Package edu.umd.cloud9.webgraph.data

Examples of edu.umd.cloud9.webgraph.data.AnchorText


                            (float) env.getDefaultDf(), (float) env.getDefaultCf());
  }

  private void preparePostings(String postingsPath) throws Exception {
    postings = new HMapIV<CompressedPositionalPostings>();
    dfs = new HMapII();
    docLengths = new HMapII();

    FSDataInputStream input = fs.open(new Path(postingsPath));
    int termid = input.readInt();
    while(termid != -1) {
      dfs.put(termid, input.readInt());
View Full Code Here


        context.getCounter(LinkCounter.INVALID_URL).increment(1);
        return;
      }

      arrayList.clear();
      arrayList.add(new AnchorText(
          AnchorTextConstants.Type.DOCNO_FIELD.val,
          AnchorTextConstants.EMPTY_STRING, docno));
      keyWord.set(base);
      context.write(keyWord, arrayList);

      // keeping track of the number of documents that have actually been
      // processed
      context.getCounter(LinkCounter.OUTPUT_DOCS).increment(1);

      try
      {
        baseHost = new URI(base).getHost();
      }
      catch (Exception e)
      {
        context.getCounter(LinkCounter.INVALID_URL).increment(1);
        return;
      }

      if (baseHost == null)
      {
        context.getCounter(LinkCounter.INVALID_URL).increment(1);
        return;
      }

      try
      {
        parser.setInputHTML(doc.getContent()); // initializing the
        // parser with new HTML
        // content

        // Setting base URL for the current document
        NodeList nl = parser.parse(null);
        BaseHrefTag baseTag = new BaseHrefTag();
        baseTag.setBaseUrl(base);
        nl.add(baseTag);

        // re-initializing the parser with the fixed content
        parser.setInputHTML(nl.toHtml());

        // listing all LinkTag nodes
        list = parser.extractAllNodesThatMatch(filter);
      }
      catch (ParserException e)
      {
        context.getCounter(LinkCounter.PARSER_FAILED).increment(1);
        return;
      }
      catch (StackOverflowError e)
      {
        context.getCounter(LinkCounter.PARSER_FAILED).increment(1);
        return;
      }

      for (int i = 0; i < list.size(); i++)
      {
        LinkTag link = (LinkTag) list.elementAt(i);
        String anchor = link.getLinkText();
        String url = normalizeURL(link.extractLink());

        if (url == null)
        {
          continue;
        }

        if (url.equals(base))
        { // discard self links
          continue;
        }

        String host = null;
        try
        {
          host = new URI(url).getHost();
        }
        catch (Exception e)
        {
          continue;
        }

        if (host == null)
        {
          continue;
        }

        if (anchor == null)
        {
          anchor = "";
        }

        // normalizing the anchor text
        anchor = normalizer.process(anchor);

        arrayList.clear();
        if (baseHost.equals(host))
        {

          if (!includeInternalLinks)
            continue;

          arrayList.add(new AnchorText(
              AnchorTextConstants.Type.INTERNAL_IN_LINK.val,
              anchor, docno));

        }
        else
        {
          arrayList.add(new AnchorText(
              AnchorTextConstants.Type.EXTERNAL_IN_LINK.val,
              anchor, docno));
        }

        try
        {
          keyWord.set(url);
          context.write(keyWord, arrayList);
        }
        catch (UTFDataFormatException e)
        {
          context.getCounter(LinkCounter.TEXT_TOO_LONG).increment(1);

          keyWord.set(url);
          byte flag = arrayList.get(0).getType();
          arrayList.clear();
          arrayList.add(new AnchorText(flag,
              AnchorTextConstants.EMPTY_STRING, docno));
          context.write(keyWord, arrayList);
        }

      }
View Full Code Here

       
        currentDocument = key.getLeftElement();
        arrayList.clear();
      }
     
      arrayList.add(new AnchorText(AnchorTextConstants.Type.OTHER_TYPES.val, key.getRightElement()));
      int currentIndex = arrayList.size() - 1;
     
      while(values.hasNext()) {
        packet = values.next().get();
       
        //break larger chunks of data to smaller packets - reduces the underlying HashMap costs
        if(arrayList.get(currentIndex).getSize() < AnchorTextConstants.MAXIMUM_SOURCES_PER_PACKET) {
          arrayList.get(currentIndex).addDocument(packet);
        } else {
          arrayList.add(new AnchorText(AnchorTextConstants.Type.OTHER_TYPES.val, key.getRightElement(), packet));
          currentIndex = arrayList.size() - 1;
        }
      }
    }
View Full Code Here

            arrayList.add(data.clone());
          }
        }
      }

      arrayList.add(new AnchorText(AnchorTextConstants.Type.IN_DEGREE.val,
                                   null, indegree));
      arrayList.add(new AnchorText(AnchorTextConstants.Type.URL_FIELD.val,
                                   key.toString()));
      Collections.sort(arrayList);

      //if there was no document number detected,
      //this record would not be emitted.
View Full Code Here

        reporter.incrCounter(LinkCounter.INVALID_URL, 1);
        return;
      }

      arrayList.clear();
      arrayList.add(new AnchorText(AnchorTextConstants.Type.DOCNO_FIELD.val, null, docno));
      keyWord.set(base);
      output.collect(keyWord, arrayList);
      arrayList.clear();

      // keeping track of the number of documents that have actually been
      // processed
      reporter.incrCounter(LinkCounter.OUTPUT_DOCS, 1);

      try {
        baseHost = new URI(base).getHost();
      } catch (Exception e) {
        reporter.incrCounter(LinkCounter.INVALID_URL, 1);
        return;
      }

      if(baseHost == null) {
        reporter.incrCounter(LinkCounter.INVALID_URL, 1);
        return;
      }

      try {
        // initializing the parser with new content
        parser.setInputHTML(doc.getContent());

        // Setting base URL for the current document
        NodeList nl = parser.parse(null);
        BaseHrefTag baseTag = new BaseHrefTag();
        baseTag.setBaseUrl(base);
        nl.add(baseTag);

        // re-initializing the parser with the correct content
        parser.setInputHTML(nl.toHtml());

        // listing all LinkTag nodes
        list = parser.extractAllNodesThatMatch(filter);
      } catch (ParserException e) {
        reporter.incrCounter(LinkCounter.PARSER_FAILED, 1);
        return;
      } catch (StackOverflowError e) {
        reporter.incrCounter(LinkCounter.PARSER_FAILED, 1);
        return;
      }

      for(int i = 0; i < list.size(); i++) {
        LinkTag link = (LinkTag) list.elementAt(i);
        String anchor = link.getLinkText();
        String url = link.extractLink();

        if(url == null) {
          continue;
        }

        if(url.equals(base)) {// discard self links
          continue;
        }

        String host = null;
        try {
          host = new URI(url).getHost();
        } catch (Exception e) {
          continue;
        }

        if(host == null) {
          continue;
        }

        if(anchor == null) {
          anchor = "";
        }

        // normalizing the anchor text
        anchor = normalizer.process(anchor);

        arrayList.clear();
        if(baseHost.equals(host)) {
          if(!includeInternalLinks) {
            continue;
          }

          arrayList.add(new AnchorText(
              AnchorTextConstants.Type.INTERNAL_IN_LINK.val, anchor, docno));
        } else {
          arrayList.add(new AnchorText(
              AnchorTextConstants.Type.EXTERNAL_IN_LINK.val, anchor, docno));
        }

        try {
          keyWord.set(url);
          output.collect(keyWord, arrayList);
        } catch (UTFDataFormatException e) {
          reporter.incrCounter(LinkCounter.TEXT_TOO_LONG, 1);

          keyWord.set(url);
          byte flag = arrayList.get(0).getType();
          arrayList.clear();
          arrayList.add(new AnchorText(flag, AnchorTextConstants.EMPTY_STRING, docno));
          output.collect(keyWord, arrayList);
        }
      }
    }
View Full Code Here

        flag = data.isExternalInLink() ?
          AnchorTextConstants.Type.EXTERNAL_OUT_LINK.val :
          AnchorTextConstants.Type.INTERNAL_OUT_LINK.val;

        arrayList.clear();
        arrayList.add(new AnchorText(flag,
            AnchorTextConstants.EMPTY_STRING, key.get()));
        for(int source : data) {
          keyWord.set(source);
          output.collect(keyWord, arrayList);
        }
View Full Code Here

            arrayList.add(data.clone());
          }
        }
      }

      arrayList.add(new AnchorText(
          AnchorTextConstants.Type.OUT_DEGREE.val, null, outdegree));
      Collections.sort(arrayList);
      output.collect(key, arrayList);
    }
View Full Code Here

public class AnchorTextTest {

  @Test
  public void testConstructors() {
    AnchorText anchor = new AnchorText();
    assertTrue(anchor.isInternalInLink());
    assertEquals(anchor.getText(), AnchorTextConstants.EMPTY_STRING);
    assertEquals(anchor.getSize(), 0);
    assertEquals(anchor.getWeight(), 0, 1e-100);
   
    AnchorText anchor2 = new AnchorText(AnchorTextConstants.Type.EXTERNAL_IN_LINK.val, "text");
    assertEquals(anchor2.getText(), "text");
    assertEquals(anchor2.getSize(), 0);
   
    AnchorText anchor3 = new AnchorText(AnchorTextConstants.Type.EXTERNAL_OUT_LINK.val, "text");
    assertNull(anchor3.getText());
    assertEquals(anchor3.getSize(), 0);
   
    AnchorText anchor4 = new AnchorText(AnchorTextConstants.Type.DOCNO_FIELD.val, "text", 100);
    assertNull(anchor4.getText());
    assertEquals(anchor4.getSize(), 1);
  }
View Full Code Here

    assertEquals(anchor4.getSize(), 1);
  }
 
  @Test
  public void testClone() {
    AnchorText anchor1 = new AnchorText(AnchorTextConstants.Type.EXTERNAL_OUT_LINK.val, "text", 1);
   
    AnchorText anchor2 = anchor1.clone();
    anchor2.setText("some text");
    assertTrue(anchor2.equals(anchor1));
    anchor2.addDocument(2);
    assertNull(anchor2.getText());
    assertEquals(anchor2.getSize(), 2);
    assertTrue(anchor2.equalsIgnoreSources(anchor1));
   
    AnchorText anchor3 = new AnchorText(AnchorTextConstants.Type.DOCNO_FIELD.val, "text");
    anchor3.addDocumentsFrom(anchor2);
    anchor3.addDocument(2);
    assertNull(anchor3.getText());
    assertEquals(anchor3.getSize(), 2);
   
    anchor3.setWeight(1);
    assertEquals(anchor3.getWeight(), 0, 1e-100);
   
    assertEquals(anchor3.compareTo(anchor2), 1);
   
    ByteArrayOutputStream bstream = new ByteArrayOutputStream();
    DataOutputStream out = new DataOutputStream(bstream);
   
    try {
      anchor3.write(out);
      out.close();
    }catch(Exception e) {
    }
   
    DataInputStream in = new DataInputStream(new ByteArrayInputStream(bstream.toByteArray()));
    AnchorText readAnchor = new AnchorText();
    try {
      readAnchor.readFields(in);
      in.close();
    }catch(Exception e) {
    }
   
    assertEquals(anchor3, readAnchor);
View Full Code Here

   
  }
 
  @Test
  public void testIterable() {
    AnchorText anchor = new AnchorText(AnchorTextConstants.Type.EXTERNAL_IN_LINK.val, "text");
    anchor.addDocument(1);
    anchor.addDocument(2);
    anchor.addDocument(3);
   
    int[] sources = anchor.getDocuments();
   
    assertEquals(sources[0], 1);
    assertEquals(sources[1], 2);
    assertEquals(sources[2], 3);
   
    anchor.resetToType(AnchorTextConstants.Type.URL_FIELD.val);
    assertEquals(anchor.getSize(), 0);
   
    for(@SuppressWarnings("unused") int s : anchor)
      fail();
  }
View Full Code Here

TOP

Related Classes of edu.umd.cloud9.webgraph.data.AnchorText

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.