Package org.apache.nutch.crawl

Examples of org.apache.nutch.crawl.Inlinks


      return;
    }

    ParseData parseData = (ParseData)value;
    Outlink[] outlinks = parseData.getOutlinks();
    Inlinks inlinks = new Inlinks();
   
    String fromUrlCriginalColectionName=null;
    String fromUrlTimestamp=null;
    if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {         
      fromUrlCriginalColectionName=SqlSearcher.getCollectionNameOriginal(collection);     
      fromUrlTimestamp=SqlSearcher.getTimestampOriginal(collection);     
    }
   
    for (int i = 0; i < outlinks.length; i++) {
      Outlink outlink = outlinks[i];
      String toUrl = outlink.getToUrl();
          
      if (this.nwIgnoreInternalLinks)
      {
        String toHost = getHost(toUrl);
       
        if (toHost == null || toHost.equals(fromHost)) { // internal link             
          continue;                               // skip it
        }
      }

      if (this.nwUrlNormalizers != null)
      {
        try {         
          toUrl = this.nwUrlNormalizers. normalize(toUrl, URLNormalizers.SCOPE_LINKDB);
        }
        catch (Exception e) {
          LOG.warn("Skipping " + toUrl + ":" + e);
          toUrl = null;
        }
      }
     
      if (toUrl != null && this.nwUrlFilters != null) {
        try {
          toUrl = this.nwUrlFilters.filter(toUrl); // filter the url
          if (toUrl==null) { 
            LOG.info("LINKDB URL FILTERED")
          }
        }
        catch (Exception e) {
          LOG.warn("Skipping " + toUrl + ":" + e);
          toUrl = null;
        }
      }
   
      if (toUrl == null) {
        continue;
      }

      inlinks.clear();
   
      String anchor = outlink.getAnchor();        // truncate long anchors
      
      if (anchor.length() > this.nwMaxAnchorLength) {
        anchor = anchor.substring(0, this.nwMaxAnchorLength);
      }
      inlinks.add(new Inlink(fromUrl, anchor));   // collect inverted link     
      if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {
        try {         
            String toUrlNearTimestamp = sqlsearcher.selectNearTimestamp(toUrl, fromUrlTimestamp);
        LOG.debug("LinkDB: toUrlNearTimestamp: " + toUrlNearTimestamp + " toUrl:" + toUrl + " fromUrlTimestamp " + fromUrlTimestamp);
          if (toUrlNearTimestamp!=null) {
View Full Code Here


  public void reduce(WritableComparable key, Iterator values,
                     OutputCollector output, Reporter reporter)
    throws IOException {    
   
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    CrawlDatum redir = null;
    ParseData parseData = null;
    ParseText parseText = null;
    Float pagerank = null; // TODO MC
    while (values.hasNext()) {
      Object value = ((ObjectWritable)values.next()).get(); // unwrap
         
      if (value instanceof Inlinks) {
        inlinks = (Inlinks)value;
      }
      else if (value instanceof CrawlDatum) {
         
        CrawlDatum datum = (CrawlDatum)value;       
        if (CrawlDatum.hasDbStatus(datum))
          dbDatum = datum;
        else if (CrawlDatum.hasFetchStatus(datum))
          fetchDatum = datum;
        else if (CrawlDatum.STATUS_LINKED == datum.getStatus())
          // redirected page
          redir = datum;
        else
          throw new RuntimeException("Unexpected status: "+datum.getStatus());
      }
      else if (value instanceof ParseData) {
        parseData = (ParseData)value;
      }
      else if (value instanceof ParseText) {
        parseText = (ParseText)value;
      }      
      else if (value instanceof FloatWritable) {  // TODO MC
        pagerank = ((FloatWritable)value).get();
      }
      else if (LOG.isWarnEnabled()) {
        LOG.warn("Unrecognized type: "+value.getClass());
      }
    }     
   
    if (collectionType.equals(Global.COLLECTION_TYPE_TREC)) {
      LOG.info("index TREC: "+key.toString()+" "+(redir==null)+" "+(fetchDatum == null)+" "+(dbDatum == null)+" "+(parseText == null)+" "+(parseData == null)+" "+(inlinks==null)+" "+(pagerank==null));
    }
   
    if (redir != null) { // does not work - see http://www.mail-archive.com/nutch-commits@lucene.apache.org/msg01971.html
      // XXX page was redirected - what should we do?
      // XXX discard it for now
     
      LOG.info("index REDIR:"+redir); // sanity check
      return;
    }
       

    if (collectionType.equals(Global.COLLECTION_TYPE_TREC)) {
      if (fetchDatum == null /*|| dbDatum == null*/
          || parseText == null || parseData == null) {
        return;                                     // only have inlinks
      }
    }
    else {
      if (fetchDatum == null || dbDatum == null
            || parseText == null || parseData == null) {
            return;                                     // only have inlinks
        }
    }
      
    Document doc = new Document();
    Metadata metadata = parseData.getContentMeta();
  
    if (metadata.get(Nutch.SEGMENT_NAME_KEY)==null || metadata.get(Nutch.SIGNATURE_KEY)==null) {
      LOG.error("Metadata empty:"+key+" "+parseData.toString());
      return;
    }
   
    // add segment, used to map from merged index back to segment files
    doc.add(new Field("segment", metadata.get(Nutch.SEGMENT_NAME_KEY),
            Field.Store.YES, Field.Index.NO));

    // add digest, used by dedup
    doc.add(new Field("digest", metadata.get(Nutch.SIGNATURE_KEY),
            Field.Store.YES, Field.Index.NO));
        
    Parse parse = new ParseImpl(parseText, parseData);
    try {
      // run indexing filters
      doc = this.filters.filter(doc, parse, (Text)key, fetchDatum, inlinks);
    } catch (IndexingException e) {
      if (LOG.isWarnEnabled()) { LOG.warn("Error indexing "+key+": "+e); }
      return;
    }
   
    float boost = 1.0f;
    // run scoring filters
    if (dbDatum!=null || !collectionType.equals(Global.COLLECTION_TYPE_TREC)) {
      try {
        boost = this.scfilters.indexerScore((Text)key, doc, dbDatum,
              fetchDatum, parse, inlinks, boost);
      } catch (ScoringFilterException e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Error calculating score " + key + ": " + e);
        }
        return;
      }
    }            
   
    // apply boost to all indexed fields.
    //    doc.setBoost(boost); - it uses the default 1.0f. if set, all fields will have this value boosted
    // store boost for use by explain and dedup
    doc.add(new Field("boost", Float.toString(boost), Field.Store.YES, Field.Index.NO));       
    doc.add(new Field("inlinks", (inlinks==null) ? "0" : Integer.toString(inlinks.size()), Field.Store.YES, Field.Index.NO));
    doc.add(new Field("outlinks", (parseData.getOutlinks()==null) ? "0" : Integer.toString(parseData.getOutlinks().length), Field.Store.YES, Field.Index.NO));
    doc.add(new Field("pagerank", (pagerank==null) ? "0" : Float.toString(pagerank), Field.Store.YES, Field.Index.NO));
   
    output.collect(key, new ObjectWritable(doc));    
  }
View Full Code Here

    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);

    IndexingFilters filters = new IndexingFilters(conf);
    filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  }
View Full Code Here

    conf.addResource("crawl-tests.xml");

    IndexingFilters filters = new IndexingFilters(conf);
    NutchDocument doc = filters.filter(null, new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
    
    Assert.assertNull(doc);
  }
View Full Code Here

    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);

    IndexingFilters filters1 = new IndexingFilters(conf);
    NutchDocument fdoc1 = filters1.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
      new ParseStatus(),"title",new Outlink[0],new Metadata())),new Text("http://www.example.com/"),
      new CrawlDatum(),new Inlinks());

    // add another index filter
    String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
    // set content metadata
    Metadata md = new Metadata();
    md.add("example","data");
    // set content metadata property defined in MetadataIndexer
    conf.set("index.content.md","example");
    // add MetadataIndxer filter
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
    IndexingFilters filters2 = new IndexingFilters(conf);
    NutchDocument fdoc2 = filters2.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
      new ParseStatus(),"title",new Outlink[0],md)),new Text("http://www.example.com/"),
      new CrawlDatum(),new Inlinks());
    Assert.assertEquals(fdoc1.getFieldNames().size(),fdoc2.getFieldNames().size());
  }
View Full Code Here

    NutchDocument doc = new NutchDocument();
    doc.add("id", url);
    Text urlText = new Text(url);

    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);

    byte[] signature = SignatureFactory.getSignature(conf).calculate(content,
        parse);
    parse.getData().getContentMeta()
View Full Code Here

  public void setUp() throws Exception {
    conf = NutchConfiguration.create();
    parse = new ParseImpl();
    url = new Text("http://nutch.apache.org/index.html");
    crawlDatum = new CrawlDatum();
    inlinks = new Inlinks();
    filter = new StaticFieldIndexer();
  }
View Full Code Here

    ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);

    CrawlDatum crawlDatum = new CrawlDatum();
    crawlDatum.setFetchTime(100L);

    Inlinks inlinks = new Inlinks();

    try {
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
    } catch(Exception e){
      e.printStackTrace();
View Full Code Here

  }

  public void reduce(Text key, Iterator<NutchWritable> values,
                     OutputCollector<Text, NutchIndexAction> output, Reporter reporter)
    throws IOException {
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    ParseData parseData = null;
    ParseText parseText = null;
View Full Code Here

    AnchorIndexingFilter filter = new AnchorIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://test1.com/", "text1"));
    inlinks.add(new Inlink("http://test2.com/", "text2"));
    inlinks.add(new Inlink("http://test3.com/", "text2"));
    try {
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
    } catch(Exception e){
      e.printStackTrace();
      Assert.fail(e.getMessage());
View Full Code Here

TOP

Related Classes of org.apache.nutch.crawl.Inlinks

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.