Examples of org.apache.nutch.parse.ParseImpl

org.apache.nutch.parse.ParseImpl
The result of parsing a page's raw content. @see Parser#getParse(Content)

        }
      }


      // get parse metadata
      Metadata metadata = parseData.getContentMeta();
      Parse parse = new ParseImpl(parseText, parseData);


      // handle redirect urls
      Text reprUrlText = (Text)fetchDatum.getMetaData().get(
        Nutch.WRITABLE_REPR_URL_KEY);
      String reprUrl = reprUrlText != null ? reprUrlText.toString() : null;
      String url = key.toString();
      String fieldUrl = (reprUrl != null) ? reprUrl : url;
      String host = URLUtil.getHost(fieldUrl);


      // add segment, used to map from merged index back to segment files
      FieldWritable segField = new FieldWritable(Fields.SEGMENT,
        metadata.get(Nutch.SEGMENT_NAME_KEY), FieldType.CONTENT, false, true,
        false);
      fieldsList.add(segField);


      // add digest, used by dedup
      FieldWritable digestField = new FieldWritable(Fields.DIGEST,
        metadata.get(Nutch.SIGNATURE_KEY), FieldType.CONTENT, false, true,
        false);
      fieldsList.add(digestField);


      // url is both stored and indexed, so it's both searchable and returned
      fieldsList.add(new FieldWritable(Fields.URL, fieldUrl, FieldType.CONTENT,
        true, true, true));
      fieldsList.add(new FieldWritable(Fields.SEG_URL, url, FieldType.CONTENT,
        false, true, false));


      if (reprUrl != null) {
        // also store original url as both stored and indexes
        fieldsList.add(new FieldWritable(Fields.ORIG_URL, url,
          FieldType.CONTENT, true, true, true));
      }


      if (host != null) {
        // add host as un-stored, indexed and tokenized
        FieldWritable hostField = new FieldWritable(Fields.HOST, host,
          FieldType.CONTENT, true, false, true);
        fieldsList.add(hostField);


        // add site as un-stored, indexed and un-tokenized
        FieldWritable siteField = new FieldWritable(Fields.SITE, host,
          FieldType.CONTENT, true, false, false);
        fieldsList.add(siteField);
      }


      // content is indexed, so that it's searchable, but not stored in index
      fieldsList.add(new FieldWritable(Fields.CONTENT, parse.getText(),
        FieldType.CONTENT, true, false, true));


      // title
      String title = parse.getData().getTitle();
      if (title.length() > MAX_TITLE_LENGTH) { // truncate title if needed
        title = title.substring(0, MAX_TITLE_LENGTH);
      }
      // add title indexed and stored so that it can be displayed
      fieldsList.add(new FieldWritable(Fields.TITLE, title, FieldType.CONTENT,
        true, true, true));


      // add cached content/summary display policy, if available
      String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
      if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
        fieldsList.add(new FieldWritable(Fields.CACHE, caching,
          FieldType.CONTENT, false, true, false));
      }

View Full Code Here

    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());


    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata());
    return ParseResult.createParseResult(content.getUrl(), 
                                         new ParseImpl(text, parseData));
  }

View Full Code Here

    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());


    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata(),
                                        metadata);
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
    // any filter?
    //return HtmlParseFilters.filter(content, parse, root);
  }

View Full Code Here

              // Score at this stage is 1.0f.
              metaData.set(Nutch.SCORE_KEY, Float.toString(datum.getScore())); // TODO MC
                                
              // WritableComparable outkey = new UTF8(d.urlString);
              WritableComparable outkey = new Text(url); 
              Writable outvalue = new FetcherOutput(datum, null, new ParseImpl(parse));                 
                    
              // output.collect(outkey, outvalue); 
              Text key=Nutchwax.generateWaxKey(outkey, collectionName);
              output.collect(key, outvalue);                        
            }

View Full Code Here

  private Configuration conf;
    
  public Parse getParse(Content content)
  {
//    return new ParseImpl(content.getUrl(),  TODO MC BUG - don't index url as content
    return new ParseImpl("",
      new ParseData(ParseStatus.STATUS_SUCCESS,
      "", new Outlink[0], content.getMetadata()));
  }

View Full Code Here


    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata(),
                                        metadata);
    parseData.setConf(this.conf);
    return new ParseImpl(text, parseData);
    // any filter?
    //return HtmlParseFilters.filter(content, parse, root);
  }

View Full Code Here


    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
      outlinks, content.getMetadata());
    parseData.setConf(this.conf);
    
    return new ParseImpl(text, parseData);
  }

View Full Code Here


    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata(),
                                        metadata);
    parseData.setConf(this.conf);
    return new ParseImpl(text, parseData);
  }

View Full Code Here

          kbPerSecond));
      }
    }


    Writable v = new FetcherOutput(datum, null,
      parse != null ? new ParseImpl(parse) : null);       
    if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {
      LOG.info("multiple: "+SqlSearcher.getCollectionNameWithTimestamp(this.collectionName,arcData.getDate())+" "+url); 
      output.collect(Nutchwax.generateWaxKey(url,SqlSearcher.getCollectionNameWithTimestamp(this.collectionName,arcData.getDate())), v); 
    }
    else {

View Full Code Here

    String class1 = "NonExistingFilter";
    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);


    IndexingFilters filters = new IndexingFilters(conf);
    filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.parse.ParseImpl

org.apache.nutch.crawl.TextProfileSignature

org.apache.nutch.indexer.anchor.TestAnchorIndexingFilter

org.apache.nutch.indexer.basic.TestBasicIndexingFilter

org.apache.nutch.indexer.field.BasicFields$Extractor

org.apache.nutch.indexer.IndexerMapReduce

org.apache.nutch.indexer.more.TestMoreIndexingFilter

org.apache.nutch.indexer.staticfield.TestStaticFieldIndexerTest

org.apache.nutch.indexer.TestIndexingFilters

org.apache.nutch.parse.ext.ExtParser

org.apache.nutch.parse.ext.WaxExtParser

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.