Examples of org.apache.nutch.indexer.IndexingFilters

org.apache.nutch.indexer.IndexingFilters
Creates and caches {@link IndexingFilter} implementing plugins.

    
    if (LOG.isInfoEnabled()) {
      LOG.info("fetching: " + url);
    }
        
    IndexingFilters indexers = new IndexingFilters(conf);
    
    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    CrawlDatum datum = new CrawlDatum();
    
    Content content = protocol.getProtocolOutput(new Text(url), datum)
        .getContent();


    // store the guessed content type in the crawldatum
    if (content.getContentType() != null) datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));


    if (content == null) {
      System.out.println("No content for " + url);
      return 0;
    }
    
    contentType = content.getContentType();
    
    if (contentType == null) {
      return -1;
    }
    
    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
    }


    ParseResult parseResult = new ParseUtil(conf).parse(content);
    
    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);


    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }


    for (String fname : doc.getFieldNames()) {

View Full Code Here


    if (LOG.isInfoEnabled()) {
      LOG.info("fetching: " + url);
    }


    IndexingFilters indexers = new IndexingFilters(conf);


    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    CrawlDatum datum = new CrawlDatum();


    ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum);
    
    if (!output.getStatus().isSuccess()) {
      System.out.println("Fetch failed with protocol status: " + output.getStatus());
      return 0;
    }
         
    Content content = output.getContent();


    if (content == null) {
      System.out.println("No content for " + url);
      return 0;
    }


    contentType = content.getContentType();


    if (contentType == null) {
      return -1;
    }


    // store the guessed content type in the crawldatum
    datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(contentType));


    if (ParseSegment.isTruncated(content)) {
      LOG.warn("Content is truncated, parse may fail!");
    }


    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
    }


    ParseResult parseResult = new ParseUtil(conf).parse(content);


    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);


    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }


    if (doc == null) {

View Full Code Here

    
    if (LOG.isInfoEnabled()) {
      LOG.info("fetching: " + url);
    }
        
    IndexingFilters indexers = new IndexingFilters(conf);
    
    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    CrawlDatum datum = new CrawlDatum();
    
    Content content = protocol.getProtocolOutput(new Text(url), datum)
        .getContent();
    
    if (content == null) {
      System.out.println("No content for " + url);
      return 0;
    }
    
    contentType = content.getContentType();
    
    if (contentType == null) {
      return -1;
    }
    
    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
    }
    
    ParseResult parseResult = new ParseUtil(conf).parse(content);
    
    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);


    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }


    for (String fname : doc.getFieldNames()) {

View Full Code Here


    if (LOG.isInfoEnabled()) {
      LOG.info("fetching: " + url);
    }


    IndexingFilters indexers = new IndexingFilters(conf);


    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    CrawlDatum datum = new CrawlDatum();


    Content content = protocol.getProtocolOutput(new Text(url), datum)
        .getContent();


    if (content == null) {
      System.out.println("No content for " + url);
      return 0;
    }


    contentType = content.getContentType();


    if (contentType == null) {
      return -1;
    }


    // store the guessed content type in the crawldatum
    datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(contentType));


    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
    }


    ParseResult parseResult = new ParseUtil(conf).parse(content);


    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);


    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }


    if (doc == null) {

View Full Code Here

TOP

Related Classes of org.apache.nutch.indexer.IndexingFilters

org.apache.nutch.indexer.IndexingFiltersChecker

org.apache.nutch.plugin.Extension

org.apache.nutch.plugin.ExtensionPoint

org.apache.nutch.util.ObjectCache

java.util.HashMap

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.