Package org.apache.nutch.parse

Examples of org.apache.nutch.parse.ParseImpl


    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);

    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());

    Assert.assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
  }
View Full Code Here


  private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }
View Full Code Here

      idx = Math.min(MAX_TITLE_LEN, script.length());
      title = script.substring(0, idx);
    }
    ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
                                 c.getMetadata());
    return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
  }
View Full Code Here

    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata(),
                                        metadata);
    parseData.setConf(this.conf);
    return new ParseImpl(text, parseData);
  }
View Full Code Here

    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
                                        metadataCollector.getTitle(),
                                        metadataCollector.getOutlinks(),
                                        contentMeta,
                                        metadataCollector.getData());
    return new ParseImpl(metadataCollector.getText(), parseData);
  }
View Full Code Here

    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
                                        metadataCollector.getTitle(),
                                        metadataCollector.getOutlinks(),
                                        contentMeta,
                                        metadataCollector.getData());
    return new ParseImpl(metadataCollector.getText(), parseData);
  }
View Full Code Here

                                              resultTitle, outlinks,
                                              content.getMetadata());
    parseData.setConf(this.conf);

    if (LOG.isTraceEnabled()) { LOG.trace("Zip file parsed sucessfully !!"); }
    return new ParseImpl(resultText, parseData);
  }
View Full Code Here

    String class1 = "NonExistingFilter";
    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);

    IndexingFilters filters = new IndexingFilters(conf);
    filters.filter(new Document(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  }
View Full Code Here

        // }

        ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
                contentTitle.toString(), outlinks, content.getMetadata());
        parseData.setConf(this.conf);
        return new ParseImpl(indexText.toString(), parseData);
    }
View Full Code Here

    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());

    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata());
    parseData.setConf(this.conf);
    return new ParseImpl(text, parseData);
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.parse.ParseImpl

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.