Examples of org.apache.nutch.parse.Parse

org.apache.nutch.parse.Parse
The result of parsing a page's raw content. @see Parser#getParse(Content)

  private Configuration conf;
  private String[] headings;
  private boolean multiValued = false;


  public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
    Parse parse = parseResult.get(content.getUrl());


    for (int i = 0 ; headings != null && i < headings.length ; i++ ) {
      List<String> discoveredHeadings = getElement(doc, headings[i]);


      if (discoveredHeadings.size() > 0) {
        for (String heading : discoveredHeadings) {
          if (heading != null) {
            heading.trim();


            if (heading.length() > 0) {
              parse.getData().getParseMeta().add(headings[i], heading);
            }
          }
        }
      }
    }

View Full Code Here


  public String getTextContent(String fileName) throws ProtocolException, ParseException {
    String urlString = "file:" + sampleDir + fileSeparator + fileName;
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    return parse.getText();
  }

View Full Code Here


  @Test
  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    Protocol protocol;
    ProtocolFactory factory = new ProtocolFactory(conf);


    System.out.println("Expected : "+expectedText);


    for (int i=0; i<sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];


      if (sampleFiles[i].startsWith("ootest")==false) continue;


      protocol = factory.getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());


      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();


      // simply test for the presence of a text - the ordering of the elements may differ from what was expected
      // in the previous tests
      Assert.assertTrue(text!=null && text.length() > 0);

View Full Code Here

  @Test
  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;


    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];


      Configuration conf = NutchConfiguration.create();
      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());


      Assert.assertEquals("121", parse.getData().getMeta("width"));
      Assert.assertEquals("48", parse.getData().getMeta("height"));
    }
  }

View Full Code Here

  public void testIt() throws ProtocolException, ParseException {


    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;


    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
        .get(content.getUrl());
    String text = parse.getText();
    Assert.assertEquals("The quick brown fox jumps over the lazy dog", text.trim());


    String title = parse.getData().getTitle();
    Metadata meta = parse.getData().getParseMeta();


    Assert.assertEquals("test rft document", title);
    Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));


  }

View Full Code Here

   */
  public ParseResult filter(Content content, ParseResult parseResult,
    HTMLMetaTags metaTags, DocumentFragment doc) {
    
    // get parse obj
    Parse parse = parseResult.get(content.getUrl());
    // Trying to find the document's rel-tags
    Parser parser = new Parser(doc);
    Set<?> tags = parser.getRelTags();
    Iterator<?> iter = tags.iterator();
    Metadata metadata = parse.getData().getParseMeta();
    while (iter.hasNext())
      metadata.add(REL_TAG, (String) iter.next());


    return parseResult;
  }

View Full Code Here

  @Test
  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;


    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];


      Configuration conf = NutchConfiguration.create();
      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());


      int index = parse.getText().indexOf(expectedText);
      Assert.assertTrue(index > 0);
    }
  }

View Full Code Here

  @Test
  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;


    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];


      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-tika",
          content).get(content.getUrl());


      // check that there are 2 outlinks:
      // unlike the original parse-rss
      // tika ignores the URL and description of the channel


      // http://test.channel.com
      // http://www-scf.usc.edu/~mattmann/
      // http://www.nutch.org


      ParseData theParseData = parse.getData();


      Outlink[] theOutlinks = theParseData.getOutlinks();


      Assert.assertTrue("There aren't 2 outlinks read!",
          theOutlinks.length == 2);

View Full Code Here

  private Configuration conf;
  
  public ParseResult filter(Content content, ParseResult parseResult,
    HTMLMetaTags metaTags, DocumentFragment doc) {


    Parse parse = parseResult.get(content.getUrl());


    String url = content.getBaseUrl();
    ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
    walk(doc, parse, metaTags, url, outlinks);
    if (outlinks.size() > 0) {
      Outlink[] old = parse.getData().getOutlinks();
      String title = parse.getData().getTitle();
      List<Outlink> list = Arrays.asList(old);
      outlinks.addAll(list);
      ParseStatus status = parse.getData().getStatus();
      String text = parse.getText();
      Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
      ParseData parseData = new ParseData(status, title, newlinks,
                                          parse.getData().getContentMeta(),
                                          parse.getData().getParseMeta());


      // replace original parse obj with new one
      parseResult.put(content.getUrl(), new ParseText(text), parseData);
    }
    return parseResult;

View Full Code Here

  }


  public ParseResult filter(Content content, ParseResult parseResult,
      HTMLMetaTags metaTags, DocumentFragment doc) {


    Parse parse = parseResult.get(content.getUrl());
    Metadata metadata = parse.getData().getParseMeta();


    // check in the metadata first : the tika-parser
    // might have stored the values there already
    for (String mdName : metadata.names()) {
      addIndexedMetatags(metadata, mdName, metadata.getValues(mdName));

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.parse.Parse

com.atlantbh.nutch.filter.index.omit.OmitIndexingFilterTest

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilterTest

com.atlantbh.nutch.filter.xpath.XPathIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.AlternativeDataFlowIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.flow.CsvDataFlowTest

org.apache.nutch.analysis.lang.HTMLLanguageParser

org.apache.nutch.analysis.lang.LanguageIdentifier

org.apache.nutch.analysis.lang.TestHTMLLanguageParser

org.apache.nutch.indexer.field.BasicFields$Extractor

org.apache.nutch.indexer.IndexerMapReduce

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.