Examples of org.apache.nutch.parse.Parse

org.apache.nutch.parse.Parse
The result of parsing a page's raw content. @see Parser#getParse(Content)

      List<Outlink> list = Arrays.asList(old);
      outlinks.addAll(list);
      ParseStatus status = parse.getParseStatus();
      String text = parse.getText();
      Outlink[] newlinks = outlinks.toArray(new Outlink[outlinks.size()]);
      return new Parse(text, title, newlinks, status);
    }
    return parse;
  }

View Full Code Here

      title = script.substring(0, idx);
    } else {
      idx = Math.min(MAX_TITLE_LEN, script.length());
      title = script.substring(0, idx);
    }
    Parse parse =
      new Parse(script, title, outlinks, ParseStatusUtils.STATUS_SUCCESS);
    return parse;
  }

View Full Code Here

    try {
      protocol = ProtocolFactory.getProtocol(url);
      Content content = protocol.getProtocolOutput(url).getContent();
      String contentType = content.getContentType();
      Parser parser = ParserFactory.getParser(contentType, url);
      Parse parse = parser.getParse(content);
      System.out.println("text:" + parse.getText());
      return parse.getText();


    } catch (ProtocolNotFound e) {
      e.printStackTrace();
    } catch (ProtocolException e) {
      e.printStackTrace();

View Full Code Here

      /* loop through the test documents and validate result */
      for (int t = 0; t < docs.length; t++) {


        Content content = getContent(docs[t]);
        Parser parser = ParserFactory.getParser("text/html", URL);
        Parse parse = parser.getParse(content);


        assertEquals(metalanguages[t], (String) parse.getData().get(
            HTMLLanguageParser.META_LANG_NAME));


      }
    } catch (Exception e) {
      e.printStackTrace(System.out);

View Full Code Here

  private boolean multiValued = false;


  public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
    this.doc = doc;


    Parse parse = parseResult.get(content.getUrl());


    for (int i = 0 ; headings != null && i < headings.length ; i++ ) {
      List<String> discoveredHeadings = getElement(headings[i]);


      if (discoveredHeadings.size() > 0) {
        for (String heading : discoveredHeadings) {
          if (heading != null) {
            heading.trim();


            if (heading.length() > 0) {
              parse.getData().getParseMeta().add(headings[i], heading);
            }
          }
        }
      }
    }

View Full Code Here

    try {
      String urlString = "file:" + sampleDir + fileSeparator + fileName;     
      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
      Content content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();
      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
      metadata = parse.getData().getParseMeta();
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.toString());
    }
    return metadata;

View Full Code Here


    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);


    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }

View Full Code Here

    doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));


    // add digest, used by dedup
    doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));


    final Parse parse = new ParseImpl(parseText, parseData);
    try {
      // extract information from dbDatum and pass it to
      // fetchDatum so that indexing filters can use it
      final Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
      if (url != null) {

View Full Code Here


  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();


    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];


      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();


      parse = new ParseUtil(conf).parse(content).get(content.getUrl());


      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
      assertTrue(sampleTexts[i].equals(text));
    }
  }

View Full Code Here

  public void testIt() throws ProtocolException, ParseException {


    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;


    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
                      .getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-rtf", content);
    String text = parse.getText();
    assertEquals("The quick brown fox jumps over the lazy dog", text.trim());


    String title = parse.getData().getTitle();
    Metadata meta = parse.getData().getParseMeta();
    assertEquals("test rft document", title);
    assertEquals("tests", meta.get(DublinCore.SUBJECT));

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.parse.Parse

com.atlantbh.nutch.filter.index.omit.OmitIndexingFilterTest

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilterTest

com.atlantbh.nutch.filter.xpath.XPathIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.AlternativeDataFlowIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.flow.CsvDataFlowTest

org.apache.nutch.analysis.lang.HTMLLanguageParser

org.apache.nutch.analysis.lang.LanguageIdentifier

org.apache.nutch.analysis.lang.TestHTMLLanguageParser

org.apache.nutch.indexer.field.BasicFields$Extractor

org.apache.nutch.indexer.IndexerMapReduce

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.