Examples of org.apache.nutch.parse.Parse

org.apache.nutch.parse.Parse
The result of parsing a page's raw content. @see Parser#getParse(Content)

    NutchDocument doc = new NutchDocument();
    doc.add("id", url);
    Text urlText = new Text(url);


    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);


    byte[] signature = SignatureFactory.getSignature(conf).calculate(content,
        parse);
    parse.getData().getContentMeta()
        .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
    String digest = parse.getData().getContentMeta().get(Nutch.SIGNATURE_KEY);
    doc.add("digest", digest);


    try {
      doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {

View Full Code Here

    in.readFully(bytes);
    ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
        "application/rss+xml", new Metadata(), conf));
    for (Entry<Text, Parse> entry : parseResult) {
      System.out.println("key: " + entry.getKey());
      Parse parse = entry.getValue();
      System.out.println("data: " + parse.getData());
      System.out.println("text: " + parse.getText() + "\n");
    }
  }

View Full Code Here


  private void addToMap(ParseResult parseResult, SyndFeed feed,
      String feedLink, SyndEntry entry, Content content) {
    String link = entry.getLink(), text = null, title = null;
    Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
    Parse parse = null;
    SyndContent description = entry.getDescription();


    try {
      link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);


      if (link != null)
        link = filters.filter(link);
    } catch (Exception e) {
      e.printStackTrace();
      return;
    }


    if (link == null)
      return;


    title = stripTags(entry.getTitleEx());


    if (feedLink != null)
      parseMeta.set("feed", feedLink);


    addFields(parseMeta, contentMeta, feed, entry);


    // some item descriptions contain markup text in them,
    // so we temporarily set their content-type to parse them
    // with another plugin
    String contentType = contentMeta.get(Response.CONTENT_TYPE);


    if (description != null)
      text = description.getValue();


    if (text == null) {
      List<?> contents = entry.getContents();
      StringBuilder buf = new StringBuilder();
      for (Object syndContent: contents) {
        buf.append(((SyndContent)syndContent).getValue());
      }
      text = buf.toString();
    }


    try {
      Parser parser = parserFactory.getParsers(contentType, link)[0];
      parse = parser.getParse(
          new Content(link, link, text.getBytes(), contentType, contentMeta,
              conf)).get(link);
    } catch (ParserNotFound e) { /* ignore */
    }


    if (parse != null) {
      ParseData data = parse.getData();
      data.getContentMeta().remove(Response.CONTENT_TYPE);
      mergeMetadata(data.getParseMeta(), parseMeta);
      parseResult.put(link, new ParseText(parse.getText()), new ParseData(
          ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data
              .getContentMeta(), data.getParseMeta()));
    } else {
      contentMeta.remove(Response.CONTENT_TYPE);
      parseResult.put(link, new ParseText(text), new ParseData(

View Full Code Here

    try {
      ParseUtil parser = new ParseUtil(NutchConfiguration.create());
      /* loop through the test documents and validate result */
      for (int t = 0; t < docs.length; t++) {
        Content content = getContent(docs[t]);
        Parse parse = parser.parse(content).get(content.getUrl());
        Assert.assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
      }
    } catch (Exception e) {
      e.printStackTrace(System.out);
      Assert.fail(e.toString());
    }

View Full Code Here

          try {
            Metadata metadata = new Metadata();
            metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
            metadata.set(Response.CONTENT_TYPE, contentType);
            Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
            Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
            ParseData theParseData = parse.getData();
            Outlink[] theOutlinks = theParseData.getOutlinks();
            
            for(int count = 0; count < theOutlinks.length; count++) {
              outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
            }
            
            resultText += entry.getName() + " " + parse.getText() + " ";
          } catch (ParseException e) {
            if (LOG.isInfoEnabled()) { 
              LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
            }
          }

View Full Code Here

  public void testEncodingDetection() {
    for (String[] testPage : encodingTestPages) {
      String name = testPage[0];
      Charset charset = Charset.forName(testPage[1]);
      byte[] contentBytes = testPage[2].getBytes(charset);
      Parse parse = parse(contentBytes);
      String text = parse.getText();
      String title = parse.getData().getTitle();
      String keywords = parse.getData().getMeta("keywords");
      LOG.info(name);
      LOG.info("title:\t" + title);
      LOG.info("keywords:\t" + keywords);
      LOG.info("text:\t" + text);
      Assert.assertEquals("Title not extracted properly (" + name + ")",

View Full Code Here

  @Test
  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();


    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];


      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();


      parse = new ParseUtil(conf).parse(content).get(content.getUrl());


      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
      Assert.assertTrue(sampleTexts[i].equals(text));
    }
  }

View Full Code Here

  @Test
  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;


    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];


      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
      Assert.assertTrue(parse.getText().equals(expectedText));
    }
  }

View Full Code Here

    doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));


    // add digest, used by dedup
    doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));


    final Parse parse = new ParseImpl(parseText, parseData);
    try {
      // extract information from dbDatum and pass it to
      // fetchDatum so that indexing filters can use it
      final Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
      if (url != null) {

View Full Code Here

    try {
      String urlString = "file:" + sampleDir + fileSeparator + fileName;
      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
      Content content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();
      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
      metadata = parse.getData().getParseMeta();
    } catch (Exception e) {
      e.printStackTrace();
      Assert.fail(e.toString());
    }
    return metadata;

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.parse.Parse

com.atlantbh.nutch.filter.index.omit.OmitIndexingFilterTest

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilterTest

com.atlantbh.nutch.filter.xpath.XPathIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.AlternativeDataFlowIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.flow.CsvDataFlowTest

org.apache.nutch.analysis.lang.HTMLLanguageParser

org.apache.nutch.analysis.lang.LanguageIdentifier

org.apache.nutch.analysis.lang.TestHTMLLanguageParser

org.apache.nutch.indexer.field.BasicFields$Extractor

org.apache.nutch.indexer.IndexerMapReduce

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.