Package org.apache.nutch.parse

Examples of org.apache.nutch.parse.Parse


  private Configuration conf;
  private String[] headings;
  private boolean multiValued = false;

  public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
    Parse parse = parseResult.get(content.getUrl());

    for (int i = 0 ; headings != null && i < headings.length ; i++ ) {
      List<String> discoveredHeadings = getElement(doc, headings[i]);

      if (discoveredHeadings.size() > 0) {
        for (String heading : discoveredHeadings) {
          if (heading != null) {
            heading.trim();

            if (heading.length() > 0) {
              parse.getData().getParseMeta().add(headings[i], heading);
            }
          }
        }
      }
    }
View Full Code Here


  public String getTextContent(String fileName) throws ProtocolException, ParseException {
    String urlString = "file:" + sampleDir + fileSeparator + fileName;
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    return parse.getText();
  }
View Full Code Here

  @Test
  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    Protocol protocol;
    ProtocolFactory factory = new ProtocolFactory(conf);

    System.out.println("Expected : "+expectedText);

    for (int i=0; i<sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      if (sampleFiles[i].startsWith("ootest")==false) continue;

      protocol = factory.getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();

      // simply test for the presence of a text - the ordering of the elements may differ from what was expected
      // in the previous tests
      Assert.assertTrue(text!=null && text.length() > 0);
View Full Code Here

  @Test
  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;

    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      Configuration conf = NutchConfiguration.create();
      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

      Assert.assertEquals("121", parse.getData().getMeta("width"));
      Assert.assertEquals("48", parse.getData().getMeta("height"));
    }
  }
View Full Code Here

  public void testIt() throws ProtocolException, ParseException {

    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;

    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
        .get(content.getUrl());
    String text = parse.getText();
    Assert.assertEquals("The quick brown fox jumps over the lazy dog", text.trim());

    String title = parse.getData().getTitle();
    Metadata meta = parse.getData().getParseMeta();

    Assert.assertEquals("test rft document", title);
    Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));

  }
View Full Code Here

   */
  public ParseResult filter(Content content, ParseResult parseResult,
    HTMLMetaTags metaTags, DocumentFragment doc) {
   
    // get parse obj
    Parse parse = parseResult.get(content.getUrl());
    // Trying to find the document's rel-tags
    Parser parser = new Parser(doc);
    Set<?> tags = parser.getRelTags();
    Iterator<?> iter = tags.iterator();
    Metadata metadata = parse.getData().getParseMeta();
    while (iter.hasNext())
      metadata.add(REL_TAG, (String) iter.next());

    return parseResult;
  }
View Full Code Here

  @Test
  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;

    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      Configuration conf = NutchConfiguration.create();
      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

      int index = parse.getText().indexOf(expectedText);
      Assert.assertTrue(index > 0);
    }
  }
View Full Code Here

  @Test
  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;

    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-tika",
          content).get(content.getUrl());

      // check that there are 2 outlinks:
      // unlike the original parse-rss
      // tika ignores the URL and description of the channel

      // http://test.channel.com
      // http://www-scf.usc.edu/~mattmann/
      // http://www.nutch.org

      ParseData theParseData = parse.getData();

      Outlink[] theOutlinks = theParseData.getOutlinks();

      Assert.assertTrue("There aren't 2 outlinks read!",
          theOutlinks.length == 2);
View Full Code Here

  private Configuration conf;
 
  public ParseResult filter(Content content, ParseResult parseResult,
    HTMLMetaTags metaTags, DocumentFragment doc) {

    Parse parse = parseResult.get(content.getUrl());

    String url = content.getBaseUrl();
    ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
    walk(doc, parse, metaTags, url, outlinks);
    if (outlinks.size() > 0) {
      Outlink[] old = parse.getData().getOutlinks();
      String title = parse.getData().getTitle();
      List<Outlink> list = Arrays.asList(old);
      outlinks.addAll(list);
      ParseStatus status = parse.getData().getStatus();
      String text = parse.getText();
      Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
      ParseData parseData = new ParseData(status, title, newlinks,
                                          parse.getData().getContentMeta(),
                                          parse.getData().getParseMeta());

      // replace original parse obj with new one
      parseResult.put(content.getUrl(), new ParseText(text), parseData);
    }
    return parseResult;
View Full Code Here

  }

  public ParseResult filter(Content content, ParseResult parseResult,
      HTMLMetaTags metaTags, DocumentFragment doc) {

    Parse parse = parseResult.get(content.getUrl());
    Metadata metadata = parse.getData().getParseMeta();

    // check in the metadata first : the tika-parser
    // might have stored the values there already
    for (String mdName : metadata.names()) {
      addIndexedMetatags(metadata, mdName, metadata.getValues(mdName));
View Full Code Here

TOP

Related Classes of org.apache.nutch.parse.Parse

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.