Examples of ParseData


Examples of org.apache.nutch.parse.ParseData

    }
    Path parseDir = new Path(segment, ParseData.DIR_NAME);
    if (fs.exists(fetchDir) && fs.isDirectory(fetchDir)) {
      cnt = 0L;
      long errors = 0L;
      ParseData value = new ParseData();
      MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, getConf());
      for (int i = 0; i < mreaders.length; i++) {
        while (mreaders[i].next(key, value)) {
          cnt++;
          if (!value.getStatus().isSuccess()) errors++;
        }
        mreaders[i].close();
      }
      stats.parsed = cnt;
      stats.parseErrors = errors;
View Full Code Here

Examples of org.apache.nutch.parse.ParseData

  public void testContent() throws Exception {

    Parse parse = new ParseUtil(NutchConfiguration.create())
                        .parseByExtensionId("parse-mspowerpoint", this.content);

    ParseData data = parse.getData();
    String text = parse.getText();

    assertTrue("No content extracted length ==0", text.length() > 0);
   
    this.dumpToFile(this.testFile.getName(), data, text);
View Full Code Here

Examples of org.apache.nutch.parse.ParseData

  public void testMeta() throws Exception {

    Parse parse = new ParseUtil(NutchConfiguration.create())
                        .parseByExtensionId("parse-mspowerpoint", content);
   
    ParseData data = parse.getData();

    final FileExtensionFilter titleFilter = new FileExtensionFilter(
        this.testFile.getName() + ".meta");
    final File[] titleFiles = this.sampleDir.listFiles(titleFilter);

    if (titleFiles.length > 0) {
      assertEquals("Document Title", this.fileToString(titleFiles[0]),
          "Title: " + data.getTitle() + LINE_SEPARATOR +
          "Outlinks: " + data.getOutlinks().length + LINE_SEPARATOR);
    } else {
      assertTrue("Document Title length ==0", data.getTitle().length() > 0);
      LOG.info("Comparison file for Title not available: "
          + this.testFile.getName() + ".meta");
    }
  }
View Full Code Here

Examples of org.apache.nutch.parse.ParseData

      List list = Arrays.asList(old);
      outlinks.addAll(list);
      ParseStatus status = parse.getData().getStatus();
      String text = parse.getText();
      Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
      ParseData parseData = new ParseData(status, title, newlinks,
                                          parse.getData().getContentMeta(),
                                          parse.getData().getParseMeta());
      parseData.setConf(this.conf);
      parse = new ParseImpl(text, parseData);
    }
    return parse;
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.