Package net.nutch.parse

Examples of net.nutch.parse.ParseData


      SegmentWriter sw = new SegmentWriter(nfs, outDir, true);
      LOG.fine(" - opening first output segment in " + outDir.getName());
      FetcherOutput fo = new FetcherOutput();
      Content co = new Content();
      ParseText pt = new ParseText();
      ParseData pd = new ParseData();
      int outputCnt = 0;
      for (int n = 0; n < ir.maxDoc(); n++) {
        if (ir.isDeleted(n)) {
          //System.out.println("-del");
          continue;
View Full Code Here


    LOG.info("Input: " + total + " entries in " + readers.size() + " segments.");
    if (!parsed)
      LOG.warning(" - some input segments are non-parsed, forcing non-parsed output!");
    FetcherOutput fo = new FetcherOutput();
    Content co = new Content();
    ParseData pd = new ParseData();
    ParseText pt = new ParseText();
    long outputCnt = 0L;
    int segCnt = 1;
    File outDir = new File(output, SegmentWriter.getNewSegmentName());
    LOG.info("Writing output in " + output);
View Full Code Here

  public synchronized boolean next(FetcherOutput fo, Content co,
          ParseText pt, ParseData pd) throws IOException {
    boolean valid = true;
    Content rco = (co == null) ? _co : co;
    ParseText rpt = (pt == null) ? _pt : pt;
    ParseData rpd = (pd == null) ? _pd : pd;
    if (fetcherReader.next(fo) == null) valid = false;
    if (contentReader != null)
      if (contentReader.next(rco) == null) valid = false;
    if (parseTextReader != null)
      if (parseTextReader.next(rpt) == null) valid = false;
View Full Code Here

   */
  public synchronized void dump(boolean sorted, PrintStream output) throws Exception {
    reset();
    FetcherOutput fo = new FetcherOutput();
    Content co = new Content();
    ParseData pd = new ParseData();
    ParseText pt = new ParseText();
    long recNo = 0L;
    if (!sorted) {
      while(next(fo, co, pt, pd)) {
        output.println("Recno:: " + recNo++);
        output.println("FetcherOutput::\n" + fo.toString());
        if (contentReader != null)
          output.println("Content::\n" + co.toString());
        if (parseDataReader != null)
          output.println("ParseData::\n" + pd.toString());
        if (parseTextReader != null)
          output.println("ParseText::\n" + pt.toString());
        output.println("");
      }
    } else {
      File unsortedFile = new File(segmentDir, ".unsorted");
      File sortedFile = new File(segmentDir, ".sorted");
      nfs.delete(unsortedFile);
      nfs.delete(sortedFile);
      SequenceFile.Writer seqWriter = new SequenceFile.Writer(nfs,
              unsortedFile.toString(), UTF8.class, LongWritable.class);
      FetchListEntry fle;
      LongWritable rec = new LongWritable();
      UTF8 url = new UTF8();
      String urlString;
      while (fetcherReader.next(fo) != null) {
        fle = fo.getFetchListEntry();
        urlString = fle.getPage().getURL().toString();
        rec.set(recNo);
        url.set(urlString);
        seqWriter.append(url, rec);
        recNo++;
      }
      seqWriter.close();
      // sort the SequenceFile
      long start = System.currentTimeMillis();

      SequenceFile.Sorter sorter = new SequenceFile.Sorter(nfs,
              new UTF8.Comparator(), LongWritable.class);

      sorter.sort(unsortedFile.toString(), sortedFile.toString());

      float localSecs = (System.currentTimeMillis() - start) / 1000.0f;
      LOG.info(" - sorted: " + recNo + " entries in " + localSecs + "s, "
        + (recNo/localSecs) + " entries/s");

      nfs.delete(unsortedFile);
      SequenceFile.Reader seqReader = new SequenceFile.Reader(nfs, sortedFile.toString());
      while (seqReader.next(url, rec)) {
        recNo = rec.get();
        get(recNo, fo, co, pt, pd);
        output.println("Recno:: " + recNo++);
        output.println("FetcherOutput::\n" + fo.toString());
        if (contentReader != null)
          output.println("Content::\n" + co.toString());
        if (parseDataReader != null)
          output.println("ParseData::\n" + pd.toString());
        if (parseTextReader != null)
          output.println("ParseText::\n" + pt.toString());
        output.println("");
      }
      seqReader.close();
View Full Code Here

        break;
      case OP_ANCHORS:
        value = new ArrayWritable(UTF8.class);
        break;
      case OP_PARSEDATA:
        value = new ParseData();
        break;
      case OP_PARSETEXT:
        value = new ParseText();
        break;
      case OP_FETCHDATE:
View Full Code Here

      Properties meta = new Properties();
      meta.setProperty("Content-Type", "text/html");
      meta.setProperty("Host", "http://localhost");
      meta.setProperty("Connection", "Keep-alive, close");
      Content co = new Content(url, "http://www.example.com", content.toString().getBytes("UTF-8"), "text/html", meta);
      ParseData pd = new ParseData("Hello from Page " + i, new Outlink[0], meta);
      StringBuffer text = new StringBuffer("Hello from Page" + i);
      if (unique) {
        text.append("\nCreated at epoch time: " + System.currentTimeMillis() + ", " + r.nextLong());
      }
      for (int k = 0; k < 10; k++) {
View Full Code Here

      title = "";

    // collect outlink
    Outlink[] outlinks = new Outlink[0];

    ParseData parseData = new ParseData(title, outlinks, metadata);
    return new ParseImpl(text, parseData);
    // any filter?
    //return HtmlParseFilters.filter(content, parse, root);
  }
View Full Code Here

    // collect meta data
    Properties metadata = new Properties();
    metadata.putAll(content.getMetadata()); // copy through

    ParseData parseData = new ParseData(title, outlinks, metadata);
    return new ParseImpl(text, parseData);
    // any filter?
    //return HtmlParseFilters.filter(content, parse, root);
  }
View Full Code Here

    // collect meta data
    Properties metaData = new Properties();
    metaData.putAll(content.getMetadata()); // copy through

    ParseData parseData = new ParseData(title, outlinks, metaData);
    return new ParseImpl(text, parseData);
  }
View Full Code Here

TOP

Related Classes of net.nutch.parse.ParseData

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.