Package org.apache.nutch.parse

Examples of org.apache.nutch.parse.ParseText


    FileStatus[] stats = fs.listStatus(out);
    // there should be just one path
    assertEquals(1, stats.length);
    Path outSeg = stats[0].getPath();
    Text k = new Text();
    ParseText v = new ParseText();
    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(outSeg, ParseText.DIR_NAME), conf);
    int cnt1 = 0, cnt2 = 0;
    for (MapFile.Reader r : readers) {
      while (r.next(k, v)) {
        String ks = k.toString();
        String vs = v.getText();
        if (ks.startsWith("seg1-")) {
          cnt1++;
          assertTrue(vs.startsWith("seg1 "));
        } else if (ks.startsWith("seg2-")) {
          cnt2++;
View Full Code Here


    CrawlDatum lastG = null;
    CrawlDatum lastF = null;
    CrawlDatum lastSig = null;
    Content lastC = null;
    ParseData lastPD = null;
    ParseText lastPT = null;
    String lastGname = null;
    String lastFname = null;
    String lastSigname = null;
    String lastCname = null;
    String lastPDname = null;
    String lastPTname = null;
    TreeMap linked = new TreeMap();
    while (values.hasNext()) {
      ObjectWritable wrapper = (ObjectWritable)values.next();
      Object o = wrapper.get();
      if (o instanceof CrawlDatum) {
        CrawlDatum val = (CrawlDatum)o;
        // check which output dir it belongs to
        UTF8 part = (UTF8)val.getMetaData().get(SEGMENT_PART_KEY);
        if (part == null)
          throw new IOException("Null segment part, key=" + key);
        UTF8 uName = (UTF8)val.getMetaData().get(SEGMENT_NAME_KEY);
        if (uName == null)
          throw new IOException("Null segment name, key=" + key);
        String name = uName.toString();
        String partString = part.toString();
        if (partString.equals(CrawlDatum.GENERATE_DIR_NAME)) {
          if (lastG == null) {
            lastG = val;
            lastGname = name;
          } else {
            // take newer
            if (lastGname.compareTo(name) < 0) {
              lastG = val;
              lastGname = name;
            }
          }
        } else if (partString.equals(CrawlDatum.FETCH_DIR_NAME)) {
          if (lastF == null) {
            lastF = val;
            lastFname = name;
          } else {
            // take newer
            if (lastFname.compareTo(name) < 0) {
              lastF = val;
              lastFname = name;
            }
          }
        } else if (partString.equals(CrawlDatum.PARSE_DIR_NAME)) {
          if (val.getStatus() == CrawlDatum.STATUS_SIGNATURE) {
            if (lastSig == null) {
              lastSig = val;
              lastSigname = name;
            } else {
              // take newer
              if (lastSigname.compareTo(name) < 0) {
                lastSig = val;
                lastSigname = name;
              }
            }
            continue;
          }
          // collect all LINKED values from the latest segment
          ArrayList segLinked = (ArrayList)linked.get(name);
          if (segLinked == null) {
            segLinked = new ArrayList();
            linked.put(name, segLinked);
          }
          segLinked.add(val);
        } else {
          throw new IOException("Cannot determine segment part: " + partString);
        }
      } else if (o instanceof Content) {
        String name = ((Content)o).getMetadata().get(SEGMENT_NAME_KEY.toString());
        if (lastC == null) {
          lastC = (Content)o;
          lastCname = name;
        } else {
          if (lastCname.compareTo(name) < 0) {
            lastC = (Content)o;
            lastCname = name;
          }
        }
      } else if (o instanceof ParseData) {
        String name = ((ParseData)o).getParseMeta().get(SEGMENT_NAME_KEY.toString());
        if (lastPD == null) {
          lastPD = (ParseData)o;
          lastPDname = name;
        } else {
          if (lastPDname.compareTo(name) < 0) {
            lastPD = (ParseData)o;
            lastPDname = name;
          }
        }
      } else if (o instanceof ParseText) {
        String text = ((ParseText)o).getText();
        String name = null;
        int idx = text.indexOf(nameMarker, nameMarker.length());
        if (idx != -1) {
          name = text.substring(nameMarker.length(), idx);
        } else {
          throw new IOException("Missing segment name marker in ParseText, key " + key + ": " + text);
        }
        if (lastPT == null) {
          lastPT = (ParseText)o;
          lastPTname = name;
        } else {
          if (lastPTname.compareTo(name) < 0) {
            lastPT = (ParseText)o;
            lastPTname = name;
          }
        }
      }
    }
    curCount++;
    UTF8 sliceName = null;
    ObjectWritable wrapper = new ObjectWritable();
    if (sliceSize > 0) {
      sliceName = new UTF8(String.valueOf(curCount / sliceSize));
    }
    // now output the latest values
    if (lastG != null) {
      if (sliceName != null) {
        lastG.getMetaData().put(SEGMENT_SLICE_KEY, sliceName);
      }
      wrapper.set(lastG);
      output.collect(key, wrapper);
    }
    if (lastF != null) {
      if (sliceName != null) {
        lastF.getMetaData().put(SEGMENT_SLICE_KEY, sliceName);
      }
      wrapper.set(lastF);
      output.collect(key, wrapper);
    }
    if (lastSig != null) {
      if (sliceName != null) {
        lastSig.getMetaData().put(SEGMENT_SLICE_KEY, sliceName);
      }
      wrapper.set(lastSig);
      output.collect(key, wrapper);
    }
    if (lastC != null) {
      if (sliceName != null) {
        lastC.getMetadata().set(sliceMarker, sliceName.toString());
      }
      wrapper.set(lastC);
      output.collect(key, wrapper);
    }
    if (lastPD != null) {
      if (sliceName != null) {
        lastPD.getParseMeta().set(sliceMarker, sliceName.toString());
      }
      wrapper.set(lastPD);
      output.collect(key, wrapper);
    }
    if (lastPT != null) {
      if (sliceName != null) {
        lastPT = new ParseText(sliceMarker + sliceName + sliceMarker
                + lastPT.getText());
      }
      wrapper.set(lastPT);
      output.collect(key, wrapper);
    }
    if (linked.size() > 0) {
View Full Code Here

              ((ParseData)o).setParseMeta(new Metadata());
            }
            ((ParseData)o).getParseMeta().set(SEGMENT_NAME_KEY.toString(), segment);
          } else if (o instanceof ParseText) {
            String text = ((ParseText)o).getText();
            o = new ParseText(SEGMENT_NAME_KEY.toString() +
                    segment + SEGMENT_NAME_KEY.toString() + text);
            wrapper.set(o);
          } else {
            throw new IOException("Unknown value type: " + o.getClass().getName() + "(" + o + ")");
          }
View Full Code Here

                int idx = text.indexOf(nameMarker, nameMarker.length());
                if (idx != -1) {
                  text = text.substring(idx + nameMarker.length());
                }
              }
              o = new ParseText(text);
            }
            pt_out = ensureMapFile(slice, ParseText.DIR_NAME, ParseText.class);
            pt_out.append(key, o);
          }
        }
View Full Code Here

    CrawlDatum lastG = null;
    CrawlDatum lastF = null;
    CrawlDatum lastSig = null;
    Content lastC = null;
    ParseData lastPD = null;
    ParseText lastPT = null;
    String lastGname = null;
    String lastFname = null;
    String lastSigname = null;
    String lastCname = null;
    String lastPDname = null;
View Full Code Here

    }

    String feedDesc = stripTags(feed.getDescriptionEx());
    String feedTitle = stripTags(feed.getTitleEx());

    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
        new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
        content.getMetadata()));

    return parseResult;
  }
View Full Code Here

    if (parse != null) {
      ParseData data = parse.getData();
      data.getContentMeta().remove(Response.CONTENT_TYPE);
      mergeMetadata(data.getParseMeta(), parseMeta);
      parseResult.put(link, new ParseText(parse.getText()), new ParseData(
          ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data
              .getContentMeta(), data.getParseMeta()));
    } else {
      contentMeta.remove(Response.CONTENT_TYPE);
      parseResult.put(link, new ParseText(text), new ParseData(
          ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta,
          parseMeta));
    }

  }
View Full Code Here

    throws IOException {
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    ParseData parseData = null;
    ParseText parseText = null;
    while (values.hasNext()) {
      final Writable value = values.next().get(); // unwrap
      if (value instanceof Inlinks) {
        inlinks = (Inlinks)value;
      } else if (value instanceof CrawlDatum) {
View Full Code Here

    MapFile.Writer w = new MapFile.Writer(conf, fs, ptPath.toString(), Text.class, ParseText.class);
    long curSize = 0;
    countSeg1 = 0;
    while (curSize < blkSize * 2) {
      k.set("seg1-" + df.format(countSeg1));
      w.append(k, new ParseText("seg1 text " + countSeg1));
      countSeg1++;
      curSize += 40; // roughly ...
    }
    w.close();
    System.err.println(" - done: " + countSeg1 + " records.");
    System.err.println("Creating large segment 2...");
    ptPath = new Path(new Path(seg2, ParseText.DIR_NAME), "part-00000");
    w = new MapFile.Writer(conf, fs, ptPath.toString(), Text.class, ParseText.class);
    curSize = 0;
    countSeg2 = 0;
    while (curSize < blkSize * 2) {
      k.set("seg2-" + df.format(countSeg2));
      w.append(k, new ParseText("seg2 text " + countSeg2));
      countSeg2++;
      curSize += 40; // roughly ...
    }
    w.close();
    System.err.println(" - done: " + countSeg2 + " records.");
View Full Code Here

    FileStatus[] stats = fs.listStatus(out);
    // there should be just one path
    assertEquals(1, stats.length);
    Path outSeg = stats[0].getPath();
    Text k = new Text();
    ParseText v = new ParseText();
    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(outSeg, ParseText.DIR_NAME), conf);
    int cnt1 = 0, cnt2 = 0;
    for (MapFile.Reader r : readers) {
      while (r.next(k, v)) {
        String ks = k.toString();
        String vs = v.getText();
        if (ks.startsWith("seg1-")) {
          cnt1++;
          assertTrue(vs.startsWith("seg1 "));
        } else if (ks.startsWith("seg2-")) {
          cnt2++;
View Full Code Here

TOP

Related Classes of org.apache.nutch.parse.ParseText

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.