Package net.nutch.protocol

Examples of net.nutch.protocol.Content


      delta = s1;
      nfs.mkdirs(outDir);
      SegmentWriter sw = new SegmentWriter(nfs, outDir, true);
      LOG.fine(" - opening first output segment in " + outDir.getName());
      FetcherOutput fo = new FetcherOutput();
      Content co = new Content();
      ParseText pt = new ParseText();
      ParseData pd = new ParseData();
      int outputCnt = 0;
      for (int n = 0; n < ir.maxDoc(); n++) {
        if (ir.isDeleted(n)) {
View Full Code Here


    }

    LOG.info("fetching: "+url);

    Protocol protocol = ProtocolFactory.getProtocol(url);
    Content content = protocol.getContent(url);

    if (force) {
      content.setContentType(contentType);
    } else {
      contentType = content.getContentType();
    }

    if (contentType == null) {
      System.err.println("");
      System.exit(-1);
View Full Code Here

    }
    LOG.info("Input: " + total + " entries in " + readers.size() + " segments.");
    if (!parsed)
      LOG.warning(" - some input segments are non-parsed, forcing non-parsed output!");
    FetcherOutput fo = new FetcherOutput();
    Content co = new Content();
    ParseData pd = new ParseData();
    ParseText pt = new ParseText();
    long outputCnt = 0L;
    int segCnt = 1;
    File outDir = new File(output, SegmentWriter.getNewSegmentName());
View Full Code Here

   * be performed for all streams anyway, to ensure that the whole entry is valid.
   */
  public synchronized boolean next(FetcherOutput fo, Content co,
          ParseText pt, ParseData pd) throws IOException {
    boolean valid = true;
    Content rco = (co == null) ? _co : co;
    ParseText rpt = (pt == null) ? _pt : pt;
    ParseData rpd = (pd == null) ? _pd : pd;
    if (fetcherReader.next(fo) == null) valid = false;
    if (contentReader != null)
      if (contentReader.next(rco) == null) valid = false;
View Full Code Here

   * @throws Exception
   */
  public synchronized void dump(boolean sorted, PrintStream output) throws Exception {
    reset();
    FetcherOutput fo = new FetcherOutput();
    Content co = new Content();
    ParseData pd = new ParseData();
    ParseText pt = new ParseText();
    long recNo = 0L;
    if (!sorted) {
      while(next(fo, co, pt, pd)) {
        output.println("Recno:: " + recNo++);
        output.println("FetcherOutput::\n" + fo.toString());
        if (contentReader != null)
          output.println("Content::\n" + co.toString());
        if (parseDataReader != null)
          output.println("ParseData::\n" + pd.toString());
        if (parseTextReader != null)
          output.println("ParseText::\n" + pt.toString());
        output.println("");
      }
    } else {
      File unsortedFile = new File(segmentDir, ".unsorted");
      File sortedFile = new File(segmentDir, ".sorted");
      nfs.delete(unsortedFile);
      nfs.delete(sortedFile);
      SequenceFile.Writer seqWriter = new SequenceFile.Writer(nfs,
              unsortedFile.toString(), UTF8.class, LongWritable.class);
      FetchListEntry fle;
      LongWritable rec = new LongWritable();
      UTF8 url = new UTF8();
      String urlString;
      while (fetcherReader.next(fo) != null) {
        fle = fo.getFetchListEntry();
        urlString = fle.getPage().getURL().toString();
        rec.set(recNo);
        url.set(urlString);
        seqWriter.append(url, rec);
        recNo++;
      }
      seqWriter.close();
      // sort the SequenceFile
      long start = System.currentTimeMillis();

      SequenceFile.Sorter sorter = new SequenceFile.Sorter(nfs,
              new UTF8.Comparator(), LongWritable.class);

      sorter.sort(unsortedFile.toString(), sortedFile.toString());

      float localSecs = (System.currentTimeMillis() - start) / 1000.0f;
      LOG.info(" - sorted: " + recNo + " entries in " + localSecs + "s, "
        + (recNo/localSecs) + " entries/s");

      nfs.delete(unsortedFile);
      SequenceFile.Reader seqReader = new SequenceFile.Reader(nfs, sortedFile.toString());
      while (seqReader.next(url, rec)) {
        recNo = rec.get();
        get(recNo, fo, co, pt, pd);
        output.println("Recno:: " + recNo++);
        output.println("FetcherOutput::\n" + fo.toString());
        if (contentReader != null)
          output.println("Content::\n" + co.toString());
        if (parseDataReader != null)
          output.println("ParseData::\n" + pd.toString());
        if (parseTextReader != null)
          output.println("ParseText::\n" + pt.toString());
        output.println("");
View Full Code Here

  public void testId3v2() throws ProtocolException, ParseException {

    String urlString;
    Protocol protocol;
    Content content;
    Parser parser;
    Parse parse;

    urlString = "file:" + sampleDir + fileSeparator + id3v2;
    protocol = ProtocolFactory.getProtocol(urlString);
    content = protocol.getContent(urlString);

    parser = ParserFactory.getParser(content.getContentType(), urlString);
    parse = parser.getParse(content);
    Properties metadata = parse.getData().getMetadata();
    assertEquals("postgresql comment id3v2", metadata.getProperty("COMM-Text"));
    assertEquals("postgresql composer id3v2", metadata.getProperty("TCOM-Text"));
    assertEquals("02", metadata.getProperty("TRCK-Text"));
View Full Code Here

  public void testId3v1() throws ProtocolException, ParseException {

    String urlString;
    Protocol protocol;
    Content content;
    Parser parser;
    Parse parse;

    urlString = "file:" + sampleDir + fileSeparator + id3v1;
    protocol = ProtocolFactory.getProtocol(urlString);
    content = protocol.getContent(urlString);
    parser = ParserFactory.getParser(content.getContentType(), urlString);
    parse = parser.getParse(content);

    Properties metadata = parse.getData().getMetadata();
    assertEquals("postgresql comment id3v1", metadata.getProperty("COMM-Text"));
    assertEquals("postgresql artist id3v1", metadata.getProperty("TPE1-Text"));
View Full Code Here

  }

  public void testNone() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parser parser;
    Parse parse;

    urlString = "file:" + sampleDir + fileSeparator + none;
    protocol = ProtocolFactory.getProtocol(urlString);
    content = protocol.getContent(urlString);
    parser = ParserFactory.getParser(content.getContentType(), urlString);
    try {
      parse = parser.getParse(content);
      Properties metadata = parse.getData().getMetadata();
    } catch (ParseException e) {
      return;
View Full Code Here

   */
  private static String getUrlContent(String url) {
    Protocol protocol;
    try {
      protocol = ProtocolFactory.getProtocol(url);
      Content content = protocol.getContent(url);
      String contentType = content.getContentType();
      Parser parser = ParserFactory.getParser(contentType, url);
      Parse parse = parser.getParse(content);
      System.out.println("text:" + parse.getText());
      return parse.getText();

View Full Code Here

    }
    in.close();
    byte[] bytes = out.toByteArray();

    Parser parser = ParserFactory.getParser(contentType, url);
    Content content =
      new Content(url, url, bytes, contentType, new Properties());
    Parse parse = parser.getParse(content);

    Properties metadata = parse.getData().getMetadata();
    assertEquals(license, metadata.get("License-Url"));
    assertEquals(location, metadata.get("License-Location"));
View Full Code Here

TOP

Related Classes of net.nutch.protocol.Content

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.