Package org.apache.nutch.crawl

Examples of org.apache.nutch.crawl.CrawlDatum


    Parse parse;

    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + id3v2;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                      .getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content);
    Metadata metadata = parse.getData().getParseMeta();
    assertEquals("postgresql comment id3v2", metadata.get("COMM-Text"));
    assertEquals("postgresql composer id3v2", metadata.get("TCOM-Text"));
View Full Code Here


    Parse parse;

    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + id3v1;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                      .getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content);

    Metadata metadata = parse.getData().getParseMeta();
    assertEquals("postgresql comment id3v1", metadata.get("COMM-Text"));
View Full Code Here

    Parse parse;

    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + none;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                      .getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content);
//    Metadata metadata = parse.getData().getParseMeta();
    if (parse.getData().getStatus().isSuccess()) {
      fail("Expected ParseException");
View Full Code Here

    for (int i=0; i<sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = factory.getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

      parse = parser.getParse(content);

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ");
      assertTrue(expectedText.equals(text));
View Full Code Here

   
    if (robotRules == null) {                     // cache miss
      if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
      try {
        Response response = http.getResponse(new URL(url, "/robots.txt"),
                                             new CrawlDatum(), true);

        if (response.getCode() == 200)               // found rules: parse them
          robotRules = parseRules(response.getContent());
        else if ( (response.getCode() == 403) && (!allowForbidden) )
          robotRules = FORBID_ALL_RULES;            // use forbid all
View Full Code Here

      file.setMaxContentLength(maxContentLength);

    // set log level
    //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));

    Content content = file.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

    System.err.println("Content-Type: " + content.getContentType());
    System.err.println("Content-Length: " +
                       content.getMetadata().get(Response.CONTENT_LENGTH));
    System.err.println("Last-Modified: " +
View Full Code Here

   */
  private static String getUrlContent(String url, Configuration conf) {
    Protocol protocol;
    try {
      protocol = new ProtocolFactory(conf).getProtocol(url);
      Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
      Parse parse = new ParseUtil(conf).parse(content);
      System.out.println("text:" + parse.getText());
      return parse.getText();

    } catch (ProtocolNotFound e) {
View Full Code Here

        } else {
          LOG.debug("-feeding " + feed + " input urls ...");
          while (feed > 0 && hasMore) {
            try {
              Text url = new Text();
              CrawlDatum datum = new CrawlDatum();
              hasMore = reader.next(url, datum);
              if (hasMore) {
                queues.addFetchItem(url, datum);
                cnt++;
                feed--;
View Full Code Here

                    output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_REDIR_PERM);
                    Text redirUrl = new Text(newUrl);
                    if (maxRedirect > 0) {
                      redirecting = true;
                      redirectCount++;
                      fit = FetchItem.create(redirUrl, new CrawlDatum(), byIP);
                      FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                      fiq.addInProgressFetchItem(fit);
                      if (LOG.isDebugEnabled()) {
                        LOG.debug(" - content redirect to " + redirUrl + " (fetching now)");
                      }
                    } else {
                      output(redirUrl, new CrawlDatum(), null, null, CrawlDatum.STATUS_LINKED);
                      if (LOG.isDebugEnabled()) {
                        LOG.debug(" - content redirect to " + redirUrl + " (fetching later)");
                      }
                    }
                  } else if (LOG.isDebugEnabled()) {
                    LOG.debug(" - content redirect skipped: " +
                             (newUrl != null ? "to same url" : "filtered"));
                  }
                }
                break;

              case ProtocolStatus.MOVED:         // redirect
              case ProtocolStatus.TEMP_MOVED:
                int code;
                if (status.getCode() == ProtocolStatus.MOVED) {
                  code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
                } else {
                  code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
                }
                output(fit.url, fit.datum, content, status, code);
                String newUrl = status.getMessage();
                newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
                newUrl = this.urlFilters.filter(newUrl);
                if (newUrl != null && !newUrl.equals(fit.url.toString())) {
                  Text redirUrl = new Text(newUrl);
                  if (maxRedirect > 0) {
                    redirecting = true;
                    redirectCount++;
                    fit = FetchItem.create(redirUrl, new CrawlDatum(), byIP);
                    FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                    fiq.addInProgressFetchItem(fit);
                    if (LOG.isDebugEnabled()) {
                      LOG.debug(" - protocol redirect to " + redirUrl + " (fetching now)");
                    }
                  } else {
                    output(redirUrl, new CrawlDatum(), null, null, CrawlDatum.STATUS_LINKED);
                    if (LOG.isDebugEnabled()) {
                      LOG.debug(" - protocol redirect to " + redirUrl + " (fetching later)");
                    }
                  }
                } else if (LOG.isDebugEnabled()) {
View Full Code Here

  public void reduce(WritableComparable key, Iterator values,
                     OutputCollector output, Reporter reporter)
    throws IOException {
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    CrawlDatum redir = null;
    ParseData parseData = null;
    ParseText parseText = null;
    while (values.hasNext()) {
      Object value = ((ObjectWritable)values.next()).get(); // unwrap
      if (value instanceof Inlinks) {
        inlinks = (Inlinks)value;
      } else if (value instanceof CrawlDatum) {
        CrawlDatum datum = (CrawlDatum)value;
        if (CrawlDatum.hasDbStatus(datum))
          dbDatum = datum;
        else if (CrawlDatum.hasFetchStatus(datum))
          fetchDatum = datum;
        else if (CrawlDatum.STATUS_LINKED == datum.getStatus())
          // redirected page
          redir = datum;
        else
          throw new RuntimeException("Unexpected status: "+datum.getStatus());
      } else if (value instanceof ParseData) {
        parseData = (ParseData)value;
      } else if (value instanceof ParseText) {
        parseText = (ParseText)value;
      } else if (LOG.isWarnEnabled()) {
View Full Code Here

TOP

Related Classes of org.apache.nutch.crawl.CrawlDatum

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.