Package org.apache.nutch.crawl

Examples of org.apache.nutch.crawl.CrawlDatum


                }

                reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1);

                // Create new FetchItem with depth incremented
                FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
                fetchQueues.addFetchItem(fit);

                outlinkCounter++;
              }
            }
View Full Code Here


  }

  public String getTextContent(String fileName) throws ProtocolException, ParseException {
    String urlString = "file:" + sampleDir + fileSeparator + fileName;
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    return parse.getText();
  }
View Full Code Here

    public void run() {
      synchronized (OldFetcher.this) {activeThreads++;} // count threads
     
      try {
        Text key = new Text();
        CrawlDatum datum = new CrawlDatum();
       
        while (true) {
          // TODO : NUTCH-258 ...
          // If something bad happened, then exit
          // if (conf.getBoolean("fetcher.exit", false)) {
          //   break;
          // ]
         
          try {                                   // get next entry from input
            if (!input.next(key, datum)) {
              break;                              // at eof, exit
            }
          } catch (IOException e) {
            if (LOG.isErrorEnabled()) {
              LOG.error("fetcher caught:"+e.toString());
            }
            break;
          }

          synchronized (OldFetcher.this) {
            lastRequestStart = System.currentTimeMillis();
          }

          // url may be changed through redirects.
          Text url = new Text(key);

          Text reprUrlWritable =
            (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
          if (reprUrlWritable == null) {
            reprUrl = key.toString();
          } else {
            reprUrl = reprUrlWritable.toString();
          }

          try {
            if (LOG.isInfoEnabled()) { LOG.info("fetching " + url); }

            // fetch the page
            redirectCount = 0;
            do {
              if (LOG.isDebugEnabled()) {
                LOG.debug("redirectCount=" + redirectCount);
              }
              redirecting = false;
              Protocol protocol = this.protocolFactory.getProtocol(url.toString());
              ProtocolOutput output = protocol.getProtocolOutput(url, datum);
              ProtocolStatus status = output.getStatus();
              Content content = output.getContent();
              ParseStatus pstatus = null;

              String urlString = url.toString();
              if (reprUrl != null && !reprUrl.equals(urlString)) {
                datum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
                    new Text(reprUrl));
              }

              switch(status.getCode()) {
View Full Code Here

            LOG.debug(" - " + redirType + " redirect to " +
                      url + " (fetching now)");
          }
          return url;
        } else {
          CrawlDatum newDatum = new CrawlDatum();
          if (reprUrl != null) {
            newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
                new Text(reprUrl));
          }
          output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
          if (LOG.isDebugEnabled()) {
            LOG.debug(" - " + redirType + " redirect to " +
View Full Code Here

  /** Increase the score by a sum of inlinked scores. */
  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked) throws ScoringFilterException {
    float adjust = 0.0f;
    for (int i = 0; i < inlinked.size(); i++) {
      CrawlDatum linked = inlinked.get(i);
      adjust += linked.getScore();
    }
    if (old == null) old = datum;
    datum.setScore(old.getScore() + adjust);
  }
View Full Code Here

      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      if (sampleFiles[i].startsWith("ootest")==false) continue;

      protocol = factory.getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();

      // simply test for the presence of a text - the ordering of the elements may differ from what was expected
View Full Code Here

      // Overwrite the key with the normalized URL
      key.set(url);

      if (value instanceof CrawlDatum) {
        CrawlDatum datum = (CrawlDatum)value;

        if (datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP ||
            datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM ||
            datum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) {

            // Tell the reducer to get rid of all instances of this key
            output.collect(key, new NutchWritable(new BooleanWritable(true)));
        }
      }
      else if (value instanceof ParseData) {
        // get the parse data and the outlinks from the parse data, along with
        // the fetch time for those links
        ParseData data = (ParseData)value;
        long fetchTime = getFetchTime(data);
        Outlink[] outlinkAr = data.getOutlinks();
        Map<String, String> outlinkMap = new LinkedHashMap<String, String>();

        // normalize urls and put into map
        if (outlinkAr != null && outlinkAr.length > 0) {
          for (int i = 0; i < outlinkAr.length; i++) {
            Outlink outlink = outlinkAr[i];
            String toUrl = normalizeUrl(outlink.getToUrl());

            if (filterUrl(toUrl) == null) {
              continue;
            }

            // only put into map if the url doesn't already exist in the map or
            // if it does and the anchor for that link is null, will replace if
            // url is existing
            boolean existingUrl = outlinkMap.containsKey(toUrl);
            if (toUrl != null
              && (!existingUrl || (existingUrl && outlinkMap.get(toUrl) == null))) {
              outlinkMap.put(toUrl, outlink.getAnchor());
            }
          }
        }

        // collect the outlinks under the fetch time
        for (String outlinkUrl : outlinkMap.keySet()) {
          String anchor = outlinkMap.get(outlinkUrl);
          LinkDatum datum = new LinkDatum(outlinkUrl, anchor, fetchTime);
          output.collect(key, new NutchWritable(datum));
        }
      }
      else if (value instanceof LinkDatum) {
        LinkDatum datum = (LinkDatum)value;
        String linkDatumUrl = normalizeUrl(datum.getUrl());

        if (filterUrl(linkDatumUrl) != null) {
          datum.setUrl(linkDatumUrl);

          // collect existing outlinks from existing OutlinkDb
          output.collect(key, new NutchWritable(datum));
        }
      }
View Full Code Here

    fos.write(expectedText.getBytes());
    fos.close();

    // get nutch content
    Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    protocol = null;
  }
View Full Code Here

    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      Configuration conf = NutchConfiguration.create();
      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

      Assert.assertEquals("121", parse.getData().getMeta("width"));
      Assert.assertEquals("48", parse.getData().getMeta("height"));
    }
View Full Code Here

    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
        .get(content.getUrl());
    String text = parse.getText();
    Assert.assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
View Full Code Here

TOP

Related Classes of org.apache.nutch.crawl.CrawlDatum

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.