Package org.apache.nutch.crawl

Examples of org.apache.nutch.crawl.CrawlDatum


          String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);
          if (sig != null) {
            byte[] signature = StringUtil.fromHexString(sig);
            if (signature != null) {
              // append a CrawlDatum with a signature
              CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0.0f);
              d.setSignature(signature);
              crawlOut.append(key, d);
            }
          }

          // collect outlinks for subsequent db update
          Outlink[] links = parseData.getOutlinks();
          if (ignoreExternalLinks) {
            try {
              fromHost = new URL(fromUrl).getHost().toLowerCase();
            } catch (MalformedURLException e) {
              fromHost = null;
            }
          } else {
            fromHost = null;
          }

          String[] toUrls = new String[links.length];
          int validCount = 0;
          for (int i = 0; i < links.length; i++) {
            String toUrl = links[i].getToUrl();
            try {
              toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK); // normalize the url
              toUrl = filters.filter(toUrl);   // filter the url
            } catch (Exception e) {
              toUrl = null;
            }
            // ignore links to self (or anchors within the page)
            if (fromUrl.equals(toUrl)) toUrl = null;
            if (toUrl != null) validCount++;
            toUrls[i] = toUrl;
          }
          CrawlDatum adjust = null;
          // compute score contributions and adjustment to the original score
          for (int i = 0; i < toUrls.length; i++) {
            if (toUrls[i] == null) continue;
            if (ignoreExternalLinks) {
              try {
                toHost = new URL(toUrls[i]).getHost().toLowerCase();
              } catch (MalformedURLException e) {
                toHost = null;
              }
              if (toHost == null || !toHost.equals(fromHost)) { // external links
                continue; // skip it
              }
            }
            CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
            Text targetUrl = new Text(toUrls[i]);
            adjust = null;
            try {
              adjust = scfilters.distributeScoreToOutlink((Text)key, targetUrl,
                      parseData, target, null, links.length, validCount);
View Full Code Here


    if (LOG.isInfoEnabled()) { LOG.info("fetching: "+url); }

    Configuration conf = NutchConfiguration.create();
    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();

    if (force) {
      content.setContentType(contentType);
    } else {
      contentType = content.getContentType();
View Full Code Here

    for (int i=0; i<sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      Configuration conf = NutchConfiguration.create();
      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content);

      int index = parse.getText().indexOf(expectedText);
      assertTrue(index > 0);
    }
View Full Code Here

  public void map(WritableComparable key, Writable value, OutputCollector output,
      Reporter reporter) throws IOException {
    newKey.set(key.toString());
    if (withMetadata) {
      CrawlDatum datum = (CrawlDatum)value;
      MapWritable meta = datum.getMetaData();
      if (meta.size() > 0) {
        MapWritable newMeta = new MapWritable();
        Iterator it = meta.keySet().iterator();
        while (it.hasNext()) {
          WritableComparable k = (WritableComparable)it.next();
          Writable v = meta.get(k);
          if (k instanceof UTF8) {
            Text t = new Text(k.toString());
            k = t;
          }
          newMeta.put(k, v);
        }
        datum.setMetaData(newMeta);
      }
    }
    output.collect(newKey, value);
  }
View Full Code Here

    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);

    IndexingFilters filters = new IndexingFilters(conf);
    filters.filter(new Document(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  }
View Full Code Here

    String url = args[0];
    Configuration conf = NutchConfiguration.create();
    RSSParser parser = new RSSParser();
    parser.setConf(conf);
    Protocol protocol = new ProtocolFactory(conf).getProtocol(url);
    Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
    Parse parse = parser.getParse(content);
    System.out.println("data: "+ parse.getData());
    System.out.println("text: "+parse.getText());
  }
View Full Code Here

      datum2.put(new DummyWritable(i), new DummyWritable(i));
    }
    assertEquals(100, datum2.size());
    testWritable(datum2);

    CrawlDatum c = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 1f);
    c.setMetaData(new MapWritable());
    for (int i = 0; i < 100; i++) {
      c.getMetaData().put(new LongWritable(i), new Text("" + 1));
    }
    testWritable(c);
  }
View Full Code Here

    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content);
      assertTrue(parse.getText().equals(expectedText));
    }
  }
View Full Code Here

    fos.write(expectedText.getBytes());
    fos.close();

    // get nutch content
    Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    protocol = null;
  }
View Full Code Here

    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = factory.getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString),
                                           new CrawlDatum()).getContent();
      parse = parser.parseByExtensionId("parse-msexcel", content);

      assertTrue(parse.getText().equals(expectedText));
    }
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.crawl.CrawlDatum

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.