Examples of org.apache.nutch.storage.WebPage

org.apache.nutch.storage.WebPage

  conf.setInt("indexer.max.title.length", 10);
  BasicIndexingFilter filter = new BasicIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  WebPage page = new WebPage();
  page.putToInlinks(new Utf8("http://exceedmaximumtitleurl.org/"), new Utf8("exceeding title site"));
  page.setTitle(new Utf8("This title exceeds maximum characters"));
  try {
    filter.filter(doc, "http://www.apache.org/", page);
  } catch (Exception e) {
    e.printStackTrace();
    fail(e.getMessage());

View Full Code Here

      LOG.info("fetching: " + url);
    }


    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    WebPage page = new WebPage();
    
    ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);
    
    if(!protocolOutput.getStatus().isSuccess()) {
      LOG.error("Fetch failed with protocol status: "
          + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
          + ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus()));
      return (-1);
    }
    Content content = protocolOutput.getContent();
    
    if (content == null) {
      LOG.error("No content for " + url);
      return (-1);
    }
    page.setBaseUrl(new org.apache.avro.util.Utf8(url));
    page.setContent(ByteBuffer.wrap(content.getContent()));


    if (force) {
      content.setContentType(contentType);
    } else {
      contentType = content.getContentType();
    }


    if (contentType == null) {
      LOG.error("Failed to determine content type!");
      return (-1);
    }


    page.setContentType(new Utf8(contentType));


    if (ParserJob.isTruncated(url, page)) {
      LOG.warn("Content is truncated, parse may fail!");
    }


    Parse parse = new ParseUtil(conf).parse(url, page);


    if (parse == null) {
      LOG.error("Problem with parse - check log");
      return (-1);
    }
    
    // Calculate the signature
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(page);
    
    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
      LOG.info("signature: " + StringUtil.toHexString(signature));
    }




    LOG.info("---------\nUrl\n---------------\n");
    System.out.print(url + "\n");
    LOG.info("---------\nMetadata\n---------\n");
    Map<Utf8, ByteBuffer> metadata = page.getMetadata();
    StringBuffer sb = new StringBuffer();
    if (metadata != null) {
      Iterator<Entry<Utf8, ByteBuffer>> iterator = metadata.entrySet()
          .iterator();
      while (iterator.hasNext()) {

View Full Code Here

    Result<String, WebPage> result = datastore.execute(query);
    boolean found = false;
    // should happen only once
    while (result.next()) {
      try {
        WebPage page = result.get();
        String skey = result.getKey();
        // we should not get to this point but nevermind
        if (page == null || skey == null)
          break;
        found = true;

View Full Code Here

        System.exit(-1);
      } else // root is required parameter
        url = args[i];
    }


    ProtocolOutput out = http.getProtocolOutput(url, new WebPage());
    Content content = out.getContent();


    System.out.println("Status: " + out.getStatus());
    if (content != null) {
      System.out.println("Content Type: " + content.getContentType());

View Full Code Here

     MoreIndexingFilter filter = new MoreIndexingFilter();
     filter.setConf(conf);
     assertNotNull(filter);
     NutchDocument doc = new NutchDocument();
     try{
       filter.filter(doc, "http://nutch.apache.org/index.html", new WebPage());
     }
     catch(Exception e){
       e.printStackTrace();
       fail(e.getMessage());
     }

View Full Code Here

  }
  
  private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    WebPage page = new WebPage();
    String url = "http://www.example.com/";
    page.setContent(ByteBuffer.wrap("text".getBytes()));
    page.setTitle(new Utf8("title"));
    page.putToHeaders(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8(source));
    NutchDocument doc = filter.filter(new NutchDocument(), url, page);
    assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }

View Full Code Here

    
    private void advance() throws Exception, IOException {
      hasNext = res.next();
      if (hasNext && batchId != null) {
        do {
          WebPage page = res.get();
          Utf8 mark = Mark.UPDATEDB_MARK.checkMark(page);
          if (NutchJob.shouldProcess(mark, batchId)) {
            return;
          } else {
            if (LOG.isDebugEnabled()) {

View Full Code Here

    if (robotRules == null) {                     // cache miss
      URL redir = null;
      if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
      try {
        Response response = ((HttpBase)http).getResponse(new URL(url, "/robots.txt"),
                                             new WebPage(), true);
        // try one level of redirection ?
        if (response.getCode() == 301 || response.getCode() == 302) {
          String redirection = response.getHeader("Location");
          if (redirection == null) {
            // some versions of MS IIS are known to mangle this header
            redirection = response.getHeader("location");
          }
          if (redirection != null) {
            if (!redirection.startsWith("http")) {
              // RFC says it should be absolute, but apparently it isn't
              redir = new URL(url, redirection);
            } else {
              redir = new URL(redirection);
            }
            
            response = ((HttpBase)http).getResponse(redir, new WebPage(), true);
          }
        }


        if (response.getCode() == 200)               // found rules: parse them
          robotRules =  parseRules(url.toString(), response.getContent(),

View Full Code Here

    file.setConf(NutchConfiguration.create());


    if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
      file.setMaxContentLength(maxContentLength);


    Content content = file.getProtocolOutput(urlString, new WebPage())
        .getContent();


    System.out.println("Content-Type: " + content.getContentType());
    System.out.println("Content-Length: "
        + content.getMetadata().get(Response.CONTENT_LENGTH));

View Full Code Here

  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  in.close();


  WebPage page = new WebPage();
  page.setBaseUrl(new Utf8(urlString));
  page.setContent(ByteBuffer.wrap(bytes));
  String mtype = mimeutil.getMimeType(file);
  page.setContentType(new Utf8(mtype));


  parse = new ParseUtil(conf).parse(urlString, page);


  String title = parse.getTitle();
  String text = parse.getText();

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.storage.WebPage

org.apache.gora.persistency.impl.StateManagerImpl

org.apache.nutch.analysis.lang.TestHTMLLanguageParser

org.apache.nutch.api.DbReader$DbIterator

org.apache.nutch.crawl.DbUpdateReducer

org.apache.nutch.crawl.InjectorJob$UrlMapper

org.apache.nutch.crawl.TestGenerator

org.apache.nutch.crawl.TestInjector

org.apache.nutch.crawl.TestURLPartitioner

org.apache.nutch.crawl.WebTableReader

org.apache.nutch.fetcher.FetcherReducer$FetcherThread

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.