Examples of net.matuschek.http.HttpDoc

net.matuschek.http.HttpDoc
A HTTP document. It consists of the contents and HTTP headers. @author Daniel Matuschek (daniel@matuschek.net) @author ptah @version $Id: HttpDoc.java,v 1.11 2004/08/09 17:36:49 matuschd Exp $

    }


    log.info("retrieving " + urlString);
    httpTool.setReferer(referer);


    HttpDoc doc = null;
    Vector links = null;
    boolean cached = false;


    // look in the cache first, but only for static pages
    boolean reScan = true;
    if ((docManager != null && allowCaching)
      && (task.getMethod() == HttpConstants.GET)
      && (task.getParamString() == null)) {
      doc = docManager.retrieveFromCache(u);
/*      if (doc != null) {
        try {
          links = ((UrlCollector) docManager).retrieveLinks(doc);
        } catch (IOException e) {
          log.info("Could not get links for " + u + ": " + e.getMessage());
          links = null;
        } 
      }*/
      
      if (doc != null) {
        countCache++;
        long lastRetrieved = doc.getDateAsMilliSeconds();
        double ageInSeconds = (now - lastRetrieved) / 1000;
        if (ageInSeconds < 0) {
          log.warn("DocumentAge < 0!");
        }
        reScan = maxDocumentAge >= 0 && ageInSeconds > maxDocumentAge;
        if (reScan) {
          long lastModified = doc.getLastModifiedAsMilliSeconds();
          Date lastModifiedDate = new Date(lastModified);
          httpTool.setIfModifiedSince(lastModifiedDate);
        }
      } else {
        httpTool.setIfModifiedSince(null);
      }
    }


    // if not found in cache, retrieve from the web page
    if (reScan) {
      HttpDoc newDoc;
      boolean error = false;
      try {
        if (u.getProtocol().equalsIgnoreCase("file")) {
          // retrieve from file
          newDoc = retrieveFileURL(u, httpTool.getIfModifiedSince());
        } else {
          // retrieve from Web
          newDoc = httpTool.retrieveDocument(u, task.getMethod(), task.getParamString());
          if (newDoc != null) {
            newDoc.setDate(now);
          }
          sleepNow();
        }
        
        if (newDoc!= null && !newDoc.isNotModified()) {
          if (!(newDoc.isOk() || newDoc.isRedirect())) {
            error = true;
          }
        } else {
          // (newDoc == null || newDoc.isNotModified()) && doc != null 
          // -> Not modified
          // -> refresh time stamp
          if (doc != null) {
            doc.setDate(now);
            doc.setCached(false);
            newDoc = null;
          }
        }
      } catch (HttpException hex) {
        error = true; newDoc = null;
      }
      if (error) {
        int retry = task.retry();
        if (retry <= maxRetries) {
          synchronized(visited) {
            todo.add(task);
            visited.remove(task);
          }
          log.info("Adding " + u + " for retry no. " + retry);
          return;
        } else {
          doc = docManager.retrieveFromCache(u);
          if (doc == null) {
            log.warn("Unsuccessfull retries for " + u);
            return;
          } else {
            long docDate = doc.getDateAsMilliSeconds();
            long age = (now - docDate);
            age /= 1000;
            if (expirationAge < 0 || age < expirationAge) {
              newDoc = doc;
              cached = true;
              log.info("Cached document not expired: " + u);
            } else {
              log.warn("Cached document expired: " + u);
              docManager.removeDocument(u);
              return;
            }
          }
        }
      }
      
      if (newDoc != null) {
        countWeb++;
        doc = newDoc;
        links = null; // force recalculation of links
        countRefresh++;
      } else {
        cached = true;
        countNoRefresh++;
      }
    } else {
      cached = true;
      log.debug("Page " + u + " retrieved from cache");
    }


    // Add it to the visited vector
    // needs to be synchronized with todo-list
//    visited.add(task); 
    
    // got a NULL document, that doc was not retrieved
    // usually, it was not downloaded because a rule didn't allow
    // to download it
    if (doc == null) {
      log.info("not downloaded " + u);
      return;
    }


    // Duplicate check
    String duplicate=null;
    if (duplicateCheck) {
      duplicate = getContentVisitedURL(doc);
      if (duplicate != null) {
        log.info("URLs with same content found: " + urlString + " = " + duplicate);
      } else {  
        try {
          duplicate = docManager.findDuplicate(doc);
          if (duplicate != null) {
            log.info("URLs with same content found in cache: " + urlString + " = " + duplicate);
          }
        } catch (IOException e) {
          e.printStackTrace();
        }
      }
      
      if (duplicate != null) {
        String pureDuplicate = removeParameters(duplicate);
        String pureUrl = removeParameters(urlString);
        if (!pureUrl.equals(pureDuplicate) && !cached) {
          // different url not yet stored -> store it
          try {
            // retrieve links from original
            HttpDoc linksDoc = docManager.retrieveFromCache(new URL(duplicate));
            if (linksDoc != null) {    
              doc.setLinks(linksDoc.getLinks());
            }
            docManager.storeDocument(doc);
          } catch (Exception e) {
            e.printStackTrace();
          }
        }
        RobotTask newTask;
        try {
          newTask = createRobotTask(new URL(duplicate), depth, referer);
          // check already here for visited tasks to save memory
          if (!visited.contains(newTask)) {
            addTask(newTask);
          }
        } catch (MalformedURLException e) {
          e.printStackTrace(); // Can�t happen
        }
        return;
      } 
    }


    // was it an UnAuthorized document ?
    if (doc.isUnauthorized()) {
      log.info("got HTTP Unauthorized for URL " + u);
    }


    if (doc.isOk() || cached) {
      // callback
      if (webRobotCallback != null) {
        int contentLength=0;
        if (doc.getContent() != null) { contentLength=doc.getContent().length; }
        webRobotCallback.webRobotRetrievedDoc(urlString, contentLength);
      }


      // extract links
      try {
        if (doc.isHTML() && (depth > 0)) {
          // solving encoding problem
          // HtmlDocument htmlDoc = new HtmlDocument(u, doc.getContent());
          HtmlDocument htmlDoc = null;
          HttpHeader contentTypeHeader = doc.getHeader("Content-type");
          if (contentTypeHeader != null) {
            String contentType = contentTypeHeader.getValue();
            int index = contentType.toLowerCase().indexOf("charset=");
            if (index > 0) {
              htmlDoc = new HtmlDocument(u, doc.getContent(), contentType.substring(index+8));
            } else {
              htmlDoc = new HtmlDocument(u, doc.getContent());
            }
          } else {
            htmlDoc = new HtmlDocument(u, doc.getContent());
          }
  
          // add links
          
          // this depth-check is critical!
          // otherwise far too many RobotTasks will be created
          // this will cause a premature OutOfMemoryException!
          if (depth > 0) {
            if (duplicate != null) {
              HttpDoc linksDoc = docManager.retrieveFromCache(new URL(duplicate));
              doc.setLinks(linksDoc.getLinks());
            } else if (cached) {
            } 
            if (links == null) {
              links = htmlDoc.getLinks();
              doc.setLinks(links);

View Full Code Here

   * retrieves a file from the local file system.
   * @param url the url of the file to retrieve
   * @return HttpDoc containing the content and mime type
   */
  private HttpDoc retrieveFileURL(URL url, Date ifModifiedSince) throws HttpException {
    HttpDoc doc = new HttpDoc();


    try {
      String host = url.getHost();
      String filename = url.getFile();
      if ((host == null) || (host.equals(""))) {
        // local file
        // remove leading / or \
        if ((filename.startsWith("\\")) || (filename.startsWith("/"))) {
          filename = filename.substring(1);
        }
      } else {
        filename = "//" + host + filename;
      }
      // get the mimetype and put in the http header
      String mimetypestr = getMimeTypeForFilename(filename);
      if (mimetypestr != null) {
        HttpHeader header = new HttpHeader("content-type", mimetypestr);
        doc.addHeader(header);
      }
      
      // get the content from the file
      File file = new File(filename);
      if (!file.exists()) {
        doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTFOUND);
        return doc;
      }
      long fileLastModified = file.lastModified();
      long ifModifiedSinceTime = ifModifiedSince == null ? 0 : ifModifiedSince.getTime();
      if (fileLastModified > ifModifiedSinceTime) {
        byte[] content = readFileToByteArray(file);
        doc.setContent(content);
        doc.setHttpCode("httpcode " + HttpConstants.HTTP_OK);
      } else {
        doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTMODIFIED);
      }
      doc.setLastModified(fileLastModified);
      doc.setDate(System.currentTimeMillis());
      doc.setURL(url);
      
      return doc;
    } catch (Exception e) {
      throw new HttpException(e.getMessage());
    }

View Full Code Here

    throws Exception
  {
    BasicConfigurator.configure();
    
    HttpTool tool = new HttpTool();
    HttpDoc doc = tool.retrieveDocument(new URL("http://usul27:a1rrakis@www.atkpremium.com/members/styles/standard/pages/index.php?thispage=modelupdate&thisupdate=083735&thismodel=len004"),
          HttpConstants.GET,null);
    HtmlDocument html=new HtmlDocument(new URL("http://localhost"), doc.getContent());
    for (URL u: html.getLinks()) {
      System.out.println(u);
    }
    
    //    System.out.println(doc);

View Full Code Here

TOP

Related Classes of net.matuschek.http.HttpDoc

net.matuschek.spider.WebRobot

net.matuschek.util.MD5

TryIt

java.util.StringTokenizer

java.util.Date

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.