Source Code of net.matuschek.http.HttpDocToFile

package net.matuschek.http;


/************************************************
    Copyright (c) 2001/2002 by Daniel Matuschek
*************************************************/




import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
import java.util.StringTokenizer;


import org.apache.log4j.Category;


/**
 * DocumentManager that will store document contents in a file.
 *
 * @author Daniel Matuschek 
 * @version $Revision: 1.11 $
 */
public class HttpDocToFile extends AbstractHttpDocManager
{
  /**
   * directory where the files will be created
   */
  private String baseDir;




  /**
   * the object will not store files smaller then this size !
   */
  private int minFileSize;
  


  /**
   * defines if special characters in the URL should be replaced
   * by "normal" characters
   * @see #setReplaceAllSpecials(boolean)
   */
  private boolean replaceAllSpecials = false;




  /**
   * defines, if CGIs should be stored on disc. 
   *
   * @see #setStoreCGI
   */
  private boolean storeCGI = true;


  /** Log4J logging */
  private Category log;




  
  /**
   * creates a new HttpDocToFile object that will store the
   * documents in the given directory
   */
  public HttpDocToFile(String baseDir) {
    this.baseDir = baseDir;
    log = Category.getInstance(getClass().getName());
  }
  


  /**
   * store document (that means write it to disk)
   * @param doc the document to store
   * @exception DocManagerException if the document can't be stored
   * (some IO error occured)
   */
  public void storeDocument(HttpDoc doc) 
    throws DocManagerException
  {
    if ((doc == null) || (doc.getContent() == null)) {
      return;
    }
    
    /* 
     * write file only, if this was NOT a cached document
     * (in this case we have it already on harddisk)
     */
    if (doc.isCached()) {
      return;
    }




    if ((! storeCGI)
  && (doc.getURL().toString().indexOf('?') >= 0)) {
      // do not store dynamic pages, because storeCGI is false
      // and the URL contains a "?"
      return;
    }




    String filename = url2Filename(doc.getURL());
    if (doc.getContent().length >= minFileSize) {
      try {
    createDirs(filename);
    BufferedOutputStream os = 
      new BufferedOutputStream(new FileOutputStream(filename));
    os.write(doc.getContent());
    os.flush();
    os.close();
      } catch (IOException e) {
    throw new DocManagerException(e.getMessage());
      }
    }
  }




  /**
   * Gets the cacheFile of the given URL if its document was stored.
   * @param url
   * @return cacheFile
   */
  protected File getCacheFile(URL url) {
    // does the file exists on the filesystem ?
    File cacheFile = new File(url2Filename(url));
    if (! (cacheFile.exists() && (cacheFile.isFile()))) {
    return null;
    }
    return cacheFile;
  }


  /**
   * Gets the extension of the given URL if its document was stored.
   * @param url
   * @return String
   */
  protected String getExtension(URL url) {
    // is it dynamic ?
    if ((url.toString().indexOf('?') >= 0) 
    || (url.toString().indexOf("cgi") >= 0)) {
      return null;
    }
    
      // do we have an filename extension ?
      // without it is not possible to guess the MIME type.
      String path = url.getPath();
      String ext = null;
  
      if (path.indexOf(".") < 0) {
      return null;
      }
  
      StringTokenizer st = new StringTokenizer(path,".");
      while (st.hasMoreTokens()) {
      ext = st.nextToken();
      }
      // no extension if ext contains a "/"
      if (ext.indexOf("/") >= 0) {
      return null;
      }
      
      return ext;
  }
  
  /**
   * Removes a document that was stored previous from the file system. Because
   * the HttpDocToFile does not store the HTTP headers, only the Content-Type
   * header will exists. Even this header may not be correct. It will only use a
   * simple heuristic to determine the possible MIME type.
   */
  public void removeDocument(URL u) {
  String ext = getExtension(u);
  if (ext == null) return;
  File cacheFile = getCacheFile(u);
  if (cacheFile == null) return ;
  
  cacheFile.delete();
  }


  /**
   * Gets a document that was stored previous from the file system.
   * Because the HttpDocToFile does not store the HTTP headers, only
   * the Content-Type header will exists. Even this header may not 
   * be correct. It will only use a simple heuristic to determine the
   * possible MIME type.
   *
   * @return null, if this document was not stored before or it seems
   * to be a dynamic document.
   */
  public HttpDoc retrieveFromCache(URL u) {
  String ext = getExtension(u);
  if (ext == null) return null;
  File cacheFile = getCacheFile(u);
  if (cacheFile == null) return null;
    
    // create a buffer;
    long size = cacheFile.length();
    if (size > Integer.MAX_VALUE) {
      log.info("File too large");
      return null;
    }


    byte[] buff = new byte[(int) size];


    // read the file
    try {
      FileInputStream fi = new FileInputStream(cacheFile);
      fi.read(buff);
    } catch (IOException e) {
      log.info("Could not read cached document "+e.getMessage());
      return null;
    }
    
    // create a new HttpDoc object
    HttpDoc doc = new HttpDoc();


    // and set the content and the header
    doc.setHttpCode("HTTP/1.0 200 OK");
    doc.setContent(buff);
    
   
    // now guess the MIME type
    String mimetype = null;


    if (ext.equals("html") 
  || ext.equals("htm")
  || ext.equals("shtml")
  || ext.equals("asp")
  || ext.equals("php")
  || ext.equals("jsp")) {
      mimetype="text/html";
    } else {
      mimetype="application/unknown";
    }


    doc.addHeader(new HttpHeader("Content-Type",mimetype));    
    doc.setURL(u);
    doc.setCached(true);
  
    return doc;
  }
  


  /**
   * gets the value of baseDir
   * @return the value of baseDir
   */
  public String getBaseDir() {
    return baseDir;
  }
  


  /**
   * sets the value of basedir
   * @param baseDir the new value of baseDir
   */
  public void setBaseDir(String baseDir) {
    this.baseDir = baseDir;
  }
  


  /**
   * converts an URL to a filename http://host/path will 
   * be converted to basedir/host/path
   * @param URL a URL to convert, must not be null
   * @return a pathname
   */
  protected String url2Filename(URL u) {
    StringBuffer sb = new StringBuffer();


    sb.append(baseDir);
    sb.append(File.separatorChar);
    sb.append(u.getHost());
    sb.append(u.getFile());


    // is there a query part ?
    // that is something after the file name seperated by ?
    String query = u.getQuery();
    if ((query != null) &&
  (!query.equals(""))) {
      sb.append(File.separatorChar);
      sb.append(query);
    }


    // filename that ends with /
    // are directories, we will name the file "index.html"
    if (sb.charAt(sb.length()-1) == '/') {
      sb.append("index.html");
    } 


    // postprocess filename (replace special characters)
    for (int i=0; i<sb.length(); i++) {
      char c=sb.charAt(i);
      char newc=(char)0;


      // replace / by operating system file name separator
      if (c == '/') {
  newc = File.separatorChar;
      }
      
      // replace special characters from CGIs
      if (replaceAllSpecials) {
  if ((c == '?')
      || (c == '=')
      || (c == '&')) {
    newc = '-';
  }
      }


      if ((newc != (char)0) 
    && (newc != c)) {
  sb.setCharAt(i,newc);
      }
    }


    return sb.toString();
  }
  


  /** 
   * creates all directories that are needed to place the 
   * file filename if they don't exists 
   * @param filename the full path name of a file
   */
  protected void createDirs(String filename) throws IOException {
    int pos = -1;
    // look for the last directory separator in the filename
    for (int i = filename.length() - 1; i >= 0; i--) {
      if (filename.charAt(i) == File.separatorChar) {
  pos = i;
  i = -1;
      }
    }
    File dir = new File(filename.substring(0, pos));
    dir.mkdirs();
  }
  


  /**
   * gets the value of minFileSize. Files smaller then this size
   * (in Bytes) will not be saved to disk !
   * @return the value of minFileSize 
   */
  public int getMinFileSize() {
    return minFileSize;
  }


  
  /**
   * sets the value of minFileSize
   * @param minFileSize the new value of minFileSize
   * @see #getMinFileSize()
   */
  public void setMinFileSize(int minFileSize) {
    this.minFileSize = minFileSize;
  }




  /**
   * Get the value of replaceAllSpecials.
   *
   * if replaceAllSpecials is true, all sepcial characters in the URL
   * will be replaced by "-". This is useful for operating system that
   * can't handle files with special characters in the filename (e.g.
   * Windows)
   *
   * @return value of replaceAllSpecials.
   */
  public boolean isReplaceAllSpecials() {
    return replaceAllSpecials;
  }
  


  /**
   * Set the value of replaceAllSpecials.
   *
   * if replaceAllSpecials is true, all sepcial characters in the URL
   * will be replaced by "-". This is useful for operating system that
   * can't handle files with special characters in the filename (e.g.
   * Windows)
   *
   * @param v  Value to assign to replaceAllSpecials.
   */
  public void setReplaceAllSpecials(boolean  v) {
    this.replaceAllSpecials = v;
  } 




  /**
   * Get the value of storeCGI
   *
   * If this is true, the object will store ALL retrieved documents,
   * otherwise it will store only documents from URLs that do not
   * have a "?" in the URL
   */
  public boolean getStoreCGI() {
    return storeCGI;
  }
  


  /**
   * Set the value of storeCGI.
   *
   * If this is true, the object will store ALL retrieved documents,
   * otherwise it will store only documents from URLs that do not
   * have a "?" in the URL
   *
   * @param v  Value to assign to storeCGI.
   */
  public void setStoreCGI(boolean v) {
    this.storeCGI = v;
  } 


}
Source Code of net.matuschek.http.HttpDocToFile

Related Classes of net.matuschek.http.HttpDocToFile