Source Code of net.matuschek.http.HttpDocCache

//////////////////////////////////////////////////////////////////////////////
// Copyright (c) Insiders Wissensbasierte Systeme GmbH, Germany
//////////////////////////////////////////////////////////////////////////////


package net.matuschek.http;


import java.io.*;
import java.net.*;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipOutputStream;


import net.matuschek.util.MD5;
import org.apache.log4j.Category;


/**
 * Full implementation of HttpDocManager interface.
 * Caches documents, links and headers in ZIP-files.
 * Documents with same content will be detected 
 * and share the same content-storage.
 *
 * @author Oliver Schmidt
 * @version $Revision: 1.2 $
 */
public class HttpDocCache implements HttpDocManager {


  /** internally used header name to mark duplicates */
  protected final static String CONTENT_DUPLICATE = "Content-Duplicate";
  
  /** use MD5 encoding for filenames */
  public boolean useMD5 = true;
  
  /** log4j logging instance */
  protected static Category log =
    Category.getInstance(HttpDocCache.class.getName());


  /** collection of visited URLs */
  private Collection urls = new LinkedList();


  /** storage main directory */
  protected String storagedir;
  
  /** file that holds directory information */
  protected File storageDirectoryFile = null;
  
  /** subdirectory name for links */
  protected final static String LINKS = "links" + File.separator;
  
  /** subdirectory name for content */
  protected final static String CONTENT = "content" + File.separator;
  
  /** subdirectory name for document information */
  protected final static String DOCUMENTS = "documents" + File.separator;
  
  /**
   * Constructor
   * @param storageDirectory
   */
  public HttpDocCache(String storageDirectory) {
    setStorageDir(storageDirectory);
  }
  
  private FileOutputStream storageDirectoryStream = null;
  
  /**
   * Set storage directory and create directories if necessary.
   * @param newStoragedir
   */
  private void setStorageDir(String newStoragedir) {
    storagedir = newStoragedir;
    
    if (!storagedir.endsWith(File.separator)) {
      storagedir = storagedir + File.separator;
    }
    
    // create the directories, if they do not exist yet.
    File storagedirFile = new File(storagedir + DOCUMENTS);
    if (!storagedirFile.exists()) {
      storagedirFile.mkdirs();
    }
    File contentFile = new File(storagedir + CONTENT);
    if (!contentFile.exists()) {
      contentFile.mkdirs();
    }
    
    if (useMD5) {
      storageDirectoryFile = new File(storagedir + "directory.csv");
      try {
        storageDirectoryStream = new FileOutputStream(storageDirectoryFile.getPath(), true);
        if (!storageDirectoryFile.exists()) {
          storageDirectoryStream.write(("Path,URL" + LF).getBytes());
        }
      } catch (Exception e) {
        log.error(e.getMessage());
      }
    }
  }
  
  final static String QUOTE = "\"";
  final static String LF = System.getProperty("line.separator");


  /**
   * Method store.
   * stores the document to the storage directory
   * @param doc the document to be stored
   * @param links to be stored (optional)
   * @return String
   * @throws DocManagerException if the document cannot be written to the directory
   */
  public void storeDocument(HttpDoc doc) throws DocManagerException {
    List links = doc.getLinks();
     
    // don�t store cached documents
    if (doc.isCached()) {
      return;
    }
    
    // get the content type
    String filename = generateFilename(doc.getURL().toExternalForm());
    
    String filepath = storagedir + DOCUMENTS + filename;
    checkStoragePathFor(DOCUMENTS, filename);
          
    try {
      File f = new File(filepath + ".zip");
      if (!f.exists()) {
        writeDirectoryInfo(doc, filename);
      }
  
      // write it to the file
      OutputStream fs = new BufferedOutputStream(new FileOutputStream(f));
      ZipOutputStream zos = new ZipOutputStream(fs);
      zos.setLevel(9);
      
      try {
  //      writeContentToZipFile(doc, zos);
        storeContent(doc);
        writeHeadersToZipFile(doc, zos);
        writeUrlToZipFile(doc, zos);
        if (links != null) {
          writeLinksToZipFile(links, zos);
        }
      } catch (Throwable e){
        System.out.println(e);
      } finally {
        zos.close();
        fs.close();
        long date = doc.getDateAsMilliSeconds();
        f.setLastModified(date > 0 ? date : System.currentTimeMillis());
      }
    } catch (IOException ioex) {
      DocManagerException ex = new DocManagerException(ioex.getMessage());
      throw ex;
    }
  }


  /**
   * Write Directory info.
   * @param doc
   * @param filename in cache
   * @throws IOException
   */
  protected void writeDirectoryInfo(HttpDoc doc, String filename)
    throws IOException {
    if (storageDirectoryFile != null) {
      synchronized(storageDirectoryFile) {
        try {
          String directoryInfo = QUOTE + filename + QUOTE + "," + QUOTE + doc.getURL() + QUOTE + LF;
          storageDirectoryStream.write(directoryInfo.getBytes());
        } catch (Exception e) {
          log.warn(e.getMessage());
          storageDirectoryStream.close();
        }
      }
    }
  }


  /**
   * Write content to zipFile
   * @param doc
   * @param zos
   * @throws IOException
   */
  protected void writeContentToZipFile(HttpDoc doc, ZipOutputStream zos)
    throws IOException {
    String contenttype = doc.getHeaderValue(HttpHeader.CONTENT_TYPE);
    String extension = getExtensionFromContenttype(contenttype);
    ZipEntry zipEntry = new ZipEntry("content" + extension);
    long date = doc.getLastModifiedAsMilliSeconds();
    if (date < 0) {
      date = doc.getDateAsMilliSeconds();
    }
    zipEntry.setTime(date);
    zos.putNextEntry(zipEntry);
    zos.write(doc.getContent());
    zos.closeEntry();
  }


  /**
   * Write headers to zipFile.
   * @param doc
   * @param zos
   * @return ZipEntry
   * @throws IOException
   */
  protected ZipEntry writeHeadersToZipFile(HttpDoc doc, ZipOutputStream zos) throws IOException {
    StringBuffer comment = new StringBuffer();
    Vector headers = doc.getHttpHeader();
    for (Iterator iter = headers.iterator(); iter.hasNext();) {
      HttpHeader header = (HttpHeader) iter.next();
      if (!header.getName().equals(CONTENT_DUPLICATE)) {
        comment.append(header.toString());
        if (iter.hasNext()) {
          comment.append(LF);
        }
      }
    }
    ZipEntry ze = new ZipEntry("header");
    zos.putNextEntry(ze);
    zos.write(comment.toString().getBytes());
    long date = doc.getDateAsMilliSeconds();
    ze.setTime(date > 0 ? date : System.currentTimeMillis());
    zos.closeEntry();
    return ze;
  }
  
  /**
   * Read headers from ZipFile
   * @param doc
   * @param zf
   * @return boolean
   * @throws IOException
   */
  protected boolean readHeadersFromZipFile(HttpDoc doc, ZipFile zf) throws IOException {
    ZipEntry ze = zf.getEntry("header");
    if (ze != null) {
      InputStream is = zf.getInputStream(ze);
      BufferedReader reader = new BufferedReader(new InputStreamReader(is));
      while (reader.ready()) {
        String line = reader.readLine();
        int pos = line.indexOf(": ");
        if (pos >= 0) {
          String name = line.substring(0, pos);
          String value = line.substring(pos + 2);
          HttpHeader header = new HttpHeader(name, value);
          doc.addHeader(header);
        }
      }
      reader.close();
      return true;
    }
    return false;
  }
  
  /**
   * Read links from ZipFile
   * @param doc
   * @param zf
   * @return boolean
   * @throws IOException
   */
  protected boolean readLinksFromZipFile(HttpDoc doc, ZipFile zf) throws IOException {
    ZipEntry ze = zf.getEntry("links");
    List links = doc.getLinks();
    if (links == null) {
      links = new Vector();
      doc.setLinks(links);
    } else {
      links.clear();
    }
    
    if (ze != null) {
      InputStream is = zf.getInputStream(ze);
      BufferedReader reader = new BufferedReader(new InputStreamReader(is));
      while (reader.ready()) {
        String line = reader.readLine();
        if (line != null) {
          URL url = new URL(line);
          links.add(url);
        }
      }
      reader.close();
      return true;
    }
    return false;
  }
  
  /**
   * Write Url to ZipFile.
   * @param doc
   * @param zos
   * @return ZipEntry
   * @throws IOException
   */
  protected ZipEntry writeUrlToZipFile(HttpDoc doc, ZipOutputStream zos) throws IOException {
    String url = doc.getURL().toString();
    ZipEntry ze = new ZipEntry("url");
    zos.putNextEntry(ze);
    zos.write(url.getBytes());
    long date = doc.getDateAsMilliSeconds();
    ze.setTime(date > 0 ? date : System.currentTimeMillis());
    zos.closeEntry();
    return ze;
  }
  
  /**
   * Get File of document content users.
   * @param doc
   * @return File
   */
  private File getContentUsersFile(HttpDoc doc) {
    File f = null;
    byte[] content = doc.getContent();
    if (content.length != 0) {
      String md5 = doc.getContentMD5();
      f = contentFile(md5, ".txt");
    }
    return f;
  }
  
  /**
   * Returns URL-String of duplicate content (if found).
   * @see net.matuschek.http.HttpDocManager#findDuplicate(HttpDoc)
   */
  public String findDuplicate(HttpDoc doc) throws IOException {
    String duplicate = null;
    File f = getContentUsersFile(doc);
    if (f != null) {
      String urlString = doc.getURL().toString();
      if (f.exists()) {
        BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
        while (reader.ready()) {
          String line = reader.readLine();
          if (line.equals(urlString)) {
            break;
          } else if (duplicate == null) {
            duplicate = line; 
          }
        }
        reader.close();
      } 
    }
    return duplicate;
  }
  
  /**
   * Creates a file with a name created by the content, containing the URL.
   * @param doc
   */  
  protected void storeContent(HttpDoc doc) throws IOException {
    if (doc.getContent().length == 0) 
      return;
    File f = getContentUsersFile(doc);
    String urlString = doc.getURL().toString();
    String md5 = doc.getContentMD5();
    
    // is content user?
    boolean found = false;
    if (f.exists()) {
      BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
      try {
        while (reader.ready()) {
          String line = reader.readLine();
          if (line.equals(urlString)) {
            found = true; break;
          }
        }
      } finally {
        reader.close();
      }
    } 
    
    // write content
    File fzip = contentFile(md5, ".zip");
    if (!fzip.exists()) {
      checkStoragePathFor(CONTENT, useFirstCharactersAsDirectories(md5));
      OutputStream fs = new BufferedOutputStream(new FileOutputStream(fzip));
      ZipOutputStream zos = null;
      try {
        zos = new ZipOutputStream(fs);
        zos.setLevel(9);
        writeContentToZipFile(doc, zos);
      } finally {
        if (zos != null) {
          zos.close();
        } else {
          fs.close();
        }
      }
    } else {
      fzip.setLastModified(System.currentTimeMillis());
    }
    
    // append user
    if (!found) {
      FileOutputStream os = new FileOutputStream(f.getPath(), true);
      try {
        os.write((urlString + LF).getBytes());
      } finally {
        os.close();
      }
    }
  }


  /**
   * Write links to ZipFile.
   * @param links
   * @param ZipOutputStream
   */  
  protected void writeLinksToZipFile(List links, ZipOutputStream zs)
    throws IOException {
    HashSet storedLinks = new HashSet();
    ZipEntry zipEntry = new ZipEntry("links");
    zs.putNextEntry(zipEntry);
    for (Iterator iter = links.iterator(); iter.hasNext();) {
      URL url = (URL) iter.next();
      if (!storedLinks.contains(url)) {
        zs.write((url.toString() + LF).getBytes());
        storedLinks.add(url);
      }
    }
    zs.closeEntry();
  }
  
  /**
   * Collects Urls (duplicates will be skipped).
   * 
   * @param doc a HttpDoc object to process. This may also be null
   * @exception DocManagerException will be thrown if an error occurs
   * while processing the document.
   * @see net.matuschek.http.HttpDocManager#processDocument(net.matuschek.http.HttpDoc)
   */
  public void processDocument(HttpDoc doc) throws DocManagerException {
    log.info(
      "Processing "
        + doc.getURL().toExternalForm()
        + doc.getHttpHeader());
        
    // collect URL (only if content is no duplicate)
    HttpHeader duplicate = doc.getHeader(CONTENT_DUPLICATE);
    if (duplicate == null) {
      urls.add(doc.getURL());
    }
  }


  /**
   * retrieves a document from the cache.
   * @param url
   * @see net.matuschek.http.HttpDocManager#retrieveFromCache(java.net.URL)
   */
  public HttpDoc retrieveFromCache(java.net.URL url) {
    HttpDoc doc = null;
    File f = null;
    try {
      String filename0 = url.toExternalForm(); 
      String filename = generateFilename(filename0) + ".zip";
      f = new File(storagedir + DOCUMENTS + filename);
          
      if (f.exists()) {
        log.info("retrieve " + f);
        
        // create document and read it from file
        doc = new HttpDoc();
        doc.setURL(url);
        ZipFile zf = new ZipFile(f);
        
        // read headers
        readHeadersFromZipFile(doc, zf);
        
        // read links
        readLinksFromZipFile(doc, zf);
        
        doc.setCached(true);
        
        // read content
        String md5 = doc.getContentMD5();
        File contentFile = contentFile(md5, ".zip");
        if (contentFile.exists()) {
          ZipFile contentZip = new ZipFile(contentFile);
          readContentFromZipFile(doc, contentZip);
          contentZip.close();
        } else {
          doc.setContent(new byte[0]);
        }
        zf.close();
      } 
    } catch (Exception e) {
      log.warn("removing invalid file " + f);
      f.delete();
      doc = null;
    }
        
    return doc;
  }
  
  /**
   * Read content from ZipFile
   * @param doc
   * @param contentZip
   * @throws IOException
   */
  protected void readContentFromZipFile(HttpDoc doc, ZipFile contentZip)
    throws IOException {
    byte[] content = null;
    for (Enumeration enumeration = contentZip.entries(); enumeration.hasMoreElements();) {
      ZipEntry zipEntry = (ZipEntry) enumeration.nextElement();
      if (zipEntry.getName().startsWith("content")) {
        InputStream is = contentZip.getInputStream(zipEntry);
        int length = (int) zipEntry.getSize();
        content = new byte[length]; 
        int startPos = 0;
        while (startPos < length) {
          startPos += is.read(content, startPos, length - startPos);
        }
        is.close();
        break;
      }
    }
    doc.setContent(content);
  }
  
  /**
   * Remove document from cache.
   * @param url
   * @see net.matuschek.http.HttpDocManager#removeDocument(URL)
   */
  public void removeDocument(URL url) {
    HttpDoc doc = retrieveFromCache(url);
    
    File f = null;
    try {
      String filename0 = url.toExternalForm(); 
      String filename = generateFilename(filename0) + ".zip";
      
      f = new File(storagedir + LINKS + filename);
      if (f.exists()) {
        f.delete();
      }
      
      deleteContent(doc);
      f = new File(storagedir + DOCUMENTS + filename);
      if (f.exists()) {
        f.delete();
      }
    } catch (Exception ex) {
      log.error(ex);
    }
  }
  
  /**
   * Deletes stored content for the given document
   * @param document
   */  
  private void deleteContent(HttpDoc doc) throws IOException {
    byte[] content = doc.getContent();
    if (content.length == 0) {
      return;
    }
    String urlString = doc.getURL().toString();
    String md5 = doc.getContentMD5();
    File f = contentFile(md5, ".txt");
    ArrayList entries = new ArrayList();
    if (f.exists()) {
      BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
      while (reader.ready()) {
        String line = reader.readLine();
        if (!line.equals(urlString)) {
          entries.add(line);
        }
      }
      reader.close();
    }
    if (entries.size() > 0) {
      FileOutputStream os = new FileOutputStream(f.getPath(), false);
      for (Iterator iter = entries.iterator(); iter.hasNext();) {
        String line = (String) iter.next();
        os.write((line + LF).getBytes());
      }
      os.close();
    } else {
      f.delete();
      File fzip = contentFile(md5, ".zip");
      if (fzip.exists()) {
        fzip.delete();
      }
    }
  }
  
  /**
   * List collected URLs.
   * @see java.lang.Object#toString()
   */
  public String toString() {
    StringBuffer sb = new StringBuffer(1000);
    for (Iterator i = urls.iterator(); i.hasNext();) {
      sb.append(i.next()).append("\n");
    }
    return sb.toString();
  }


  /**
   * Uses the first storageDirDepth characters of filename as paths
   * @param filename
   */
  private final String useFirstCharactersAsDirectories(String filename) {
    int n = storageDirDepth;
    if (n > filename.length()) n = filename.length();
    char dir[] = new char[n*2];
    for (int i=0; i<n; i++) {
      dir[i*2] = filename.charAt(i);
      dir[i*2+1] = File.separatorChar;
    }
    return new String(dir);
  }
  
  /**
   * Checks if the storage path for the given file exists and creates it if necessary.
   * @param subdirectory
   * @param filename
   */
  private final void checkStoragePathFor(String subdirectory, String filename) {
    if (!subdirectory.endsWith(File.separator)) {
      subdirectory += File.separator;
    }
    String head = filename.substring(0, storageDirDepth*2);
    File path = new File(storagedir + subdirectory + head);
    if (!path.exists()) {
      path.mkdirs();
    }
  }
  
  /**
   * Generate a valid filename for the given docURI.
   * @param docURI
   * @return String
   */
  protected String generateFilename(String docURI) {
    if (useMD5) {
      MD5 md5 = new MD5(docURI);
      String hex = md5.asHex();
      if (storageDirDepth > 0) {
        return useFirstCharactersAsDirectories(hex) + hex.substring(storageDirDepth);
      }
      return hex;
    } else {
      StringBuffer buf = new StringBuffer(docURI.length());
      
      for (int i = 0; i < docURI.length(); i++) {
        char c = docURI.charAt(i);
        switch (c) {
          case '/' : buf.append("&slash;"); break;
          case '\\' : buf.append("&backslash"); break;
          case ':' : buf.append("&colon;"); break;
          case '*' : buf.append("&asterisk;"); break;
          case '?' : buf.append("&question;"); break;
          case '\"' : buf.append("&quot;"); break;
          case '<' : buf.append("&lt;"); break;
          case '>' : buf.append("&gt;"); break;
          case '|' : buf.append("&or;"); break;
          default : buf.append(c); break;
        }
      }
      docURI = buf.toString();
      
      return docURI;
    }
  }


  /**
   * Returns a File with the mapping of this content to its URLs.
   * @param content
   * @return long
   */
  protected File contentFile(String hex, String extension) {
    return new File(storagedir + CONTENT + useFirstCharactersAsDirectories(hex) + hex.substring(storageDirDepth) + extension);
  }
  
  /**
   * Close storageDirectory File.
   * @see net.matuschek.http.HttpDocManager#finish()
   */
  public void finish() {
    if (storageDirectoryStream != null) {
      try {
        storageDirectoryStream.close();
        storageDirectoryStream = null;
      } catch (IOException e) {
        e.printStackTrace();
      }
    }
  }
  
  /**
   * Calls finish and super.finalize().
   * @see java.lang.Object#finalize()
   */
  protected void finalize() throws Throwable { 
    finish();
    super.finalize();
  }
  
  /**
   * Depth of source set directory.
   * (depth = number of used subdirectory levels)
   * The first storageDirDepth characters of file will be used
   * as directories.
   */
  protected int storageDirDepth = 0;
  
  /**
   * Sets the desired directory depth of the source set directory
   * (depth = number of used subdirectory levels)
   * 
   * @param desired depth of source set directory.
   */
  public void setStorageDirDepth(int depth) {  storageDirDepth = depth; }
  
  /**
   * Method getstorageDirDepth.
   * returns the directory depth of the source set directory
   * @param desired depth of source set directory.
   * @return the directory depth of the source set directory
   */
  public int getStorageDirDepth() { return storageDirDepth; }
  
  /**
   * Get relevant part of contenttype and get default extension for it.
   * @param contenttype
   * @return extension
   */
  private String getExtensionFromContenttype(String contenttype) {
    String extension = null;
    if (contenttype != null){
      String strContentType = null;
      int pos = contenttype.indexOf(';');
      if (pos > 0) {
        strContentType = contenttype.substring(0, pos).trim();
      } else {
        strContentType = contenttype.trim();
      }
      extension = getDefaultExtension(strContentType);
    }
    
    if (extension == null) {
      extension = "";
    } else {
      extension = "." + extension;
    }
    return extension;
  }


  /**
   * Get default extension for given contentType.
   * @param contentType
   * @return default extension or null
   */
  protected String getDefaultExtension(String contentType) {
    if (contentType == null) {
      return null;
    } else if (contentType.indexOf("text/html") >= 0) {
      return ".html";
    } else if (contentType.indexOf("text/") >= 0) {
      return ".txt";
    } else {
      return null;
    }
  }
}
Source Code of net.matuschek.http.HttpDocCache

Related Classes of net.matuschek.http.HttpDocCache