//////////////////////////////////////////////////////////////////////////////
// Copyright (c) Insiders Wissensbasierte Systeme GmbH, Germany
//////////////////////////////////////////////////////////////////////////////
package net.matuschek.http;
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipOutputStream;
import net.matuschek.util.MD5;
import org.apache.log4j.Category;
/**
* Full implementation of HttpDocManager interface.
* Caches documents, links and headers in ZIP-files.
* Documents with same content will be detected
* and share the same content-storage.
*
* @author Oliver Schmidt
* @version $Revision: 1.2 $
*/
public class HttpDocCache implements HttpDocManager {
/** internally used header name to mark duplicates */
protected final static String CONTENT_DUPLICATE = "Content-Duplicate";
/** use MD5 encoding for filenames */
public boolean useMD5 = true;
/** log4j logging instance */
protected static Category log =
Category.getInstance(HttpDocCache.class.getName());
/** collection of visited URLs */
private Collection urls = new LinkedList();
/** storage main directory */
protected String storagedir;
/** file that holds directory information */
protected File storageDirectoryFile = null;
/** subdirectory name for links */
protected final static String LINKS = "links" + File.separator;
/** subdirectory name for content */
protected final static String CONTENT = "content" + File.separator;
/** subdirectory name for document information */
protected final static String DOCUMENTS = "documents" + File.separator;
/**
* Constructor
* @param storageDirectory
*/
public HttpDocCache(String storageDirectory) {
setStorageDir(storageDirectory);
}
private FileOutputStream storageDirectoryStream = null;
/**
* Set storage directory and create directories if necessary.
* @param newStoragedir
*/
private void setStorageDir(String newStoragedir) {
storagedir = newStoragedir;
if (!storagedir.endsWith(File.separator)) {
storagedir = storagedir + File.separator;
}
// create the directories, if they do not exist yet.
File storagedirFile = new File(storagedir + DOCUMENTS);
if (!storagedirFile.exists()) {
storagedirFile.mkdirs();
}
File contentFile = new File(storagedir + CONTENT);
if (!contentFile.exists()) {
contentFile.mkdirs();
}
if (useMD5) {
storageDirectoryFile = new File(storagedir + "directory.csv");
try {
storageDirectoryStream = new FileOutputStream(storageDirectoryFile.getPath(), true);
if (!storageDirectoryFile.exists()) {
storageDirectoryStream.write(("Path,URL" + LF).getBytes());
}
} catch (Exception e) {
log.error(e.getMessage());
}
}
}
final static String QUOTE = "\"";
final static String LF = System.getProperty("line.separator");
/**
* Method store.
* stores the document to the storage directory
* @param doc the document to be stored
* @param links to be stored (optional)
* @return String
* @throws DocManagerException if the document cannot be written to the directory
*/
public void storeDocument(HttpDoc doc) throws DocManagerException {
List links = doc.getLinks();
// don�t store cached documents
if (doc.isCached()) {
return;
}
// get the content type
String filename = generateFilename(doc.getURL().toExternalForm());
String filepath = storagedir + DOCUMENTS + filename;
checkStoragePathFor(DOCUMENTS, filename);
try {
File f = new File(filepath + ".zip");
if (!f.exists()) {
writeDirectoryInfo(doc, filename);
}
// write it to the file
OutputStream fs = new BufferedOutputStream(new FileOutputStream(f));
ZipOutputStream zos = new ZipOutputStream(fs);
zos.setLevel(9);
try {
// writeContentToZipFile(doc, zos);
storeContent(doc);
writeHeadersToZipFile(doc, zos);
writeUrlToZipFile(doc, zos);
if (links != null) {
writeLinksToZipFile(links, zos);
}
} catch (Throwable e){
System.out.println(e);
} finally {
zos.close();
fs.close();
long date = doc.getDateAsMilliSeconds();
f.setLastModified(date > 0 ? date : System.currentTimeMillis());
}
} catch (IOException ioex) {
DocManagerException ex = new DocManagerException(ioex.getMessage());
throw ex;
}
}
/**
* Write Directory info.
* @param doc
* @param filename in cache
* @throws IOException
*/
protected void writeDirectoryInfo(HttpDoc doc, String filename)
throws IOException {
if (storageDirectoryFile != null) {
synchronized(storageDirectoryFile) {
try {
String directoryInfo = QUOTE + filename + QUOTE + "," + QUOTE + doc.getURL() + QUOTE + LF;
storageDirectoryStream.write(directoryInfo.getBytes());
} catch (Exception e) {
log.warn(e.getMessage());
storageDirectoryStream.close();
}
}
}
}
/**
* Write content to zipFile
* @param doc
* @param zos
* @throws IOException
*/
protected void writeContentToZipFile(HttpDoc doc, ZipOutputStream zos)
throws IOException {
String contenttype = doc.getHeaderValue(HttpHeader.CONTENT_TYPE);
String extension = getExtensionFromContenttype(contenttype);
ZipEntry zipEntry = new ZipEntry("content" + extension);
long date = doc.getLastModifiedAsMilliSeconds();
if (date < 0) {
date = doc.getDateAsMilliSeconds();
}
zipEntry.setTime(date);
zos.putNextEntry(zipEntry);
zos.write(doc.getContent());
zos.closeEntry();
}
/**
* Write headers to zipFile.
* @param doc
* @param zos
* @return ZipEntry
* @throws IOException
*/
protected ZipEntry writeHeadersToZipFile(HttpDoc doc, ZipOutputStream zos) throws IOException {
StringBuffer comment = new StringBuffer();
Vector headers = doc.getHttpHeader();
for (Iterator iter = headers.iterator(); iter.hasNext();) {
HttpHeader header = (HttpHeader) iter.next();
if (!header.getName().equals(CONTENT_DUPLICATE)) {
comment.append(header.toString());
if (iter.hasNext()) {
comment.append(LF);
}
}
}
ZipEntry ze = new ZipEntry("header");
zos.putNextEntry(ze);
zos.write(comment.toString().getBytes());
long date = doc.getDateAsMilliSeconds();
ze.setTime(date > 0 ? date : System.currentTimeMillis());
zos.closeEntry();
return ze;
}
/**
* Read headers from ZipFile
* @param doc
* @param zf
* @return boolean
* @throws IOException
*/
protected boolean readHeadersFromZipFile(HttpDoc doc, ZipFile zf) throws IOException {
ZipEntry ze = zf.getEntry("header");
if (ze != null) {
InputStream is = zf.getInputStream(ze);
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
while (reader.ready()) {
String line = reader.readLine();
int pos = line.indexOf(": ");
if (pos >= 0) {
String name = line.substring(0, pos);
String value = line.substring(pos + 2);
HttpHeader header = new HttpHeader(name, value);
doc.addHeader(header);
}
}
reader.close();
return true;
}
return false;
}
/**
* Read links from ZipFile
* @param doc
* @param zf
* @return boolean
* @throws IOException
*/
protected boolean readLinksFromZipFile(HttpDoc doc, ZipFile zf) throws IOException {
ZipEntry ze = zf.getEntry("links");
List links = doc.getLinks();
if (links == null) {
links = new Vector();
doc.setLinks(links);
} else {
links.clear();
}
if (ze != null) {
InputStream is = zf.getInputStream(ze);
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
while (reader.ready()) {
String line = reader.readLine();
if (line != null) {
URL url = new URL(line);
links.add(url);
}
}
reader.close();
return true;
}
return false;
}
/**
* Write Url to ZipFile.
* @param doc
* @param zos
* @return ZipEntry
* @throws IOException
*/
protected ZipEntry writeUrlToZipFile(HttpDoc doc, ZipOutputStream zos) throws IOException {
String url = doc.getURL().toString();
ZipEntry ze = new ZipEntry("url");
zos.putNextEntry(ze);
zos.write(url.getBytes());
long date = doc.getDateAsMilliSeconds();
ze.setTime(date > 0 ? date : System.currentTimeMillis());
zos.closeEntry();
return ze;
}
/**
* Get File of document content users.
* @param doc
* @return File
*/
private File getContentUsersFile(HttpDoc doc) {
File f = null;
byte[] content = doc.getContent();
if (content.length != 0) {
String md5 = doc.getContentMD5();
f = contentFile(md5, ".txt");
}
return f;
}
/**
* Returns URL-String of duplicate content (if found).
* @see net.matuschek.http.HttpDocManager#findDuplicate(HttpDoc)
*/
public String findDuplicate(HttpDoc doc) throws IOException {
String duplicate = null;
File f = getContentUsersFile(doc);
if (f != null) {
String urlString = doc.getURL().toString();
if (f.exists()) {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
while (reader.ready()) {
String line = reader.readLine();
if (line.equals(urlString)) {
break;
} else if (duplicate == null) {
duplicate = line;
}
}
reader.close();
}
}
return duplicate;
}
/**
* Creates a file with a name created by the content, containing the URL.
* @param doc
*/
protected void storeContent(HttpDoc doc) throws IOException {
if (doc.getContent().length == 0)
return;
File f = getContentUsersFile(doc);
String urlString = doc.getURL().toString();
String md5 = doc.getContentMD5();
// is content user?
boolean found = false;
if (f.exists()) {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
try {
while (reader.ready()) {
String line = reader.readLine();
if (line.equals(urlString)) {
found = true; break;
}
}
} finally {
reader.close();
}
}
// write content
File fzip = contentFile(md5, ".zip");
if (!fzip.exists()) {
checkStoragePathFor(CONTENT, useFirstCharactersAsDirectories(md5));
OutputStream fs = new BufferedOutputStream(new FileOutputStream(fzip));
ZipOutputStream zos = null;
try {
zos = new ZipOutputStream(fs);
zos.setLevel(9);
writeContentToZipFile(doc, zos);
} finally {
if (zos != null) {
zos.close();
} else {
fs.close();
}
}
} else {
fzip.setLastModified(System.currentTimeMillis());
}
// append user
if (!found) {
FileOutputStream os = new FileOutputStream(f.getPath(), true);
try {
os.write((urlString + LF).getBytes());
} finally {
os.close();
}
}
}
/**
* Write links to ZipFile.
* @param links
* @param ZipOutputStream
*/
protected void writeLinksToZipFile(List links, ZipOutputStream zs)
throws IOException {
HashSet storedLinks = new HashSet();
ZipEntry zipEntry = new ZipEntry("links");
zs.putNextEntry(zipEntry);
for (Iterator iter = links.iterator(); iter.hasNext();) {
URL url = (URL) iter.next();
if (!storedLinks.contains(url)) {
zs.write((url.toString() + LF).getBytes());
storedLinks.add(url);
}
}
zs.closeEntry();
}
/**
* Collects Urls (duplicates will be skipped).
*
* @param doc a HttpDoc object to process. This may also be null
* @exception DocManagerException will be thrown if an error occurs
* while processing the document.
* @see net.matuschek.http.HttpDocManager#processDocument(net.matuschek.http.HttpDoc)
*/
public void processDocument(HttpDoc doc) throws DocManagerException {
log.info(
"Processing "
+ doc.getURL().toExternalForm()
+ doc.getHttpHeader());
// collect URL (only if content is no duplicate)
HttpHeader duplicate = doc.getHeader(CONTENT_DUPLICATE);
if (duplicate == null) {
urls.add(doc.getURL());
}
}
/**
* retrieves a document from the cache.
* @param url
* @see net.matuschek.http.HttpDocManager#retrieveFromCache(java.net.URL)
*/
public HttpDoc retrieveFromCache(java.net.URL url) {
HttpDoc doc = null;
File f = null;
try {
String filename0 = url.toExternalForm();
String filename = generateFilename(filename0) + ".zip";
f = new File(storagedir + DOCUMENTS + filename);
if (f.exists()) {
log.info("retrieve " + f);
// create document and read it from file
doc = new HttpDoc();
doc.setURL(url);
ZipFile zf = new ZipFile(f);
// read headers
readHeadersFromZipFile(doc, zf);
// read links
readLinksFromZipFile(doc, zf);
doc.setCached(true);
// read content
String md5 = doc.getContentMD5();
File contentFile = contentFile(md5, ".zip");
if (contentFile.exists()) {
ZipFile contentZip = new ZipFile(contentFile);
readContentFromZipFile(doc, contentZip);
contentZip.close();
} else {
doc.setContent(new byte[0]);
}
zf.close();
}
} catch (Exception e) {
log.warn("removing invalid file " + f);
f.delete();
doc = null;
}
return doc;
}
/**
* Read content from ZipFile
* @param doc
* @param contentZip
* @throws IOException
*/
protected void readContentFromZipFile(HttpDoc doc, ZipFile contentZip)
throws IOException {
byte[] content = null;
for (Enumeration enumeration = contentZip.entries(); enumeration.hasMoreElements();) {
ZipEntry zipEntry = (ZipEntry) enumeration.nextElement();
if (zipEntry.getName().startsWith("content")) {
InputStream is = contentZip.getInputStream(zipEntry);
int length = (int) zipEntry.getSize();
content = new byte[length];
int startPos = 0;
while (startPos < length) {
startPos += is.read(content, startPos, length - startPos);
}
is.close();
break;
}
}
doc.setContent(content);
}
/**
* Remove document from cache.
* @param url
* @see net.matuschek.http.HttpDocManager#removeDocument(URL)
*/
public void removeDocument(URL url) {
HttpDoc doc = retrieveFromCache(url);
File f = null;
try {
String filename0 = url.toExternalForm();
String filename = generateFilename(filename0) + ".zip";
f = new File(storagedir + LINKS + filename);
if (f.exists()) {
f.delete();
}
deleteContent(doc);
f = new File(storagedir + DOCUMENTS + filename);
if (f.exists()) {
f.delete();
}
} catch (Exception ex) {
log.error(ex);
}
}
/**
* Deletes stored content for the given document
* @param document
*/
private void deleteContent(HttpDoc doc) throws IOException {
byte[] content = doc.getContent();
if (content.length == 0) {
return;
}
String urlString = doc.getURL().toString();
String md5 = doc.getContentMD5();
File f = contentFile(md5, ".txt");
ArrayList entries = new ArrayList();
if (f.exists()) {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
while (reader.ready()) {
String line = reader.readLine();
if (!line.equals(urlString)) {
entries.add(line);
}
}
reader.close();
}
if (entries.size() > 0) {
FileOutputStream os = new FileOutputStream(f.getPath(), false);
for (Iterator iter = entries.iterator(); iter.hasNext();) {
String line = (String) iter.next();
os.write((line + LF).getBytes());
}
os.close();
} else {
f.delete();
File fzip = contentFile(md5, ".zip");
if (fzip.exists()) {
fzip.delete();
}
}
}
/**
* List collected URLs.
* @see java.lang.Object#toString()
*/
public String toString() {
StringBuffer sb = new StringBuffer(1000);
for (Iterator i = urls.iterator(); i.hasNext();) {
sb.append(i.next()).append("\n");
}
return sb.toString();
}
/**
* Uses the first storageDirDepth characters of filename as paths
* @param filename
*/
private final String useFirstCharactersAsDirectories(String filename) {
int n = storageDirDepth;
if (n > filename.length()) n = filename.length();
char dir[] = new char[n*2];
for (int i=0; i<n; i++) {
dir[i*2] = filename.charAt(i);
dir[i*2+1] = File.separatorChar;
}
return new String(dir);
}
/**
* Checks if the storage path for the given file exists and creates it if necessary.
* @param subdirectory
* @param filename
*/
private final void checkStoragePathFor(String subdirectory, String filename) {
if (!subdirectory.endsWith(File.separator)) {
subdirectory += File.separator;
}
String head = filename.substring(0, storageDirDepth*2);
File path = new File(storagedir + subdirectory + head);
if (!path.exists()) {
path.mkdirs();
}
}
/**
* Generate a valid filename for the given docURI.
* @param docURI
* @return String
*/
protected String generateFilename(String docURI) {
if (useMD5) {
MD5 md5 = new MD5(docURI);
String hex = md5.asHex();
if (storageDirDepth > 0) {
return useFirstCharactersAsDirectories(hex) + hex.substring(storageDirDepth);
}
return hex;
} else {
StringBuffer buf = new StringBuffer(docURI.length());
for (int i = 0; i < docURI.length(); i++) {
char c = docURI.charAt(i);
switch (c) {
case '/' : buf.append("&slash;"); break;
case '\\' : buf.append("&backslash"); break;
case ':' : buf.append(":"); break;
case '*' : buf.append("&asterisk;"); break;
case '?' : buf.append("&question;"); break;
case '\"' : buf.append("""); break;
case '<' : buf.append("<"); break;
case '>' : buf.append(">"); break;
case '|' : buf.append("∨"); break;
default : buf.append(c); break;
}
}
docURI = buf.toString();
return docURI;
}
}
/**
* Returns a File with the mapping of this content to its URLs.
* @param content
* @return long
*/
protected File contentFile(String hex, String extension) {
return new File(storagedir + CONTENT + useFirstCharactersAsDirectories(hex) + hex.substring(storageDirDepth) + extension);
}
/**
* Close storageDirectory File.
* @see net.matuschek.http.HttpDocManager#finish()
*/
public void finish() {
if (storageDirectoryStream != null) {
try {
storageDirectoryStream.close();
storageDirectoryStream = null;
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* Calls finish and super.finalize().
* @see java.lang.Object#finalize()
*/
protected void finalize() throws Throwable {
finish();
super.finalize();
}
/**
* Depth of source set directory.
* (depth = number of used subdirectory levels)
* The first storageDirDepth characters of file will be used
* as directories.
*/
protected int storageDirDepth = 0;
/**
* Sets the desired directory depth of the source set directory
* (depth = number of used subdirectory levels)
*
* @param desired depth of source set directory.
*/
public void setStorageDirDepth(int depth) { storageDirDepth = depth; }
/**
* Method getstorageDirDepth.
* returns the directory depth of the source set directory
* @param desired depth of source set directory.
* @return the directory depth of the source set directory
*/
public int getStorageDirDepth() { return storageDirDepth; }
/**
* Get relevant part of contenttype and get default extension for it.
* @param contenttype
* @return extension
*/
private String getExtensionFromContenttype(String contenttype) {
String extension = null;
if (contenttype != null){
String strContentType = null;
int pos = contenttype.indexOf(';');
if (pos > 0) {
strContentType = contenttype.substring(0, pos).trim();
} else {
strContentType = contenttype.trim();
}
extension = getDefaultExtension(strContentType);
}
if (extension == null) {
extension = "";
} else {
extension = "." + extension;
}
return extension;
}
/**
* Get default extension for given contentType.
* @param contentType
* @return default extension or null
*/
protected String getDefaultExtension(String contentType) {
if (contentType == null) {
return null;
} else if (contentType.indexOf("text/html") >= 0) {
return ".html";
} else if (contentType.indexOf("text/") >= 0) {
return ".txt";
} else {
return null;
}
}
}