package net.matuschek.http;
/************************************************
Copyright (c) 2001/2002 by Daniel Matuschek
*************************************************/
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
import java.util.StringTokenizer;
import org.apache.log4j.Category;
/**
* DocumentManager that will store document contents in a file.
*
* @author Daniel Matuschek
* @version $Revision: 1.11 $
*/
public class HttpDocToFile extends AbstractHttpDocManager
{
/**
* directory where the files will be created
*/
private String baseDir;
/**
* the object will not store files smaller then this size !
*/
private int minFileSize;
/**
* defines if special characters in the URL should be replaced
* by "normal" characters
* @see #setReplaceAllSpecials(boolean)
*/
private boolean replaceAllSpecials = false;
/**
* defines, if CGIs should be stored on disc.
*
* @see #setStoreCGI
*/
private boolean storeCGI = true;
/** Log4J logging */
private Category log;
/**
* creates a new HttpDocToFile object that will store the
* documents in the given directory
*/
public HttpDocToFile(String baseDir) {
this.baseDir = baseDir;
log = Category.getInstance(getClass().getName());
}
/**
* store document (that means write it to disk)
* @param doc the document to store
* @exception DocManagerException if the document can't be stored
* (some IO error occured)
*/
public void storeDocument(HttpDoc doc)
throws DocManagerException
{
if ((doc == null) || (doc.getContent() == null)) {
return;
}
/*
* write file only, if this was NOT a cached document
* (in this case we have it already on harddisk)
*/
if (doc.isCached()) {
return;
}
if ((! storeCGI)
&& (doc.getURL().toString().indexOf('?') >= 0)) {
// do not store dynamic pages, because storeCGI is false
// and the URL contains a "?"
return;
}
String filename = url2Filename(doc.getURL());
if (doc.getContent().length >= minFileSize) {
try {
createDirs(filename);
BufferedOutputStream os =
new BufferedOutputStream(new FileOutputStream(filename));
os.write(doc.getContent());
os.flush();
os.close();
} catch (IOException e) {
throw new DocManagerException(e.getMessage());
}
}
}
/**
* Gets the cacheFile of the given URL if its document was stored.
* @param url
* @return cacheFile
*/
protected File getCacheFile(URL url) {
// does the file exists on the filesystem ?
File cacheFile = new File(url2Filename(url));
if (! (cacheFile.exists() && (cacheFile.isFile()))) {
return null;
}
return cacheFile;
}
/**
* Gets the extension of the given URL if its document was stored.
* @param url
* @return String
*/
protected String getExtension(URL url) {
// is it dynamic ?
if ((url.toString().indexOf('?') >= 0)
|| (url.toString().indexOf("cgi") >= 0)) {
return null;
}
// do we have an filename extension ?
// without it is not possible to guess the MIME type.
String path = url.getPath();
String ext = null;
if (path.indexOf(".") < 0) {
return null;
}
StringTokenizer st = new StringTokenizer(path,".");
while (st.hasMoreTokens()) {
ext = st.nextToken();
}
// no extension if ext contains a "/"
if (ext.indexOf("/") >= 0) {
return null;
}
return ext;
}
/**
* Removes a document that was stored previous from the file system. Because
* the HttpDocToFile does not store the HTTP headers, only the Content-Type
* header will exists. Even this header may not be correct. It will only use a
* simple heuristic to determine the possible MIME type.
*/
public void removeDocument(URL u) {
String ext = getExtension(u);
if (ext == null) return;
File cacheFile = getCacheFile(u);
if (cacheFile == null) return ;
cacheFile.delete();
}
/**
* Gets a document that was stored previous from the file system.
* Because the HttpDocToFile does not store the HTTP headers, only
* the Content-Type header will exists. Even this header may not
* be correct. It will only use a simple heuristic to determine the
* possible MIME type.
*
* @return null, if this document was not stored before or it seems
* to be a dynamic document.
*/
public HttpDoc retrieveFromCache(URL u) {
String ext = getExtension(u);
if (ext == null) return null;
File cacheFile = getCacheFile(u);
if (cacheFile == null) return null;
// create a buffer;
long size = cacheFile.length();
if (size > Integer.MAX_VALUE) {
log.info("File too large");
return null;
}
byte[] buff = new byte[(int) size];
// read the file
try {
FileInputStream fi = new FileInputStream(cacheFile);
fi.read(buff);
} catch (IOException e) {
log.info("Could not read cached document "+e.getMessage());
return null;
}
// create a new HttpDoc object
HttpDoc doc = new HttpDoc();
// and set the content and the header
doc.setHttpCode("HTTP/1.0 200 OK");
doc.setContent(buff);
// now guess the MIME type
String mimetype = null;
if (ext.equals("html")
|| ext.equals("htm")
|| ext.equals("shtml")
|| ext.equals("asp")
|| ext.equals("php")
|| ext.equals("jsp")) {
mimetype="text/html";
} else {
mimetype="application/unknown";
}
doc.addHeader(new HttpHeader("Content-Type",mimetype));
doc.setURL(u);
doc.setCached(true);
return doc;
}
/**
* gets the value of baseDir
* @return the value of baseDir
*/
public String getBaseDir() {
return baseDir;
}
/**
* sets the value of basedir
* @param baseDir the new value of baseDir
*/
public void setBaseDir(String baseDir) {
this.baseDir = baseDir;
}
/**
* converts an URL to a filename http://host/path will
* be converted to basedir/host/path
* @param URL a URL to convert, must not be null
* @return a pathname
*/
protected String url2Filename(URL u) {
StringBuffer sb = new StringBuffer();
sb.append(baseDir);
sb.append(File.separatorChar);
sb.append(u.getHost());
sb.append(u.getFile());
// is there a query part ?
// that is something after the file name seperated by ?
String query = u.getQuery();
if ((query != null) &&
(!query.equals(""))) {
sb.append(File.separatorChar);
sb.append(query);
}
// filename that ends with /
// are directories, we will name the file "index.html"
if (sb.charAt(sb.length()-1) == '/') {
sb.append("index.html");
}
// postprocess filename (replace special characters)
for (int i=0; i<sb.length(); i++) {
char c=sb.charAt(i);
char newc=(char)0;
// replace / by operating system file name separator
if (c == '/') {
newc = File.separatorChar;
}
// replace special characters from CGIs
if (replaceAllSpecials) {
if ((c == '?')
|| (c == '=')
|| (c == '&')) {
newc = '-';
}
}
if ((newc != (char)0)
&& (newc != c)) {
sb.setCharAt(i,newc);
}
}
return sb.toString();
}
/**
* creates all directories that are needed to place the
* file filename if they don't exists
* @param filename the full path name of a file
*/
protected void createDirs(String filename) throws IOException {
int pos = -1;
// look for the last directory separator in the filename
for (int i = filename.length() - 1; i >= 0; i--) {
if (filename.charAt(i) == File.separatorChar) {
pos = i;
i = -1;
}
}
File dir = new File(filename.substring(0, pos));
dir.mkdirs();
}
/**
* gets the value of minFileSize. Files smaller then this size
* (in Bytes) will not be saved to disk !
* @return the value of minFileSize
*/
public int getMinFileSize() {
return minFileSize;
}
/**
* sets the value of minFileSize
* @param minFileSize the new value of minFileSize
* @see #getMinFileSize()
*/
public void setMinFileSize(int minFileSize) {
this.minFileSize = minFileSize;
}
/**
* Get the value of replaceAllSpecials.
*
* if replaceAllSpecials is true, all sepcial characters in the URL
* will be replaced by "-". This is useful for operating system that
* can't handle files with special characters in the filename (e.g.
* Windows)
*
* @return value of replaceAllSpecials.
*/
public boolean isReplaceAllSpecials() {
return replaceAllSpecials;
}
/**
* Set the value of replaceAllSpecials.
*
* if replaceAllSpecials is true, all sepcial characters in the URL
* will be replaced by "-". This is useful for operating system that
* can't handle files with special characters in the filename (e.g.
* Windows)
*
* @param v Value to assign to replaceAllSpecials.
*/
public void setReplaceAllSpecials(boolean v) {
this.replaceAllSpecials = v;
}
/**
* Get the value of storeCGI
*
* If this is true, the object will store ALL retrieved documents,
* otherwise it will store only documents from URLs that do not
* have a "?" in the URL
*/
public boolean getStoreCGI() {
return storeCGI;
}
/**
* Set the value of storeCGI.
*
* If this is true, the object will store ALL retrieved documents,
* otherwise it will store only documents from URLs that do not
* have a "?" in the URL
*
* @param v Value to assign to storeCGI.
*/
public void setStoreCGI(boolean v) {
this.storeCGI = v;
}
}