package org.sf.mustru.docs;
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.sf.mustru.crawl.ClassifyDoc;
import org.sf.mustru.utils.Constants;
import com.sleepycat.bind.tuple.TupleBinding;
import com.sleepycat.bind.tuple.TupleInput;
import com.sleepycat.bind.tuple.TupleOutput;
/**
* A Web page document class
*
*/
public class PageDoc extends IndexableDoc
{
private String textType = ""; //*-- type of text
/**
* A generic Web page document class that can be indexed
* String (Optional) The name of the file containing the text
*/
public PageDoc() { super(); setBdbBinding( new PageDocDb() ); }
public PageDoc(String ifile) { super(ifile); setBdbBinding( new PageDocDb() ); }
/**
* load text specific information, type of text
*/
public void loadSpecific (ClassifyDoc cdoc)
{
setFileType("webpage");
setTextType(cdoc.classifyTextContents(this));
//*-- check if the URL is local
String webDir = Constants.getWEBDIR(); String filename = getFileName();
setFileLocation("");
if (webDir.length() > 0)
{ String webDirPattern = '^' + webDir + ".*$";
if (filename.matches(webDirPattern))
{ filename = filename.replaceFirst(webDir, "http://localhost");
setFileLocation(filename);
}
}
}
//*-- create the Lucene Index
public void loadIndex(IndexWriter ramIW, boolean storeTermVector) throws IOException
{
Document doc = new Document();
doc.add( new Field("key", getFileName(), Field.Store.YES, Field.Index.NO) );
doc.add(new Field("contents", getContents().toString(), Field.Store.NO, Field.Index.TOKENIZED) );
doc.add( new Field("type", getFileType(), Field.Store.YES, Field.Index.NO) );
doc.add( new Field("category", getTextType(), Field.Store.YES, Field.Index.NO) );
ramIW.addDocument(doc);
}
public TupleBinding getBdbBinding()
{ return bdbBinding; }
public void setBdbBinding(TupleBinding bdbBinding)
{ this.bdbBinding = bdbBinding; }
public String toString()
{
StringBuffer sb = new StringBuffer();
sb.append(super.toString());
//*-- add text specific data
sb.append(" Text type: "); sb.append(getTextType() );
return sb.toString();
}
public String getTextType()
{ return textType; }
public void setTextType(String textType)
{ this.textType = textType; }
}
/**
* Berkeley DB binding for TextDoc
*
*/
final class PageDocDb extends TupleBinding
{
static IndexableDocBinding idb = new IndexableDocBinding();
IndexableDoc idoc;
public Object entryToObject(TupleInput ti)
{
idoc = (IndexableDoc) idb.entryToObject(ti);
PageDoc o = new PageDoc();
o.loadGeneric(idoc); idoc = null;
//*-- write any text specific information to o
o.setTextType(ti.readString());
return o;
}
public void objectToEntry(Object o, TupleOutput to)
{
idb.objectToEntry(o, to);
//*-- write text specific to the tuple output
PageDoc tdoc = (PageDoc) o;
to.writeString(tdoc.getTextType());
}
}