import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.sf.mustru.crawl.ClassifyDoc;
import com.sleepycat.bind.tuple.TupleBinding;
import com.sleepycat.bind.tuple.TupleInput;
import com.sleepycat.bind.tuple.TupleOutput;
* Book documents judged based on the length of the document.
* Will try to find the title, author, and language from a Gutenberg like book
public class BookDoc extends IndexableDoc
private String textType = "";
//*-- pre-compiled RE patterns
private static Pattern titlePattern = Pattern.compile("^Title:(.*)$", Pattern.MULTILINE);
private static Pattern authorPattern = Pattern.compile("^Author:(.*)$", Pattern.MULTILINE);
private static Pattern languagePattern = Pattern.compile("^Language:(.*)$", Pattern.MULTILINE);
private static Pattern nlinePattern = Pattern.compile(System.getProperty("line.separator"));
* A book document class that can be indexed
* String (Optional) The name of the file containing the text
public BookDoc() { super(); setBdbBinding( new BookDocDb() ); }
public BookDoc(String ifile) { super(ifile); setBdbBinding( new BookDocDb() ); }
* load text specific information, type of text - article, book, etc.
* category of text
public void loadSpecific (ClassifyDoc cdoc)
//*-- identify the possible title, author, and language
//*-- split the contents into lines
Matcher matcher = null;
String[] arr = nlinePattern.split ( getContents());
for (int i = 0; i < arr.length; i++)
matcher = titlePattern.matcher(arr[i]);
if (matcher.matches()) setTitle(;
matcher = authorPattern.matcher(arr[i]);
if (matcher.matches()) setAuthor(;
matcher = languagePattern.matcher(arr[i]);
if (matcher.matches()) setLanguage(;
if (i > 50) break; //*-- stop searching after the first 50 lines
} //*-- end of for
//*-- create the Lucene Index
public void loadIndex(IndexWriter ramIW, boolean storeTermVector) throws IOException
Document doc = new Document();
doc.add( new Field("key", getFileName(), Field.Store.YES, Field.Index.NO) );
doc.add(new Field("contents", getContents().toString(), Field.Store.NO, Field.Index.TOKENIZED) );
doc.add( new Field("type", getFileType(), Field.Store.YES, Field.Index.NO) );
doc.add( new Field("category", getTextType(), Field.Store.YES, Field.Index.NO) );
public TupleBinding getBdbBinding()
{ return bdbBinding; }
public void setBdbBinding(TupleBinding bdbBinding)
{ this.bdbBinding = bdbBinding; }
public String toString()
StringBuffer sb = new StringBuffer();
//*-- add book specific data
sb.append(" Text type: "); sb.append(getTextType() );
return sb.toString();
public String getTextType()
{ return textType; }
public void setTextType(String textType)
{ this.textType = textType; }
* Berkeley DB binding for BookDoc
final class BookDocDb extends TupleBinding
static IndexableDocBinding idb = new IndexableDocBinding();
IndexableDoc idoc;
public Object entryToObject(TupleInput ti)
idoc = (IndexableDoc) idb.entryToObject(ti);
BookDoc o = new BookDoc();
o.loadGeneric(idoc); idoc = null;
//*-- write any text specific information to o
return o;
public void objectToEntry(Object o, TupleOutput to)
idb.objectToEntry(o, to);
//*-- write text specific to the tuple output
BookDoc tdoc = (BookDoc) o;