Source Code of org.sf.mustru.docs.BookDocDb

package org.sf.mustru.docs;


import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.sf.mustru.crawl.ClassifyDoc;


import com.sleepycat.bind.tuple.TupleBinding;
import com.sleepycat.bind.tuple.TupleInput;
import com.sleepycat.bind.tuple.TupleOutput;


/**
 * Book documents judged based on the length of the document. 
 * Will try to find the title, author, and language from a Gutenberg like book
 */
public class BookDoc extends IndexableDoc
{
  private String textType = "";
  
  //*-- pre-compiled RE patterns
  private static Pattern titlePattern   = Pattern.compile("^Title:(.*)$", Pattern.MULTILINE);
  private static Pattern authorPattern   = Pattern.compile("^Author:(.*)$", Pattern.MULTILINE);                                    
  private static Pattern languagePattern = Pattern.compile("^Language:(.*)$", Pattern.MULTILINE);
  private static Pattern nlinePattern   = Pattern.compile(System.getProperty("line.separator"));
   
  /**
   * A book document class that can be indexed
   * String (Optional) The name of the file containing the text
   */
  public BookDoc() { super(); setBdbBinding( new BookDocDb() ); }
  public BookDoc(String ifile) { super(ifile); setBdbBinding( new BookDocDb() ); }
  
  /**
   * load text specific information, type of text - article, book, etc.
   * category of text
   */
  public void loadSpecific (ClassifyDoc cdoc)
  {  
   setTextType(cdoc.classifyTextContents(this));
   setFileType("book");


   //*-- identify the possible title, author, and language
   //*-- split the contents into lines
   Matcher matcher = null;
   String[] arr = nlinePattern.split ( getContents());
   for (int i = 0; i < arr.length; i++)
   { 
    matcher = titlePattern.matcher(arr[i]);     
    if (matcher.matches()) setTitle(matcher.group(1));


    matcher = authorPattern.matcher(arr[i]);     
    if (matcher.matches()) setAuthor(matcher.group(1));


    matcher = languagePattern.matcher(arr[i]);     
    if (matcher.matches()) setLanguage(matcher.group(1));


    if (i > 50) break;   //*-- stop searching after the first 50 lines 
   } //*-- end of for


  }


  //*-- create the Lucene Index
  public void loadIndex(IndexWriter ramIW, boolean storeTermVector) throws IOException
  {
   Document doc = new Document();
   doc.add( new Field("key", getFileName(), Field.Store.YES, Field.Index.NO) );
   doc.add(new Field("contents", getContents().toString(), Field.Store.NO, Field.Index.TOKENIZED) );
   doc.add( new Field("type", getFileType(), Field.Store.YES, Field.Index.NO) );
   doc.add( new Field("category", getTextType(), Field.Store.YES, Field.Index.NO) );
   ramIW.addDocument(doc);
  }


  public TupleBinding getBdbBinding() 
  { return bdbBinding; }


  public void setBdbBinding(TupleBinding bdbBinding) 
  { this.bdbBinding = bdbBinding; }


  public String toString()
  {
   StringBuffer sb = new StringBuffer();
   sb.append(super.toString());


   //*-- add book specific data
   sb.append(" Text type: "); sb.append(getTextType() );


   return sb.toString();
  }


  public String getTextType() 
  { return textType; }


  public void setTextType(String textType) 
  { this.textType = textType; }


}


/**
 * Berkeley DB binding for BookDoc
 *
 */
final class BookDocDb extends TupleBinding
{
 static IndexableDocBinding idb = new IndexableDocBinding();
 IndexableDoc idoc;


 public Object entryToObject(TupleInput ti)
 {
  idoc = (IndexableDoc) idb.entryToObject(ti);
  BookDoc o = new BookDoc();
  o.loadGeneric(idoc); idoc = null;


  //*-- write any text specific information to o
  o.setTextType(ti.readString());


  return o;
 }


 public void objectToEntry(Object o, TupleOutput to)
 { 
  idb.objectToEntry(o, to);


  //*-- write text specific to the tuple output
  BookDoc tdoc = (BookDoc) o;
  to.writeString(tdoc.getTextType());


 }


}
Source Code of org.sf.mustru.docs.BookDocDb

Related Classes of org.sf.mustru.docs.BookDocDb