Source Code of org.sf.mustru.crawl.ClassifyDoc

package org.sf.mustru.crawl;


import java.io.FileInputStream;
import java.io.IOException;
import java.io.ObjectInputStream;


import org.apache.log4j.Logger;
import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.utils.Constants;


import com.aliasi.classify.JointClassification;
import com.aliasi.classify.LMClassifier;


/**
 * Classify a document into a type - book, article, Web page, etc. and
 * by content into a category - finance, health, or sports
 *
 */
public class ClassifyDoc 
{
  static Logger logger = Logger.getLogger(ClassifyDoc.class.getName());
  private LMClassifier fileTypeClassifier = null;    //*-- classifier for file types
  private LMClassifier textTypeClassifier = null;    //*-- classifier for text types
  
  /**
   * Classification of files and text contents using pre-defined models 
   */
  public ClassifyDoc() 
  { 
   //*-- read the classification models 
   String fmodelFile = Constants.FTYPE_CLASS_MODEL;  //*-- file type model - article, page, book, etc. 
   String tmodelFile = Constants.TTYPE_CLASS_MODEL;  //*-- text type model - business, sports, health
   try
   { ObjectInputStream oi = new ObjectInputStream( new FileInputStream(fmodelFile) );
     fileTypeClassifier = (LMClassifier) oi.readObject();
     oi.close();  
     oi = new ObjectInputStream( new FileInputStream(tmodelFile) );
     textTypeClassifier = (LMClassifier) oi.readObject();
     oi.close();  
   }
   catch (IOException ie) { logger.error("Failed to read the classifier model: " + ie.getMessage() ); }
   catch (ClassNotFoundException ce) { logger.error("Could not find LMClassifier class: " + ce.getMessage() ); }
   
  }
  
  /**
   * Classify the passed text string into one of the file types - book, letter, or text.
   * @param idoc IndexableDoc Document to be classified
   * @return String Type of file
   */
  public String classifyFileType(IndexableDoc idoc)
  {
   //*-- use the top n bytes of the contents to identify the file type
   int length = idoc.getContents().length();
   String contents = (length > 500) ? idoc.getContents().toString().substring(0, 500): 
                  idoc.getContents().toString();
   
   //*-- return book if the length of the document seems large
   if (length > Constants.DB_DOC_STORE_LIMIT) return "book";
   
   //*-- assign a default file type of text when insufficient evidence
   JointClassification jc =  fileTypeClassifier.classifyJoint(contents);
   return ( (jc.score(0) > -2.5) ? jc.bestCategory(): "text");
  }


  /**
   * Classify the passed text string into one of the pre-defined categories - rec, business, news, etc.
   * @param idoc IndexableDoc The document whose contents are to be classified
   * @return String The closest category for the passed string
   */
  public String classifyTextContents(IndexableDoc idoc)
  {
   //*-- assign a default text type of "" when insufficient evidence
   JointClassification jc =  textTypeClassifier.classifyJoint(idoc.getContents().toString());
   return ( (jc.score(0) > -2.5) ? jc.bestCategory(): "miscellaneous");
  }
  
}
Source Code of org.sf.mustru.crawl.ClassifyDoc

Related Classes of org.sf.mustru.crawl.ClassifyDoc