package org.sf.mustru.crawl;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import org.apache.log4j.Logger;
import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.utils.Constants;
import com.aliasi.classify.JointClassification;
import com.aliasi.classify.LMClassifier;
/**
* Classify a document into a type - book, article, Web page, etc. and
* by content into a category - finance, health, or sports
*
*/
public class ClassifyDoc
{
static Logger logger = Logger.getLogger(ClassifyDoc.class.getName());
private LMClassifier fileTypeClassifier = null; //*-- classifier for file types
private LMClassifier textTypeClassifier = null; //*-- classifier for text types
/**
* Classification of files and text contents using pre-defined models
*/
public ClassifyDoc()
{
//*-- read the classification models
String fmodelFile = Constants.FTYPE_CLASS_MODEL; //*-- file type model - article, page, book, etc.
String tmodelFile = Constants.TTYPE_CLASS_MODEL; //*-- text type model - business, sports, health
try
{ ObjectInputStream oi = new ObjectInputStream( new FileInputStream(fmodelFile) );
fileTypeClassifier = (LMClassifier) oi.readObject();
oi.close();
oi = new ObjectInputStream( new FileInputStream(tmodelFile) );
textTypeClassifier = (LMClassifier) oi.readObject();
oi.close();
}
catch (IOException ie) { logger.error("Failed to read the classifier model: " + ie.getMessage() ); }
catch (ClassNotFoundException ce) { logger.error("Could not find LMClassifier class: " + ce.getMessage() ); }
}
/**
* Classify the passed text string into one of the file types - book, letter, or text.
* @param idoc IndexableDoc Document to be classified
* @return String Type of file
*/
public String classifyFileType(IndexableDoc idoc)
{
//*-- use the top n bytes of the contents to identify the file type
int length = idoc.getContents().length();
String contents = (length > 500) ? idoc.getContents().toString().substring(0, 500):
idoc.getContents().toString();
//*-- return book if the length of the document seems large
if (length > Constants.DB_DOC_STORE_LIMIT) return "book";
//*-- assign a default file type of text when insufficient evidence
JointClassification jc = fileTypeClassifier.classifyJoint(contents);
return ( (jc.score(0) > -2.5) ? jc.bestCategory(): "text");
}
/**
* Classify the passed text string into one of the pre-defined categories - rec, business, news, etc.
* @param idoc IndexableDoc The document whose contents are to be classified
* @return String The closest category for the passed string
*/
public String classifyTextContents(IndexableDoc idoc)
{
//*-- assign a default text type of "" when insufficient evidence
JointClassification jc = textTypeClassifier.classifyJoint(idoc.getContents().toString());
return ( (jc.score(0) > -2.5) ? jc.bestCategory(): "miscellaneous");
}
}