Package org.sf.mustru.crawl

Source Code of org.sf.mustru.crawl.ClassifyDoc

package org.sf.mustru.crawl;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.ObjectInputStream;

import org.apache.log4j.Logger;
import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.utils.Constants;

import com.aliasi.classify.JointClassification;
import com.aliasi.classify.LMClassifier;

/**
* Classify a document into a type - book, article, Web page, etc. and
* by content into a category - finance, health, or sports
*
*/
public class ClassifyDoc
{
  static Logger logger = Logger.getLogger(ClassifyDoc.class.getName());
  private LMClassifier fileTypeClassifier = null;    //*-- classifier for file types
  private LMClassifier textTypeClassifier = null;    //*-- classifier for text types
 
  /**
   * Classification of files and text contents using pre-defined models
   */
  public ClassifyDoc()
  {
   //*-- read the classification models
   String fmodelFile = Constants.FTYPE_CLASS_MODEL;  //*-- file type model - article, page, book, etc.
   String tmodelFile = Constants.TTYPE_CLASS_MODEL;  //*-- text type model - business, sports, health
   try
   { ObjectInputStream oi = new ObjectInputStream( new FileInputStream(fmodelFile) );
     fileTypeClassifier = (LMClassifier) oi.readObject();
     oi.close()
     oi = new ObjectInputStream( new FileInputStream(tmodelFile) );
     textTypeClassifier = (LMClassifier) oi.readObject();
     oi.close()
   }
   catch (IOException ie) { logger.error("Failed to read the classifier model: " + ie.getMessage() ); }
   catch (ClassNotFoundException ce) { logger.error("Could not find LMClassifier class: " + ce.getMessage() ); }
  
  }
 
  /**
   * Classify the passed text string into one of the file types - book, letter, or text.
   * @param idoc IndexableDoc Document to be classified
   * @return String Type of file
   */
  public String classifyFileType(IndexableDoc idoc)
  {
   //*-- use the top n bytes of the contents to identify the file type
   int length = idoc.getContents().length();
   String contents = (length > 500) ? idoc.getContents().toString().substring(0, 500):
                  idoc.getContents().toString();
  
   //*-- return book if the length of the document seems large
   if (length > Constants.DB_DOC_STORE_LIMIT) return "book";
  
   //*-- assign a default file type of text when insufficient evidence
   JointClassification jc =  fileTypeClassifier.classifyJoint(contents);
   return ( (jc.score(0) > -2.5) ? jc.bestCategory(): "text");
  }

  /**
   * Classify the passed text string into one of the pre-defined categories - rec, business, news, etc.
   * @param idoc IndexableDoc The document whose contents are to be classified
   * @return String The closest category for the passed string
   */
  public String classifyTextContents(IndexableDoc idoc)
  {
   //*-- assign a default text type of "" when insufficient evidence
   JointClassification jc =  textTypeClassifier.classifyJoint(idoc.getContents().toString());
   return ( (jc.score(0) > -2.5) ? jc.bestCategory(): "miscellaneous");
  }
 
}
TOP

Related Classes of org.sf.mustru.crawl.ClassifyDoc

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.