Examples of org.sf.mustru.filters.StarHandler

org.sf.mustru.filters.StarHandler
A generic text extractor for all files. The appropriate class to handle the text extraction is instantiated based on the type of suffix.

 { PropertyConfigurator.configure (Constants.LOG4J_FILE);  }
 
 public void ttestPdfFile()
 {
  System.out.println("Started testPdfFile");
  Properties props = null; StarHandler sh = null;
  try { props = new Properties(); props.load(new FileInputStream(Constants.FILTER_FILE)); sh = new StarHandler(props); }
  catch (IOException e) { throw new RuntimeException("Could not read filtersFile + " + e.getMessage() ); }
  
  String filename = "/home/manuk/html/akr/ebooks/MYSQLmanual-a4.pdf";
  IndexableDoc doc = new IndexableDoc();
  sh.getDocument(filename, doc);
  int contentSizeLimit = Constants.DOC_LENGTH_MAXLIMIT;
  int textSize = doc.getContents().length();
  if (textSize > contentSizeLimit ) 
       doc.setContents( (new StringBuffer( doc.getContents().substring(0, contentSizeLimit ) ) ) );

View Full Code Here

  public static void main(String[] args) 
  {
   //*-- get a generic document handler to extract text 
   System.out.println("Started TestFilters...");
   setUp();
   Properties props = null; StarHandler sh = null;
   try { props = new Properties(); props.load(new FileInputStream(Constants.FILTER_FILE)); sh = new StarHandler(props); }
   catch (IOException e) { throw new RuntimeException("Could not read filtersFile + " + e.getMessage() ); }
  
   for (int i = 0; i < sampleFiles.length; i++)
   { String filename = sampleDir + sampleFiles[i];
     IndexableDoc doc = new IndexableDoc();
     sh.getDocument(filename, doc);
     String contents = doc.getContents().toString();
     contents = (contents.length() > 2000) ? contents.substring(0, 2000): contents;
     System.out.println("File: " + sampleFiles[i] + " has content: " + contents);
   }
   System.out.println("Finished TestFilters");

View Full Code Here

  */
 public void run()
 {      
  //*-- read the filters file and get a generic handler to extract text  
  fileReadTime -=  new Date().getTime();
  Properties props = null; StarHandler sh = null;
  try { props = new Properties(); props.load(new BufferedInputStream(new FileInputStream(Constants.FILTER_FILE))); 
  sh = new StarHandler(props); }
  catch (IOException e) { crawlManager.cleanUp("Could not read filtersFile " + e.getMessage()); }
  if (sh == null) crawlManager.cleanUp("The generic handler to filter files was not found");


  //*-- read the document types file to classify documents into types - text, image, audio, book, letter, etc.
  Properties dprops = null;
  try { dprops = new Properties(); dprops.load(new BufferedInputStream(new FileInputStream(Constants.DOCTYPES_FILE))); }
  catch (IOException e)  { crawlManager.cleanUp("Could not read doctypesFile " + e.getMessage()); }


  //*-- read the media suffixes and associated file types properties file
  Properties mprops = new Properties();
  try { mprops.load(new BufferedInputStream(new FileInputStream( Constants.MEDIATYPES_FILE))); }
  catch ( IOException e) { crawlManager.cleanUp("Could not read media types file " + e.getMessage()); }
  fileReadTime += new Date().getTime();


  currentDoc = crawlManager.getCurrentDoc(threadNum); 


  long startMemory = Runtime.getRuntime().freeMemory();
  logger.info("Free memory at start of scan: " + startMemory);


  //*-- scan the list of files and process depending on thread number
  String iDocument = ""; IndexableDoc idoc = null; boolean completed = false;
  FILELOOP: while (!completed) 
  { 
   //*-- distribute the load evenly 
   if ( (++currentDoc % numThreads) != threadNum )  continue FILELOOP; 


   //*-- for a restart skip documents scanned earlier
   if (currentDoc < crawlConfig.getStartPosition()) continue FILELOOP;


   //*-- terminate if all files have been processed
   if (currentDoc >= crawlManager.totalNumFiles() ) { completed = true; break FILELOOP; }


   //*-- periodically synchronize the indexes
   syncTime -= new Date().getTime(); 
   long freeMemory = Runtime.getRuntime().freeMemory();
   //  if (freeMemory < startMemory)  mergeIndexes(false);
   if ( ( (currentDoc % 1000) == 0) && (docsProcessed > 0) ) mergeIndexes(false);
   syncTime += new Date().getTime();           
   try 
   { 
    //*-- check if the document was indexed prior to the modification date of the file
    iDocument = crawlManager.getFileName(currentDoc); initDoc(iDocument, tempDoc);
    //logger.info(currentDoc + ": Started File: " + iDocument + "<----"); 
    if (checkIndexed(iDocument)) continue FILELOOP;


    //*-- extract the text from the document and create an indexable document
    docFilterTime -= new Date().getTime(); 
    sh.getDocument(iDocument, tempDoc);
    docFilterTime += new Date().getTime();


    //*-- get the signature of the document text and check for duplicates 
    if (duplicateDoc()) continue FILELOOP;

View Full Code Here

TOP

Related Classes of org.sf.mustru.filters.StarHandler

org.sf.mustru.crawl.CrawlThread

org.sf.mustru.test.TestFilters

org.sf.mustru.test.TestSentenceExtraction

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.