Package org.sf.mustru.filters

Examples of org.sf.mustru.filters.StarHandler


{ PropertyConfigurator.configure (Constants.LOG4J_FILE)}
public void ttestPdfFile()
{
  System.out.println("Started testPdfFile");
  Properties props = null; StarHandler sh = null;
  try { props = new Properties(); props.load(new FileInputStream(Constants.FILTER_FILE)); sh = new StarHandler(props); }
  catch (IOException e) { throw new RuntimeException("Could not read filtersFile + " + e.getMessage() ); }
 
  String filename = "/home/manuk/html/akr/ebooks/MYSQLmanual-a4.pdf";
  IndexableDoc doc = new IndexableDoc();
  sh.getDocument(filename, doc);
  int contentSizeLimit = Constants.DOC_LENGTH_MAXLIMIT;
  int textSize = doc.getContents().length();
  if (textSize > contentSizeLimit )
       doc.setContents( (new StringBuffer( doc.getContents().substring(0, contentSizeLimit ) ) ) );
 
View Full Code Here


  public static void main(String[] args)
  {
   //*-- get a generic document handler to extract text
   System.out.println("Started TestFilters...");
   setUp();
   Properties props = null; StarHandler sh = null;
   try { props = new Properties(); props.load(new FileInputStream(Constants.FILTER_FILE)); sh = new StarHandler(props); }
   catch (IOException e) { throw new RuntimeException("Could not read filtersFile + " + e.getMessage() ); }
 
   for (int i = 0; i < sampleFiles.length; i++)
   { String filename = sampleDir + sampleFiles[i];
     IndexableDoc doc = new IndexableDoc();
     sh.getDocument(filename, doc);
     String contents = doc.getContents().toString();
     contents = (contents.length() > 2000) ? contents.substring(0, 2000): contents;
     System.out.println("File: " + sampleFiles[i] + " has content: " + contents);
   }
   System.out.println("Finished TestFilters");
View Full Code Here

  */
public void run()
{     
  //*-- read the filters file and get a generic handler to extract text 
  fileReadTime -=  new Date().getTime();
  Properties props = null; StarHandler sh = null;
  try { props = new Properties(); props.load(new BufferedInputStream(new FileInputStream(Constants.FILTER_FILE)));
  sh = new StarHandler(props); }
  catch (IOException e) { crawlManager.cleanUp("Could not read filtersFile " + e.getMessage()); }
  if (sh == null) crawlManager.cleanUp("The generic handler to filter files was not found");

  //*-- read the document types file to classify documents into types - text, image, audio, book, letter, etc.
  Properties dprops = null;
  try { dprops = new Properties(); dprops.load(new BufferedInputStream(new FileInputStream(Constants.DOCTYPES_FILE))); }
  catch (IOException e)  { crawlManager.cleanUp("Could not read doctypesFile " + e.getMessage()); }

  //*-- read the media suffixes and associated file types properties file
  Properties mprops = new Properties();
  try { mprops.load(new BufferedInputStream(new FileInputStream( Constants.MEDIATYPES_FILE))); }
  catch ( IOException e) { crawlManager.cleanUp("Could not read media types file " + e.getMessage()); }
  fileReadTime += new Date().getTime();

  currentDoc = crawlManager.getCurrentDoc(threadNum);

  long startMemory = Runtime.getRuntime().freeMemory();
  logger.info("Free memory at start of scan: " + startMemory);

  //*-- scan the list of files and process depending on thread number
  String iDocument = ""; IndexableDoc idoc = null; boolean completed = false;
  FILELOOP: while (!completed)
  {
   //*-- distribute the load evenly
   if ( (++currentDoc % numThreads) != threadNum continue FILELOOP;

   //*-- for a restart skip documents scanned earlier
   if (currentDoc < crawlConfig.getStartPosition()) continue FILELOOP;

   //*-- terminate if all files have been processed
   if (currentDoc >= crawlManager.totalNumFiles() ) { completed = true; break FILELOOP; }

   //*-- periodically synchronize the indexes
   syncTime -= new Date().getTime();
   long freeMemory = Runtime.getRuntime().freeMemory();
   //  if (freeMemory < startMemory)  mergeIndexes(false);
   if ( ( (currentDoc % 1000) == 0) && (docsProcessed > 0) ) mergeIndexes(false);
   syncTime += new Date().getTime();          
   try
   {
    //*-- check if the document was indexed prior to the modification date of the file
    iDocument = crawlManager.getFileName(currentDoc); initDoc(iDocument, tempDoc);
    //logger.info(currentDoc + ": Started File: " + iDocument + "<----");
    if (checkIndexed(iDocument)) continue FILELOOP;

    //*-- extract the text from the document and create an indexable document
    docFilterTime -= new Date().getTime();
    sh.getDocument(iDocument, tempDoc);
    docFilterTime += new Date().getTime();

    //*-- get the signature of the document text and check for duplicates
    if (duplicateDoc()) continue FILELOOP;

View Full Code Here

TOP

Related Classes of org.sf.mustru.filters.StarHandler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.