Package org.sf.mustru.filters

Source Code of org.sf.mustru.filters.DocHandler

package org.sf.mustru.filters;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FilePermission;
import java.io.IOException;
import java.io.StringWriter;
import java.io.PrintWriter;

import org.sf.mustru.docs.IndexableDoc;
//import org.sf.mustru.docs.TextDoc;
import org.sf.mustru.utils.*;

import org.apache.log4j.Logger;
import org.apache.poi.hdf.extractor.WordDocument;

/**
* Class to extract text from Microsoft Word documents. First try using the Apache
* POI classes, if that fails, then try antiword.
*/
public class DocHandler implements HandlerInterface
{

static Logger logger = Logger.getLogger(DocHandler.class.getName());

/**
  * empty constructor
  */
public DocHandler() { super(); }

/**
  *  Try extracting the text from the Word document using the POI classes.
  *  If that fails, then try antiword
  * 
  *  @param ifile The name of the file to be indexed
  */
public void getDocument(String ifile, IndexableDoc doc)
{
  String bodyText = null;
  try
  { logger.info("Using POI classes to extract text from Word document: " + ifile);
  WordDocument wd = new WordDocument( new FileInputStream( new File(ifile) ) );
  StringWriter docTextWriter = new StringWriter();
  wd.writeAllText(new PrintWriter(docTextWriter));   
  docTextWriter.close();

  //*-- if no text was extracted, try antiword
  bodyText = docTextWriter.toString(); bodyText = StringTools.filterChars(bodyText);
  if (bodyText.length() == 0) bodyText = tryAntiword(ifile);

  }
  catch (OutOfMemoryError exc)
  { logger.error("Ran out of memory for " + ifile + " or could be corrupt file " + exc.getMessage());
  bodyText = tryAntiword(ifile); }
  catch (Exception e)
  { logger.warn("Could not extract text with POI from " + ifile + " " + e.getMessage());
  bodyText = tryAntiword(ifile); }


  //*-- as a last resort, try the plain text handler
  if ( (bodyText != null) && (bodyText.length() == 0) )
  { new TxtHandler().getDocument(ifile, doc); return}

  //TextDoc doc = new TextDoc(ifile); 
  bodyText = StringTools.filterChars(bodyText);
  doc.setContentsnew StringBuffer(bodyText) );
  doc.setFileType("text");
  doc.setFileName(ifile);

  return;
}

/**
  * Run antiword to try and extract text from the Word document
  * @param ifile Word file
  * @return String Extracted text
  */
private static synchronized String tryAntiword(String ifile)
{
  String outfile = Constants.ANTIWORDDIR + File.separator + "run_antiword";
  createBatchFile(ifile, outfile);
  String[] cmdline = {outfile};
  String bodyText = ExecProgram.runProgram(cmdline);
  if (bodyText.length() > 0)
  { logger.info("Antiword did extract text from " + ifile)}
  else
  { logger.warn("Antiword did not extract text from " + ifile); }

  //*-- remove the temporary file created
  //   outfile += (Constants.OSNAME.endsWith("x")) ?  ".sh": ".bat";
  //   File tfile = new File(outfile); tfile.delete();

  return(bodyText);
}
/**
  * @param ifile Name of MS word file to be indexed
  * @param outfile Name of the output batch file
  */
private static void createBatchFile(String ifile, String outfile)
{
  String permissions = "read,execute";
  PrintWriter pw = null; FileOutputStream fos = null;
  try
  {
   //*-- create the Linux script
   if (Constants.OSNAME.endsWith("x"))
   {
    String tfile = outfile + ".sh";
    fos = new FileOutputStream(new File(tfile));
    new FilePermission(tfile, permissions);
    pw = new PrintWriter(fos);
    pw.println("#!/bin/sh");
    pw.println("#*-- Generated Linux script to run antiword");
    pw.println("export HOME=\"" + Constants.ANTIWORDDIR + "\"");
    pw.println("cd $HOME");
    pw.println("antiword \"" + ifile + "\"");
   }
   //*-- create the windows script
   else
   {
    String tfile = outfile + ".bat";
    fos = new FileOutputStream(new File(tfile));
    new FilePermission(tfile, permissions);
    pw = new PrintWriter(fos);
    pw.println("@ECHO OFF");
    pw.println("REM *-- Generated Windows script to run antiword");
    pw.println("set HOME=" + Constants.ANTIWORDDIR);
    pw.println("set PATH=" + Constants.CYGWINDIR + ";%PATH%");
    ifile = ifile.replace('/''\\');
    pw.println("%HOME%\\antiword.exe \"" + ifile + "\"");     
   }
  }
  catch (IOException ie)
  { logger.error("Could not create antiword batch file" + ie.getMessage()); }
  finally
  { try { if (pw != null) pw.close();
  if (fos != null)  { fos.flush(); fos.close(); }
  }
  catch (IOException ie) { logger.error("Ignore error"); } }
}
}
TOP

Related Classes of org.sf.mustru.filters.DocHandler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.