package org.sf.mustru.filters;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FilePermission;
import java.io.IOException;
import java.io.StringWriter;
import java.io.PrintWriter;
import org.sf.mustru.docs.IndexableDoc;
//import org.sf.mustru.docs.TextDoc;
import org.sf.mustru.utils.*;
import org.apache.log4j.Logger;
import org.apache.poi.hdf.extractor.WordDocument;
/**
* Class to extract text from Microsoft Word documents. First try using the Apache
* POI classes, if that fails, then try antiword.
*/
public class DocHandler implements HandlerInterface
{
static Logger logger = Logger.getLogger(DocHandler.class.getName());
/**
* empty constructor
*/
public DocHandler() { super(); }
/**
* Try extracting the text from the Word document using the POI classes.
* If that fails, then try antiword
*
* @param ifile The name of the file to be indexed
*/
public void getDocument(String ifile, IndexableDoc doc)
{
String bodyText = null;
try
{ logger.info("Using POI classes to extract text from Word document: " + ifile);
WordDocument wd = new WordDocument( new FileInputStream( new File(ifile) ) );
StringWriter docTextWriter = new StringWriter();
wd.writeAllText(new PrintWriter(docTextWriter));
docTextWriter.close();
//*-- if no text was extracted, try antiword
bodyText = docTextWriter.toString(); bodyText = StringTools.filterChars(bodyText);
if (bodyText.length() == 0) bodyText = tryAntiword(ifile);
}
catch (OutOfMemoryError exc)
{ logger.error("Ran out of memory for " + ifile + " or could be corrupt file " + exc.getMessage());
bodyText = tryAntiword(ifile); }
catch (Exception e)
{ logger.warn("Could not extract text with POI from " + ifile + " " + e.getMessage());
bodyText = tryAntiword(ifile); }
//*-- as a last resort, try the plain text handler
if ( (bodyText != null) && (bodyText.length() == 0) )
{ new TxtHandler().getDocument(ifile, doc); return; }
//TextDoc doc = new TextDoc(ifile);
bodyText = StringTools.filterChars(bodyText);
doc.setContents( new StringBuffer(bodyText) );
doc.setFileType("text");
doc.setFileName(ifile);
return;
}
/**
* Run antiword to try and extract text from the Word document
* @param ifile Word file
* @return String Extracted text
*/
private static synchronized String tryAntiword(String ifile)
{
String outfile = Constants.ANTIWORDDIR + File.separator + "run_antiword";
createBatchFile(ifile, outfile);
String[] cmdline = {outfile};
String bodyText = ExecProgram.runProgram(cmdline);
if (bodyText.length() > 0)
{ logger.info("Antiword did extract text from " + ifile); }
else
{ logger.warn("Antiword did not extract text from " + ifile); }
//*-- remove the temporary file created
// outfile += (Constants.OSNAME.endsWith("x")) ? ".sh": ".bat";
// File tfile = new File(outfile); tfile.delete();
return(bodyText);
}
/**
* @param ifile Name of MS word file to be indexed
* @param outfile Name of the output batch file
*/
private static void createBatchFile(String ifile, String outfile)
{
String permissions = "read,execute";
PrintWriter pw = null; FileOutputStream fos = null;
try
{
//*-- create the Linux script
if (Constants.OSNAME.endsWith("x"))
{
String tfile = outfile + ".sh";
fos = new FileOutputStream(new File(tfile));
new FilePermission(tfile, permissions);
pw = new PrintWriter(fos);
pw.println("#!/bin/sh");
pw.println("#*-- Generated Linux script to run antiword");
pw.println("export HOME=\"" + Constants.ANTIWORDDIR + "\"");
pw.println("cd $HOME");
pw.println("antiword \"" + ifile + "\"");
}
//*-- create the windows script
else
{
String tfile = outfile + ".bat";
fos = new FileOutputStream(new File(tfile));
new FilePermission(tfile, permissions);
pw = new PrintWriter(fos);
pw.println("@ECHO OFF");
pw.println("REM *-- Generated Windows script to run antiword");
pw.println("set HOME=" + Constants.ANTIWORDDIR);
pw.println("set PATH=" + Constants.CYGWINDIR + ";%PATH%");
ifile = ifile.replace('/', '\\');
pw.println("%HOME%\\antiword.exe \"" + ifile + "\"");
}
}
catch (IOException ie)
{ logger.error("Could not create antiword batch file" + ie.getMessage()); }
finally
{ try { if (pw != null) pw.close();
if (fos != null) { fos.flush(); fos.close(); }
}
catch (IOException ie) { logger.error("Ignore error"); } }
}
}