package org.sf.mustru.filters;
//import java.io.FileInputStream;
//import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.log4j.Logger;
import org.apache.poi.hslf.extractor.QuickButCruddyTextExtractor;
import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.utils.*;
/**
* Extract text from a PowerPoint file using the POI classes.
*/
public class PptHandler implements HandlerInterface
{
static Logger logger = Logger.getLogger(PptHandler.class.getName());
/**
* empty constructor
*
*/
public PptHandler() { super(); }
/**
*- Extract the text from a Power Point file and return the plain text contents
*/
public void getDocument(String ifile, IndexableDoc doc)
{
String bodyText = "";
try
{
logger.info("Extracting text from PPT file " + ifile);
//*-- use the cruddy extractor instead of powerpointextractor since it seems to
//*-- handle all kinds of PPT files
//PowerPointExtractor ppe = null;
//ppe = new PowerPointExtractor(ifile);
//bodyText = ppe.getText() + ppe.getNotes();
//ppe.close();
QuickButCruddyTextExtractor qbt = new QuickButCruddyTextExtractor(ifile);
bodyText = qbt.getTextAsString();
}
catch (OutOfMemoryError e)
{ logger.error("Cannot allocate memory, file may be corrupt " + ifile + " " + e.getMessage()); }
catch (Exception e)
{ logger.error("Cannot extract text from a PowerPoint document " + ifile + " " + e.getMessage() ); }
//SlideDoc doc = new SlideDoc(ifile);
if (bodyText != null) { bodyText = StringTools.filterChars(bodyText); }
doc.setContents ( new StringBuffer(bodyText) );
doc.setFileType("slide"); doc.setFileName(ifile);
return;
}
}