Package org.sf.mustru.filters

Source Code of org.sf.mustru.filters.SxwHandler$OOHandler

package org.sf.mustru.filters;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.io.PrintWriter;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

import org.apache.log4j.Logger;
import org.sf.mustru.docs.IndexableDoc;
//import org.sf.mustru.docs.TextDoc;
import org.sf.mustru.utils.*;

/**
* A class to extract text from OpenOffice files using the Xerces XML parser.
*/
public class SxwHandler implements HandlerInterface
{
  static Logger logger = Logger.getLogger(SxwHandler.class.getName());
  StringBuffer content = new StringBuffer();

  /**
   * empty constructor
   */
  public SxwHandler() { super(); }
 
  /**
   *  Extract text from an OpenOffice file -
   *  a. Unzip the file.
   *  b. Create the contents.xml file in a directory with OpenOffice dtds.
   *  c. Use an XML parser to remove the text.
   */
  public void getDocument (String ifile, IndexableDoc doc)
  {
    //*-- fetch an unique thread code to generate an unique temporary file name
    int threadCode = Thread.currentThread().hashCode();
   
    //*-- a. unzip the OpenOffice file
    logger.info("Extracting from OpenOffice file " + ifile);
 
    ZipFile zFile;
    try
     { zFile = new ZipFile(new File(ifile)); }
    catch (IOException e)
     { logger.error("Could not open OpenOffice file " + ifile + " " + e.getMessage() );
       return; }

    //*-- b. Extract the content.xml file and write to a file
    ZipEntry zEntry = zFile.getEntry("content.xml");
    InputStream xmlStream = null; PrintWriter outp = null;
    BufferedReader iReader = null;
    String outfile = "";
    try
     {
      //*-- create an input stream for the XML file
      xmlStream = zFile.getInputStream(zEntry);
      iReader = new BufferedReader( new InputStreamReader(xmlStream, "UTF-8") );
     
      //*-- generate the output file name and dump the XML contents
      String iline; outfile = Constants.OFFICEDIR + File.separator + "TEMP_content_" + threadCode + ".xml";
     
      outp = new PrintWriter(new FileWriter(outfile));
      while ( (iline = iReader.readLine()) != null ) { outp.println(iline); }
      outp.flush();
     }
    catch (IOException e)
     { logger.error("Could not read text from OpenOffice file: " + ifile + " " + e.getMessage()); }
    finally
     {
      if (outp != null) outp.close();  
      try
       { if (iReader != null) iReader.close();
         if (xmlStream != null) xmlStream.close()
         if (zFile != null) zFile.close();    
       }
      catch (IOException exc) { logger.error("Ignore error"); }
     }

    //*-- parse the content.xml file with the SAXParser
    try
    {
     XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
     parser.setContentHandler(new OOHandler());
     parser.parse(outfile);
    }
    catch (OutOfMemoryError oe) { logger.error("Ran out of memory for " + outfile + " or could be corrupt file " + oe.getMessage()); }
    catch (SAXException se) { logger.error("Could not parse XML file" + outfile + " " + se.getMessage()); }
    catch (IOException  ie) { logger.error("Could not read  XML file" + outfile + " " + ie.getMessage()); }
   
    //*-- remove the temporary content.xml file
    File tfile = new File(outfile); tfile.delete();
   
    //*-- return the contents
    String bodyText = content.toString();
    if (bodyText != null) { bodyText = StringTools.filterChars(bodyText); }
    doc.setContents ( new StringBuffer(bodyText) );
    doc.setFileType("text"); doc.setFileName(ifile);
    return;
  }
 
  /**
   * Inner class to append the text of the OpenOffice file
   */
  class OOHandler extends DefaultHandler
   {
    public void characters(char[] ch, int start, int length)
     { content.append(" "); content.append( new String(ch, start, length)); }
   }
}
TOP

Related Classes of org.sf.mustru.filters.SxwHandler$OOHandler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.