Package net.sf.regain.crawler.preparator

Source Code of net.sf.regain.crawler.preparator.JacobMsWordPreparator

/*
* regain - A file search engine providing plenty of formats
* Copyright (C) 2004  Til Schneider
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
* Contact: Til Schneider, info@murfman.de
*
* CVS information:
*  $RCSfile$
*   $Source$
*     $Date: 2008-10-25 18:35:21 +0200 (Sa, 25 Okt 2008) $
*   $Author: thtesche $
* $Revision: 349 $
*/
package net.sf.regain.crawler.preparator;

import java.util.HashSet;
import java.util.Map;

import net.sf.regain.RegainException;
import net.sf.regain.RegainToolkit;
import net.sf.regain.crawler.config.PreparatorConfig;
import net.sf.regain.crawler.document.RawDocument;

import org.apache.log4j.Logger;

import com.jacob.com.ComFailException;
import com.jacob.com.ComThread;
import com.jacob.com.Dispatch;
import com.jacob.com.Variant;

import de.filiadata.lucene.spider.generated.msoffice2000.word.Application;
import de.filiadata.lucene.spider.generated.msoffice2000.word.Document;
import de.filiadata.lucene.spider.generated.msoffice2000.word.Documents;
import de.filiadata.lucene.spider.generated.msoffice2000.word.GroupShapes;
import de.filiadata.lucene.spider.generated.msoffice2000.word.HeaderFooter;
import de.filiadata.lucene.spider.generated.msoffice2000.word.Paragraph;
import de.filiadata.lucene.spider.generated.msoffice2000.word.Paragraphs;
import de.filiadata.lucene.spider.generated.msoffice2000.word.Section;
import de.filiadata.lucene.spider.generated.msoffice2000.word.Sections;
import de.filiadata.lucene.spider.generated.msoffice2000.word.Selection;
import de.filiadata.lucene.spider.generated.msoffice2000.word.Shape;
import de.filiadata.lucene.spider.generated.msoffice2000.word.Shapes;
import de.filiadata.lucene.spider.generated.msoffice2000.word.WdHeaderFooterIndex;

/**
* Präpariert ein Microsoft-Word-Dokument für die Indizierung mit Hilfe der
* <a href="http://danadler.com/jacob/">Jacob-API</a>, wobei
* <a href="http://www.bigatti.it/projects/jacobgen/">Jacobgen</a>
* genutzt wurde, um den Zugriff zu erleichtern.
* <p>
* Dabei werden die Rohdaten des Dokuments von Formatierungsinformation befreit,
* es wird der Titel extrahiert.
*
* @author Til Schneider, www.murfman.de
*/
public class JacobMsWordPreparator extends AbstractJacobMsOfficePreparator {

  /** The logger for this class */
  private static Logger mLog = Logger.getLogger(JacobMsWordPreparator.class);

  /**
   * The word application. Is <code>null</code> as long as no document was
   * processed.
   */
  private Application mWordApplication;
 
  /**
   * The word style names (style == format template) that are used by paragraphs
   * holding a headline. Is <code>null</code> if no headline styles were
   * configured.
   */
  private HashSet mHeadlineStyleNameSet;
 

  /**
   * Creates a new instance of JacobMsPowerPointPreparator.
   *
   * @throws RegainException If creating the preparator failed.
   */
  public JacobMsWordPreparator() throws RegainException {
    super(new String[] { "doc", "dot" });
  }


  /**
   * Initializes the preparator.
   *
   * @param config The configuration
   * @throws RegainException If the configuration has an error.
   */
  public void init(PreparatorConfig config) throws RegainException {
    super.init(config);
   
    Map main = config.getSectionWithName("main");
    if (main != null) {
      String headlineStyles = (String) main.get("headlineStyles");
      if (headlineStyles != null) {
        String[] styleArr = RegainToolkit.splitString(headlineStyles, ";", true);
        mHeadlineStyleNameSet = new HashSet();
        for (int i = 0; i < styleArr.length; i++) {
          mHeadlineStyleNameSet.add(styleArr[i]);
        }
      }
    }
  }


  /**
   * Präpariert ein Dokument für die Indizierung.
   *
   * @param rawDocument Das zu pr�pariernde Dokument.
   *
   * @throws RegainException Wenn die Pr�paration fehl schlug.
   */
  public void prepare(RawDocument rawDocument) throws RegainException {
    if (mWordApplication == null) {
      // COM-Thread initialisieren
      ComThread.InitSTA();

      // Neue Word-Applikation erstellen
      mLog.info("Starting MS Word");
      mWordApplication = new Application();

      // Word unsichtbar machen
      // mWordApplication.setVisible(false);
      Dispatch.put(mWordApplication, "Visible", new Variant(false));
    }

    try {
      // Dokument �ffnen (Bei Konvertierung nicht fragen und Read only)
      // Workaround: Wenn das Dokument von einer anderen Person bearbeitet wird,
      //             dann erscheint ein Popup. Um das zu verhindern, wird in
      //             jedem Fall (auch bei file-Dokumenten) anstatt der
      //             Originaldatei eine tempor�re Kopie genutzt, da diese
      //             unm�glich von jemandem bearbeitet werden kann.
      String fileName = rawDocument.getContentAsFile(true).getAbsolutePath();
      Documents docs = mWordApplication.getDocuments();
      Document doc = docs.open(new Variant(fileName),
                               new Variant(false),    // confirmConversions
                               new Variant(true));    // readOnly

      // iterate through the sections
      StringBuffer content = new StringBuffer(DEFAULT_BUFFER_SIZE);
      Sections sections = doc.getSections();
      for (int i = 1; i <= sections.getCount(); i++) {
        Section sec = sections.item(i);

        // Get the header of the first section as title
        if (i == 1) {
          int headerFirstPage = WdHeaderFooterIndex.wdHeaderFooterFirstPage;
          HeaderFooter firstHeader = sec.getHeaders().item(headerFirstPage);
          String title = firstHeader.getRange().getText();
          setTitle(title);
        }

        // Get the text
        sec.getRange().select();
        content.append(getSelection(mWordApplication) + "\n");
      }

      // iterate through the shapes
      Shapes shapes = doc.getShapes();
      for (int i = 1; i <= shapes.getCount(); i++) {
        Shape shape = shapes.item(new Variant(i));
        appendShape(shape, content);
      }
     
      // iterate through the paragraphs and extract the headlines
      StringBuffer headlines = null;
      if ((mHeadlineStyleNameSet != null) && (! mHeadlineStyleNameSet.isEmpty())) {
        Paragraphs paragraphs = doc.getParagraphs();
        for (int i = 1; i <= paragraphs.getCount(); i++) {
          Paragraph paragraph = paragraphs.item(i);
         
          // Get the name of the style for this paragraph
          // NOTE: See the Style class for getting other values from the style
          Object styleDispatch = paragraph.getFormat().getStyle().getDispatch();
          String formatName = Dispatch.get(styleDispatch, "NameLocal").toString();
         
          if (mHeadlineStyleNameSet.contains(formatName)) {
            // This paragraph is a headline -> add it to the headlines StringBuffer
           
            // Extract the text
            paragraph.getRange().select();
            String text = getSelection(mWordApplication);
            text = removeBinaryStuff(text);
           
            // Add it to the headlines
            if (headlines == null) {
              headlines = new StringBuffer();
            }
            headlines.append(text + "\n");
           
            if (mLog.isDebugEnabled()) {
              mLog.debug("Extracted headline: '" + text + "'");
            }
          }
        }
      }
     
      // Read the document properties
      readProperties(doc);
     
      // Set the extracted text and the headlines
      setCleanedContent(content.toString());
      if (headlines != null) {
        setHeadlines(headlines.toString());
      }

      // Dokument schlie�en (ohne Speichern)
      doc.close(new Variant(false));
    }
    catch (ComFailException exc) {
      throw new RegainException("Using COM failed.", exc);
    }
  }

 
  /**
   * Gets the currently selected text from a Word application.
   *
   * @param wordAppl The Word application to get the selected text from.
   * @return The currently selected text.
   */
  private String getSelection(Application wordAppl) {
    Selection sel = wordAppl.getSelection();
    // Alternative (VB): sel.moveEndWhile(?? cset:=vbCr ??, WdConstants.wdBackward);
    // Alternative (VB): Call app.ActiveDocument.Bookmarks.Item("\endofdoc").Select()
    sel.moveEnd();
    sel.copy();
    return sel.getText();
  }

 
  /**
   * Appends the text content of a shape to a StringBuffer.
   *
   * @param shape The shape to add.
   * @param buffer The buffer where to append the text
   */
  private void appendShape(Shape shape, StringBuffer buffer) {
    String shapeName = shape.getName();
    if (shapeName.startsWith("Text Box ")) {
      shape.getTextFrame().getTextRange().select();
      buffer.append(getSelection(mWordApplication) + "\n");
    }
    else if (shapeName.startsWith("Group ")) {
      GroupShapes group = shape.getGroupItems();
      for (int i = 1; i <= group.getCount(); i++) {
        Shape child = group.item(new Variant(i));
        appendShape(child, buffer);
      }
    }
  }
 
 
  /**
   * Removes all characters that are less that 32 from the given String
   *
   * @param text The String where to remove the binary stuff.
   * @return The cleaned String.
   */
  private String removeBinaryStuff(String text) {
    StringBuffer newText = new StringBuffer(text.length());
   
    for (int j = 0; j < text.length(); j++) {
      char c = text.charAt(j);
      if (c >= 32) {
        newText.append(c);
      }
    }
   
    return newText.toString();
  }
 

  /**
   * Frees all resources reserved by the preparator.
   * <p>
   * Is called at the end of the crawler process after all documents were
   * processed.
   *
   * @throws RegainException If freeing the resources failed.
   */
  public void close() throws RegainException {
    if (mWordApplication != null) {
      try {
        // Word schlie�en
        mWordApplication.quit();
        mLog.info("Closed MS Word");
      }
      catch (Throwable thr) {
        throw new RegainException("Using COM failed.", thr);
      }
      finally {
        // Alle Ressourcen des COM-Threads freigeben
        ComThread.Release();
      }
    }
  }


  /* VB source code

  'ObjectWord ist das Word-Object
  If objWord Is Nothing Then
    objWord = New Word.Application
  End If

  'Wenn Du keinen With-Block machst mu�t Du bei VB überall das objWord
  'vornedran schreiben! So reicht ein "."
  With objWord

    '### Word Sichtbar/unsichtbar ###
    .Visible = False

    '### Oeffnen des Dokuments ###
    .Documents.Open("Dokumentenname").Activate()

    '### Header+footer kopieren Anfang ###

    '### Ueberschrift kopieren header ###
    .Documents.Item(("Dokumentenname").Activate()
    strUeberschrift = .ActiveDocument.Sections.Item(1).Headers.Item(Word.WdHeaderFooterIndex.wdHeaderFooterFirstPage).Range.Text

    '### Hauptteil kopieren Anfang ###
    .Documents.Item(("Dokumentenname").Activate()
    .ActiveDocument.Sections.Item(1).Range.Select()
    Call .Selection.MoveEndWhile(cset:=vbCr, Count:=Word.WdConstants.wdBackward)
    .Selection.Copy()
    strVariable = .Selection.Text

    '### kopiere textfeld in fussnote ###
    .Documents.Item("Dokumentenname").Activate()
    .ActiveDocument.StoryRanges.Item(7).Select()
    strText = .Selection.Text

    '### um das ende zu markieren
    Call .ActiveDocument.Bookmarks.Item("\endofdoc").Select()

    '### zum schliessen de dokumentes - ohn zu speichern                              Call
    .Documents.Close(savechanges:=Word.WdSaveOptions.wdDoNotSaveChanges)
  */

}
 
TOP

Related Classes of net.sf.regain.crawler.preparator.JacobMsWordPreparator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.