Package org.apache.lenya.lucene.index

Source Code of org.apache.lenya.lucene.index.ConfigurableDocumentCreator

/*
* ConfigurableDocumentCreator.java
*
* Created on 17. M�rz 2003, 15:01
*/

package org.apache.lenya.lucene.index;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.lang.reflect.Method;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.apache.lenya.lucene.parser.HTMLParser;
import org.apache.lenya.lucene.parser.HTMLParserFactory;
import org.apache.lenya.lucene.parser.StringCleaner;
import org.apache.lenya.xml.DocumentHelper;
import org.apache.lenya.xml.NamespaceHelper;
import org.xml.sax.InputSource;

/**
*
* @author  hrt
*/
public class ConfigurableDocumentCreator
    extends AbstractDocumentCreator {
   
    public static final String LUCENE_NAMESPACE = "http://www.wyona.org/2003/lucene";
    public static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
   
    public ConfigurableDocumentCreator(String stylesheet) {
        this.stylesheet = stylesheet;
    }
   
    private String stylesheet;
   
    public String getStylesheet() {
        return stylesheet;
    }
   
    public Document getDocument(File file, File htdocsDumpDir)
        throws Exception {
           
        // System.out.println(getClass().getName() + ": indexing " + file.getAbsolutePath());
       
        try {
           
            // transform source document into lucene document
           
            NamespaceHelper documentHelper = new NamespaceHelper(XHTML_NAMESPACE, "xhtml", "html");
            org.w3c.dom.Document sourceDocument = documentHelper.getDocument();
            Element rootNode = sourceDocument.getDocumentElement();
           
            String bodyText = getBodyText(file);
            Element bodyElement = documentHelper.createElement("body", bodyText);
            rootNode.appendChild(bodyElement);
           
            DOMSource documentSource = new DOMSource(sourceDocument);
            Writer documentWriter = new StringWriter();
           
            TransformerFactory tFactory = TransformerFactory.newInstance();
            Transformer documentTransformer = tFactory.newTransformer(new StreamSource(new StringReader(getStylesheet())));
            documentTransformer.setOutputProperty(OutputKeys.INDENT, "yes");
            documentTransformer.setOutputProperty(OutputKeys.ENCODING, "ISO-8859-1");
           
            String fileName = file.getName();
            if (fileName.endsWith(".pdf.txt")) {
                fileName = fileName.substring(0, fileName.lastIndexOf(".txt"));
            }
           
            documentTransformer.setParameter("filename", fileName);
            documentTransformer.transform(documentSource, new StreamResult(documentWriter));
           
            dumpLuceneDocument(file, documentWriter);
           
            DocumentBuilder builder = DocumentHelper.createBuilder();
            org.w3c.dom.Document luceneDocument
                = builder.parse(new InputSource(new StringReader(documentWriter.toString())));
           
            NamespaceHelper helper = new NamespaceHelper(LUCENE_NAMESPACE, "luc", luceneDocument);
            Element root = luceneDocument.getDocumentElement();
            Element fieldElements[] = helper.getChildren(root, "field");
           
            Document document = super.getDocument(file, htdocsDumpDir);

            Class parameterTypes[] = { String.class, String.class };
           
            for (int i = 0; i < fieldElements.length; i++) {
                String name = fieldElements[i].getAttribute("name");
                String type = fieldElements[i].getAttribute("type");
                String text = getText(fieldElements[i]);
               
                Method method = Field.class.getMethod(type, parameterTypes);
               
                String args[] = { name, text };
               
                Field field = (Field) method.invoke(null, args);
                document.add(field);
                //System.out.println("Adding field of type " + type +": " + name + " = " + text);
            }
           
            return document;
        }
        catch (Exception e) {
            throw e;
        }
    }

    /**
     * Writes the lucene XML document to a file.
     */
    protected static void dumpLuceneDocument(File file, Writer writer)
            throws IOException {
        File luceneDocumentFile = new File(file.getAbsolutePath() + ".xml");
        luceneDocumentFile.createNewFile();
        FileWriter fileWriter = new FileWriter(luceneDocumentFile);
        fileWriter.write(writer.toString());
        fileWriter.close();
    }
           
    public static String getText(Node node) {
        StringBuffer result = new StringBuffer();
        if (!node.hasChildNodes())
            return "";

        NodeList list = node.getChildNodes();
        for (int i = 0; i < list.getLength(); i++) {
            Node subnode = list.item(i);
            if (subnode.getNodeType() == Node.TEXT_NODE) {
                result.append(subnode.getNodeValue());
            }
            else if (subnode.getNodeType() ==
                Node.CDATA_SECTION_NODE) {
                result.append(subnode.getNodeValue());
            }
            else if (subnode.getNodeType() ==
                Node.ENTITY_REFERENCE_NODE) {
              // Recurse into the subtree for text
              // (and ignore comments)
              result.append(getText(subnode));
            }
        }
        return result.toString();
    }
   
    public static String getBodyText(File file) throws Exception {
       
        HTMLParser parser = HTMLParserFactory.newInstance(file);
        parser.parse(file);
        Reader reader = parser.getReader();
        Writer writer = new StringWriter();
       
        int c;
        while ((c = reader.read()) != -1)
           writer.write(c);
       
        String content = writer.toString();
        reader.close();
        writer.close();
       
        content = StringCleaner.clean(content);
       
        return content;
    }
   
}
TOP

Related Classes of org.apache.lenya.lucene.index.ConfigurableDocumentCreator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.