Source Code of org.apache.lenya.lucene.index.ConfigurableDocumentCreator

/*
 * Copyright  1999-2004 The Apache Software Foundation
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 */


/* $Id: ConfigurableDocumentCreator.java,v 1.11 2004/05/16 23:23:13 michi Exp $  */


package org.apache.lenya.lucene.index;


import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.lang.reflect.Method;


import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;


import org.apache.lenya.lucene.parser.HTMLParser;
import org.apache.lenya.lucene.parser.HTMLParserFactory;
import org.apache.lenya.lucene.parser.StringCleaner;
import org.apache.lenya.xml.DocumentHelper;
import org.apache.lenya.xml.NamespaceHelper;
import org.apache.log4j.Category;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;


/**
 * Uses XSLT to transform a XML into a Lucene document
 */
public class ConfigurableDocumentCreator extends AbstractDocumentCreator {
    Category log = Category.getInstance(ConfigurableDocumentCreator.class);
  
    public static final String LUCENE_NAMESPACE = "http://apache.org/cocoon/lenya/lucene/1.0";
    public static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";


    /**
     * Creates a new ConfigurableDocumentCreator object.
     *
     * @param stylesheet DOCUMENT ME!
     */
    public ConfigurableDocumentCreator(String stylesheet) {
        this.stylesheet = stylesheet;
    }


    private String stylesheet;


    /**
     * DOCUMENT ME!
     *
     * @return DOCUMENT ME!
     */
    public String getStylesheet() {
        return stylesheet;
    }


    /**
     * Transform source document into lucene document and generate a Lucene Document instance
     *
     * @param file DOCUMENT ME!
     * @param htdocsDumpDir DOCUMENT ME!
     *
     * @return DOCUMENT ME!
     *
     * @throws Exception DOCUMENT ME!
     */
    public Document getDocument(File file, File htdocsDumpDir) throws Exception {
        log.debug(".getDocument() : indexing " + file.getAbsolutePath());
        try {


            org.w3c.dom.Document sourceDocument = null;
            DocumentBuilderFactory parserFactory = DocumentBuilderFactory.newInstance();
            parserFactory.setValidating(false);
            parserFactory.setNamespaceAware(true);
            parserFactory.setIgnoringElementContentWhitespace(true);
            DocumentBuilder mybuilder = parserFactory.newDocumentBuilder();
            sourceDocument = mybuilder.parse(file.getAbsolutePath());




// FIXME: What is this good for: <?xml version="1.0"?><body>...</body>
/*
            NamespaceHelper documentHelper = new NamespaceHelper(XHTML_NAMESPACE, "xhtml", "html");
            org.w3c.dom.Document sourceDocument = documentHelper.getDocument();


            Element rootNode = sourceDocument.getDocumentElement();


            String bodyText = getBodyText(file);
            Element bodyElement = documentHelper.createElement("body", bodyText);
            rootNode.appendChild(bodyElement);
*/








            DOMSource documentSource = new DOMSource(sourceDocument);
            Writer documentWriter = new StringWriter();


            TransformerFactory tFactory = TransformerFactory.newInstance();
            Transformer documentTransformer = tFactory.newTransformer(new StreamSource(new StringReader(getStylesheet())));
            documentTransformer.setOutputProperty(OutputKeys.INDENT, "yes");
            documentTransformer.setOutputProperty(OutputKeys.ENCODING, "ISO-8859-1");


            String fileName = file.getName();


            if (fileName.endsWith(".pdf.txt")) {
                fileName = fileName.substring(0, fileName.lastIndexOf(".txt"));
            }


            documentTransformer.setParameter("filename", fileName);
            documentTransformer.transform(documentSource, new StreamResult(documentWriter));


            // DEBUG: debug lucene documents
            //dumpLuceneDocument(file, documentWriter);


            DocumentBuilder builder = DocumentHelper.createBuilder();
            org.w3c.dom.Document luceneDocument = builder.parse(new InputSource(new StringReader(documentWriter.toString())));


            NamespaceHelper helper = new NamespaceHelper(LUCENE_NAMESPACE, "luc", luceneDocument);
            Element root = luceneDocument.getDocumentElement();
            Element[] fieldElements = helper.getChildren(root, "field");


            Document document = super.getDocument(file, htdocsDumpDir);


            Class[] parameterTypes = { String.class, String.class };


            for (int i = 0; i < fieldElements.length; i++) {
                String name = fieldElements[i].getAttribute("name");
                String type = fieldElements[i].getAttribute("type");
                String text = getText(fieldElements[i]);


                Method method = Field.class.getMethod(type, parameterTypes);


                String[] args = { name, text };


                Field field = (Field) method.invoke(null, args);
                document.add(field);


            }


            return document;
        } catch (Exception e) {
            throw e;
        }
    }


    /**
     * Writes the lucene XML document to a file.
     */
    protected void dumpLuceneDocument(File file, Writer writer) throws IOException {
        log.debug(".dumpLuceneDocument(): Dump document: " + file.getAbsolutePath());


        File luceneDocumentFile = new File(file.getAbsolutePath() + ".xluc");
        luceneDocumentFile.createNewFile();


        FileWriter fileWriter = new FileWriter(luceneDocumentFile);
        fileWriter.write(writer.toString());
        fileWriter.close();
    }


    /**
     * DOCUMENT ME!
     *
     * @param node DOCUMENT ME!
     *
     * @return DOCUMENT ME!
     */
    public static String getText(Node node) {
        StringBuffer result = new StringBuffer();


        if (!node.hasChildNodes()) {
            return "";
        }


        NodeList list = node.getChildNodes();


        for (int i = 0; i < list.getLength(); i++) {
            Node subnode = list.item(i);


            if (subnode.getNodeType() == Node.TEXT_NODE) {
                result.append(subnode.getNodeValue());
            } else if (subnode.getNodeType() == Node.CDATA_SECTION_NODE) {
                result.append(subnode.getNodeValue());
            } else if (subnode.getNodeType() == Node.ENTITY_REFERENCE_NODE) {
                // Recurse into the subtree for text
                // (and ignore comments)
                result.append(getText(subnode));
            }
        }


        return result.toString();
    }


    /**
     * DOCUMENT ME!
     *
     * @param file DOCUMENT ME!
     *
     * @return DOCUMENT ME!
     *
     * @throws Exception DOCUMENT ME!
     */
    public static String getBodyText(File file) throws Exception {
        HTMLParser parser = HTMLParserFactory.newInstance(file);
        parser.parse(file);


        Reader reader = parser.getReader();
        Writer writer = new StringWriter();


        int c;


        while ((c = reader.read()) != -1)
            writer.write(c);


        String content = writer.toString();
        reader.close();
        writer.close();


        content = StringCleaner.clean(content);


        return content;
    }
}
Source Code of org.apache.lenya.lucene.index.ConfigurableDocumentCreator

Related Classes of org.apache.lenya.lucene.index.ConfigurableDocumentCreator