Source Code of org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexUpdate

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.jackrabbit.oak.plugins.index.lucene;


import static com.google.common.base.Preconditions.checkArgument;
import static org.apache.jackrabbit.oak.plugins.index.IndexUtils.getString;
import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newPathField;
import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newPropertyField;
import static org.apache.jackrabbit.oak.plugins.index.lucene.TermFactory.newPathTerm;


import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Set;
import java.util.TreeSet;


import org.apache.jackrabbit.JcrConstants;
import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.api.CommitFailedException;
import org.apache.jackrabbit.oak.api.PropertyState;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.spi.state.NodeBuilder;
import org.apache.jackrabbit.oak.spi.state.NodeState;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.search.PrefixQuery;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.WriteOutContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


class LuceneIndexUpdate implements Closeable, LuceneIndexConstants {


    private static final Logger log = LoggerFactory
            .getLogger(LuceneIndexUpdate.class);


    private static IndexWriterConfig getIndexWriterConfig() {
        // FIXME: Hack needed to make Lucene work in an OSGi environment
        Thread thread = Thread.currentThread();
        ClassLoader loader = thread.getContextClassLoader();
        thread.setContextClassLoader(IndexWriterConfig.class.getClassLoader());
        try {
            IndexWriterConfig config = new IndexWriterConfig(VERSION, ANALYZER);
            config.setMergeScheduler(new SerialMergeScheduler());
            return config;
        } finally {
            thread.setContextClassLoader(loader);
        }
    }


    private static final IndexWriterConfig config = getIndexWriterConfig();


    /**
     * Parser used for extracting text content from binary properties for full
     * text indexing.
     */
    private final Parser parser;


    /**
     * The media types supported by the parser used.
     */
    private Set<MediaType> supportedMediaTypes;


    private final String path;


    private final Set<String> updates = new TreeSet<String>();


    private final IndexWriter writer;


    private final Set<Integer> propertyTypes;


    public LuceneIndexUpdate(String path, NodeBuilder index, Parser parser)
            throws CommitFailedException {
        this.path = path;
        this.parser = parser;
        this.propertyTypes = buildPropertyTypes(index);
        try {
            writer = new IndexWriter(new ReadWriteOakDirectory(
                    index.child(INDEX_DATA_CHILD_NAME)), config);
        } catch (IOException e) {
            throw new CommitFailedException("Lucene", 1,
                    "Failed to update the full text search index", e);
        }
    }


    private Set<Integer> buildPropertyTypes(NodeBuilder index) {
        PropertyState ps = index.getProperty(INCLUDE_PROPERTY_TYPES);
        if (ps == null) {
            return new HashSet<Integer>();
        }
        Set<Integer> includes = new HashSet<Integer>();
        for (String inc : ps.getValue(Type.STRINGS)) {
            // TODO add more types as needed
            if (Type.STRING.toString().equalsIgnoreCase(inc)) {
                includes.add(Type.STRING.tag());
            } else if (Type.BINARY.toString().equalsIgnoreCase(inc)) {
                includes.add(Type.STRING.tag());
            }
        }
        return includes;
    }


    public void insert(String path, NodeBuilder value)
            throws CommitFailedException {
        // null value can come from a deleted node, followed by a deleted
        // property event which would trigger an update on the previously
        // deleted node
        if (value == null) {
            return;
        }
        checkArgument(path.startsWith(this.path));
        String key = path.substring(this.path.length());
        if ("".equals(key)) {
            key = "/";
        }
        if (!key.startsWith("/")) {
            key = "/" + key;
        }
        if (updates.contains(key)) {
            return;
        }
        updates.add(key);
        try {
            writer.updateDocument(newPathTerm(key),
                    makeDocument(key, value.getNodeState()));
        } catch (IOException e) {
            throw new CommitFailedException("Lucene", 1,
                    "Failed to update the full text search index", e);
        }
    }


    public void remove(String path) throws CommitFailedException {
        checkArgument(path.startsWith(this.path));
        try {
            deleteSubtreeWriter(writer, path.substring(this.path.length()));
        } catch (IOException e) {
            throw new CommitFailedException("Lucene", 1,
                    "Failed to update the full text search index", e);
        }
    }


    public void apply() throws CommitFailedException {
        if (writer != null) {
            try {
                writer.close();
            } catch (IOException e) {
                throw new CommitFailedException("Lucene", 1,
                        "Failed to update the full text search index", e);
            }
        }
    }


    private void deleteSubtreeWriter(IndexWriter writer, String path)
            throws IOException {
        // TODO verify the removal of the entire sub-hierarchy
        if (!path.startsWith("/")) {
            path = "/" + path;
        }
        writer.deleteDocuments(newPathTerm(path));
        if (!path.endsWith("/")) {
            path += "/";
        }
        writer.deleteDocuments(new PrefixQuery(newPathTerm(path)));
    }


    private Document makeDocument(String path, NodeState state) {
        Document document = new Document();
        document.add(newPathField(path));
        for (PropertyState property : state.getProperties()) {
            if (propertyTypes.isEmpty()
                    || propertyTypes.contains(property.getType().tag())) {
                if (Type.BINARY.tag() == property.getType().tag()) {
                    addBinaryValue(document, property, state);
                } else {
                    String pname = property.getName();
                    for (String v : property.getValue(Type.STRINGS)) {
                        document.add(newPropertyField(pname, v));
                    }
                }
            }
        }
        return document;
    }


    private void addBinaryValue(Document doc, PropertyState property,
            NodeState state) {
        String type = getString(state, JcrConstants.JCR_MIMETYPE);
        if (type == null || !isSupportedMediaType(type)) {
            return;
        }
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, type);
        // jcr:encoding is not mandatory
        String encoding = getString(state, JcrConstants.JCR_ENCODING);
        if (encoding != null) {
            metadata.set(Metadata.CONTENT_ENCODING, encoding);
        }


        String name = property.getName();
        for (Blob v : property.getValue(Type.BINARIES)) {
            doc.add(newPropertyField(name, parseStringValue(v, metadata)));
        }
    }


    /**
     * Returns <code>true</code> if the provided type is among the types
     * supported by the Tika parser we are using.
     *
     * @param type the type to check.
     * @return whether the type is supported by the Tika parser we are using.
     */
    private boolean isSupportedMediaType(final String type) {
        if (supportedMediaTypes == null) {
            supportedMediaTypes = parser.getSupportedTypes(null);
        }
        return supportedMediaTypes.contains(MediaType.parse(type));
    }


    private String parseStringValue(Blob v, Metadata metadata) {
        WriteOutContentHandler handler = new WriteOutContentHandler();
        try {
            InputStream stream = v.getNewStream();
            try {
                parser.parse(stream, handler, metadata, new ParseContext());
            } finally {
                stream.close();
            }
        } catch (LinkageError e) {
            // Capture and ignore errors caused by extraction libraries
            // not being present. This is equivalent to disabling
            // selected media types in configuration, so we can simply
            // ignore these errors.
        } catch (Throwable t) {
            // Capture and report any other full text extraction problems.
            // The special STOP exception is used for normal termination.
            if (!handler.isWriteLimitReached(t)) {
                log.debug("Failed to extract text from a binary property."
                        + " This is a fairly common case, and nothing to"
                        + " worry about. The stack trace is included to"
                        + " help improve the text extraction feature.", t);
                return "TextExtractionError";
            }
        }
        return handler.toString();
    }


    @Override
    public void close() throws IOException {
        if (writer != null) {
            try {
                writer.close();
            } catch (IOException e) {
                //
            }
        }
    }
}
Source Code of org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexUpdate

Related Classes of org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexUpdate