Source Code of org.xbib.elasticsearch.index.analysis.skos.engine.jena.SKOSEngineImpl

/**
 * Copyright 2010 Bernhard Haslhofer
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.xbib.elasticsearch.index.analysis.skos.engine.jena;


import com.hp.hpl.jena.ontology.AnnotationProperty;
import com.hp.hpl.jena.ontology.ObjectProperty;
import com.hp.hpl.jena.rdf.model.Literal;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.ResIterator;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.StmtIterator;
import com.hp.hpl.jena.update.GraphStore;
import com.hp.hpl.jena.update.GraphStoreFactory;
import com.hp.hpl.jena.update.UpdateAction;
import com.hp.hpl.jena.update.UpdateFactory;
import com.hp.hpl.jena.update.UpdateRequest;
import com.hp.hpl.jena.util.FileManager;
import com.hp.hpl.jena.vocabulary.RDF;


import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;


import org.xbib.elasticsearch.index.analysis.skos.engine.SKOSEngine;
import org.xbib.elasticsearch.plugin.analysis.SKOSAnalysisPlugin;


/**
 * A Lucene-backed SKOSEngine Implementation.
 *
 * Each SKOS concept is stored/indexed as a Lucene document.
 *
 * All labels are converted to lowercase
 */
public class SKOSEngineImpl implements SKOSEngine {


    /**
     * Records the total number of matches
     */
    public static class AllDocCollector extends Collector {


        private final List<Integer> docs = new ArrayList<Integer>();
        private int base;


        @Override
        public boolean acceptsDocsOutOfOrder() {
            return true;
        }


        @Override
        public void collect(int doc) throws IOException {
            docs.add(doc + base);
        }


        public List<Integer> getDocs() {
            return docs;
        }


        @Override
        public void setNextReader(AtomicReaderContext context) throws IOException {
            base = context.docBase;
        }


        @Override
        public void setScorer(Scorer scorer) throws IOException {
            // not needed
        }
    }
    //protected final Version matchVersion;
    /*
     * Static fields used in the Lucene Index
     */
    private static final String FIELD_URI = "uri";
    private static final String FIELD_PREF_LABEL = "pref";
    private static final String FIELD_ALT_LABEL = "alt";
    private static final String FIELD_HIDDEN_LABEL = "hidden";
    private static final String FIELD_BROADER = "broader";
    private static final String FIELD_NARROWER = "narrower";
    private static final String FIELD_BROADER_TRANSITIVE = "broaderTransitive";
    private static final String FIELD_NARROWER_TRANSITIVE = "narrowerTransitive";
    private static final String FIELD_RELATED = "related";
    /**
     * The input SKOS model
     */
    private Model skosModel;
    /**
     * The location of the concept index
     */
    private Directory indexDir;
    /**
     * Provides access to the index
     */
    private IndexSearcher searcher;
    /**
     * The languages to be considered when returning labels.
     *
     * If NULL, all languages are supported
     */
    private Set<String> languages;
    /**
     * The analyzer used during indexing of / querying for concepts
     *
     * SimpleAnalyzer = LetterTokenizer + LowerCaseFilter
     */
    private final Analyzer analyzer;


    /**
     * This constructor loads the SKOS model from a given InputStream using the
     * given serialization language parameter, which must be either N3, RDF/XML,
     * or TURTLE.
     *
     * @param inputStream the input stream
     * @param lang the serialization language
     * @throws IOException if the model cannot be loaded
     */
    public SKOSEngineImpl(InputStream inputStream,
            String lang) throws IOException {


        if (!("N3".equals(lang) || "RDF/XML".equals(lang) || "TURTLE".equals(lang))) {
            throw new IOException("Invalid RDF serialization format");
        }


        analyzer = new SimpleAnalyzer(SKOSAnalysisPlugin.getLuceneVersion());


        skosModel = ModelFactory.createDefaultModel();


        skosModel.read(inputStream, null, lang);


        indexDir = new RAMDirectory();
        entailSKOSModel();
        indexSKOSModel();


        searcher = new IndexSearcher(DirectoryReader.open(indexDir));
    }


    /**
     * This constructor loads the SKOS model from a given filename or URI,
     * starts the indexing process and sets up the index searcher.
     *
     * @param languages the languages to be considered
     * @param indexPath index path
     * @param filenameOrURI file name or URI
     * @throws IOException
     */
    public SKOSEngineImpl(String indexPath, String filenameOrURI,
            String... languages) throws IOException {
        analyzer = new SimpleAnalyzer(SKOSAnalysisPlugin.getLuceneVersion());


        String langSig = "";
        if (languages != null) {
            this.languages = new TreeSet<String>(Arrays.asList(languages));
            langSig = "-" + join(this.languages.iterator(), '.');
        }


        String name = getName(filenameOrURI);
        File dir = new File(indexPath + name + langSig);


        indexDir = FSDirectory.open(dir);
        if (!dir.isDirectory()) {


            // load the skos model from the given file
            FileManager fileManager = new FileManager();
            fileManager.addLocatorFile();
            fileManager.addLocatorURL();
            fileManager.addLocatorClassLoader(SKOSEngineImpl.class.getClassLoader());


            if (getExtension(filenameOrURI).equals("zip")) {
                fileManager.addLocatorZip(filenameOrURI);
                filenameOrURI = getBaseName(filenameOrURI);
            }


            skosModel = fileManager.loadModel(filenameOrURI);
            entailSKOSModel();
            indexSKOSModel();
        }


        searcher = new IndexSearcher(DirectoryReader.open(indexDir));
    }




    /**
     * This constructor loads the SKOS model from a given InputStream using the
     * given serialization language parameter, which must be either N3, RDF/XML,
     * or TURTLE.
     *
     * @param inputStream the input stream
     * @param format the serialization language
     * @throws IOException if the model cannot be loaded
     */
    public SKOSEngineImpl(InputStream inputStream,
                          String format, String... languages) throws IOException {


        if (!("N3".equals(format) || "RDF/XML".equals(format) || "TURTLE".equals(format))) {
            throw new IOException("Invalid RDF serialization format");
        }
        if (languages != null) {
            this.languages = new TreeSet<String>(Arrays.asList(languages));
        }


        analyzer = new SimpleAnalyzer(SKOSAnalysisPlugin.getLuceneVersion());


        skosModel = ModelFactory.createDefaultModel();


        skosModel.read(inputStream, null, format);


        indexDir = new RAMDirectory();
        entailSKOSModel();
        indexSKOSModel();


        searcher = new IndexSearcher(DirectoryReader.open(indexDir));
    }


    private void entailSKOSModel() {
        GraphStore graphStore = GraphStoreFactory.create(skosModel);
        String sparqlQuery =
                "PREFIX skos: <http://www.w3.org/2004/02/skos/core#>\n"
                + "PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n"
                + "INSERT { ?subject rdf:type skos:Concept }\n"
                + "WHERE {\n"
                + "{ ?subject skos:prefLabel ?text } UNION\n"
                + "{ ?subject skos:altLabel ?text } UNION\n"
                + "{ ?subject skos:hiddenLabel ?text }\n"
                + "}";
        UpdateRequest request = UpdateFactory.create(sparqlQuery);
        UpdateAction.execute(request, graphStore);
    }


    /**
     * Creates lucene documents from SKOS concept. In order to allow language
     * restrictions, one document per language is created.
     */
    private Document createDocumentsFromConcept(Resource skos_concept) {
        Document conceptDoc = new Document();


        String conceptURI = skos_concept.getURI();
        Field uriField = new Field(FIELD_URI, conceptURI, StringField.TYPE_STORED);
        conceptDoc.add(uriField);


        // store the preferred lexical labels
        indexAnnotation(skos_concept, conceptDoc, SKOS.prefLabel, FIELD_PREF_LABEL);


        // store the alternative lexical labels
        indexAnnotation(skos_concept, conceptDoc, SKOS.altLabel, FIELD_ALT_LABEL);


        // store the hidden lexical labels
        indexAnnotation(skos_concept, conceptDoc, SKOS.hiddenLabel,
                FIELD_HIDDEN_LABEL);


        // store the URIs of the broader concepts
        indexObject(skos_concept, conceptDoc, SKOS.broader, FIELD_BROADER);


        // store the URIs of the broader transitive concepts
        indexObject(skos_concept, conceptDoc, SKOS.broaderTransitive,
                FIELD_BROADER_TRANSITIVE);


        // store the URIs of the narrower concepts
        indexObject(skos_concept, conceptDoc, SKOS.narrower, FIELD_NARROWER);


        // store the URIs of the narrower transitive concepts
        indexObject(skos_concept, conceptDoc, SKOS.narrowerTransitive,
                FIELD_NARROWER_TRANSITIVE);


        // store the URIs of the related concepts
        indexObject(skos_concept, conceptDoc, SKOS.related, FIELD_RELATED);


        return conceptDoc;
    }


    @Override
    public String[] getAltLabels(String conceptURI) throws IOException {
        return readConceptFieldValues(conceptURI, FIELD_ALT_LABEL);
    }


    @Override
    public String[] getAltTerms(String label) throws IOException {
        List<String> result = new ArrayList<String>();


        // convert the query to lower-case
        String queryString = label.toLowerCase();


        try {
            String[] conceptURIs = getConcepts(queryString);


            for (String conceptURI : conceptURIs) {
                String[] altLabels = getAltLabels(conceptURI);
                if (altLabels != null) {
                    result.addAll(Arrays.asList(altLabels));
                }
            }
        } catch (Exception e) {
            System.err
                    .println("Error when accessing SKOS Engine.\n" + e.getMessage());
        }


        return result.toArray(new String[result.size()]);
    }


    @Override
    public String[] getHiddenLabels(String conceptURI) throws IOException {
        return readConceptFieldValues(conceptURI, FIELD_HIDDEN_LABEL);
    }


    @Override
    public String[] getBroaderConcepts(String conceptURI) throws IOException {
        return readConceptFieldValues(conceptURI, FIELD_BROADER);
    }


    @Override
    public String[] getBroaderLabels(String conceptURI) throws IOException {
        return getLabels(conceptURI, FIELD_BROADER);
    }


    @Override
    public String[] getBroaderTransitiveConcepts(String conceptURI)
            throws IOException {
        return readConceptFieldValues(conceptURI, FIELD_BROADER_TRANSITIVE);
    }


    @Override
    public String[] getBroaderTransitiveLabels(String conceptURI)
            throws IOException {
        return getLabels(conceptURI, FIELD_BROADER_TRANSITIVE);
    }


    @Override
    public String[] getConcepts(String label) throws IOException {
        List<String> concepts = new ArrayList<String>();


        // convert the query to lower-case
        String queryString = label.toLowerCase();


        AllDocCollector collector = new AllDocCollector();


        DisjunctionMaxQuery query = new DisjunctionMaxQuery(0.0f);
        query.add(new TermQuery(new Term(FIELD_PREF_LABEL, queryString)));
        query.add(new TermQuery(new Term(FIELD_ALT_LABEL, queryString)));
        query.add(new TermQuery(new Term(FIELD_HIDDEN_LABEL, queryString)));
        searcher.search(query, collector);


        for (Integer hit : collector.getDocs()) {
            Document doc = searcher.doc(hit);
            String conceptURI = doc.getValues(FIELD_URI)[0];
            concepts.add(conceptURI);
        }


        return concepts.toArray(new String[concepts.size()]);
    }


    private String[] getLabels(String conceptURI, String field)
            throws IOException {
        List<String> labels = new ArrayList<String>();
        String[] concepts = readConceptFieldValues(conceptURI, field);


        for (String aConceptURI : concepts) {
            String[] prefLabels = getPrefLabels(aConceptURI);
            labels.addAll(Arrays.asList(prefLabels));


            String[] altLabels = getAltLabels(aConceptURI);
            labels.addAll(Arrays.asList(altLabels));
        }


        return labels.toArray(new String[labels.size()]);
    }


    @Override
    public String[] getNarrowerConcepts(String conceptURI) throws IOException {
        return readConceptFieldValues(conceptURI, FIELD_NARROWER);
    }


    @Override
    public String[] getNarrowerLabels(String conceptURI) throws IOException {
        return getLabels(conceptURI, FIELD_NARROWER);
    }


    @Override
    public String[] getNarrowerTransitiveConcepts(String conceptURI)
            throws IOException {
        return readConceptFieldValues(conceptURI, FIELD_NARROWER_TRANSITIVE);
    }


    @Override
    public String[] getNarrowerTransitiveLabels(String conceptURI)
            throws IOException {
        return getLabels(conceptURI, FIELD_NARROWER_TRANSITIVE);
    }


    @Override
    public String[] getPrefLabels(String conceptURI) throws IOException {
        return readConceptFieldValues(conceptURI, FIELD_PREF_LABEL);
    }


    @Override
    public String[] getRelatedConcepts(String conceptURI) throws IOException {
        return readConceptFieldValues(conceptURI, FIELD_RELATED);
    }


    @Override
    public String[] getRelatedLabels(String conceptURI) throws IOException {
        return getLabels(conceptURI, FIELD_RELATED);
    }


    private void indexAnnotation(Resource skos_concept, Document conceptDoc,
            AnnotationProperty property, String field) {
        StmtIterator stmt_iter = skos_concept.listProperties(property);
        while (stmt_iter.hasNext()) {
            Literal labelLiteral = stmt_iter.nextStatement().getObject()
                    .as(Literal.class);
            String label = labelLiteral.getLexicalForm();
            String labelLang = labelLiteral.getLanguage();


            if (this.languages != null && !this.languages.contains(labelLang)) {
                continue;
            }


            // converting label to lower-case
            label = label.toLowerCase();


            Field labelField = new Field(field, label, StringField.TYPE_STORED);


            conceptDoc.add(labelField);
        }
    }


    private void indexObject(Resource skos_concept, Document conceptDoc,
            ObjectProperty property, String field) {
        StmtIterator stmt_iter = skos_concept.listProperties(property);
        while (stmt_iter.hasNext()) {
            RDFNode concept = stmt_iter.nextStatement().getObject();


            if (!concept.canAs(Resource.class)) {
                System.err.println("Error when indexing relationship of concept "
                        + skos_concept.getURI() + " .");
                continue;
            }


            Resource resource = concept.as(Resource.class);


            Field conceptField = new Field(field, resource.getURI(),
                    TextField.TYPE_STORED);


            conceptDoc.add(conceptField);
        }
    }


    /**
     * Creates the synonym index
     *
     * @throws IOException
     */
    private void indexSKOSModel() throws IOException {
        IndexWriterConfig cfg = new IndexWriterConfig(SKOSAnalysisPlugin.getLuceneVersion(), analyzer);
        IndexWriter writer = new IndexWriter(indexDir, cfg);
        writer.getConfig().setRAMBufferSizeMB(48);


        /* iterate SKOS concepts, create Lucene docs and add them to the index */
        ResIterator concept_iter = skosModel.listResourcesWithProperty(RDF.type,
                SKOS.Concept);
        while (concept_iter.hasNext()) {
            Resource skos_concept = concept_iter.next();


            Document concept_doc = createDocumentsFromConcept(skos_concept);


            writer.addDocument(concept_doc);
        }


        writer.close();
    }


    /**
     * Returns the values of a given field for a given concept
     */
    private String[] readConceptFieldValues(String conceptURI, String field)
            throws IOException {


        Query query = new TermQuery(new Term(FIELD_URI, conceptURI));


        TopDocs docs = searcher.search(query, 1);


        ScoreDoc[] results = docs.scoreDocs;


        if (results.length != 1) {
            System.out.println("Unknown concept " + conceptURI);
            return null;
        }


        Document conceptDoc = searcher.doc(results[0].doc);


        return conceptDoc.getValues(field);
    }


    private String join(Iterator iterator, char separator) {
        // handle null, zero and one elements before building a buffer
        if (iterator == null) {
            return null;
        }
        if (!iterator.hasNext()) {
            return "";
        }
        Object first = iterator.next();
        if (!iterator.hasNext()) {
            return first == null ? "" : first.toString();
        }
        // two or more elements
        StringBuilder buf = new StringBuilder();
        if (first != null) {
            buf.append(first);
        }
        while (iterator.hasNext()) {
            buf.append(separator);
            Object obj = iterator.next();
            if (obj != null) {
                buf.append(obj);
            }
        }
        return buf.toString();
    }
    
    private String getName(String filename) {
        if (filename == null) {
            return null;
        }
        int index = indexOfLastSeparator(filename);
        return filename.substring(index + 1);
    }
    private static final char UNIX_SEPARATOR = '/';
    private static final char WINDOWS_SEPARATOR = '\\';


    private int indexOfLastSeparator(String filename) {
        if (filename == null) {
            return -1;
        }
        int lastUnixPos = filename.lastIndexOf(UNIX_SEPARATOR);
        int lastWindowsPos = filename.lastIndexOf(WINDOWS_SEPARATOR);
        return Math.max(lastUnixPos, lastWindowsPos);
    }


    private String getExtension(String filename) {
        if (filename == null) {
            return null;
        }
        int index = indexOfExtension(filename);
        if (index == -1) {
            return "";
        } else {
            return filename.substring(index + 1);
        }
    }


    private int indexOfExtension(String filename) {
        if (filename == null) {
            return -1;
        }
        int extensionPos = filename.lastIndexOf(EXTENSION_SEPARATOR);
        int lastSeparator = indexOfLastSeparator(filename);
        return (lastSeparator > extensionPos ? -1 : extensionPos);
    }
    public static final char EXTENSION_SEPARATOR = '.';


    private String getBaseName(String filename) {
        return removeExtension(getName(filename));
    }


    private String removeExtension(String filename) {
        if (filename == null) {
            return null;
        }
        int index = indexOfExtension(filename);
        if (index == -1) {
            return filename;
        } else {
            return filename.substring(0, index);
        }
    }    
}
Source Code of org.xbib.elasticsearch.index.analysis.skos.engine.jena.SKOSEngineImpl

Related Classes of org.xbib.elasticsearch.index.analysis.skos.engine.jena.SKOSEngineImpl