Source Code of org.dbpedia.spotlight.lucene.search.BaseSearcher

/**
 * Copyright 2011 Pablo Mendes, Max Jakob
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.dbpedia.spotlight.lucene.search;


import com.google.common.collect.Ordering;
import com.google.common.primitives.Ints;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.MapFieldSelector;
import org.apache.lucene.index.*;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.*;
import org.apache.lucene.store.RAMDirectory;
import org.dbpedia.spotlight.exceptions.SearchException;
import org.dbpedia.spotlight.exceptions.TimeoutException;
import org.dbpedia.spotlight.lucene.LuceneFeatureVector;
import org.dbpedia.spotlight.lucene.LuceneManager;
import org.dbpedia.spotlight.model.*;
import org.dbpedia.spotlight.model.vsm.FeatureVector;


import java.io.Closeable;
import java.io.IOException;
import java.util.*;


/**
 * This class manages an index from surface form to candidate resources (surrogates)
 * @author pablomendes
 */
public class BaseSearcher implements Closeable {


    protected final Log LOG = LogFactory.getLog(getClass());


    LuceneManager mLucene;
    IndexSearcher mSearcher;
    public IndexReader mReader;


    //TODO create method that iterates over all documents in the index and computes this. (if takes too long, think about storing somewhere at indexing time)
    private double mNumberOfOccurrences = 69772256;


    protected BaseSearcher() {}  // allow subclasses to rewrite the super below


    public long objectCreationTime = 0;


    public BaseSearcher(LuceneManager lucene) throws IOException {
        this(lucene,false);
    }


    public BaseSearcher(LuceneManager lucene, boolean inMemory) throws IOException {
        this.mLucene = lucene;
        if (inMemory) {
            LOG.info("Creating in-memory lucene searcher... (may take 2-3 minutes and several GB of RAM)");
            this.mLucene.mContextIndexDir = new RAMDirectory(this.mLucene.mContextIndexDir);
        }
        LOG.info("Using index at: "+this.mLucene.mContextIndexDir);
        this.mReader = LuceneManager.openIndexReader(this.mLucene.mContextIndexDir); //will open single or multireader
        this.mSearcher = new IndexSearcher(this.mReader);
        LOG.debug("Done.");


        //If the DBpediaResourceFactory uses SQL, the mapping from Lucene docID to URI must be cached:
        if(mLucene.getDBpediaResourceFactory() instanceof DBpediaResourceFactorySQL) {
            /*
               Here, the Cache for the URIs is called (but not read) for the first time and will be
               created from the index and held within the Lucene cache manager.
            */
            LOG.debug("Caching all URIs" );
            FieldCache.DEFAULT.getStrings(mReader, LuceneManager.DBpediaResourceField.URI.toString());
            LOG.debug("Done.");
        }
        
    }


    public int getNumberOfEntries() {
        return this.mReader.numDocs();    // can use maxDoc?
    }


    public double getNumberOfOccurrences() {
        return this.mNumberOfOccurrences;
    }


    /**
     * Generic search method that will be used by other public methods in this class
     * @param query
     * @return
     * @throws IOException
     * @throws ParseException
     */
    protected List<FeatureVector> search(Query query) throws SearchException {
        // List of results
        List<FeatureVector> surrogates = new ArrayList<FeatureVector>();


        // Iterate through the results:
        for (ScoreDoc hit : getHits(query)) {
            DBpediaResource resource = getDBpediaResource(hit.doc);
            TermFreqVector vector = getVector(hit.doc);
            surrogates.add(new LuceneFeatureVector(resource, vector));
        }


        //LOG.debug(surrogates.size()+" hits found.");
        return surrogates;
    }


    public boolean isDeleted(int docNo) {
        return mReader.isDeleted(docNo);
    }


    public Document getFullDocument(int docNo) throws SearchException {
        Document document;
        try {
            document = mReader.document(docNo);
        } catch (IOException e) {
            throw new SearchException("Error reading document "+docNo,e);
        }
        return document;
    }


    public Document getDocument(int docNo, FieldSelector selector) throws SearchException {
        Document document;
        try {
            document = mReader.document(docNo, selector);
            //document = mReader.document(docNo, fieldSelector);
        } catch (IOException e) {
            throw new SearchException("Error reading document "+docNo,e);
        }
        //LOG.debug("docNo:"+docNo);
        return document;
    }


    public List<Document> getDocuments(DBpediaResource res, FieldSelector fieldSelector) throws SearchException {
        //LOG.trace("Retrieving documents for resource: "+res);


        // search index for surface form
        List<Document> documents = new ArrayList<Document>();


        // Iterate through the results:
        for (ScoreDoc hit : getHits(mLucene.getQuery(res))) {
            documents.add(getDocument(hit.doc, fieldSelector));
        }
        //LOG.debug(documents.size()+" documents found.");


        // return set of surrogates
        return documents;
    }


    /**
     * Basic search method used by all searches to the index.
     * @param query
     * @param n
     * @return
     * @throws SearchException
     */
    public ScoreDoc[] getHits(Query query, int n, int timeout, Filter filter) throws SearchException {
        ScoreDoc[] hits = null;
        try {
            //LOG.debug("Start search. timeout="+timeout);
            long start = System.nanoTime();
            TopScoreDocCollector collector = TopScoreDocCollector.create(n, false);
            //TimeLimitingCollector collector = new TimeLimitingCollector(tCollector, timeout);  //TODO try to bring this back later
            mSearcher.search(query, filter, collector);
            //mSearcher.
            hits = collector.topDocs().scoreDocs;
            long end = System.nanoTime();
            LOG.debug(String.format("Done search in %f ms. hits.length=%d",(end-start) / 1000000.0, hits.length));
        } catch (TimeLimitingCollector.TimeExceededException timedOutException) {
            throw new TimeoutException("Timeout (>"+timeout+"ms searching for surface form "+query.toString(),timedOutException);
        } catch (Exception e) {
            throw new SearchException("Error searching for surface form "+query.toString(),e);
        }
        //LOG.debug(hits.length+" hits found.");
        return hits;
    }


    /**
     * Search method with default filter
     * @param query
     * @param n number of results to return
     * @param timeout number of miliseconds before giving up this query
     * @return array of document id,score
     * @throws SearchException
     */
    public ScoreDoc[] getHits(Query query, int n, int timeout) throws SearchException {
        Filter filter = null; //TODO surfaceForm filter here?
        return getHits(query, n, timeout, filter);
    }


    // Uses default timeout
    public ScoreDoc[] getHits(Query query, int n) throws SearchException {
       return getHits(query, n, 5000);
    }


    // Uses default maxHits and timeout
    public ScoreDoc[] getHits(Query query) throws SearchException {
        return getHits(query, mLucene.topResultsLimit());
    }


    /**
     * Retrieves all DBpedia Resources in the index that are within a set of allowed URIs and match the input text.
     * Uses Lucene's MoreLikeThis rather than ICF as defined in DBpedia Spotlight's paper.
     * It is faster, but doesn't take into account selectional preferences of words wrt resources.
     *
     * @param context text containing a URI mention
     * @param resources allowed URIs
     * @return
     */
    public ScoreDoc[] getHits(Text context, Set<DBpediaResource> resources) throws SearchException {
        try {
            return getHits(mLucene.getQuery(context, resources, this.mReader));
        } catch (IOException e) {
            throw new SearchException("Error while executing query. ",e);
        }
    }


    public TermFreqVector getVector(int docNo) throws SearchException {
        TermFreqVector vector = null;
        try {
            vector = mReader.getTermFreqVector(docNo, LuceneManager.DBpediaResourceField.CONTEXT.toString());
            if (vector==null)
                throw new IllegalStateException("TermFreqVector for document "+docNo+" is null.");
        } catch (IOException e) {
            throw new SearchException("Error reading TermFreqVector for surrogate "+docNo,e);
        }
        //LOG.debug("vector:"+vector);
        return vector;
    }


    @Override
    public void close() throws IOException {
        //LOG.debug("Closing searcher.");
        mSearcher.close();
        //mReader.close();
    }


    /**
     * Creates a DBpediaResource with all fields stored in the index
     * @deprecated  Use getDBpediaResource(int docNo, String[] fieldsToLoad)
     * TODO REMOVE. This is bad because the fields being loaded are opaque to the caller. Some need only URI, others need everything.
     * @param docNo
     * @return
     */
    public DBpediaResource getDBpediaResource(int docNo) throws SearchException {
        DBpediaResource r = null;
        String method = "";
        long start = System.nanoTime();
        DBpediaResourceFactory f = mLucene.getDBpediaResourceFactory();
        if (f==null) {
            method = "lucene";
            String[] fields = {LuceneManager.DBpediaResourceField.TYPE.toString(),
                    LuceneManager.DBpediaResourceField.URI.toString(),
                    LuceneManager.DBpediaResourceField.URI_COUNT.toString()
            };
            r = getDBpediaResource(docNo,fields); // load all available info from Lucene
        } else {
           method = "database";
           //String[] fields = {LuceneManager.DBpediaResourceField.URI.toString()};
           //r = getDBpediaResource(docNo,fields);   // load only URI from Lucene
           r = getCachedDBpediaResource(docNo);   // load URI from Lucene's Cache
           r = mLucene.getDBpediaResourceFactory().from(r.uri()); // load the rest of the info from DB
        }
        long end = System.nanoTime();
        //LOG.debug(String.format("DBpediaResource (%s) creation with %s took %f ms.", r.uri(), method, (end-start) / 1000000.0) );
        objectCreationTime += (end-start);


        return r;
    }


    /**
     * This is an experimental function to evaluate the feasibility of caching all URIs in Lucene
     * @param docNo
     * @return
     * @throws SearchException
     */
    public DBpediaResource getCachedDBpediaResource(int docNo) throws SearchException {
        try {
            String[] uris = FieldCache.DEFAULT.getStrings(mReader, LuceneManager.DBpediaResourceField.URI.toString());
            return new DBpediaResource(uris[docNo]);
        } catch (IOException e) {
            throw new SearchException("Error getting cached DBpediaResource.",e);
        }
    }








    /**
     * CONTEXT SEARCHER and SURROGATE SEARCHER
     * Loads only a few fields (faster)
     * TODO FACTORY move to Factory
     * @param docNo
     * @return
     * @throws SearchException
     */
    public DBpediaResource getDBpediaResource(int docNo, String[] fieldsToLoad) throws SearchException {


        FieldSelector fieldSelector = new MapFieldSelector(fieldsToLoad);
        Document document = getDocument(docNo, fieldSelector);
        Field uriField = document.getField(LuceneManager.DBpediaResourceField.URI.toString());
        if (uriField==null)
            throw new SearchException("Cannot find URI for document "+document);


        String uri = uriField.stringValue();
        if (uri==null)
            throw new SearchException("Cannot find URI for document "+document);


        DBpediaResource resource = new DBpediaResource(uri);


        for (String fieldName: fieldsToLoad) {
            Field field = document.getField(fieldName);
            if (field != null) Factory.setField(resource, LuceneManager.DBpediaResourceField.valueOf(fieldName), document);
        }
        if (resource.prior() == 0.0) { // adjust prior
            resource.setPrior(resource.support() / this.getNumberOfOccurrences());
        }
        return resource;
    }


//        // Returns the first URI that can be found in the document number docNo   (old, superceded by Factory.setField)
//    //TODO move to Factory
//    //TODO why is this overriding BaseSearcher? can merge?
//    public DBpediaResource getDBpediaResource(int docNo) throws SearchException {
//
//        FieldSelector fieldSelector = new MapFieldSelector(onlyUriAndTypes);
//
//        LOG.trace("Getting document number " + docNo + "...");
//        Document document = createDocument(docNo, fieldSelector);
//        String uri = document.get(LuceneManager.DBpediaResourceField.URI.toString());
//        if (uri==null)
//            throw new SearchException("Cannot find URI for document "+document);
//
//        LOG.trace("Setting URI, types and support...");
//        DBpediaResource resource = new DBpediaResource(uri);
//        resource.setTypes( getDBpediaTypes(document) );
//        resource.setSupport( getSupport(document) ); //TODO this can be optimized for time performance by adding a support field. (search for the most likely URI then becomes a bit more complicated)
//
//        //LOG.debug("uri:"+uri);
//        return resource;
//    }


    public SurfaceForm getSurfaceForm(int docNo) throws SearchException {
        String[] onlyUriAndTypes = {LuceneManager.DBpediaResourceField.SURFACE_FORM.toString()};
        FieldSelector fieldSelector = new MapFieldSelector(onlyUriAndTypes);
        Document document = getDocument(docNo,fieldSelector);
        Field sfField = document.getField(LuceneManager.DBpediaResourceField.SURFACE_FORM.toString());
        if (sfField==null)
            throw new SearchException("Cannot find SurfaceForm for document "+document);


        String sf = sfField.stringValue();
        if (sf==null)
            throw new SearchException("Cannot find URI for document "+document);


        //LOG.debug("uri:"+uri);
        return new SurfaceForm(sf);
    }


    public boolean isContainedInIndex(SurfaceForm sf) {
        try {
            if (getHits(mLucene.getQuery(sf), 1).length > 0)
                return true;
        } catch (SearchException e) {
            LOG.info("SearchException in isContainedInIndex("+sf+"): "+e);
        }
        return false;
    }


//    public Document termDocs(DBpediaResource resource) throws IOException {
//
//        Term uriTerm = new Term(LuceneManager.DBpediaResourceField.URI.toString(),resource.uri());
//        mReader.terms(uriTerm);
//        TermDocs t = mReader.termDocs();
//
//        return null;
//    }




  /**
   * Computes a term frequency map for the index at the specified location.
   * @param 
   * @return a Boolean OR query.
   * @throws Exception if one is thrown.
     * @author sujitpal (computeTopTermQuery in http://sujitpal.blogspot.com/2009/02/summarization-with-lucene.html)
     * @author pablomendes adapted from sujitpal
   */
    public static List<Map.Entry<Term,Integer>> getTopTerms(IndexReader mReader) throws IOException {


        final Map<Term,Integer> frequencyMap = new HashMap<Term,Integer>();


        TermEnum terms = mReader.terms(); //TODO check what can we do about fields here. should have only top terms for context field?
        while (terms.next()) {
            Term term = terms.term();
            int frequency = mReader.docFreq(term); // DF
            frequencyMap.put(term, frequency);
        }


        // sort the term map by frequency descending
        Ordering descOrder = new Ordering<Map.Entry<Term,Integer>>() {
            public int compare(Map.Entry<Term,Integer> left, Map.Entry<Term,Integer> right) {
                return Ints.compare(right.getValue(), left.getValue());
            }
        };
        List<Map.Entry<Term,Integer>> sorted = descOrder.sortedCopy(frequencyMap.entrySet());


        return sorted;
    }


    /**
     * Warms up the index with the n most common terms.
     *
     * TODO warm up property with surface form + context
     *      Currently this only gets the most common terms (which are all probably from CONTEXT field, so not much gain is seen for the ICF
     *      Best would be to get a list of X most common surface forms and execute a query with that surface form and the top Y most common context terms
     * @param n
     * @return
     * @throws IOException
     */
    public void warmUp(int n) {


        try {
            long start = System.nanoTime();
            List<Map.Entry<Term,Integer>> terms = getTopTerms(mReader);
            LOG.info(String.format("Index has %s terms. Will warm up cache with the %s top terms.", terms.size(), n));


            for (int i=0; i<n; i++) {
                Term t = terms.get(i).getKey();
                //TODO For a second-level cache warmUp we need surface forms and context together, but this is app dependent
//                String[] commonSurfaceForms = {"*"}; //get a list of surface forms from somewhere
//                for (String sf: commonSurfaceForms) {
//                    getHits(new CandidateResourceQuery(new Term(LuceneManager.DBpediaResourceField.SURFACE_FORM.toString(), sf), t));
//                }


                //int count = terms.get(i).getValue();
                //LOG.trace(String.format("Text: %s, Count: %s", t.text(), count));


                getHits(new TermQuery(t), 3, 1000); // warm up first-level cache (lucene's own)


            }




            String time = new Long( (System.nanoTime() - start) / 1000000000 ).toString();
            //Files.write(Joiner.on("\n").join(terms), new File("/home/pablo/workspace/dbpa/trunk/src/web/topterms.tsv"), Charset.defaultCharset()); //TODO use one charset consistently throughout
            LOG.info(String.format("Warm up took %s ms.",time));
        } catch (Exception e) {
            LOG.error("Error warming up the cache. Ignoring. "); //TODO Throw SetupException
            e.printStackTrace();
        }
    }


    /**
     * for testing QueryAutoStopWordsAnalyzer
     * @return
     */
    public IndexReader getIndexReader() {
        return mReader;
    }


}
Source Code of org.dbpedia.spotlight.lucene.search.BaseSearcher

Related Classes of org.dbpedia.spotlight.lucene.search.BaseSearcher