Source Code of org.dbpedia.spotlight.lucene.index.CandidateIndexer

/*
 * Copyright 2011 DBpedia Spotlight Development Team
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 *  Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org
 */


package org.dbpedia.spotlight.lucene.index;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.store.FSDirectory;
import org.dbpedia.spotlight.exceptions.IndexException;
import org.dbpedia.spotlight.lucene.LuceneManager;
import org.dbpedia.spotlight.model.Candidate;
import org.dbpedia.spotlight.model.DBpediaResource;
import org.dbpedia.spotlight.model.SpotlightConfiguration;
import org.dbpedia.spotlight.model.SurfaceForm;
import org.semanticweb.yars.nx.Node;
import org.semanticweb.yars.nx.parser.NxParser;


import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.List;
import java.util.Scanner;


/**
 * Class to index surrogates mapping (surface form -> set of resources) in lucene.
 * This does not index context (paragraphs around an entity mention). For that see @link{org.dbpedia.spotlight.index.OccurrenceContextIndexer}
 * @author pablomendes
 * @author maxjakob
 */
public class CandidateIndexer extends BaseIndexer<Candidate> {


    final static Log LOG = LogFactory.getLog(BaseIndexer.class);


    /**
     * Constructs a surrogate indexer that follows the policy specified by the {@link org.dbpedia.spotlight.lucene.LuceneManager} implementation used.
     * @param indexManager For a caseInsensitive behavior, use {@link org.dbpedia.spotlight.lucene.LuceneManager.CaseInsensitiveSurfaceForms}.
     * @throws java.io.IOException
     */
    public CandidateIndexer(LuceneManager indexManager) throws IOException {
        super(indexManager);
    }


    public void add(SurfaceForm surfaceForm, DBpediaResource resource) throws IndexException {
        Document newDoc = mLucene.createDocument(surfaceForm, resource);
        try {
            mWriter.addDocument(newDoc); // do not commit for faster indexing.
        } catch (IOException e) {
            throw new IndexException("Error adding candidate map to the index. ", e);
        }


        LOG.trace("Added to " + mLucene.directory().toString() + ": " + surfaceForm.toString() + " -> " + resource.toString());
    }


    public void add(List<SurfaceForm> surfaceForms, DBpediaResource resource) throws IndexException {
        Document newDoc = mLucene.createDocument(surfaceForms.get(0), resource);
        for (int i=1; i<surfaceForms.size(); i++) {
            newDoc.add(mLucene.getField(surfaceForms.get(i)));
        }
        try {
            mWriter.addDocument(newDoc); // do not commit for faster indexing.
        } catch (IOException e) {
            throw new IndexException("Error adding candidate map to the index. ", e);
        }


        LOG.trace("Added to " + mLucene.directory().toString() + ": " + surfaceForms.toString() + " -> " + resource.toString());
    }


    public void add(SurfaceForm surfaceForm, DBpediaResource resource, int nTimes) throws IndexException {
        Document newDoc = mLucene.createDocument(surfaceForm, resource, nTimes);
        try {
            mWriter.addDocument(newDoc); //TODO ATTENTION need to merge with existing doc if URI is already in index
        } catch (IOException e) {
            throw new IndexException("Error adding candidate map to the index. ", e);
        }


        LOG.trace("Added to " + mLucene.directory().toString() + ": " + surfaceForm.toString() + " -> " + resource.toString());
    }


    public void add(Candidate candidate) throws IndexException {
        add(candidate.surfaceForm(), candidate.resource());
    }


    /**
     * Index surrogates mapping from a triples file.
     */
    public void addFromNTfile(File surfaceFormsDataSet) throws IOException, IndexException {
        LOG.info("Indexing candidate map from "+surfaceFormsDataSet.getName()+" to "+mLucene.directory()+"...");


        NxParser nxParser = new NxParser(new FileInputStream(surfaceFormsDataSet), false);
        while (nxParser.hasNext()) {
            Node[] nodes = nxParser.next();
            String resourceString = nodes[0].toString().replace(SpotlightConfiguration.DEFAULT_NAMESPACE,"");
            String surfaceFormString = nodes[2].toString();
            List<SurfaceForm> surfaceForms = AddSurfaceFormsToIndex.fromTitlesToAlternativesJ(new SurfaceForm(surfaceFormString));
            add(surfaceForms, new DBpediaResource(resourceString));
        }


        LOG.info("Done.");
    }


    /**
     * Index surrogates mapping from a tab separated file.
     */
    public void addFromTSVfile(File surfaceFormsDataSet) throws IOException, IndexException {
        LOG.info("Indexing candidate map from "+surfaceFormsDataSet.getName()+" to "+mLucene.directory()+"...");


        String separator = "\t";
        Scanner tsvScanner = new Scanner(new FileInputStream(surfaceFormsDataSet), "UTF-8");


        while(tsvScanner.hasNextLine()) {
            String[] line = tsvScanner.nextLine().split(separator);
            String surfaceFormString = line[0];
            String resourceString = line[1];
            //TODO read counts and set DBpediaResource.support
            // int countSfRes = new Integer(line[2])
            DBpediaResource res = new DBpediaResource(resourceString);
            // DBpediaResource res = new DBpediaResource(resourceString,countSfRes)
            List<SurfaceForm> surfaceForms = AddSurfaceFormsToIndex.fromTitlesToAlternativesJ(new SurfaceForm(surfaceFormString));
            add(surfaceForms, res);
        }


        LOG.info("Done.");
    }


    /**
     * Index surrogates mapping from a tab separated file.
     */
    public void addFromCounts(File surfaceFormsDataSet, int minCount) throws IOException, IndexException {
        LOG.info("Indexing candidate map from "+surfaceFormsDataSet.getName()+" to "+mLucene.directory()+"...");


        String separator = "\t";
        Scanner tsvScanner = new Scanner(new FileInputStream(surfaceFormsDataSet), "UTF-8");


        while(tsvScanner.hasNextLine()) {
            String[] line = tsvScanner.nextLine().split(separator);
            try {
            String countAndSf = line[0];
            int count = Integer.valueOf(countAndSf.substring(0,7).trim());
            String resourceString = countAndSf.substring(8);
            String surfaceFormString = line[1];
            if (count>minCount)
                add(new SurfaceForm(surfaceFormString), new DBpediaResource(resourceString), count);
                List<SurfaceForm> surfaceForms = AddSurfaceFormsToIndex.fromTitlesToAlternativesJ(new SurfaceForm(surfaceFormString));
                add(surfaceForms, new DBpediaResource(resourceString));
            } catch(ArrayIndexOutOfBoundsException e) {
                LOG.error("Error parsing line: "+line);
                e.printStackTrace();
            }
        }


        LOG.info("Done.");
    }


    /**
     * Optimize the index to speed up queries.
     *
     * @throws java.io.IOException
     */
    public void optimize() throws IOException {
        LOG.info("Optimizing candidate map index in "+mLucene.directory()+" ...");
        mWriter.optimize();
        LOG.info("Done.");
    }


    /**
     * Example:
     * java CandidateIndexer candidateMap.count candidateMapCI 2 --case-sensitive --overwrite
     *
     * @param args
     * @throws IOException
     * @throws IndexException
     */
    public static void main(String[] args) throws IOException, IndexException {
        String inputFileName = args[0];  // DBpedia surface forms mapping
    String outputDirName = args[1];  // target Lucene mContextIndexDir
        int minCount = 3;
        String luceneManagerType = "case-insensitive"; //case-insensitive
        boolean shouldOverwrite = false;


        try { minCount = Integer.valueOf(args[2]); } catch(ArrayIndexOutOfBoundsException ignored) {}
        try { luceneManagerType = args[3]; } catch(ArrayIndexOutOfBoundsException ignored) {}
        try { shouldOverwrite = args[4].contains("overwrite"); } catch(Exception ignored) {}


        LuceneManager mLucene;
        if (luceneManagerType.contains("case-sensitive")) {
            mLucene = new LuceneManager.CaseSensitiveSurfaceForms(FSDirectory.open(new File(outputDirName)));
        } else if (luceneManagerType.contains("buffered")){
            mLucene = new LuceneManager.BufferedMerging(FSDirectory.open(new File(outputDirName)));
        } else if (luceneManagerType.contains("phonetic")){
            mLucene = new LuceneManager.PhoneticSurfaceForms(FSDirectory.open(new File(outputDirName)));
        } else {
            mLucene = new LuceneManager.CaseInsensitiveSurfaceForms(FSDirectory.open(new File(outputDirName)));
        }
        mLucene.shouldOverwrite = shouldOverwrite;




        CandidateIndexer si = new CandidateIndexer(mLucene);


        if (inputFileName.toLowerCase().endsWith(".nt")) {
            si.addFromNTfile(new File(inputFileName));
        }
        else if (inputFileName.toLowerCase().endsWith(".tsv")) {
            si.addFromTSVfile(new File(inputFileName));
        }
        else if (inputFileName.toLowerCase().endsWith(".count")) {
            si.addFromCounts(new File(inputFileName), minCount);
        }
        si.optimize();
        si.close();
    }
}
Source Code of org.dbpedia.spotlight.lucene.index.CandidateIndexer

Related Classes of org.dbpedia.spotlight.lucene.index.CandidateIndexer