Package org.dbpedia.spotlight.disambiguate

Source Code of org.dbpedia.spotlight.disambiguate.CustomScoresDisambiguator

/**
* Copyright 2011 Pablo Mendes, Max Jakob
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.dbpedia.spotlight.disambiguate;

import com.google.common.collect.Ordering;
import com.google.common.primitives.Doubles;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.store.FSDirectory;
import org.dbpedia.spotlight.exceptions.InputException;
import org.dbpedia.spotlight.exceptions.ItemNotFoundException;
import org.dbpedia.spotlight.exceptions.SearchException;
import org.dbpedia.spotlight.exceptions.TimeoutException;
import org.dbpedia.spotlight.io.DataLoader;
import org.dbpedia.spotlight.lucene.search.LuceneCandidateSearcher;
import org.dbpedia.spotlight.model.*;
import org.dbpedia.spotlight.lucene.LuceneManager;

import java.io.File;
import java.io.IOException;
import java.util.*;

/**
* Preliminary class to take a list of weights for each DBpediaResource as the only cue to decide which surrogate to choose for a given surface form occurrence.
* For example, this has been used to get a list of prior probabilities computed offline in a Hadoop/Pig job
*
* @author pablomendes
*/
public class CustomScoresDisambiguator implements Disambiguator {

    Log LOG = LogFactory.getLog(this.getClass());

    Map<String,Double> scores = new HashMap<String,Double>();

    CandidateSearcher surrogateSearcher;

    public CustomScoresDisambiguator(CandidateSearcher surrogates, DataLoader loader) {
        this.surrogateSearcher = surrogates;
        if (loader!=null)
            scores = loader.loadPriors();
        LOG.debug(loader+": "+ scores.size()+" scores loaded.");
    }

    public List<SurfaceFormOccurrence> spotProbability(List<SurfaceFormOccurrence> sfOccurrences) {
        return sfOccurrences; //FIXME IMPLEMENT
    }

    @Override
    public DBpediaResourceOccurrence disambiguate(SurfaceFormOccurrence sfOccurrence) throws SearchException, ItemNotFoundException, InputException {
        List<DBpediaResourceOccurrence> candidates = bestK(sfOccurrence, 1);
        if (candidates.size()==0)
            throw new ItemNotFoundException(sfOccurrence.surfaceForm()+" was not found in the index.");
        return candidates.get(0);
    }

    public List<DBpediaResourceOccurrence> disambiguate(List<SurfaceFormOccurrence> sfOccurrences) throws SearchException, InputException {
        List<DBpediaResourceOccurrence> disambiguated = new ArrayList<DBpediaResourceOccurrence>();
        for (SurfaceFormOccurrence sfOcc: sfOccurrences) {
            try {
                disambiguated.add(disambiguate(sfOcc));
            } catch (TimeoutException e) {
                LOG.error("Could not disambiguate. Surface form took too long: "+sfOcc.surfaceForm()+": "+e);
            } catch (ItemNotFoundException e) {
                LOG.error("Could not disambiguate. Surface form not found: "+sfOcc.surfaceForm()+": "+e);
            }
        }
        return disambiguated;
    }

    @Override
    public List<DBpediaResourceOccurrence> bestK(SurfaceFormOccurrence sfOccurrence, int k) throws SearchException, ItemNotFoundException {
        Set<DBpediaResource> candidates = surrogateSearcher.getCandidates(sfOccurrence.surfaceForm());

        if (candidates.size()==0)
            return new LinkedList<DBpediaResourceOccurrence>();

        List<DBpediaResourceOccurrence> all = getScores(sfOccurrence, candidates);

        Ordering descOrder = new Ordering<DBpediaResourceOccurrence>() {
            public int compare(DBpediaResourceOccurrence left, DBpediaResourceOccurrence right) {
                return Doubles.compare(right.similarityScore(), left.similarityScore());

            }
        };

        return descOrder.sortedCopy(all).subList(0, Math.min(k, all.size()));
    }

    protected List<DBpediaResourceOccurrence> getScores(SurfaceFormOccurrence sfOccurrence, Set<DBpediaResource> candidates) {
         List<DBpediaResourceOccurrence> occurrences = new ArrayList<DBpediaResourceOccurrence>();
        try {
            for(DBpediaResource r: candidates) {
                Double score = scores.get(r);
                if (score ==null) {
                    LOG.debug("No score found for URI: "+r);
                    score = 0.0;
                }
                DBpediaResourceOccurrence occ = new DBpediaResourceOccurrence(r,
                    sfOccurrence.surfaceForm(),
                    sfOccurrence.context(),
                    sfOccurrence.textOffset(),
                    score);
                occurrences.add(occ);
            }
        } catch (NullPointerException e2) {
            LOG.error("NullPointerException here. Resource: "+candidates);
        }
        return occurrences;
    }




    public static void main(String[] args) throws IOException {
      String luceneIndexFileName = "data/apple-example/LuceneIndex-apple50_test";
      String resourcePriorsFileName = "data/apple-example/3apples-scores.tsv";

      // Lucene Manager - Controls indexing and searching
      LuceneManager luceneManager = new LuceneManager(FSDirectory.open(new File(luceneIndexFileName)));

        try {
            new CustomScoresDisambiguator(new LuceneCandidateSearcher(luceneManager, false), new DataLoader(new DataLoader.TSVParser(), new File("data/Distinct-surfaceForm-By-uri.grouped")));
        } catch (IOException e) {
            e.printStackTrace();
        }
    }


    @Override
    public String name() {
        return this.getClass().getSimpleName();
    }

    @Override
    public int support(DBpediaResource resource) throws SearchException {
        // for the WikiPageContext, the training size is always 1 page per resource
        return 1;
    }

    @Override
    public List<Explanation> explain(DBpediaResourceOccurrence goldStandardOccurrence, int nExplanations) throws SearchException {
        throw new SearchException("Not implemented yet.");
    }

    @Override
    public int ambiguity(SurfaceForm sf) throws SearchException {
        int s = 0;
        try {
            s = surrogateSearcher.getCandidates(sf).size();
        } catch (ItemNotFoundException e) {
            s = 0; // surface form not found
        }
        return s;
    }

    @Override
    public int contextTermsNumber(DBpediaResource resource) throws SearchException {
        return 0// prior works without context
    }

    @Override
    public double averageIdf(Text context) throws IOException {
        throw new IOException(this.getClass()+" has no index available to calculate averageIdf");
    }
}
TOP

Related Classes of org.dbpedia.spotlight.disambiguate.CustomScoresDisambiguator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.