Source Code of com.bericotech.clavin.extractor.ApacheExtractor

package com.bericotech.clavin.extractor;


import java.io.IOException;
import java.util.ArrayList;
import java.util.List;


import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;


/*#####################################################################
 * 
 * CLAVIN (Cartographic Location And Vicinity INdexer)
 * ---------------------------------------------------
 * 
 * Copyright (C) 2012-2013 Berico Technologies
 * http://clavin.bericotechnologies.com
 * 
 * ====================================================================
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 * 
 * ====================================================================
 * 
 * ApacheExtractor.java
 * 
 *###################################################################*/


/**
 * Extracts location names from unstructured text documents using a
 * named entity recognizer (Apache OpenNLP Name Finder).
 *
 */
public class ApacheExtractor implements LocationExtractor {
    
    // the actual named entity recognizer (NER) object
    private NameFinderME nameFinder;
    
    // used to tokenize plain text into the OpenNLP format
    private TokenizerME tokenizer;


    // used to split the input into sentences before finding names
    private SentenceDetectorME sentenceDetector;
    
    // resource files used by Apache OpenNLP Name Finder
    private static final String pathToNERModel = "/en-ner-location.bin";
    private static final String pathToTokenizerModel = "/en-token.bin";
    private static final String pathToSentenceDetectorModel = "/en-sent.bin";


    
    /**
     * Builds an {@link ApacheExtractor} by instantiating the OpenNLP
     * Name Finder and Tokenizer.
     * 
     * @throws IOException 
     */
    public ApacheExtractor() throws IOException {
        nameFinder = new NameFinderME(new TokenNameFinderModel(ApacheExtractor.class.getResourceAsStream(pathToNERModel)));
        tokenizer = new TokenizerME(new TokenizerModel(ApacheExtractor.class.getResourceAsStream(pathToTokenizerModel)));
        sentenceDetector = new SentenceDetectorME(new SentenceModel(ApacheExtractor.class.getResourceAsStream(pathToSentenceDetectorModel)));
    }
    
    /**
     * Extracts location names from unstructured text using the named
     * entity recognizer (NER) feature provided by the Apache OpenNLP
     * Name Finder.
     * 
     * @param plainText     Contents of text document
     * @return List of location names and positions
     */
    public List<LocationOccurrence> extractLocationNames(String plainText) {
        if(plainText == null) {
            throw new IllegalArgumentException("plaintext input to extractLocationNames should not be null");
        }


        List<LocationOccurrence> nerResults = new ArrayList<LocationOccurrence>();


        // The values used in these Spans are string character offsets
        Span sentenceSpans[] = sentenceDetector.sentPosDetect(plainText);


        // Each sentence gets processed on its own
        for (Span sentenceSpan : sentenceSpans) {


            // find the start and end position of this sentence in the document
            String sentence = plainText.substring(sentenceSpan.getStart(), sentenceSpan.getEnd());


            // tokenize the text into the required OpenNLP format
            String[] tokens = tokenizer.tokenize(sentence);


            //the values used in these Spans are string character offsets of each token from the sentence beginning
            Span[] tokenPositionsWithinSentence = tokenizer.tokenizePos(sentence);


            // find the location names in the tokenized text
            // the values used in these Spans are NOT string character offsets, they are indices into the 'tokens' array
            Span names[] = nameFinder.find(tokens);




            //for each name that got found, create our corresponding occurrence
            for (Span name : names) {


                //find offsets relative to the start of the sentence
                int beginningOfFirstWord = tokenPositionsWithinSentence[name.getStart()].getStart();
                // -1 because the high end of a Span is noninclusive
                int endOfLastWord = tokenPositionsWithinSentence[name.getEnd() - 1].getEnd();


                //to get offsets relative to the document as a whole, just add the offset for the sentence itself
                int startOffsetInDoc = sentenceSpan.getStart() + beginningOfFirstWord;
                int endOffsetInDoc = sentenceSpan.getStart() + endOfLastWord;


                //look back into the original input string to figure out what the text is that I got a hit on
                String nameInDocument = plainText.substring(startOffsetInDoc, endOffsetInDoc);


                // add to List of results to return
                nerResults.add(new LocationOccurrence(nameInDocument, startOffsetInDoc));
            }


        }


        // this is necessary to maintain consistent results across
        // multiple runs on the same data, which is what we want
        nameFinder.clearAdaptiveData();


        return nerResults;
    }


}
Source Code of com.bericotech.clavin.extractor.ApacheExtractor

Related Classes of com.bericotech.clavin.extractor.ApacheExtractor