Package org.dbpedia.spotlight.tagging.lingpipe

Source Code of org.dbpedia.spotlight.tagging.lingpipe.LingPipeFactory

package org.dbpedia.spotlight.tagging.lingpipe;

import com.aliasi.hmm.HiddenMarkovModel;
import com.aliasi.hmm.HmmDecoder;
import com.aliasi.sentences.SentenceModel;
import com.aliasi.tag.Tagger;
import com.aliasi.tokenizer.RegExTokenizerFactory;
import com.aliasi.tokenizer.TokenizerFactory;
import com.aliasi.util.Streams;

import java.io.*;

/**
* Factory for LingPipe part-of-speech tagging, sentence segmentation and tokenization models.
*
* <p>Note that the part-of-speech tagger is currently held as Singleton instance.
* This is only possible if the underlying HM model is thread-safe, for more information
* on this issue, cf. the LingPipe
* <a href="http://alias-i.com/lingpipe/docs/api/com/aliasi/hmm/HmmDecoder.html">HmmDecoder class</a>.
* </p>
*
* @author Joachim Daiber
*/

public class LingPipeFactory {

  /**
   * Default RegEx-based word tokenizer.
   */
  private TokenizerFactory tokenizerFactory
      = new RegExTokenizerFactory("(-|'|\\d|\\p{L})+|\\S");

  private File sentenceModelFile;
  private SentenceModel sentenceModel;

 
  /**
   * The Hidden Markov model part-of-speech tagger.
   */
  private HmmDecoder hmmDecoder;

 
  /**
   * Create a new LingPipe factory with the tagger model file and sentence model provided
   * as parameters. On instantiation of the object, the HMM will be read from the tagger
   * model file and the part-of-speech tagger and sentence segmentizer are created.
   *
   * @param taggerModelFile File containing the serialized hidden Markov model for PoS tagging
   * @param sentenceModel Model for sentence segmentation
   * @throws IOException The file was not found or could not be read.
   */

  public LingPipeFactory(File taggerModelFile, SentenceModel sentenceModel) throws IOException {
    this.setTaggerModelFile(taggerModelFile);
    this.setSentenceModel(sentenceModel);
  }

  /**
   * Set the tagger model file to taggerModelFile, read the model and
   * create the HmmDecoder.
   *
   * @param taggerModelFile serialized HM model file
   * @throws IOException The file was not found or could not be read.
   */
  private void setTaggerModelFile(File taggerModelFile) throws IOException {

    FileInputStream fileIn;
    HiddenMarkovModel hmm;

    try {
      fileIn = new FileInputStream(taggerModelFile);
      ObjectInputStream objIn = new ObjectInputStream(fileIn);
      hmm = (HiddenMarkovModel) objIn.readObject();
      Streams.closeInputStream(objIn);
    } catch (FileNotFoundException e) {
      throw new IOException("Could not find POS tagger model "+taggerModelFile,e);
    } catch (ClassNotFoundException e) {
      throw new IOException("Could not decode POS tagger model in "+taggerModelFile,e);
    }

    hmmDecoder = new HmmDecoder(hmm);
  }


  private void setSentenceModel(SentenceModel sentenceModel) {
    this.sentenceModel = sentenceModel;
  }

  private void setSentenceModelFile(File sentenceModelFile) {
    this.sentenceModelFile = sentenceModelFile;
    //TODO: enable serialized sentence models
  }

  /**
   * Get the Singleton part-of-speech tagger instance.
   *
   * @return part-of-speech tagger instance
   */
  public Tagger getPoSTaggerInstance() {
    return hmmDecoder;
  }

  /**
   * Get the Singleton sentence segmentizer instance.
   *
   * @return sentence segmentizer instance.
   */
  public SentenceModel getSentenceModelInstance() {
    return sentenceModel;
  }

  public void setTokenizerFactory(TokenizerFactory tokenizerFactory) {
    tokenizerFactory = tokenizerFactory;
  }


  /**
   * Get the tokenizer factory instance that can be used for word
   * tokenization.
   *
   * @return word tokenizer instance.
   */
  public TokenizerFactory getTokenizerFactoryInstance() {
    return tokenizerFactory;
  }

}
TOP

Related Classes of org.dbpedia.spotlight.tagging.lingpipe.LingPipeFactory

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.