Package opennlp.ccg.parse.postagger

Source Code of opennlp.ccg.parse.postagger.POSTagger

///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2010 Dennis N. Mehay
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//////////////////////////////////////////////////////////////////////////////
package opennlp.ccg.parse.postagger;

import opennlp.ccg.parse.tagger.Constants.TaggingAlgorithm;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import opennlp.ccg.parse.tagger.TaggedWord;
import opennlp.ccg.lexicon.Word;
import opennlp.ccg.parse.postagger.ml.POSPriorModel;
import opennlp.ccg.parse.postagger.ml.POSTagFex;
import opennlp.ccg.parse.tagger.ml.MaxentModel;
import opennlp.ccg.parse.tagger.ml.ZLMEM;
import opennlp.ccg.parse.tagger.sequencescoring.SequenceScorer;
import opennlp.ccg.util.Pair;
import opennlp.ccg.parse.tagger.util.ConfigFileProcessor;

/**
* Interface for POS taggers.
*
* @author Dennis N. Mehay
*/
public abstract class POSTagger {

    protected SequenceScorer posSeqMod = null;
    public TaggingAlgorithm alg = TaggingAlgorithm.FORWARDBACKWARD;
    public double beta = 1.0;
    protected boolean includeGold = false;

    /** In goes a list of {@code Word}s; out comes a list of {@code TaggedWord}s*/
    public abstract List<TaggedWord> tagSentence(List<Word> sentence);

    /** Set the tagging algorithm. */   
    public void setTaggingAlgorithm(TaggingAlgorithm newAlg) {
        alg = newAlg;
        posSeqMod.setAlgorithm(alg);
    }
   
    /** Set the beam width (by default, it's 1.0 -- i.e., single-best). */
    public void setBeta(double beta) {
        this.beta = beta;
    }

    /** Say whether or not we will include gold tags (e.g., for training). */
    public void setIncludeGold(boolean includeGoldOrNot) { includeGold = includeGoldOrNot; }
   
    /**
     * Filter the POS tags by the beta filter and return the (potentially) trimmed-down results.
     * It is assumed that the tags of the tagged word are sorted in descending order of
     * probability.
     */
    public List<TaggedWord> betaBestFilter(List<TaggedWord> sentence) {
        List<TaggedWord> res = new ArrayList<TaggedWord>(sentence.size());
        for (TaggedWord tw : sentence) {
            Word w = tw.getWord();
            double best = tw.getPOSTagging().get(0).a;
            int endIndex = 0;
            for (Pair<Double, String> tagging : tw.getPOSTagging()) {
                if (tagging.a >= (beta * best)) {
                    endIndex++;
                } else {
                    break;
                }
            }
            List<Pair<Double, String>> tmpTagging = new ArrayList<Pair<Double, String>>(tw.getPOSTagging().size());
            for (Pair<Double, String> tg : tw.getPOSTagging()) {
                tmpTagging.add(tg);
            }
            tmpTagging.subList(endIndex, tmpTagging.size()).clear();
            TaggedWord twTmp = new TaggedWord(w);
            twTmp.setPOSTagging(tmpTagging);
            res.add(twTmp);
        }
        return res;
    }
   
    /**
     * Build a POS tagger from a config file.
     * A non-dummy config file might contain:
     * ...
     * # this is a comment
     * taggerType=basic
     * priorModel=/home/.../posprior/prior.flm
     * # note that CaSE dOES Not matter for the key (but does for the value, e.g. a file name).
     * PRIORmodelvocab=/home/.../posprior/vocab.voc
     * # you can also repeat opions. the last one will take effect.
     * priormodelvocab=/home/.../posprior/vocab2.voc
     *
     * # did you see that empty line get ignored?
     * # notice that spaces around the '=' get ignored, as well.
     * maxentModel = /home/.../maxentmodels/myposmod.mod
     * # this last must be an ARPA-formatted n-gram model over POS tags (7-grams work well).
     * sequenceModel=/home/.../pos.lm
     * # lastly, the tagging beam width (1.0 means "single-best" -- i.e., a unitagger).
     * beta=0.1
     */
    public static POSTagger posTaggerFactory(String configFile) {
        POSTagger res = null;
        try {

            String[] pathKeys = { "maxentmodel", "priormodel", "priormodelvocab", "sequencemodel"};
            Map<String, String> opts = ConfigFileProcessor.readInConfig(configFile, pathKeys);
            if (opts.get("taggertype").equalsIgnoreCase("dummy")) {
                return new DummyPOSTagger();
            }
            MaxentModel mem = new ZLMEM(new File(opts.get("maxentmodel")));
            POSPriorModel posPrior = null;
      if (opts.get("priormodel") != null) {
    posPrior = new POSPriorModel(opts.get("priormodel"), opts.get("priormodelvocab"));
      }
            POSTagFex fexer = new POSTagFex(posPrior);

            res = new BasicPOSTagger(mem, fexer, opts.get("sequencemodel"));
            res.setBeta(Double.parseDouble(opts.get("beta")));
            TaggingAlgorithm alg = (opts.get("taggingalgorithm") == null || opts.get("taggingalgorithm").equals("forward-backward")) ?
                TaggingAlgorithm.FORWARDBACKWARD : TaggingAlgorithm.FORWARD;
            res.setTaggingAlgorithm(alg);

            res.setIncludeGold((opts.get("includegold") == null || opts.get("includegold").equals("false")) ? false : true);
           
        } catch (FileNotFoundException ex) {
            Logger.getLogger(POSTagger.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IOException ex) {
            Logger.getLogger(POSTagger.class.getName()).log(Level.SEVERE, null, ex);
        }

        return res;
    }
}
TOP

Related Classes of opennlp.ccg.parse.postagger.POSTagger

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.