Source Code of opennlp.ccg.parse.postagger.ml.POSPriorModel

///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2010 Dennis N. Mehay
// 
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// 
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
// 
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//////////////////////////////////////////////////////////////////////////////
package opennlp.ccg.parse.postagger.ml;


import opennlp.ccg.parse.supertagger.util.ProbPairComparator;


import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import opennlp.ccg.lexicon.DefaultTokenizer;
import opennlp.ccg.lexicon.Word;
import opennlp.ccg.ngrams.ConditionalProbabilityTable;
import opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator;
import opennlp.ccg.util.Interner;
import opennlp.ccg.util.Pair;


/**
 * (c) (2009) Dennis N. Mehay
 * @author Dennis N. Mehay
 * 
 * Model for predicting p(POS | word).  Uses an ARPA-formatted
 * SRILM-trained "unigram" factored LM for this, where each "unigram" is
 * a bundle of word:pos.
 */
public class POSPriorModel extends ConditionalProbabilityTable {


    public static final String WORD = DefaultTokenizer.WORD_ATTR;
    public static final String POS_TAG = DefaultTokenizer.POS_ATTR;
    private Interner<Pair<String, String>> pairs = new Interner<Pair<String, String>>();
    /** 
     * Re-usable list for attr-val pairs of word-pos-supertag inputs to the prior model
     * (i.e., for predicting p(STag | word, POS). 
     */
    public List<Pair<String, String>> attrVals = new ArrayList<Pair<String, String>>(5);
    /** 
     * A comparator for sorting Pair<Double,String>'s where the Double is a probability
     * (effectively sorts by descending order of probability).
     */
    private ProbPairComparator ppcomp = new ProbPairComparator();
    /** All the priors. Reference them when getting beta-best, beta-worst, etc. */
    List<Pair<Double, String>> priors = new ArrayList<Pair<Double, String>>(1000);


    /** String[] of all possible POS outcomes. */
    private String[] posVocab = null;


    /** Construct a prior model with the FLM config file and corresponding vocab file. */
    public POSPriorModel(String flmFile, String vocabFile) throws IOException {
        super(flmFile);
        String post = null;
        BufferedReader br = new BufferedReader(new FileReader(new File(vocabFile)));
        post = br.readLine().trim();


        // get next POS tag from the vocab.
        while ((post != null) && !post.trim().startsWith(POS_TAG + "-")) {
            post = br.readLine();
        }
        if (post != null) {
            post = post.trim().split("-")[1];
        }


        Collection<String> allSupertags = new HashSet<String>();


        // find out how many outcomes we have.
        int cnt = 0;
        while (post != null) {
            cnt++;
            allSupertags.add(post);
            while ((post != null) && !post.trim().startsWith(POS_TAG + "-")) {
                post = br.readLine();
            }
            if (post != null) {
                post = post.trim().split("-")[1];
            }
        }


        // initialize the arrays to this size.
        posVocab = new String[cnt];
        


        cnt = 0;
        // fill the vocab array with all possible POS tags.
        for (String posTag : allSupertags) {
            posVocab[cnt++] = posTag.intern();
        }
    }


    /** Get the prior probability of this POS/word combo. */
    public double getPriorOf(String pos, String word) {
        attrVals.clear();
        Pair<String, String> surfaceForm = pairs.intern(new Pair<String, String>(WORD, word.intern()));
        attrVals.add(surfaceForm);
        Pair<String, String> partOfSpeech = pairs.intern(new Pair<String, String>(POS_TAG, pos.intern()));
        attrVals.add(partOfSpeech);
        return score(attrVals);
    }


    /** Get the POS-dict restricted prior distribution (sorted descending by prob.) */
    public List<Pair<Double, String>> getPriors(Word w) {
        List<Pair<Double, String>> sortedTags = new ArrayList<Pair<Double, String>>(posVocab.length);
        for (String postag : posVocab) {
            sortedTags.add(new Pair<Double, String>(getPriorOf(postag, w.getForm()), postag));
        }
        Collections.sort(sortedTags, ppcomp);
        return sortedTags;
    }
    /* added by DCE, to facilitate use in hypertagging
     * Identical to above method, but accepts a String (name of EP) rather than
     * a Word object.
     */
    public List<Pair<Double, String>> getPriors(String s) {
      s.intern();
      List<Pair<Double, String>> sortedTags = new ArrayList<Pair<Double, String>>(posVocab.length);
      for (String postag : posVocab) {
          sortedTags.add(new Pair<Double, String>(getPriorOf(postag, s), postag));
      }
      Collections.sort(sortedTags, ppcomp);
      return sortedTags;
    }
    
    public static void main(String[] args) throws IOException {
        String usage = "\nPOSPriorModel -vocab <vocabfile> (-c <corpus> [default = <stdin>]) (-o <output> [default = <stdout>])\n";
        if (args.length > 0 && args[0].equals("-h")) {
            System.out.println(usage);
            System.exit(0);
        }


        SRILMFactoredBundleCorpusIterator in = null;
        BufferedWriter out = null;
        BufferedWriter voc = null;


        try {
            String inputCorp = "<stdin>", output = "<stdout>", vocabFile = "vocab.voc";


            for (int i = 0; i < args.length; i++) {
                if (args[i].equals("-c")) { inputCorp = args[++i]; continue; }
                if (args[i].equals("-o")) { output = args[++i];    continue; }
                if (args[i].equals("-vocab")) { vocabFile = args[++i];  continue; }
                System.out.println("Unrecognized option: " + args[i]);
            }


            try {
                in = new SRILMFactoredBundleCorpusIterator(
                        (inputCorp.equals("<stdin>")) ? new BufferedReader(new InputStreamReader(System.in)) : new BufferedReader(new FileReader(new File(inputCorp))));
            } catch (FileNotFoundException ex) {
                System.err.print("Input corpus " + inputCorp + " not found.  Exiting...");
                Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex);
                System.exit(
                        -1);
            }


            try {
                out = (output.equals("<stdout>")) ? new BufferedWriter(new OutputStreamWriter(System.out)) : new BufferedWriter(new FileWriter(new File(output)));
            } catch (IOException ex) {
                System.err.print("Output file " + output + " not found.  Exiting...");
                Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex);
                System.exit(
                        -1);
            }


            try {
                voc = new BufferedWriter(new FileWriter(new File(vocabFile)));
            } catch (IOException ex) {
                Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex);
            }


            Map<String, Integer> vocab = new HashMap<String, Integer>();
            for (List<Word> inLine : in) {
                for (Word w : inLine) {
                    String pos = POS_TAG + "-" + DefaultTokenizer.escape(w.getPOS()),
                        wform = WORD + "-" + DefaultTokenizer.escape(w.getForm());


                    vocab.put(pos, (vocab.get(pos) == null) ? 1 : vocab.get(pos) + 1);
                    vocab.put(wform, (vocab.get(wform) == null) ? 1 : vocab.get(wform) + 1);
                    out.write(wform + ":" + pos + " ");
                }
                out.write(System.getProperty("line.separator"));
            }
            out.flush();


            for (String str : vocab.keySet()) {
                    voc.write(str + System.getProperty("line.separator"));
            }
            voc.flush();
        } finally {
            try {
                out.close();
                in.close();
                voc.close();
            } catch (IOException ex) {
                Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
    }
}
Source Code of opennlp.ccg.parse.postagger.ml.POSPriorModel

Related Classes of opennlp.ccg.parse.postagger.ml.POSPriorModel