Source Code of edu.stanford.nlp.tagger.maxent.ReadDataTagged

/**
 * Title:        StanfordMaxEnt<p>
 * Description:  A Maximum Entropy Toolkit<p>
 * Copyright:    Copyright (c) Kristina Toutanova<p>
 * Company:      Stanford University<p>
 */
package edu.stanford.nlp.tagger.maxent;


import edu.stanford.nlp.io.NumberRangesFileFilter;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.WordTag;
import edu.stanford.nlp.trees.*;


import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.StringTokenizer;




/**
 * Reads tagged data from a file and creates a dictionary.
 * The tagged data has to be whitespace-separated items, with the word and
 * tag split off by a delimiter character, which is found as the last instance
 * of the delimiter character in the item.
 *
 * @author Kristina Toutanova
 * @version 1.0
 */
public class ReadDataTagged {


  private final String filename;
  private ArrayList<DataWordTag> v = new ArrayList<DataWordTag>();
  private int numElements = 0;
  private final PairsHolder pairs;


  private static final String eosWord = "EOS";
  private static final String eosTag = "EOS";
  //TODO: make a class DataHolder that holds the dict, tags, pairs, etc, for tagger
  // and pass it around


  protected ReadDataTagged(TaggerConfig config, PairsHolder pairs) {
    this.pairs = pairs;
    this.filename = config.getFile();
    try {
      if (config.getInitFromTrees()) {
        initFromTrees(config);
      } else {
        init(config.getTagSeparator(), config.getEncoding());
      }
    } catch (Exception e) {
      System.err.println("Error reading data from " + filename);
      e.printStackTrace();
    }
  }




  /** Frees the memory that is stored in this object by dropping the word-tag data.
   */
  void release() {
    v = null;
  }




  DataWordTag get(int index) {
    return v.get(index);
  }


  private void initFromTrees(TaggerConfig config) throws Exception {
    System.err.println("Training a tagger from treebank" + filename);
    ArrayList<String> words = new ArrayList<String>();
    ArrayList<String> tags = new ArrayList<String>();
    int numSentences = 0;
    int numWords = 0;


    int maxLen = Integer.MIN_VALUE;
    int minLen = Integer.MAX_VALUE;
    TreeReaderFactory trf = new LabeledScoredTreeReaderFactory();
    TreeTransformer transformer = config.getTreeTransformer();
    TreeNormalizer normalizer = config.getTreeNormalizer();
    DiskTreebank treebank = new DiskTreebank(trf, config.getEncoding());
    if (config.getTreeRange() != null) {
      treebank.loadPath(filename, new NumberRangesFileFilter(config.getTreeRange(), true));
    } else {
      treebank.loadPath(filename);
    }
    for (Tree t : treebank) {
      if (normalizer != null) {
        t = normalizer.normalizeWholeTree(t, t.treeFactory());
      }
      if (transformer != null) {
        t = t.transform(transformer);
      }
      Sentence<TaggedWord> yield = t.taggedYield();
      for(TaggedWord tw : yield) {
        if(tw != null) {
          words.add(tw.word());
          tags.add(tw.tag());
          if (!GlobalHolder.tagTokens.containsKey(tw.tag())) {
            GlobalHolder.tagTokens.put(tw.tag(), new HashSet<String>());
          }
          GlobalHolder.tagTokens.get(tw.tag()).add(tw.word());
        }
      }
      maxLen = (yield.length() > maxLen ? yield.length() : maxLen);
      minLen = (yield.length() < minLen ? yield.length() : minLen);
      words.add(eosWord);
      tags.add(eosTag);
      numElements = numElements + yield.length() + 1;
      // iterate over the words in the sentence
      for (int i = 0; i < yield.length() + 1; i++) {
        History h = new History(numWords+numSentences, numWords+numSentences + yield.length(), numWords+numSentences + i, pairs);
        String tag = tags.get(i);
        String word = words.get(i);
        pairs.add(new WordTag(word,tag));
        int y = GlobalHolder.tags.add(tag);
        DataWordTag dat = new DataWordTag(h, y);
        v.add(dat);
        GlobalHolder.dict.add(word, tag);


      }
      numSentences++;
      numWords += yield.length();
      words.clear();
      tags.clear();
      if ((numSentences % 100000) == 0) System.err.println("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words ... [still reading]");


    }


    System.err.println("Read " + numWords + " words from " + filename + " [done].");
    System.err.println("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words.");
  }




  // Read the data.
  private void init(String tagSeparator, String encoding) throws IOException {
    ArrayList<String> words = new ArrayList<String>();
    ArrayList<String> tags = new ArrayList<String>();
    int numSentences = 0;
    int numWords = 0;
    int endPos = 0;
    int prevPos = 0;
    BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(filename), encoding));


    int maxLen = Integer.MIN_VALUE;
    int minLen = Integer.MAX_VALUE;


    //loop over sentences
    for  (String s; (s = in.readLine()) != null; ) {
      StringTokenizer st = new StringTokenizer(s);
      //loop over words in a single sentence


      while (st.hasMoreTokens()) {
        String token = st.nextToken();
        numWords++;
        int indexUnd = token.lastIndexOf(tagSeparator);
        if (indexUnd < 0) {
          throw new RuntimeException("Data format error: can't find delimiter \"" + tagSeparator + "\" in word \"" + token + "\" (line " + numSentences + " of " + filename + ')');
        }
        String word = token.substring(0, indexUnd).intern();
        String tag = token.substring(indexUnd + 1).intern();
        words.add(word);
        tags.add(tag);
        if(!GlobalHolder.tagTokens.containsKey(tag)) {
          GlobalHolder.tagTokens.put(tag, new HashSet<String>());
        }
        GlobalHolder.tagTokens.get(tag).add(word);
        endPos++;
      }


      if (endPos > maxLen) maxLen = endPos;
      if (endPos < minLen) minLen = endPos;


      // add the EOS as well
      words.add(eosWord);
      tags.add(eosTag);
      numElements = numElements + endPos + 1;


      // iterate over the words in the sentence
      for (int i = 0; i < endPos + 1; i++) {
        History h = new History(prevPos, prevPos + endPos, prevPos + i, pairs);
        String tag = tags.get(i);
        String word = words.get(i);
        pairs.add(new WordTag(word,tag));
        int y = GlobalHolder.tags.add(tag);
        DataWordTag dat = new DataWordTag(h, y);
        v.add(dat);
        GlobalHolder.dict.add(word, tag);
      }


      numSentences++;
      prevPos += endPos + 1;
      endPos = 0;
      words.clear();
      tags.clear();
      if ((numSentences % 100000) == 0) System.err.println("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words ... [still reading]");
    }


    in.close();
    System.err.println("Read " + numWords + " words from " + filename + " [done].");
    System.err.println("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words.");
  }


  /*
  public void addPartTakingVerb(History h, String tag) {
    if (!(tag.equals(rpTag) || tag.equals(rbTag) || tag.equals(inTag))) {
      return;
    }
    String cWord = ExtractorFrames.cWord.extract(h);
    if (GlobalHolder.dict.getCount(cWord, rpTag) == 0) {
      return;
    }
    String verb = ExtractorParticles.extractLV(h, 1);
    if (!verb.startsWith("NA"))
    // add it
    {
      GlobalHolder.dict.addVPTaking(verb, tag, cWord);
    }
  }




  public void addPRRBINTakingVerb(History h, String tag) {
    if (!(tag.equals(rpTag) || tag.equals(rbTag) || tag.equals(inTag))) {
      return;
    }
    String word = ExtractorFrames.cWord.extract(h);
    String verb = ((ExtractorLastVerb) (ExtractorFrames.lastVerb)).extractLV(h, 6);
    // add it
    GlobalHolder.dict.add(word + "|" + verb, tag); //correct later
  }


  public void addThatTakingVerb(History h, String word, String tag) {
    if (!(word.equals(thatWord))) {
      return;
    }
    if (!(tag.equals(inTag))) {
      return;
    }
    String s = ExtractorFrames.lastVerbThat.extract(h, 0);
    String verb = ExtractorFrames.lastVerbThat.extractLV(h);
    if (!s.equals("0"))
    // add it
    {
      GlobalHolder.dict.addVThatTaking(verb);
    }
  }
  */


  /** Returns the number of tokens in the data read, which is the number of words
   *  plus one end sentence token per sentence.
   *  @return The number of tokens in the data
   */
  public int getSize() {
    return numElements;
  }


  /*
  public static void main(String[] args) {
  ReadDataTagged rDT1=new ReadDataTagged("testhuge.txt");
  Dictionary dOld=GlobalHolder.dict;
  GlobalHolder.dict=new Dictionary();
  ReadDataTagged rDT2=new ReadDataTagged("trainhuge.txt");
  // how many ambiguous praticles are there in testhuge, how many amb. INs, how many amb. RBs
  // how many of the amb. praticles in trainhuge have appeared as a particle with the same verb before
  // how many of the amb. particles have appeared with the same verb before as INs or RBs
  // how many of the RPs have appeared with the same verb before
  // read in sequentially the testhuge dictionary
  int numARP=0;
  int numAIN=0;
  int numARB=0;
  int[] napp=new int[3];
  int[][] appbefore=new int[3][3]; // RP RB IN
  int[][] nappbefore=new int[3][3];
  Object[] arr=dOld.dict.keySet().toArray();


   for(int i=0;i<arr.length;i++)
  {
    String word=(String)arr[i];
    if(word.indexOf("|")==-1) continue;
    String wordA=word.substring(0,word.indexOf("|"));
    if(GlobalHolder.dict.sum(wordA)==0)
    {
     System.out.println(" unknown "+wordA);
     continue;
    }
    if(GlobalHolder.dict.getTags(wordA).length==1) continue; // unambiguous
    numARP+=dOld.getCount(word,rpTag);
    numARB+=dOld.getCount(word,rbTag);
    numAIN+=dOld.getCount(word,inTag);
    int numRP=dOld.getCount(word,rpTag);
    int numRB=dOld.getCount(word,rbTag);
    int numIN=dOld.getCount(word,inTag);
    TagCount tC=GlobalHolder.dict.get(word);
    if(tC==null){
    napp[0]+=numRP;
    napp[1]+=numRB;
    napp[2]+=numIN;
    continue;
    }
    else{// the word was seen before with the same verb
     int numRPM=GlobalHolder.dict.getCount(word,rpTag);
     int numRBM=GlobalHolder.dict.getCount(word,rbTag);
     int numINM=GlobalHolder.dict.getCount(word,inTag);
     if(numRPM>0){
      appbefore[0][0]+=numRP;
      appbefore[1][0]+=numRB;
      appbefore[2][0]+=numIN;
      }
     else{
      nappbefore[0][0]+=numRP;
      nappbefore[1][0]+=numRB;
      nappbefore[2][0]+=numIN;
        }
      if(numRBM>0){
      appbefore[0][1]+=numRP;
      appbefore[1][1]+=numRB;
      appbefore[2][1]+=numIN;
      }
     else{
      nappbefore[0][1]+=numRP;
      nappbefore[1][1]+=numRB;
      nappbefore[2][1]+=numIN;
        }


      if(numINM>0){
      appbefore[0][2]+=numRP;
      appbefore[1][2]+=numRB;
      appbefore[2][2]+=numIN;
      }
     else{
      nappbefore[0][2]+=numRP;
      nappbefore[1][2]+=numRB;
      nappbefore[2][2]+=numIN;
        }


    } // else the word was seen before


   }// for


   System.out.println(numARP+" "+numARB+" "+numAIN);
   System.out.println(" not appeared at all before "+ napp[0]+" "+napp[1]+" "+napp[2]);
   for(int i=0;i<3;i++)
    for(int j=0;j<3;j++){
   System.out.println(" napp as this before "+i+" "+j+" "+nappbefore[i][j]);
   }
   for(int i=0;i<3;i++)
    for(int j=0;j<3;j++){
   System.out.println(" napp as this before "+i+" "+j+" "+nappbefore[i][j]);
   System.out.println(" appeared as this before " +i+" "+j+" "+appbefore[i][j]);
   }


    //saveTreebankToFile(args[0],(Integer.parseInt(args[1]), Integer.parseInt(args[2]));


    //System.out.println(" ambg "+countAmbiguous+" unambg "+countUnAmbiguous+" disamb "+countAmbDisamb);


  //}
  */


  /**
   * Save the treebank sections to a file in format [word_tag ]+
   * per sentence, one sentence per line.
   */
  /*
  public static void saveTreebankToFile(String filename, int start,int end) {
    String treebankPath="/dfs/ah/1/tmp/klein/corpora/Treebank3/parsed/mrg/wsj";


    Treebank trainTreebank = new MemoryTreebank(new TreeReaderFactory() {
        public TreeReader newTreeReader(Reader in) {
          return new PennTreeReader(in,
                        new LabeledScoredTreeFactory(
                               new StringLabelFactory()),
                               new BobChrisTreeNormalizer());
        }
      });
    FileFilter filter=new NumberRangeFileFilter(start,end,true);
    trainTreebank.loadPath(treebankPath, filter);
    try{
      PrintWriter bw=new PrintWriter(new BufferedWriter(new FileWriter(filename)));


      for (Tree nextTree : trainTreebank) {
        Sentence s=nextTree.taggedYield();


        //System.out.println(s.toString());


        for (int i=0;i<s.length();i++) {
          TaggedWord w=(TaggedWord)s.getWord(i);
          String st=w.value();
          //System.out.println("value is "+st+" tag is "+w.tag());
          bw.print(st+'_'+w.tag()+' ');
        }
        bw.println();
      }


      bw.close();


    } catch(Exception e){
      e.printStackTrace();
    }
  }
  */
}
Source Code of edu.stanford.nlp.tagger.maxent.ReadDataTagged

Related Classes of edu.stanford.nlp.tagger.maxent.ReadDataTagged