Package opennlp.tools.ngram

Examples of opennlp.tools.ngram.MutableDictionary


        iterations = Integer.parseInt(args[ai++]);
      }
      GISModel mod;
      if (dict != null) {
        System.err.println("Building dictionary");
        MutableDictionary mdict = new MutableDictionary(cutoff);
        DataStream data = new opennlp.maxent.PlainTextByLineDataStream(new java.io.FileReader(inFile));
        while(data.hasNext()) {
          String tagStr = (String) data.nextToken();
          String[] tt = tagStr.split(" ");
          String[] words = new String[tt.length];
          for (int wi=0;wi<words.length;wi++) {
            words[wi] = tt[wi].substring(0,tt[wi].lastIndexOf('_'));
          }
          mdict.add(words,1,true);
        }
        System.out.println("Saving the dictionary");
        mdict.persist(new File(dict));
      }
      EventStream es;
      if (encoding == null) {
        if (dict == null) {
          es = new POSEventStream(new PlainTextByLineDataStream(new InputStreamReader(new FileInputStream(inFile))));
View Full Code Here


   * @param rules The head rules for the parses.
   * @param cutoff The minimum number of entries required for the n-gram to be saved as part of the dictionary.
   * @return A dictionary object.
   */
  private static MutableDictionary buildDictionary(DataStream data, HeadRules rules, int cutoff) {
    MutableDictionary mdict = new MutableDictionary(cutoff);
    while(data.hasNext()) {
      String parseStr = (String) data.nextToken();
      Parse p = Parse.parseParse(parseStr);
      p.updateHeads(rules);
      Parse[] pwords = p.getTagNodes();
      String[] words = new String[pwords.length];
      //add all uni-grams
      for (int wi=0;wi<words.length;wi++) {
        words[wi] = pwords[wi].toString();
      }
      mdict.add(words,1,true);
      //add tri-grams and bi-grams for inital sequence
      Parse[] chunks = collapsePunctuation(ParserEventStream.getInitialChunks(p),rules.getPunctuationTags());
      String[] cwords = new String[chunks.length];
      for (int wi=0;wi<cwords.length;wi++) {
        cwords[wi] = chunks[wi].getHead().toString();
      }
      mdict.add(cwords,3,false);
      //emulate reductions to produce additional n-grams
      int ci = 0;
      while (ci < chunks.length) {
        //System.err.println("chunks["+ci+"]="+chunks[ci].getHead().toString()+" chunks.length="+chunks.length);
        if (lastChild(chunks[ci], chunks[ci].getParent(),rules.getPunctuationTags())) {
          //perform reduce
          int reduceStart = ci;
          while (reduceStart >=0 && chunks[reduceStart].getParent() == chunks[ci].getParent()) {
            reduceStart--;
          }
          reduceStart++;
          chunks = ParserEventStream.reduceChunks(chunks,ci,chunks[ci].getParent());
          ci = reduceStart;
          if (chunks.length != 0) {
            String[] window = new String[5];
            int wi = 0;
            if (ci-2 >= 0) window[wi++] = chunks[ci-2].getHead().toString();
            if (ci-1 >= 0) window[wi++] = chunks[ci-1].getHead().toString();
            window[wi++] = chunks[ci].getHead().toString();
            if (ci+1 < chunks.length) window[wi++] = chunks[ci+1].getHead().toString();
            if (ci+2 < chunks.length) window[wi++] = chunks[ci+2].getHead().toString();
            if (wi < 5) {
              String[] subWindow = new String[wi];
              for (int swi=0;swi<wi;swi++) {
                subWindow[swi]=window[swi];
              }
              window = subWindow;
            }
            if (window.length >=3) {
              mdict.add(window,3,false);
            }
            else if (window.length == 2) {
              mdict.add(window,2,false);
            }
          }
          ci=reduceStart-1; //ci will be incremented at end of loop
        }
        ci++;
View Full Code Here

      Parse.useFunctionTags(true);
    }
    if (dict || all) {
      System.err.println("Building dictionary");
      DataStream data = new opennlp.maxent.PlainTextByLineDataStream(new java.io.FileReader(inFile));
      MutableDictionary mdict = buildDictionary(data, rules, cutoff);
      System.out.println("Saving the dictionary");
      mdict.persist(dictFile);
    }
    if (tag || all) {
      System.err.println("Training tagger");
      System.err.println("Loading Dictionary");
      Dictionary tridict = new Dictionary(dictFile.toString());
View Full Code Here

TOP

Related Classes of opennlp.tools.ngram.MutableDictionary

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.