Examples of opennlp.ccg.lexicon.Tokenizer

opennlp.ccg.lexicon.Tokenizer
The Tokenizer interface provides a way to customize tokenization and handling of special tokens. A custom tokenizer may be specified in the grammar file. DefaultTokenizer provides a default implementation, which can also be subclassed for custom behavior. @author Michael White @version $Revision: 1.14 $, $Date: 2005/10/20 17:30:30 $


        // load grammar
        URL grammarURL = new File(grammarfile).toURI().toURL();
        System.out.println("Loading grammar from URL: " + grammarURL);
        Grammar grammar = new Grammar(grammarURL);
        Tokenizer tokenizer = grammar.lexicon.tokenizer;
        System.out.println();
        
        // set up parser
        Parser parser = new Parser(grammar);
        // instantiate scorer
        try {
            System.out.println("Instantiating parsing sign scorer from class: " + parseScorerClass);
            SignScorer parseScorer = (SignScorer) Class.forName(parseScorerClass).newInstance();
            parser.setSignScorer(parseScorer);
            System.out.println();
        } catch (Exception exc) {
            throw (RuntimeException) new RuntimeException().initCause(exc);
        }
        // instantiate supertagger
        try {
          Supertagger supertagger;
          if (supertaggerClass != null) {
                System.out.println("Instantiating supertagger from class: " + supertaggerClass);
                supertagger = (Supertagger) Class.forName(supertaggerClass).newInstance();
          }
          else {
            System.out.println("Instantiating supertagger from config file: " + stconfig);
            supertagger = WordAndPOSDictionaryLabellingStrategy.supertaggerFactory(stconfig);
          }
            parser.setSupertagger(supertagger);
            System.out.println();
        } catch (Exception exc) {
            throw (RuntimeException) new RuntimeException().initCause(exc);
        }
        
        // loop through input
        BufferedReader in = new BufferedReader(new FileReader(inputfile));
        String line;
        Map<String,String> predInfoMap = new HashMap<String,String>();
        System.out.println("Parsing " + inputfile);
        System.out.println();
        int count = 1;
        while ((line = in.readLine()) != null) {
          String id = "s" + count;
          try {
            // parse it
            System.out.println(line);
      parser.parse(line);
      int numParses = Math.min(nbestListSize, parser.getResult().size());
      for (int i=0; i < numParses; i++) {
          Sign thisParse = parser.getResult().get(i);
          // convert lf
          Category cat = thisParse.getCategory();
          LF convertedLF = null;
          String predInfo = null;
          if (cat.getLF() != null) {
        // convert LF
        LF flatLF = cat.getLF();
        cat = cat.copy();
        Nominal index = cat.getIndexNominal(); 
        convertedLF = HyloHelper.compactAndConvertNominals(flatLF, index, thisParse);
        // get pred info
        predInfoMap.clear();
        Testbed.extractPredInfo(flatLF, predInfoMap);
        predInfo = Testbed.getPredInfo(predInfoMap);
          }
          // add test item, sign
          Element item = RegressionInfo.makeTestItem(grammar, line, 1, convertedLF);
          String actualID = (nbestListSize == 1) ? id : id + "-" + (i+1);
          item.setAttribute("info", actualID);
          outRoot.addContent(item);
          signMap.put(actualID, thisParse);
          // Add parsed words as a separate LF element
          Element fullWordsElt = new Element("full-words");
          fullWordsElt.addContent(tokenizer.format(thisParse.getWords()));
          item.addContent(fullWordsElt);
          if (predInfo != null) {
        Element predInfoElt = new Element("pred-info");
        predInfoElt.setAttribute("data", predInfo);
        item.addContent(predInfoElt);

View Full Code Here

        KenNgramModel lm = new KenNgramModel(Integer.parseInt(order), lmfile, false, lowercase, splitNEs, '_', false);
  lm.debugScore = true;
        int secs = (int) (System.currentTimeMillis() - start) / 1000;
        System.out.println("secs: " + secs);
        System.out.println();
        Tokenizer tokenizer = new DefaultTokenizer();
        List<Word> words = tokenizer.tokenize(tokens);
        System.out.println("scoring: " + tokens);
        System.out.println();
        lm.setWordsToScore(words, true);
        lm.prepareToScoreWords();
        double logprob = lm.logprob();

View Full Code Here

        System.out.println();
        // System.out.println("trie map: ");
        // System.out.println(lm.trieMapRoot.toString());
        // System.out.println();
        
        Tokenizer tokenizer = new DefaultTokenizer();
        List<Word> words = tokenizer.tokenize(tokens);
        System.out.println("scoring: " + tokens);
        System.out.println();
        lm.debugScore = true;
        lm.setWordsToScore(words, true);
        lm.prepareToScoreWords();

View Full Code Here

        System.out.println();
        // System.out.println("trie map: ");
        // System.out.println(lm.trieMapRoot.toString());
        // System.out.println();
        
        Tokenizer tokenizer = new DefaultTokenizer();
        List<Word> words = tokenizer.tokenize(tokens);
        System.out.println("scoring: " + tokens);
        System.out.println();
        lm.debugScore = true;
        lm.setWordsToScore(words, true);
        lm.prepareToScoreWords();

View Full Code Here

            // compute stats, show outcome
            rCount++;
            totalScore += score; 
            if (gramcomplete) totalScoreComplete += score;
            int itemRank = 1;
            Tokenizer tokenizer = grammar.lexicon.tokenizer;
            String itemOrth = tokenizer.getOrthography(tokenizer.tokenize(testItem.sentence));
            if (!bestRealization.equals(itemOrth)) {
                itemRank = 0;
                List<Edge> bestEdges = chart.bestEdges();
                for (int j = 0; j < bestEdges.size(); j++) {
                    Edge edge = bestEdges.get(j);

View Full Code Here

        if (reverse) option += ", reversed";
        System.out.println("Writing text file" + option + ": " + filename);
        System.out.println();
        PrintWriter tOut = new PrintWriter(new BufferedWriter(new FileWriter(filename)));
        HashSet<String> unique = new HashSet<String>(); 
        Tokenizer tokenizer = grammar.lexicon.tokenizer;
        // loop through files
        for (File f : getXMLFiles(tbFile)) {
          // load testbed
          System.out.println("Loading testbed from: " + f);
          RegressionInfo tbInfo = new RegressionInfo(grammar, f);
          int numItems = tbInfo.numberOfItems();
          // do each test item
          for (int i = 0; i < numItems; i++) {
              // check even/odd only
              if (i % 2 == 1 && evenOnly) continue;
              if (i % 2 == 0 && oddOnly) continue;
              RegressionInfo.TestItem testItem = tbInfo.getItem(i); 
            // check grammatical
            if (testItem.numOfParses == 0) continue;
              String s = testItem.sentence;
              // get parsed words if doing more than just text
              List<Word> words = null;
              if (semClassReplacement || withFactors) {
                  // use words from sign or pre-parsed full words if available
                if (testItem.sign != null) 
                  words = testItem.sign.getWords();
                else if (testItem.fullWords != null) 
                      words = tokenizer.tokenize(testItem.fullWords, true);
                  // otherwise parse
                  else words = grammar.getParsedWords(s);
              }
              else words = tokenizer.tokenize(s);
              // reverse, if apropos
              if (reverse) {
                  List<Word> tmp = words;
                  words = new ArrayList<Word>(words.size());
                  words.add(Word.createWord("<s>"));
                  for (int j = tmp.size()-1; j >= 0; j--) {
                      Word w = tmp.get(j);
                      if (w.getForm() == "<s>" || w.getForm() == "</s>") continue; // skip <s> or </s>
                      words.add(w);
                  }
                  words.add(Word.createWord("</s>"));
              }
              // write str, add to unique set
              String str = (!withFactors)
                  ? tokenizer.getOrthography(words, semClassReplacement)
                  : tokenizer.format(words, semClassReplacement);
              tOut.println(str);
              unique.add(str);
              System.out.print("."); // indicate progress
          }
          System.out.println();

View Full Code Here

    private void writeDerivationFactors(File tbFile, String filename) throws IOException {
        // open text file
        System.out.println("Writing derivation factors file: " + filename);
        System.out.println();
        PrintWriter tOut = new PrintWriter(new BufferedWriter(new FileWriter(filename)));
        Tokenizer tokenizer = grammar.lexicon.tokenizer;
        // loop through files
        for (File f : getXMLFiles(tbFile)) {
          // load testbed
          System.out.println("Loading testbed from: " + f);
          RegressionInfo tbInfo = new RegressionInfo(grammar, f);
          int numItems = tbInfo.numberOfItems();
          // do each test item, using the saved sign
          for (int i = 0; i < numItems; i++) {
              RegressionInfo.TestItem testItem = tbInfo.getItem(i); 
            if (testItem.numOfParses == 0) continue; // check grammatical
            Sign sign = testItem.sign;
            List<Word> factors = GenerativeSyntacticModel.getFactors(sign);
            for (Word w : factors) {
              tOut.print(tokenizer.format(w));
              tOut.print(" ");
            }
            tOut.println();
              System.out.print("."); // indicate progress
          }

View Full Code Here

TOP

Related Classes of opennlp.ccg.lexicon.Tokenizer

opennlp.ccg.ngrams.KenNgramModel

opennlp.ccg.ngrams.SRILMNgramModel

opennlp.ccg.ngrams.StandardNgramModel

opennlp.ccg.Parse

opennlp.ccg.test.Regression

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.