Source Code of org.languagetool.dev.ConfusionSetCoverage

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.dev;


import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.language.English;
import org.languagetool.rules.ConfusionProbabilityRule;
import org.languagetool.rules.ConfusionSetLoader;
import org.languagetool.tools.StringTools;


import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Map;


/**
 * Measure how many homophones there are per sentence on average. Useful so we can estimate
 * the number of ngram lookups needed by the confusion rule.
 * @since 2.7
 */
class ConfusionSetCoverage {


  private int sentences = 0;
  private int homophones = 0;
  private int lookupsNeeded = 0;


  private void run(String filename) throws IOException {
    Map<String, ConfusionProbabilityRule.ConfusionSet> confusionSet = getConfusionSet();
    try (FileReader reader = new FileReader(filename)) {
      String text = StringTools.readerToString(reader);
      JLanguageTool languageTool = new JLanguageTool(new English());
      List<AnalyzedSentence> analyzedSentences = languageTool.analyzeText(text);
      for (AnalyzedSentence sentence : analyzedSentences) {
        runOnSentence(sentence, confusionSet);
      }
    }
    System.out.println("Homophones set: " + confusionSet.size() + " items");
    System.out.println("Sentences: " + sentences);
    System.out.println("Homophones: " + homophones + " = " + ((float)homophones/sentences) + " per sentence");
    System.out.println("Lookups: " + lookupsNeeded + " = " + ((float)lookupsNeeded/sentences) + " per sentence");
    System.out.println(" (Lookups is the number of lookups needed to see which word in the homophones set " +
            "is more common. Actually even more ngram lookups will be needed, depending on what ngrams we have.)");
  }


  private Map<String, ConfusionProbabilityRule.ConfusionSet> getConfusionSet() throws IOException {
    ConfusionSetLoader loader = new ConfusionSetLoader();
    InputStream homophoneStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/homophones.txt");
    return loader.loadConfusionSet(homophoneStream);
  }


  private void runOnSentence(AnalyzedSentence sentence, Map<String, ConfusionProbabilityRule.ConfusionSet> confusionSet) {
    sentences++;
    for (AnalyzedTokenReadings token : sentence.getTokensWithoutWhitespace()) {
      String tokenStr = token.getToken();
      if (confusionSet.containsKey(tokenStr)) {
        homophones++;
        lookupsNeeded += confusionSet.get(tokenStr).getSet().size();
      }
    }
  }


  public static void main(String[] args) throws IOException {
    if (args.length != 1) {
      System.out.println("Usage: " + ConfusionSetCoverage.class.getSimpleName() + " <textfile>");
      System.exit(1);
    }
    ConfusionSetCoverage coverage = new ConfusionSetCoverage();
    coverage.run(args[0]);
  }


}
Source Code of org.languagetool.dev.ConfusionSetCoverage

Related Classes of org.languagetool.dev.ConfusionSetCoverage