Source Code of org.languagetool.dev.HomophoneOccurrenceDumper

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.dev;


import org.apache.lucene.index.Fields;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.languagetool.JLanguageTool;
import org.languagetool.languagemodel.LuceneLanguageModel;
import org.languagetool.rules.ConfusionProbabilityRule;
import org.languagetool.rules.ConfusionSetLoader;


import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.Set;


/**
 * Dump the occurrences of homophone 3grams to STDOUT. Useful to have a more
 * compare file with homophone occurrences, as searching the homophones and
 * their contexts in the Lucene index requires iterating all terms and is
 * thus slow.
 * @since 2.8
 */
class HomophoneOccurrenceDumper extends LuceneLanguageModel {


  private static final int MIN_COUNT = 1000;


  HomophoneOccurrenceDumper(File topIndexDir) throws IOException {
    super(topIndexDir);
  }


  /**
   * Get the context (left and right words) for the given word(s). This is slow,
   * as it needs to scan the whole index.
   */
  Map<String,Long> getContext(String... tokens) throws IOException {
    Objects.requireNonNull(tokens);
    TermsEnum iterator = getIterator();
    Map<String,Long> result = new HashMap<>();
    BytesRef byteRef;
    int i = 0;
    while ((byteRef = iterator.next()) != null) {
      String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
      for (String token : tokens) {
        if (term.contains(" " + token + " ")) {
          String[] split = term.split(" ");
          if (split.length == 3) {
            long count = getCount(split[0], split[1], split[2]);
            result.put(term, count);
          }
        }
      }
      /*if (i++ > 1_000_000) { // comment in for faster testing with subsets of the data
        break;
      }*/
    }
    return result;
  }


  private void run(String homophonePath) throws IOException {
    System.err.println("Loading homophones from " + homophonePath + ", minimum occurrence: " + MIN_COUNT);
    ConfusionSetLoader confusionSetLoader = new ConfusionSetLoader();
    InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(homophonePath);
    Map<String,ConfusionProbabilityRule.ConfusionSet> map = confusionSetLoader.loadConfusionSet(inputStream);
    Set<String> confusionTerms = map.keySet();
    dumpOccurrences(confusionTerms);
  }


  private void dumpOccurrences(Set<String> tokens) throws IOException {
    Objects.requireNonNull(tokens);
    TermsEnum iterator = getIterator();
    BytesRef byteRef;
    int i = 0;
    while ((byteRef = iterator.next()) != null) {
      String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
      String[] split = term.split(" ");
      if (split.length == 3) {
        String token = split[1];
        if (tokens.contains(token)) {
          long count = getCount(split[0], split[1], split[2]);
          if (count >= MIN_COUNT) {
            System.out.println(token + "\t" + count + "\t" + split[0] + " " + split[1] + " " + split[2]);
          }
        }
      }
      if (i % 10_000 == 0) {
        System.err.println(i + "...");
      }
      i++;
    }
  }


  private TermsEnum getIterator() throws IOException {
    LuceneSearcher luceneSearcher = getLuceneSearcher(3);
    Fields fields = MultiFields.getFields(luceneSearcher.getReader());
    Terms terms = fields.terms("ngram");
    return terms.iterator(null);
  }


  public static void main(String[] args) throws IOException {
    if (args.length != 1) {
      System.out.println("Usage: " + HomophoneOccurrenceDumper.class.getSimpleName() + " <indexDir>");
      System.exit(1);
    }
    HomophoneOccurrenceDumper dumper = new HomophoneOccurrenceDumper(new File(args[0]));
    dumper.run("/en/homophones.txt");
  }
}
Source Code of org.languagetool.dev.HomophoneOccurrenceDumper

Related Classes of org.languagetool.dev.HomophoneOccurrenceDumper