Package org.languagetool.commandline

Source Code of org.languagetool.commandline.Main

/* LanguageTool, a natural language style checker
* Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
* USA
*/
package org.languagetool.commandline;

import org.apache.tika.language.LanguageIdentifier;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.MultiThreadedJLanguageTool;
import org.languagetool.bitext.TabBitextReader;
import org.languagetool.language.English;
import org.languagetool.rules.Rule;
import org.languagetool.rules.bitext.BitextRule;
import org.languagetool.tools.JnaTools;
import org.languagetool.tools.StringTools;
import org.languagetool.tools.Tools;
import org.xml.sax.SAXException;

import javax.xml.parsers.ParserConfigurationException;
import java.io.*;
import java.nio.charset.Charset;
import java.util.*;

/**
* The command line tool to check plain text files.
*
* @author Daniel Naber
*/
class Main {

  /* maximum file size to read in a single read */
  private static final int MAX_FILE_SIZE = 64000;

  private final boolean verbose;
  private final boolean apiFormat;
  private final boolean taggerOnly;
  private final boolean applySuggestions;
  private final boolean autoDetect;
  private final boolean singleLineBreakMarksParagraph;
  private final String[] enabledRules;
  private final String[] disabledRules;
  private final Language motherTongue;
 
  private JLanguageTool lt;
  private boolean profileRules;
  private boolean bitextMode;
  private JLanguageTool srcLt;
  private List<BitextRule> bRules;
  private Rule currentRule;

  Main(final boolean verbose, final boolean taggerOnly,
      final Language language, final Language motherTongue,
      final String[] disabledRules, final String[] enabledRules,
      final boolean enabledOnly,
      final boolean apiFormat, boolean applySuggestions,
      boolean autoDetect, boolean singleLineBreakMarksParagraph, File languageModelIndexDir) throws IOException,
      SAXException, ParserConfigurationException {
    this.verbose = verbose;
    this.apiFormat = apiFormat;
    this.taggerOnly = taggerOnly;
    this.applySuggestions = applySuggestions;
    this.autoDetect = autoDetect;
    this.enabledRules = enabledRules;
    this.disabledRules = disabledRules;
    this.motherTongue = motherTongue;
    this.singleLineBreakMarksParagraph = singleLineBreakMarksParagraph;
    profileRules = false;
    bitextMode = false;
    srcLt = null;
    bRules = null;
    lt = new MultiThreadedJLanguageTool(language, motherTongue);
    lt.activateDefaultPatternRules();
    lt.activateDefaultFalseFriendRules();
    if (languageModelIndexDir != null) {
      lt.activateLanguageModelRules(languageModelIndexDir);
    }
    Tools.selectRules(lt, disabledRules, enabledRules, enabledOnly);
  }

  boolean isSpellCheckingActive() {
    List<Rule> rules = lt.getAllActiveRules();
    for (Rule rule : rules) {
      if (rule.isDictionaryBasedSpellingRule()) {
        return true;
      }
    }
    return false;
  }
 
  JLanguageTool getJLanguageTool() {
    return lt;
  }
 
  private void setListUnknownWords(final boolean listUnknownWords) {
    lt.setListUnknownWords(listUnknownWords);
  }
 
  private void cleanUp() {
    JLanguageTool.removeTemporaryFiles();
  }
 
  private void setProfilingMode() {
    profileRules = true;
  }

  private void setBitextMode(final Language sourceLang,
      final String[] disabledRules, final String[] enabledRules) throws IOException, ParserConfigurationException, SAXException {
    bitextMode = true;
    final Language target = lt.getLanguage();
    lt = new MultiThreadedJLanguageTool(target, null);
    srcLt = new MultiThreadedJLanguageTool(sourceLang);
    lt.activateDefaultPatternRules();
    Tools.selectRules(lt, disabledRules, enabledRules);
    Tools.selectRules(srcLt, disabledRules, enabledRules);
    bRules = Tools.getBitextRules(sourceLang, lt.getLanguage());

    List<BitextRule> bRuleList = new ArrayList<>(bRules);
    for (final BitextRule bitextRule : bRules) {
      for (final String disabledRule : disabledRules) {
        if (bitextRule.getId().equals(disabledRule)) {
          bRuleList.remove(bitextRule);
        }
      }
    }
    bRules = bRuleList;
    if (enabledRules.length > 0) {
      bRuleList = new ArrayList<>();
      for (final String enabledRule : enabledRules) {
        for (final BitextRule bitextRule : bRules) {
          if (bitextRule.getId().equals(enabledRule)) {
            bRuleList.add(bitextRule);
          }
        }
      }
      bRules = bRuleList;
    }
  }

  private void runOnFile(final String filename, final String encoding,
      final boolean listUnknownWords, final boolean xmlFiltering) throws IOException {
    boolean oneTime = false;
    if (!isStdIn(filename)) {
      if (autoDetect) {
        Language language = detectLanguageOfFile(filename, encoding);
        if (language == null) {
          System.err.println("Could not detect language well enough, using English");
          language = new English();
        }
        changeLanguage(language, motherTongue, disabledRules, enabledRules);
        System.out.println("Using " + language.getName() + " for file " + filename);
      }
      final File file = new File(filename);
      // run once on file if the file size < MAX_FILE_SIZE or
      // when we use the bitext mode (we use a bitext reader
      // instead of a direct file access)
      oneTime = file.length() < MAX_FILE_SIZE || bitextMode;
    }
    if (oneTime) {
      runOnFileInOneGo(filename, encoding, listUnknownWords, xmlFiltering);
    } else {
      runOnFileLineByLine(filename, encoding, listUnknownWords);
    }
  }

  private void runOnFileInOneGo(String filename, String encoding, boolean listUnknownWords, boolean xmlFiltering) throws IOException {
    if (bitextMode) {
      final TabBitextReader reader = new TabBitextReader(filename, encoding);
      if (applySuggestions) {
        CommandLineTools.correctBitext(reader, srcLt, lt, bRules);
      } else {
        CommandLineTools.checkBitext(reader, srcLt, lt, bRules, apiFormat);
      }
    } else {
      final String text = getFilteredText(filename, encoding, xmlFiltering);
      if (applySuggestions) {
        System.out.print(Tools.correctText(text, lt));
      } else if (profileRules) {
        CommandLineTools.profileRulesOnText(text, lt);
      } else if (!taggerOnly) {
        CommandLineTools.checkText(text, lt, apiFormat, 0);
      } else {
        CommandLineTools.tagText(text, lt);
      }
      if (listUnknownWords) {
        System.out.println("Unknown words: " + lt.getUnknownWords());
      }
    }
  }

  private void runOnFileLineByLine(String filename, String encoding, boolean listUnknownWords) throws IOException {
    if (verbose) {
      lt.setOutput(System.err);
    }
    if (!apiFormat && !applySuggestions) {
      if (isStdIn(filename)) {
        System.out.println("Working on STDIN...");
      } else {
        System.out.println("Working on " + filename + "...");
      }
    }
    if (profileRules && isStdIn(filename)) {
      throw new IllegalArgumentException("Profiling mode cannot be used with input from STDIN");
    }
    int runCount = 1;
    final List<Rule> rules = lt.getAllActiveRules();
    if (profileRules) {
      System.out.printf("Testing %d rules\n", rules.size());
      System.out.println("Rule ID\tTime\tSentences\tMatches\tSentences per sec.");
      runCount = rules.size();
    }
    InputStreamReader isr = null;
    BufferedReader br = null;
    int lineOffset = 0;
    int tmpLineOffset = 0;
    final List<String> unknownWords = new ArrayList<>();
    StringBuilder sb = new StringBuilder();
    for (int ruleIndex = 0; !rules.isEmpty() && ruleIndex < runCount; ruleIndex++) {
      currentRule = rules.get(ruleIndex);
      int matches = 0;
      long sentences = 0;
      final long startTime = System.currentTimeMillis();
      try {
        isr = getInputStreamReader(filename, encoding);
        br = new BufferedReader(isr);
        String line;
        int lineCount = 0;
        while ((line = br.readLine()) != null) {
          sb.append(line);
          lineCount++;
          // to detect language from the first input line
          if (lineCount == 1 && autoDetect) {
            Language language = detectLanguageOfString(line);
            if (language == null) {
              System.err.println("Could not detect language well enough, using English");
              language = new English();
            }
            System.out.println("Language used is: " + language.getName());
            language.getSentenceTokenizer().setSingleLineBreaksMarksParagraph(
                    singleLineBreakMarksParagraph);
            changeLanguage(language, motherTongue, disabledRules, enabledRules);
          }
          sb.append('\n');
          tmpLineOffset++;
          if (lt.getLanguage().getSentenceTokenizer().singleLineBreaksMarksPara()) {
            matches = handleLine(matches, lineOffset, sb);
            sentences += lt.getSentenceCount();
            if (profileRules) {
              sentences += lt.sentenceTokenize(sb.toString()).size();
            }
            rememberUnknownWords(listUnknownWords, unknownWords);
            sb = new StringBuilder();
            lineOffset = tmpLineOffset;
          } else {
            if ("".equals(line) || sb.length() >= MAX_FILE_SIZE) {
              matches = handleLine(matches, lineOffset, sb);
              sentences += lt.getSentenceCount();
              if (profileRules) {
                sentences += lt.sentenceTokenize(sb.toString()).size();
              }
              rememberUnknownWords(listUnknownWords, unknownWords);
              sb = new StringBuilder();
              lineOffset = tmpLineOffset;
            }
          }
        }
      } finally {
        if (sb.length() > 0) {
          matches = handleLine(matches, tmpLineOffset - 1, sb);
          sentences += lt.getSentenceCount();
          if (profileRules) {
            sentences += lt.sentenceTokenize(sb.toString()).size();
          }
          if (apiFormat && !taggerOnly && !applySuggestions) {
              System.out.println("</matches>");
          }
          rememberUnknownWords(listUnknownWords, unknownWords);
        }
        printTimingInformation(listUnknownWords, rules, unknownWords, ruleIndex, matches, sentences, startTime);
        if (br != null) {
          br.close();
        }
        if (isr != null) {
          isr.close();
        }
      }
    }
  }

  private void rememberUnknownWords(boolean listUnknownWords, List<String> unknownWords) {
    if (listUnknownWords && !taggerOnly) {
      for (String word : lt.getUnknownWords()) {
        if (!unknownWords.contains(word)) {
          unknownWords.add(word);
        }
      }
    }
  }

  private InputStreamReader getInputStreamReader(String filename, String encoding)
          throws UnsupportedEncodingException, FileNotFoundException {
    final InputStreamReader isr;
    if (!isStdIn(filename)) {
      final File file = new File(filename);
      if (encoding != null) {
        isr = new InputStreamReader(new BufferedInputStream(
            new FileInputStream(file)), encoding);
      } else {
        isr = new InputStreamReader(new BufferedInputStream(
            new FileInputStream(file)));
      }
    } else {
      if (encoding != null) {
        isr = new InputStreamReader(new BufferedInputStream(System.in), encoding);
      } else {
        isr = new InputStreamReader(new BufferedInputStream(System.in));
      }
    }
    return isr;
  }

  private boolean isStdIn(String filename) {
    return "-".equals(filename);
  }

  private void printTimingInformation(final boolean listUnknownWords, final List<Rule> rules,
      final List<String> unknownWords, final int ruleIndex, final int matches, final long sentences, final long startTime) {
    if (!applySuggestions) {
      final long endTime = System.currentTimeMillis();
      final long time = endTime - startTime;
      final float timeInSeconds = time / 1000.0f;
      final float sentencesPerSecond = sentences / timeInSeconds;
      if (apiFormat) {
        System.out.println("<!--");
      }
      if (profileRules) {
        //TODO: run 10 times, line in runOnce mode, and use median
        System.out.printf(Locale.ENGLISH,
            "%s\t%d\t%d\t%d\t%.1f", rules.get(ruleIndex).getId(),
            time, sentences, matches, sentencesPerSecond);
        System.out.println();
      } else {
        System.out.printf(Locale.ENGLISH,
            "Time: %dms for %d sentences (%.1f sentences/sec)", time,
            sentences, sentencesPerSecond);
        System.out.println();
      }
      if (listUnknownWords) {
        Collections.sort(unknownWords);
        System.out.println("Unknown words: " + unknownWords);
      }
      if (apiFormat) {
        System.out.println("-->");
      }
    }
  }

  private int handleLine(final int matchNo, final int lineOffset,
      final StringBuilder sb) throws IOException {
    int matches = matchNo;
    if (applySuggestions) {
      System.out.print(Tools.correctText(StringTools.filterXML(sb.toString()),
          lt));
    } else if (profileRules) {
      matches += Tools.profileRulesOnLine(StringTools.filterXML(sb.toString()),
          lt, currentRule);
    } else if (!taggerOnly) {
      if (matches == 0) {
        matches += CommandLineTools.checkText(StringTools.filterXML(sb.toString()), lt,
            apiFormat, -1, lineOffset, matches,
            StringTools.XmlPrintMode.START_XML);
      } else {
        matches += CommandLineTools.checkText(StringTools.filterXML(sb.toString()), lt,
            apiFormat, -1, lineOffset, matches,
            StringTools.XmlPrintMode.CONTINUE_XML);
      }
    } else {
      CommandLineTools.tagText(StringTools.filterXML(sb.toString()), lt);
    }
    return matches;
  }

  private void runRecursive(final String filename, final String encoding,
      final boolean listUnknown, final boolean xmlFiltering) {
    final File dir = new File(filename);
    final File[] files = dir.listFiles();
    if (files == null) {
      throw new IllegalArgumentException(dir.getAbsolutePath() + " is not a directory, cannot use recursion");
    }
    for (final File file : files) {
      try {
        if (file.isDirectory()) {
          runRecursive(file.getAbsolutePath(), encoding, listUnknown, xmlFiltering);
        } else {
          runOnFile(file.getAbsolutePath(), encoding, listUnknown, xmlFiltering);
        }
      } catch (Exception e) {
        throw new RuntimeException("Could not check text in file " + file, e);
      }
    }   
  }

  /**
   * Loads filename and filters out XML. Note that the XML
   * filtering can lead to incorrect positions in the list of matching rules.
   */
  private String getFilteredText(final String filename, final String encoding, boolean xmlFiltering) throws IOException {
    if (verbose) {
      lt.setOutput(System.err);
    }
    if (!apiFormat && !applySuggestions) {
      System.out.println("Working on " + filename + "...");
    }
    // don't use StringTools.readStream() as that might add newlines which aren't there:
    final String fileContents = StringTools.streamToString(new FileInputStream(filename), encoding != null ? encoding : Charset.defaultCharset().name());
    if (xmlFiltering) {
      return StringTools.filterXML(fileContents);
    } else {
      return fileContents;
    }
  }

  private void changeLanguage(Language language, Language motherTongue,
                              String[] disabledRules, String[] enabledRules) {
    try {
      lt = new MultiThreadedJLanguageTool(language, motherTongue);
      lt.activateDefaultPatternRules();
      lt.activateDefaultFalseFriendRules();
      Tools.selectRules(lt, disabledRules, enabledRules);
      if (verbose) {
        lt.setOutput(System.err);
      }
    } catch (Exception e) {
      throw new RuntimeException("Could not create LanguageTool instance for language " + language, e);
    }
  }

  /**
   * Command line tool to check plain text files.
   */
  public static void main(final String[] args) throws IOException, ParserConfigurationException, SAXException {
    JnaTools.setBugWorkaroundProperty();
    final CommandLineParser commandLineParser = new CommandLineParser();
    CommandLineOptions options = null;
    try {
       options = commandLineParser.parseOptions(args);
    } catch (WrongParameterNumberException e) {
      commandLineParser.printUsage();
      System.exit(1);
    } catch (IllegalArgumentException e) {
      System.err.println(e.toString());
      System.exit(1);
    } catch (UnknownParameterException e) {
      if (e.getMessage() != null) {
        System.err.println(e.getMessage());
      } else {
        System.err.println(e.toString());
      }
      commandLineParser.printUsage(System.err);
      System.exit(1);
    }
    if (options.isPrintUsage()) {
      commandLineParser.printUsage();
      System.exit(1);
    }
    if (options.isPrintVersion()) {
      System.out.println("LanguageTool version " + JLanguageTool.VERSION + " (" + JLanguageTool.BUILD_DATE + ")");
      System.exit(0);
    }
    if (options.isPrintLanguages()) {
      printLanguages();
      System.exit(0);
    }

    if (options.getFilename() == null) {
      options.setFilename("-");
    }

    String languageHint = null;
    if (options.getLanguage() == null) {
      if (!options.isApiFormat() && !options.isAutoDetect()) {
        System.err.println("No language specified, using English (no spell checking active, " +
                "specify a language variant like 'en-GB' if available)");
      }
      options.setLanguage(new English());
    } else if (!options.isApiFormat() && !options.isApplySuggestions()) {
      languageHint = "Expected text language: " + options.getLanguage().getName();
    }

    options.getLanguage().getSentenceTokenizer().setSingleLineBreaksMarksParagraph(
            options.isSingleLineBreakMarksParagraph());
    final Main prg = new Main(options.isVerbose(), options.isTaggerOnly(), options.getLanguage(), options.getMotherTongue(),
            options.getDisabledRules(), options.getEnabledRules(),  options.getUseEnabledOnly(), options.isApiFormat(), options.isApplySuggestions(),
            options.isAutoDetect(), options.isSingleLineBreakMarksParagraph(), options.getLanguageModel());
    if (prg.lt.getAllActiveRules().size() == 0) {
      throw new RuntimeException("WARNING: No rules are active. Please make sure your rule ids are correct: " +
              Arrays.toString(options.getEnabledRules()));
    }
    if (languageHint != null) {
      String spellHint = prg.isSpellCheckingActive() ?
              "" : " (no spell checking active, specify a language variant like 'en-GB' if available)";
      System.out.println(languageHint + spellHint);
    }
    prg.setListUnknownWords(options.isListUnknown());
    if (options.isProfile()) {
      prg.setProfilingMode();
    }
    if (options.isBitext()) {
      if (options.getMotherTongue() == null) {
        throw new IllegalArgumentException("You have to set the source language (as mother tongue) in bitext mode");
      }
      prg.setBitextMode(options.getMotherTongue(), options.getDisabledRules(), options.getEnabledRules());
    }
    if (options.isRecursive()) {
      prg.runRecursive(options.getFilename(), options.getEncoding(), options.isListUnknown(), options.isXmlFiltering());
    } else {
      prg.runOnFile(options.getFilename(), options.getEncoding(), options.isListUnknown(), options.isXmlFiltering());
    }
    prg.cleanUp();
  }

  private static void printLanguages() {
    final List<String> languages = new ArrayList<>();
    for (Language language : Language.REAL_LANGUAGES) {
      languages.add(language.getShortNameWithCountryAndVariant() + " " + language.getName());
    }
    Collections.sort(languages);
    for (String s : languages) {
      System.out.println(s);
    }
  }

  // for language auto detect
  private static Language detectLanguageOfFile(final String filename, final String encoding) throws IOException {
    final String text = StringTools.readStream(new FileInputStream(filename), encoding);
    return detectLanguageOfString(text);
  }

  private static Language detectLanguageOfString(final String text) {
    final LanguageIdentifier identifier = new LanguageIdentifier(text);
    final Language lang = Language.getLanguageForShortName(identifier.getLanguage());
    return lang;
  }

}
TOP

Related Classes of org.languagetool.commandline.Main

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.