Source Code of org.languagetool.dev.conversion.RuleCoverage

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2010 Daniel Naber (http://www.languagetool.org)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */


package org.languagetool.dev.conversion;


import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import morfologik.stemming.Dictionary;
import morfologik.stemming.DictionaryIterator;
import morfologik.stemming.DictionaryLookup;
import morfologik.stemming.IStemmer;
import morfologik.stemming.WordData;


import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.language.English;
import org.languagetool.rules.RuleMatch;
import org.languagetool.rules.patterns.Element;
import org.languagetool.rules.patterns.PatternRule;
import org.languagetool.rules.patterns.PatternRuleLoader;


public class RuleCoverage {


    private JLanguageTool tool;    
    private DictionaryIterator dictIterator;
    private DictionaryLookup dictLookup;
    private Language language;
    private String filename;
    private File dictFile;
    
    private String ruleFileHeader = RuleConverter.xmlHeader;
    private String categoriesString = "<category name=\"test\">";
    private String endCategoriesString = "</category>";
    private String endRulesString = "</rules>"; 
    
    private static Pattern regexSet = Pattern.compile("^\\[([^\\-])*?\\]$");


    // default constructor; defaults to English
    public RuleCoverage() throws IOException {
      language = new English();
      tool = new JLanguageTool(language);
        tool.activateDefaultPatternRules();
        tool.disableRule("UPPERCASE_SENTENCE_START");
        tool.disableRule("EN_UNPAIRED_BRACKETS");
        tool.disableRule("EN_A_VS_AN");
        setupDictionaryFiles();
    }
    
    // disable some of the default rules in the constructors
    //TODO: disable the right rules for each language
    // though this matters less when we return an array of all covering rules
    public RuleCoverage(Language language) throws IOException {
      this.language = language;
      tool = new JLanguageTool(language);
        tool.activateDefaultPatternRules();
        setupDictionaryFiles();
    }
    
    // for testing purposes, defaults to English
    public RuleCoverage(String dictFileName) throws IOException {
      language = new English();
      tool = new JLanguageTool(language);
        tool.activateDefaultPatternRules();
        tool.disableRule("UPPERCASE_SENTENCE_START");
        tool.disableRule("EN_UNPAIRED_BRACKETS");
        tool.disableRule("EN_A_VS_AN");
        this.filename = dictFileName;
        this.dictFile = new File(filename);
        setupDictionaryFiles();
    }
    
    public JLanguageTool getLanguageTool() {
      return tool;
    }


    // not really used anymore
    public void evaluateRules(String grammarfile) throws IOException {
        List<PatternRule> rules = loadPatternRules(grammarfile);
        for (PatternRule rule : rules) {
            String example = generateIncorrectExample(rule);
            System.out.println("Rule " + rule.getId() + " is covered by " + isCoveredBy(example) + " for example " + example);
        }
    }
    
    // not really used anymore
    public void splitOutCoveredRules(String grammarfile, String discardfile) throws IOException {
      List<PatternRule> rules = loadPatternRules(grammarfile);
      
      PrintWriter w = new PrintWriter(new OutputStreamWriter(new FileOutputStream(grammarfile),"UTF-8"));
      PrintWriter w2 = null;
      int discardedRules = 0;
      
        
      for (PatternRule rule : rules) {
        String example = generateIncorrectExample(rule);
        if (isCoveredBy(example) == null) {
          w.write(rule.toXML());
        } else {
          if (w2 == null) {
            w2 = new PrintWriter(new OutputStreamWriter(new FileOutputStream(discardfile),"UTF-8")); 
          }
          discardedRules++;
          w2.write(rule.toXML());
        }
      }
      
      if (discardedRules > 0) {
        System.out.println(Integer.toString(discardedRules) + " rules already covered, written to " + discardfile);
      }
      w.close();
      if (w2 != null) {
        w2.close();
      }
    }
    
    /**
     * Returns true if the input string is covered by an existing JLanguageTool error 
     * @param str input error string
     * @return true if (entire) string is considered an error, false o.w.; this doesn't work
     */
    public boolean isCovered(String str) throws IOException {
        List<RuleMatch> matches = tool.check(str);
        return (matches.size() > 0);        
    }
    
    /**
     * Returns a list of covering rules for the given example string
     */
    public String[] isCoveredBy(String str) throws IOException {
      List<RuleMatch> matches = tool.check(str);
      ArrayList<String> coverages = new ArrayList<>();
      if (matches.size() > 0) {
        for (RuleMatch match : matches) {
          coverages.add(match.getRule().getId());
        }
      }
      return coverages.toArray(new String[coverages.size()]);
    }
    
    public String[] isCoveredBy(PatternRule rule) throws IOException {
      ArrayList<String> coverages = new ArrayList<>();
      String example = generateIncorrectExample(rule);
    List<RuleMatch> matches = tool.check(example);
    if (matches.size() > 0) {
        for (RuleMatch match : matches) {
          coverages.add(match.getRule().getId());
        }
      }
      return coverages.toArray(new String[coverages.size()]);
    }
    
    public ArrayList<String[]> isCoveredBy(List<PatternRule> rules) throws IOException {
      ArrayList<String[]> coverages = new ArrayList<>();
      for (PatternRule rule : rules) {
        String[] cov = isCoveredBy(rule);
        coverages.add(cov);
      }
      return coverages;
    }
    
    /**
     * Generates an error string that matches the given PatternRule object 
     */
    public String generateIncorrectExample(PatternRule patternrule) {
        ArrayList<String> examples = new ArrayList<>();
        List<Element> elements = patternrule.getElements();
        for (int i=0;i<elements.size();i++) {
          List<Element> prevExceptions;
          if (i == elements.size()-1) {
            prevExceptions = new ArrayList<>();
          } else {
            prevExceptions = elements.get(i+1).getPreviousExceptionList();
            if (prevExceptions == null) prevExceptions = new ArrayList<>();
          }
            examples.add(getSpecificExample(elements.get(i),prevExceptions,elements,examples));
        }
        // it's okay to not deal with apostrophes as long as we turn off the unpaired brackets rule, for English at least
        StringBuilder sb = new StringBuilder();
        //TODO: doesn't deal with spacebefore=no
        for (String example : examples) {
          sb.append(example + " ");
        }
        String s = sb.toString().replaceAll("\\ \\.\\ ", "").trim();  // to fix the period problem 
        return s;
    }
    
    // Not using this method yet
//    public String generateCorrectExample(PatternRule patternrule) {
//      String incorrectExample = generateIncorrectExample(patternrule);
//      AnalyzedSentence analyzedSentence = null;
//      try {
//        analyzedSentence = tool.getAnalyzedSentence(incorrectExample);
//        RuleMatch[] ruleMatches = patternrule.match(analyzedSentence);
//        for (RuleMatch rm : ruleMatches) {
//          patternrule.addRuleMatch(rm);
//        }
//      } catch (IOException e) {
//        e.printStackTrace();
//      }
//
//      ArrayList<String> examples = new ArrayList<String>();
//      List<Match> matches = patternrule.getSuggestionMatches();
//      ArrayList<Element> elements = new ArrayList<Element>();
//      for (Match m : matches) {
//        int ref = m.getTokenRef();
//        Element refElement = patternrule.getElements().get(ref);
//        elements.add(refElement);
//      }
//      for (int i=0;i<elements.size();i++) {
//          List<Element> prevExceptions;
//          if (i == elements.size()-1) {
//            prevExceptions = new ArrayList<Element>();
//          } else {
//            prevExceptions = elements.get(i+1).getPreviousExceptionList();
//            if (prevExceptions == null) prevExceptions = new ArrayList<Element>();
//          }
//            examples.add(getSpecificExample(elements.get(i),prevExceptions,elements,examples));
//        }
//        // it's okay to not deal with apostrophes as long as we turn off the unpaired brackets rule, for English at least
//        StringBuilder sb = new StringBuilder();
//        //TODO: doesn't deal with spacebefore=no
//        for (String example : examples) {
//          sb.append(example + " ");
//        }
//        String s = sb.toString().replaceAll("\\ \\.\\ ", ".").trim();  // to fix the period problem 
//        return s;
//    }
//    
    
    /**
     * Generates a word that matches the given Element 
     */
    //TODO: doesn't deal with skipped tokens
    @SuppressWarnings("unchecked")
  public String getSpecificExample(Element e, List<Element> prevExceptions, List<Element> elements, ArrayList<String> examples) {
        // if this is part of (the first of) a list of and-ed tokens
      if (e.hasAndGroup()) {
          List<Element> andGroup = e.getAndGroup();
          andGroup.add(e); // add the token itself to the and group, so we can process them together
          // still, if one of the tokens in the and group is just a (non-regexp) token, we can return that as the example
          for (Element and : andGroup) {
            if (isJustToken(and)) {
              return and.getString();
            }
            if (isPunctuation(and)) {
              return getOnePunc(and);
            }
          }
          // get the patterns of all the and-ed elements, to make processing faster
          List<Pattern> tokenPatterns = new ArrayList<>(andGroup.size());
          List<Pattern> posPatterns = new ArrayList<>(andGroup.size());
          // get all the exceptions and attributes
          List<Element> allExceptions = new ArrayList<>();
          allExceptions.addAll(prevExceptions);  // add all the exceptions from the next token with scope="previous"
          for (int a=0;a<andGroup.size();a++) {
            Element and = andGroup.get(a);
            List<Element> ex = and.getExceptionList();
            if (ex != null) {
              allExceptions.addAll(and.getExceptionList());
            }
            if (and.isReferenceElement()) {
              and = getReferenceElement(and,elements,examples);  // gets the string for the element if it's a match token
            }
            String andPostag = and.getPOStag();
            String andToken = and.getString();
            tokenPatterns.add(Pattern.compile(andToken));
            if (andPostag != null) {
              if (and.isPOStagRegularExpression()) {
                posPatterns.add(Pattern.compile(andPostag));
              } else {
                posPatterns.add(Pattern.compile(Pattern.quote(andPostag)));
              }
              
            } else {
              posPatterns.add(null);
            }
            andGroup.set(a,and);
          }
          // get exceptions in attribute form for faster processings
          List<List<Pattern>> exceptionAttributes = getExceptionAttributes(allExceptions);
          
          // do the dictionary iteration thing; this part could take a while, depending on how far through the dict we have to go
          int numResets = 0;
            while (numResets < 2) {
              if (!dictIterator.hasNext()) {
                dictIterator = resetDictIterator();
                numResets++;
              }
                String word = dictIterator.next().getWord().toString();
                // check if the word meets all the and-ed criteria
                boolean matched = true;
                for (int i=0;i<andGroup.size();i++) {
                  if (!isExampleOf(word, tokenPatterns.get(i), posPatterns.get(i), andGroup.get(i))) {
                    matched = false;
                    break;
                  }
                }
                if (matched) {
                  if (!inExceptionList(word, exceptionAttributes, allExceptions)) {
                    return word;
                  }
                } 
            } 
        } 
      // just a single (non-and-ed) token
      else {
        if (e.isReferenceElement()) {
          e = getReferenceElement(e, elements, examples);
        }
          String token = e.getString();
          String postag = e.getPOStag();
            List<Element> exceptions = e.getExceptionList();
            if (exceptions == null) {
              exceptions = new ArrayList<>();
            }
            exceptions.addAll(prevExceptions);
            
            List<List<Pattern>> exceptionAttributes = getExceptionAttributes(exceptions);


            if (e.isSentenceStart()) {
                return "";
            }
            // <token>word</token>
            if (isJustToken(e)) {
                return token;
            }
            if (isPunctuation(e)) {
          return getOnePunc(e);
        }
            
            // need smarter example generation, especially for simple or-ed lists of words. 
            if (isSimpleOrRegex(e)) {
              // pick an element from the or-ed list at random
              return randomOredElement(e);
            }
            
            Pattern tokenPattern = Pattern.compile(token);
            Pattern posPattern;
            if (postag != null) {
              if (e.isPOStagRegularExpression()) {
                posPattern = Pattern.compile(postag);
              } else {
                posPattern = Pattern.compile(Pattern.quote(postag));
              }
              
              if (postag.equals("SENT_END")) {
                posPattern = null;
              }
              
            } else {
              posPattern = null;
            }
            
            // only allows approx. one pass through the dictionary
            int numResets = 0;
            while (numResets < 2) {
              if (!dictIterator.hasNext()) {
                dictIterator = resetDictIterator();
                numResets++;
              }
                String word = dictIterator.next().getWord().toString();
                if (isExampleOf(word, tokenPattern, posPattern, e) &&
                  !inExceptionList(word, exceptionAttributes, exceptions)) {
                    return word;
                }
            } 
        }
   
        return null;  // if no example can be found
    }
    
    /**
     * Returns an element with the string set as the previously matched element
     */
    private Element getReferenceElement(Element e, List<Element> elements, ArrayList<String> examples) {
      int r = e.getMatch().getTokenRef();
      Element newElement = new Element(examples.get(r), elements.get(r).isCaseSensitive(), false, false);
      newElement.setNegation(e.getNegation());
      return newElement;
      
    }
    
    /**
     * Gets all the attributes of each element of the exception, so we don't have to keep compiling the Pattern,
     * which wastes a lot of time
     */
    @SuppressWarnings("unchecked")
  private List<List<Pattern>> getExceptionAttributes(List<Element> exceptions) {
      if (exceptions.size() == 0) {
        return new ArrayList<>();
      } 
      int size = exceptions.size();
      List<List<Pattern>> ret = new ArrayList<>(6);
      List<Pattern> tokenPatterns = new ArrayList<>(size);
      List<Pattern> posPatterns = new ArrayList<>(size);
      for (Element e : exceptions) {
        String token = e.getString();
        String postag = e.getPOStag();
        Pattern tokenPattern = Pattern.compile(token);
        Pattern posPattern;
            if (postag != null) {
              posPattern = Pattern.compile(postag);
            } else {
              posPattern = null;
            }
            
            tokenPatterns.add(tokenPattern);
            posPatterns.add(posPattern);
            
      }
      ret.add(tokenPatterns);
      ret.add(posPatterns);
      return ret;
    }
    
    /**
     * Returns a random one of the or-ed elements. Random seems like the right thing to do here.
     * Only applied to simple or-ed lists of words, e.g. this|that|those
     */
    private String randomOredElement(Element e) {
      String[] split = e.getString().split("\\|");
      Random rng = new Random();
      int index = rng.nextInt(split.length);
      return split[index];
    }
    
    /** 
     * Faster version of inExceptionList, because we don't have to re-compile the Patterns for the exception elements
     */
    @SuppressWarnings("unchecked")
  private boolean inExceptionList(String word, List<List<Pattern>> exceptionAttributes, List<Element> exceptions) {
      if (exceptions.size() == 0) {
        return false;
      }
      List<Pattern> tokenPatterns = exceptionAttributes.get(0);
      List<Pattern> posPatterns = exceptionAttributes.get(1);
      
      for (int i=0;i<exceptions.size();i++) {
        Element curException = exceptions.get(i);
        if (isExampleOf(word,tokenPatterns.get(i),
            posPatterns.get(i),
            curException)) {
          return true;
        }
      }
      return false;
    }
    
    
    /**
     * Faster version of isExampleOf, since you don't have to recompile the Patterns every time
     */
    public boolean isExampleOf(String word, Pattern tokenPattern, Pattern posPattern, Element e) {
      if (tokenPattern.pattern().isEmpty() && posPattern == null) {
          return true;
        }
      boolean tokenMatches = true;
        boolean postagMatches = false;
        boolean isTokenEmpty = e.getString().isEmpty();
        boolean hasPosTag = (posPattern != null);
        boolean negate = e.getNegation();
        boolean postagNegate = e.getPOSNegation();
        boolean inflected = e.isInflected();
        
        if (posPattern == null) {
          postagMatches = true;
        }
        if (!isTokenEmpty) {
          Matcher m;
          boolean matches = false;
          // checking inflected matches
          if (inflected) {
            if (isInflectedStringMatch(word,e)) {
              matches = true;
            }
          } else {
            m = tokenPattern.matcher(word);
            if (m.matches()) matches = true;
          }
            
            if (matches) {
                if (negate) {
                    tokenMatches = false; 
                }
            } else {
                if (!negate) {
                    tokenMatches = false;
                }
            }
        }
        if (hasPosTag) {
            List<String> postags = getPosTags(word);
            for (String s : postags) {
                Matcher m = posPattern.matcher(s);
                if (m.matches()) {
                    if (!postagNegate) {
                        postagMatches = true;
                        break;
                    }
                } else {
                    if (postagNegate) {
                        postagMatches = true;
                        break;
                    }
                }
            }
            if (postags.size() == 0) {
                postagMatches = false;
            }
            
        }
        return (tokenMatches && postagMatches);
    }
    
    private boolean isInflectedStringMatch(String word, Element e) {
      Matcher m;
      Pattern lemmaPattern = Pattern.compile(RuleConverter.glueWords(getLemmas(e)));
    List<String> wordLemmas = getLemmas(word);
    for (String lemma : wordLemmas) {
      m = lemmaPattern.matcher(lemma);
      if (m.matches()) {
        return true;
      }
    }
    return false;
    }
    
    /**
     * Returns a list of the word's POS tags
     */
    private List<String> getPosTags(String word) {
        List<WordData> lwd = dictLookup.lookup(word);
        ArrayList<String> postags = new ArrayList<>();
        for (WordData wd : lwd) {
            postags.add(wd.getTag().toString());
        }
        return postags;
    }
    /**
     * Returns an or-ed group of the lemmas of a word
     */
    private ArrayList<String> getLemmas(String word) {
      List<WordData> lwd = dictLookup.lookup(word);
      ArrayList<String> lemmas = new ArrayList<>();
      for (WordData wd : lwd) {
        if (!lemmas.contains(wd.getStem())) {
          lemmas.add(wd.getStem().toString());
        }
      }
      return lemmas;
    }
    
    // returns the lemmas of an element; 
    // the point of this method is that so we can get the lemmas of a bunch of or-ed words
    private ArrayList<String> getLemmas(Element e) {
      if (!e.isRegularExpression()) {
        return getLemmas(e.getString());
      } else {
        if (isOrRegex(e)) {
          ArrayList<String> lemmas = new ArrayList<>();
          String[] words = e.getString().split("\\|");
          for (String word : words) {
            lemmas.addAll(getLemmas(word));
          }
          return lemmas;
        }
        return null;
      }
    }
    
    
    /**
     * Returns true if the element has a (non-regexp, non-negated) token and no exception list
     */
    private static boolean isJustToken(Element e) {
      return (!e.getString().isEmpty() && !e.isRegularExpression() && !e.getNegation() && e.getExceptionList() == null);
    }
    
    /**
     * Returns true if the given element's string is a regex set of punctuation.
     * e.g. ['"] or [.,;:?!]
     */
    public static boolean isPunctuation(Element e) {
      if (regexSet.matcher(e.getString()).matches() && !e.getNegation() && e.getPOStag() == null) {
        return true;
      }
      return false;
    }
    
    /**
     * Grabs the first element of a punctuation set matched by the above method.
     */
    public String getOnePunc(Element e) {
      String set = e.getString();
      Matcher m = regexSet.matcher(set);
      m.find();
      return m.group(1);
    }
    
    /** 
     * Returns true if the element is an or-ed list of words, without a specified pos-tag.
     * e.g. can|could|would|should
     */
    private static boolean isSimpleOrRegex(Element e) {
      // any number of conditions that could halt this check
      if (e.getString().isEmpty()) return false;
      if (e.getPOStag() != null) return false;
      if (e.getNegation()) return false;
      if (!e.isRegularExpression()) return false;
      if (e.hasAndGroup()) return false;
      if (e.getExceptionList() != null) return false;
      if (e.isReferenceElement()) return false;
      if (e.isSentenceStart()) return false;
      
      String token = e.getString();
      String[] ors = token.split("\\|");
      for (String s : ors) {
        if (RuleConverter.isRegex(s)) {
          return false;
        }
      }
      return true;
    }
    
    private static boolean isOrRegex(Element e) {
      if (e.getString().isEmpty()) return false;
      String token = e.getString();
      String[] ors = token.split("\\|");
      for (String s : ors) {
        if (RuleConverter.isRegex(s)) {
          return false;
        }
      }
      return true; 
    }
    
    // ** DICTIONARY METHODS ** 
    
    private DictionaryIterator resetDictIterator() {
        DictionaryIterator ret = null;
        try {
          ret = new DictionaryIterator(Dictionary.read(dictFile), Charset.forName("utf8").newDecoder(), true);
        } catch (IOException e) {
          throw new RuntimeException("Could not read " + dictFile, e);
        }
        return ret;        
    }
    
    private IStemmer loadDictionary() throws IOException {
        IStemmer dictLookup = new DictionaryLookup(Dictionary.read(dictFile));
        return dictLookup;
    }
    
    // try several ways to open the dictionary file
    private void setupDictionaryFiles() {
       try {
         filename = "" +  JLanguageTool.getDataBroker().getResourceDir() + "/" + 
               language.getShortName() + "/" + language.getName().toLowerCase() + ".dict";
         dictFile = new File(filename);
          dictLookup = (DictionaryLookup) loadDictionary();
          dictIterator = resetDictIterator();
        } catch (IOException e) {
          try {
            // a different formulation of the filename
            filename = "./src/" +  JLanguageTool.getDataBroker().getResourceDir() + "/" + 
              language.getShortName() + "/" + language.getName().toLowerCase() + ".dict";
            dictFile = new File(filename);
            dictLookup = (DictionaryLookup) loadDictionary();
            dictIterator = resetDictIterator();
          } catch (IOException e2) {
            throw new RuntimeException(e2);
          }
        }
    }
    
    public List<PatternRule> loadPatternRules(final String filename)
        throws IOException {
      final PatternRuleLoader ruleLoader = new PatternRuleLoader();
      InputStream is = this.getClass().getResourceAsStream(filename);
      if (is == null) {
        // happens for external rules plugged in as an XML file:
        return ruleLoader.getRules(new File(filename));
      } else {
        return ruleLoader.getRules(is, filename);
      }
    }
    
    public List<PatternRule> parsePatternRule(final String ruleString) {
      final PatternRuleLoader ruleLoader = new PatternRuleLoader();
      String ruleFileString = ruleFileHeader + categoriesString + ruleString + endCategoriesString + endRulesString;
      InputStream is = new ByteArrayInputStream(ruleFileString.getBytes());
      try {
        return ruleLoader.getRules(is, null);
      } catch (IOException e) {
        return new ArrayList<>();
      }
    }
    
    public List<PatternRule> parsePatternRuleExtraTokens(final String ruleString) {
      String rs = ruleString;
      rs = rs.replace("<pattern>\n", "<pattern>\n<token/>\n");
    rs = rs.replace("</pattern>\n", "<token/>\n</pattern>\n");
      final PatternRuleLoader ruleLoader = new PatternRuleLoader();
      String ruleFileString = ruleFileHeader + categoriesString + rs + endCategoriesString + endRulesString;
      InputStream is = new ByteArrayInputStream(ruleFileString.getBytes());
      try {
        return ruleLoader.getRules(is, null);
      } catch (IOException e) {
        return new ArrayList<>();
      }
    }
    
    public void enableRule(String id) {
      tool.enableDefaultOffRule(id);
    }
    
    
}
Source Code of org.languagetool.dev.conversion.RuleCoverage

Related Classes of org.languagetool.dev.conversion.RuleCoverage