Package org.languagetool

Examples of org.languagetool.AnalyzedTokenReadings


              continue;
            }
            l.add(new AnalyzedToken(word, null, null));
          }
        }
        tokenReadings.add(new AnalyzedTokenReadings(l, pos));
        pos += word.length();
        break;
      }
    }
View Full Code Here


    int pos = 0;
    for (String chunkTag : chunkTags) {
      int startPos = pos;
      int endPos = startPos + tokens[i].length();
      //System.out.println("OPEN: " + tokens[i]);
      AnalyzedTokenReadings readings = getAnalyzedTokenReadingsFor(startPos, endPos, tokenReadings);
      result.add(new ChunkTaggedToken(tokens[i], Collections.singletonList(new ChunkTag(chunkTag)), readings));
      pos = endPos;
      i++;
    }
    return result;
View Full Code Here

    return result;
  }

  private void assignChunksToReadings(List<ChunkTaggedToken> chunkTaggedTokens) {
    for (ChunkTaggedToken taggedToken : chunkTaggedTokens) {
      AnalyzedTokenReadings readings = taggedToken.getReadings();
      if (readings != null) {
        readings.setChunkTags(taggedToken.getChunkTags());
      }
    }
  }
View Full Code Here

      if (l.isEmpty()) {
        l.add(new AnalyzedToken(word, null, null));
      }

      tokenReadings.add(new AnalyzedTokenReadings(l, pos));
      pos += word.length();
    }

    return tokenReadings;
  }
View Full Code Here

      String originalTokenStr = tokenReadings.getToken();
      if (ignoreTaggedWords && tokenReadings.isTagged()) {
        continue;
      }
      String tokenString = originalTokenStr.toLowerCase(getLocale());
      AnalyzedTokenReadings analyzedTokenReadings=null;
      String infinitive=null;
      int i=0;
      while (i<2 && analyzedTokenReadings == null) {
        Matcher m = desinencies_1conj[i].matcher(tokenString);
        if (m.matches()) {
View Full Code Here

              + line + ", " + "expected 3 semicolon-separated parts, got "
              + parts.length);
        }
        final AnalyzedToken analyzedToken = new AnalyzedToken(parts[1],
            parts[2], null);
        map.put(parts[0], new AnalyzedTokenReadings(analyzedToken, 0));
      }
    }
    return map;
  }
View Full Code Here

      // This is important when the reference points to a token with min="0", which has not been
      // matched... the subsequent match elements need to be renumbered, I guess, and that one
      // silently discarded
      idx = tokens.length - 1;
    }
    AnalyzedTokenReadings firstMatchTokenObj = tokens[idx];
    boolean startsWithUppercase = StringTools.startsWithUppercase(firstMatchTokenObj.getToken())
        && matchPreservesCase(rule.getSuggestionMatches(), rule.getMessage())
        && matchPreservesCase(rule.getSuggestionMatchesOutMsg(), rule.getSuggestionsOutMsg());

    if (firstMatchTokenObj.isSentenceStart() && tokens.length > firstMatchToken + correctedStPos + 1) {
      // make uppercasing work also at sentence start:
      firstMatchTokenObj = tokens[firstMatchToken + correctedStPos + 1];
      startsWithUppercase = StringTools.startsWithUppercase(firstMatchTokenObj.getToken());
    }
    if (firstMarkerMatchToken == -1) {
      firstMarkerMatchToken = firstMatchToken;
    }
    int fromPos = tokens[firstMarkerMatchToken].getStartPos();
    // FIXME: this is fishy, assumes that comma should always come before whitespace:
    if (errMessage.contains(SUGGESTION_START_TAG + ",") && firstMarkerMatchToken >= 1) {
      fromPos = tokens[firstMarkerMatchToken - 1].getStartPos()
          + tokens[firstMarkerMatchToken - 1].getToken().length();
    }
    if (lastMarkerMatchToken == -1) {
      lastMarkerMatchToken = lastMatchToken;
    }
    final AnalyzedTokenReadings token = tokens[Math.min(lastMarkerMatchToken, tokens.length-1)];
    int toPos = token.getStartPos() + token.getToken().length();
    if (fromPos < toPos) { // this can happen with some skip="-1" when the last token is not matched
      //now do some spell-checking:
      if (!(errMessage.contains(PatternRuleHandler.PLEASE_SPELL_ME) && errMessage.contains(MISTAKE))) {
        final String clearMsg = errMessage.replaceAll(PatternRuleHandler.PLEASE_SPELL_ME, "").replaceAll(MISTAKE, "");
        final RuleMatch ruleMatch = new RuleMatch(rule, fromPos, toPos, clearMsg,
View Full Code Here

          }
        } else {
          l.add(new AnalyzedToken(word, null, null));
        }
      }
      tokenReadings.add(new AnalyzedTokenReadings(l, pos));
      pos += word.length();
    }

    return tokenReadings;
  }
View Full Code Here

    }
  }

  // non-private for tests
  String getBetterAlternativeOrNull(AnalyzedTokenReadings[] tokens, int pos, ConfusionSet confusionSet) {
    AnalyzedTokenReadings token = tokens[pos];
    //
    // TODO: LT's tokenization is different to the Google one. E.g. Google "don't" vs LT "don ' t"
    //
    String next = getStringAtOrNull(tokens, pos + 1);
    String next2 = getStringAtOrNull(tokens, pos + 2);
    String prev = getStringAtOrNull(tokens, pos - 1);
    String prev2 = getStringAtOrNull(tokens, pos - 2);
    if ((next + next2 + prev + prev2).contains(",")) {
      // v1 of Google ngram corpus doesn't contain commas, so we better stop instead of getting confused:
      return null;
    }
    @SuppressWarnings("UnnecessaryLocalVariable")
    double textScore = score(token.getToken(), next, next2, prev, prev2);
    if (textScore >= MAX_TEXT_SCORE) {
      // too common, let's assume it is not an error
      return null;
    }
    double bestScore = textScore;
    String betterAlternative = null;
    for (String alternative : confusionSet.set) {
      if (alternative.equalsIgnoreCase(token.getToken())) {
        // this is the text variant, calculated above already...
        continue;
      }
      double alternativeScore = score(alternative, next, next2, prev, prev2);
      if (alternativeScore >= bestScore + MIN_SCORE_DIFF && alternativeScore >= MIN_ALTERNATIVE_SCORE) {
View Full Code Here

    final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();

    RuleMatch prevRuleMatch = null;
    final Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<>(MAX_TERMS);
    for (int i = 0; i < tokens.length + MAX_TERMS-1; i++) {
      final AnalyzedTokenReadings token;
      // we need to extend the token list so we find matches at the end of the original list:
      if (i >= tokens.length) {
        token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos());
      } else {
        token = tokens[i];
      }
      if (i == 0) {
        addToQueue(token, prevTokens);
        continue;
      }
      if (token.isImmunized()) {
        continue;
      }

      final AnalyzedTokenReadings firstMatchToken = prevTokens.peek();
      final List<String> stringsToCheck = new ArrayList<>();
      final List<String> origStringsToCheck = new ArrayList<>();    // original upper/lowercase spelling
      final Map<String, AnalyzedTokenReadings> stringToToken =
              getStringToTokenMap(prevTokens, stringsToCheck, origStringsToCheck);
      // iterate backwards over all potentially incorrect strings to make
      // sure we match longer strings first:
      for (int k = stringsToCheck.size()-1; k >= 0; k--) {
        final String stringToCheck = stringsToCheck.get(k);
        final String origStringToCheck = origStringsToCheck.get(k);
        if (incorrectCompounds.contains(stringToCheck)) {
          final AnalyzedTokenReadings atr = stringToToken.get(stringToCheck);
          String msg = null;
          final List<String> replacement = new ArrayList<>();
          if (!noDashSuggestion.contains(stringToCheck)) {
            replacement.add(origStringToCheck.replace(' ', '-'));
            msg = withHyphenMessage;
          }
          if (isNotAllUppercase(origStringToCheck) && !onlyDashSuggestion.contains(stringToCheck)) {
            replacement.add(mergeCompound(origStringToCheck));
            msg = withoutHyphenMessage;
          }
          final String[] parts = stringToCheck.split(" ");
          if (parts.length > 0 && parts[0].length() == 1) {
            replacement.clear();
            replacement.add(origStringToCheck.replace(' ', '-'));
            msg = withHyphenMessage;
          } else if (replacement.isEmpty() || replacement.size() == 2) {     // isEmpty shouldn't happen
            msg = withOrWithoutHyphenMessage;
          }
          final RuleMatch ruleMatch = new RuleMatch(this, firstMatchToken.getStartPos(),
              atr.getStartPos() + atr.getToken().length(), msg, shortDesc);
          // avoid duplicate matches:
          if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) {
            prevRuleMatch = ruleMatch;
            break;
          }
View Full Code Here

TOP

Related Classes of org.languagetool.AnalyzedTokenReadings

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.