Examples of morfologik.stemming.IStemmer

morfologik.stemming.IStemmer

        }
        return ret;        
    }
    
    private IStemmer loadDictionary() throws IOException {
        IStemmer dictLookup = new DictionaryLookup(Dictionary.read(dictFile));
        return dictLookup;
    }

View Full Code Here

    this.manualSynthesizer = manualSynthesizer;
  }


  @Override
  protected void initSynthesizer() throws IOException {
    synthesizer = new IStemmer() { // null synthesiser 
      @Override
      public List<WordData> lookup(CharSequence word) {
        return new ArrayList<>();
      }
    };

View Full Code Here

        return !dictLookup.lookup(word).isEmpty();
    }
    
    // this should be general, not specific to English
    private static IStemmer loadDictionary() {
        IStemmer dictLookup;
        URL url = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(ENGLISH_DICT);
        File dictFile;
        try {
          dictFile = new File(url.toURI());
        } catch (URISyntaxException e) {

View Full Code Here


    boolean firstWord = true;
    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;


    final IStemmer morfologik = new DictionaryLookup(dictionary);


    for (String word : sentenceTokens) {
      final List<AnalyzedToken> l = new ArrayList<>();
      String[] taggerTokens = lexiconLookup(word, morfologik);
      if (firstWord && taggerTokens == null && ignoreCase) { // e.g. "Das" -> "das" at start of sentence

View Full Code Here

  @Override
  public final List<AnalyzedTokenReadings> tag(
      final List<String> sentenceTokens) throws IOException {
    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    final IStemmer morfologik = new DictionaryLookup(getDictionary());
    if (manualTagger == null && plaintextDictPath != null) {
      manualTagger = new ManualTagger(JLanguageTool.getDataBroker().getFromResourceDirAsStream(plaintextDictPath));
    }


    for (final String word : sentenceTokens) {
      final List<AnalyzedToken> l = new ArrayList<>();
      final String lowerCaseWord = word.toLowerCase(RO_LOCALE);
      final List<WordData> taggerTokens = morfologik.lookup(lowerCaseWord);
      if (taggerTokens != null) {
        for (WordData wd : taggerTokens) {
          final String[] tagsArr = wd.getStem().toString().split("\\+");
          for (final String currTag : tagsArr) {
            l.add(new AnalyzedToken(word,

View Full Code Here

    List<AnalyzedToken> taggerTokens;
    List<AnalyzedToken> lowerTaggerTokens;
    List<AnalyzedToken> upperTaggerTokens;
    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    final IStemmer dictLookup = new DictionaryLookup(getDictionary());


    for (String word : sentenceTokens) {
      final List<AnalyzedToken> l = new ArrayList<>();
      final String lowerWord = word.toLowerCase(conversionLocale);
      taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(word));
      lowerTaggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(lowerWord));
      final boolean isLowercase = word.equals(lowerWord);
      final boolean isMixedCase = StringTools.isMixedCase(word);


      //normal case
      addTokens(taggerTokens, l);


      //tag non-lowercase (alluppercase or startuppercase), but not mixedcase word with lowercase word tags
      if (!isLowercase && !isMixedCase) {
        addTokens(lowerTaggerTokens, l);
      }


      //tag lowercase word with startuppercase word tags
      if (tagLowercaseWithUppercase) {
        if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
          if (isLowercase) {
            upperTaggerTokens = asAnalyzedTokenList(word,
                dictLookup.lookup(StringTools.uppercaseFirstChar(word)));
            if (!upperTaggerTokens.isEmpty()) {
              addTokens(upperTaggerTokens, l);
            }
          }
        }

View Full Code Here

    List<AnalyzedToken> taggerTokens;
    List<AnalyzedToken> lowerTaggerTokens;
    List<AnalyzedToken> upperTaggerTokens;
    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    final IStemmer dictLookup = new DictionaryLookup(getDictionary());


    Matcher matcher;
    for (String word : sentenceTokens) {
      String probeWord = word;


      // This loop happens when we need to retry probing the dictionary
      // which happens rarely when trying to remove suffixes -mañ, -se, etc.
      for (;;) {
        final List<AnalyzedToken> l = new ArrayList<>();
        final String lowerWord = probeWord.toLowerCase(conversionLocale);
        taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(probeWord));
        lowerTaggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(lowerWord));
        final boolean isLowercase = probeWord.equals(lowerWord);


        // Normal case.
        addTokens(taggerTokens, l);


        if (!isLowercase) {
          // Lowercase.
          addTokens(lowerTaggerTokens, l);
        }


        // Uppercase.
        if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
          if (isLowercase) {
            upperTaggerTokens = asAnalyzedTokenList(word,
                dictLookup.lookup(StringTools.uppercaseFirstChar(probeWord)));
            if (!upperTaggerTokens.isEmpty()) {
              addTokens(upperTaggerTokens, l);
            }
          }
          if (l.isEmpty()) {

View Full Code Here

    } else if (ADD_IND_DETERMINER.equals(posTag)) {
      final AvsAnRule rule = new AvsAnRule(null);
      return new String[] { rule.suggestAorAn(token.getToken()) };
    }


    final IStemmer synthesizer = createStemmer();
    final List<WordData> wordData = synthesizer.lookup(token.getLemma() + "|" + posTag);
    final List<String> wordForms = new ArrayList<>();
    for (WordData wd : wordData) {
      wordForms.add(wd.getStem().toString());
    }
    return wordForms.toArray(new String[wordForms.size()]);

View Full Code Here

        return !dictLookup.lookup(word).isEmpty();
    }
    
    // this should be general, not specific to English
    private static IStemmer loadDictionary() {
        IStemmer dictLookup;
        URL url = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(ENGLISH_DICT);
        File dictFile;
        try {
          dictFile = new File(url.toURI());
        } catch (URISyntaxException e) {

View Full Code Here

      throws IOException {
    initializeIfRequired();


    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    final IStemmer dictLookup = new DictionaryLookup(getDictionary());


    for (String word : sentenceTokens) {
      final List<AnalyzedToken> l = new ArrayList<>();
      final String lowerWord = word.toLowerCase(conversionLocale);
      final boolean isLowercase = word.equals(lowerWord);
      final boolean isMixedCase = StringTools.isMixedCase(word);
      List<AnalyzedToken> manualTaggerTokens=manualTagsAsAnalyzedTokenList(word, manualTagger.lookup(word));
      List<AnalyzedToken> manualLowerTaggerTokens=manualTagsAsAnalyzedTokenList(word, manualTagger.lookup(lowerWord));


      // normal case, manual tagger
      addTokens(manualTaggerTokens, l);
      // normal case, tagger dictionary
      if (manualTaggerTokens.isEmpty()) {
        addTokens(asAnalyzedTokenList(word, dictLookup.lookup(word)), l);
      }
      // tag non-lowercase words (alluppercase or startuppercase but not mixedcase)
      // with lowercase word tags
      if (!isLowercase && !isMixedCase) {
        // manual tagger
        addTokens(manualLowerTaggerTokens, l);
        // tagger dictionary
        if (manualLowerTaggerTokens.isEmpty()) {
          addTokens(asAnalyzedTokenList(word, dictLookup.lookup(lowerWord)), l);
        }
      }
      // additional tagging with prefixes
      if (l.isEmpty() && !isMixedCase) {
        addTokens(additionalTags(word), l);

View Full Code Here

0 1

TOP

Related Classes of morfologik.stemming.IStemmer

org.languagetool.dev.conversion.AtdRuleConverter

org.languagetool.dev.conversion.RuleCoverage

org.languagetool.synthesis.ca.CatalanSynthesizer

org.languagetool.synthesis.en.EnglishSynthesizer

org.languagetool.synthesis.ManualSynthesizerAdapter

org.languagetool.synthesis.pl.PolishSynthesizer

org.languagetool.tagging.BaseTagger

org.languagetool.tagging.br.BretonTagger

org.languagetool.tagging.ca.CatalanTagger

org.languagetool.tagging.de.GermanTagger

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.