Package org.apache.lucene.analysis.shingle

Examples of org.apache.lucene.analysis.shingle.ShingleFilter$InputWindowToken


  @Test
  public void testShingleFilteredAnalyzer() throws IOException {
    Reader reader = new StringReader(input);
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_31);
    TokenStream ts = analyzer.tokenStream(null, reader);
    ShingleFilter sf = new ShingleFilter(ts, 3);
    TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens),  true, sf);
    validateTokens(expectedShingleTokens, f);
  }
View Full Code Here


      Reader reader2 = (normMap == null ? reader : new MappingCharFilter(normMap,reader));
         
    final Tokenizer source = new WhitespaceTokenizer(matchVersion, reader2);
      TokenStream tokenStream = new LowerCaseFilter(matchVersion, source);
      tokenStream = new ShingleFilter(tokenStream, 2, 3);
      tokenStream = new NTermStopFilter(matchVersion, tokenStream, ntermStopFilterRules);   
      return new TokenStreamComponents(source, tokenStream);
    }
View Full Code Here

    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize);
      sf.reset();
      try {
        do {
          String term = sf.getAttribute(CharTermAttribute.class).toString();
          if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
            int termId = dictionary.get(term);
            vector.setQuick(termId, vector.getQuick(termId) + 1);
          }
        } while (sf.incrementToken());

        sf.end();
      } finally {
        Closeables.close(sf, true);
      }
    } else {
      for (String term : value.getEntries()) {
View Full Code Here

        DataBag result;
        if (minGramSize == 1 && maxGramSize == 1) {
            result = fillBag(filtered);
        } else {
            ShingleFilter nGramStream = new ShingleFilter(filtered, minGramSize, maxGramSize);       
            nGramStream.setOutputUnigrams(outputUnigrams);               
            PatternReplaceFilter replacer = new PatternReplaceFilter(nGramStream, SHINGLE_FILLER, NOFIELD, true);
            result = fillBag(replacer);
        }
        return result;
    }
View Full Code Here

            this.fillerToken = fillerToken;
            this.name = name;
        }

        public TokenStream create(TokenStream tokenStream) {
            ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize);
            filter.setOutputUnigrams(outputUnigrams);
            filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
            filter.setTokenSeparator(tokenSeparator);
            filter.setFillerToken(fillerToken);
            return filter;
        }
View Full Code Here

   *  Tests PositionFilter setting all but the first positionIncrement to zero.
   * @throws java.io.IOException @see Token#next(Token)
   */
  public void test6GramFilterNoPositions() throws Exception {

    ShingleFilter filter = new ShingleFilter(new TestTokenStream(TEST_TOKEN), 6);
    assertTokenStreamContents(new PositionFilter(filter),
               SIX_GRAM_NO_POSITIONS_TOKENS,
               SIX_GRAM_NO_POSITIONS_INCREMENTS);
  }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.shingle.ShingleFilter$InputWindowToken

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.