Package org.apache.lucene.analysis.core

Examples of org.apache.lucene.analysis.core.WhitespaceTokenizer


      NormalizeCharMap normMap = builder.build();

      Reader reader2 = (normMap == null ? reader : new MappingCharFilter(normMap,reader));
         
    final Tokenizer source = new WhitespaceTokenizer(matchVersion, reader2);
      TokenStream tokenStream = new LowerCaseFilter(matchVersion, source);
      tokenStream = new ShingleFilter(tokenStream, 2, 3);
      tokenStream = new NTermStopFilter(matchVersion, tokenStream, ntermStopFilterRules);   
      return new TokenStreamComponents(source, tokenStream);
    }
View Full Code Here


    this(matchVersion, true);
  }

  @Override
  protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    Tokenizer source = new WhitespaceTokenizer(matchVersion, reader);
    TokenStream filter = new LowerCaseEntityPreservingFilter(source);

    if (stemming) {
      // Porter stemmer ignores words which are marked as keywords
      filter = new PorterStemFilter(filter);
View Full Code Here

    this(matchVersion, true);
  }

  @Override
  protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    Tokenizer source = new WhitespaceTokenizer(matchVersion, reader);
    TokenStream filter = new LowerCaseEntityPreservingFilter(source);

    if (stemming) {
      // Porter stemmer ignores words which are marked as keywords
      filter = new PorterStemFilter(filter);
View Full Code Here

    }

    @Override
    protected TokenStreamComponents createComponents(final String fieldName,
            final Reader reader) {
        WhitespaceTokenizer src = new WhitespaceTokenizer(matchVersion, reader);
        TokenStream tok = new LowerCaseFilter(matchVersion, src);
        tok = new WordDelimiterFilter(tok,
                WordDelimiterFilter.GENERATE_WORD_PARTS
                        | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE
                        | WordDelimiterFilter.GENERATE_NUMBER_PARTS, null);
View Full Code Here

            stream = new WordDelimiterFilter(new LowerCaseFilter(Version.LUCENE_43,source), WordDelimiterFilter.ALPHANUM, CharArraySet.EMPTY_SET);
        } else if(fieldName.startsWith("_ngram_")) {
            source = new NGramTokenizer(reader,3,4);
            stream = null;
        } else {
            source = new WhitespaceTokenizer(Version.LUCENE_43,reader);
            stream = new WordDelimiterFilter(new LowerCaseFilter(Version.LUCENE_43,source), WordDelimiterFilter.ALPHANUM, CharArraySet.EMPTY_SET);
        }

        return new TokenStreamComponents(source, stream);
    }
View Full Code Here

    this(matchVersion, true);
  }

  @Override
  protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    Tokenizer source = new WhitespaceTokenizer(matchVersion, reader);
    TokenStream filter = new LowerCaseEntityPreservingFilter(source);

    if (stemming) {
      // Porter stemmer ignores words which are marked as keywords
      filter = new PorterStemFilter(filter);
View Full Code Here

    /**
     * test some examples
     */
    public void testExamples() throws IOException {
        Tokenizer wsTokenizer = new WhitespaceTokenizer(LuceneUtils.CURRENT_VERSION, new StringReader("one two three"));
        TokenStream filter = new TokenPairConcatenatingFilter(wsTokenizer);
        assertTokenStreamContents(filter,
                new String[]{"one", "onetwo", "two", "twothree", "three"});
    }
View Full Code Here

     * @throws java.io.IOException
     */
    @Test
    public void testClear() throws IOException {

        TokenStream ts = new WhitespaceTokenizer(LuceneUtils.CURRENT_VERSION, new StringReader("one two three"));
        TokenPairConcatenatingFilter filter = new TokenPairConcatenatingFilter(ts);
        assertTokenStreamContents(filter, new String[]{"one", "onetwo", "two", "twothree", "three"});

        assertNotNull(filter.getPreviousWord());
        filter.clear();
View Full Code Here

     * @param reader the reader containing the input
     * @return the TokenStreamComponents
     */
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        final Tokenizer source = new WhitespaceTokenizer(version, reader);
        TokenStream stream = source;
        stream = new LowerCaseFilter(version, stream);
        return new TokenStreamComponents(source, stream);
    }
View Full Code Here

     * @param reader the reader containing the input
     * @return the TokenStreamComponents
     */
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        final Tokenizer source = new WhitespaceTokenizer(version, reader);
        TokenStream stream = source;
        stream = new LowerCaseFilter(version, stream);
        stream = new VersionTokenizingFilter(stream);
        return new TokenStreamComponents(source, stream);
    }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.core.WhitespaceTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.