Package org.apache.lucene.analysis.core

Examples of org.apache.lucene.analysis.core.WhitespaceTokenizer


*/
public class TestSmartChineseFactories extends BaseTokenStreamTestCase {
  /** Test showing the behavior with whitespace */
  public void testSimple() throws Exception {
    String sentence = "我购买了道具和服装。";
    WhitespaceTokenizer ws = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(sentence));
    SmartChineseWordTokenFilterFactory factory = new SmartChineseWordTokenFilterFactory();
    TokenStream ts = factory.create(ws);
    // TODO: fix smart chinese to not emit punctuation tokens
    // at the moment: you have to clean up with WDF, or use the stoplist, etc
    assertTokenStreamContents(ts,
View Full Code Here


    initParams.put(MorfologikFilterFactory.DICTIONARY_SCHEMA_ATTRIBUTE,
        "morfologik");
    MorfologikFilterFactory factory = new MorfologikFilterFactory();
    factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
    factory.init(initParams);
    TokenStream ts = factory.create(new WhitespaceTokenizer(TEST_VERSION_CURRENT,
        reader));
    assertTokenStreamContents(ts, new String[] {"rower", "bilet"});
  }
View Full Code Here

*/
public class TestStempelPolishStemFilterFactory extends BaseTokenStreamTestCase {
  public void testBasics() throws Exception {
    StringReader document = new StringReader("studenta studenci");
    StempelPolishStemFilterFactory factory = new StempelPolishStemFilterFactory();
    TokenStream ts = factory.create(new WhitespaceTokenizer(TEST_VERSION_CURRENT, document));
    assertTokenStreamContents(ts,
        new String[] { "student", "student" });
  }
View Full Code Here

    final TokenizerFactory factory = tf == null ? null : loadTokenizerFactory(loader, tf);
   
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_31, reader) : factory.create(reader);
        TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_31, tokenizer) : tokenizer;
        return new TokenStreamComponents(tokenizer, stream);
      }
    };
View Full Code Here

    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
    assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
  }
 
  public void testReset() throws Exception {
    WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
    EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
    assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3});
    tokenizer.setReader(new StringReader("abcde"));
    assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3});
  }
View Full Code Here

    Reader reader = new StringReader("This is a Test");
    ICUNormalizer2FilterFactory factory = new ICUNormalizer2FilterFactory();
    factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
    Map<String, String> args = Collections.emptyMap();
    factory.init(args);
    Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
    TokenStream stream = factory.create(tokenizer);
    assertTokenStreamContents(stream, new String[] { "this", "is", "a", "test" });
  }
View Full Code Here

    Reader reader = new StringReader("簡化字");
    ICUTransformFilterFactory factory = new ICUTransformFilterFactory();
    Map<String,String> args = new HashMap<String,String>();
    args.put("id", "Traditional-Simplified");
    factory.init(args);
    Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
    TokenStream stream = factory.create(tokenizer);
    assertTokenStreamContents(stream, new String[] { "简化字" });
  }
View Full Code Here

    Reader reader = new StringReader("Российская Федерация");
    ICUTransformFilterFactory factory = new ICUTransformFilterFactory();
    Map<String,String> args = new HashMap<String,String>();
    args.put("id", "Cyrillic-Latin");
    factory.init(args);
    Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
    TokenStream stream = factory.create(tokenizer);
    assertTokenStreamContents(stream, new String[] { "Rossijskaâ""Federaciâ" });
   
    // backward (invokes Latin-Cyrillic)
    reader = new StringReader("Rossijskaâ Federaciâ");
    args.put("direction", "reverse");
    factory.init(args);
    tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
    stream = factory.create(tokenizer);
    assertTokenStreamContents(stream, new String[] { "Российская", "Федерация" });
  }
View Full Code Here

  /** basic tests to ensure the folding is working */
  public void test() throws Exception {
    Reader reader = new StringReader("Résumé");
    ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory();
    factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
    Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
    TokenStream stream = factory.create(tokenizer);
    assertTokenStreamContents(stream, new String[] { "resume" });
  }
View Full Code Here

  public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
    CharArraySet dict = makeDictionary("ab", "cd", "ef");

    Tokenizer tokenizer = new MockTokenizer(new StringReader("abcdef"), MockTokenizer.WHITESPACE, false);
    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
      new WhitespaceTokenizer(TEST_VERSION_CURRENT,
        new StringReader(
          "abcdef")
        ),
      dict,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.core.WhitespaceTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.