Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.KeywordTokenizer


    assertEquals( "Big"new String(termBuffer, 0, termBuffer.length));
    termBuffer = "BIG".toCharArray();
    factory.processWord(termBuffer, 0, termBuffer.length, 0 );
    assertEquals( "BIG"new String(termBuffer, 0, termBuffer.length));
   
    Tokenizer tokenizer = new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"));
    TokenStream stream = factory.create(tokenizer);
    assertTokenStreamContents(stream, new String[] { "Hello there my name is ryan" });
   
    // now each token
    factory.onlyFirstWord = false;
    tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"));
    stream = factory.create(tokenizer);
    assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
   
    // now only the long words
    factory.minWordLength = 3;
    tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan" ));
    stream = factory.create(tokenizer);
    assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
   
    // without prefix
    tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
    stream = factory.create(tokenizer);
    assertTokenStreamContents(stream, new String[] { "Mckinley" });
   
    // Now try some prefixes
    factory = new CapitalizationFilterFactory();
    args.put( "okPrefix", "McK" )// all words
    factory.init( args );
    tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
    stream = factory.create(tokenizer);
    assertTokenStreamContents(stream, new String[] { "McKinley" });
   
    // now try some stuff with numbers
    factory.forceFirstLetter = false;
    factory.onlyFirstWord = false;
    tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third" ));
    stream = factory.create(tokenizer);
    assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" });
   
    factory.forceFirstLetter = true
    tokenizer = new KeywordTokenizer(new StringReader("the The the" ));
    stream = factory.create(tokenizer);
    assertTokenStreamContents(stream, new String[] { "The The the" });
  }
View Full Code Here


  public void testMaxWordCount2() throws Exception {
    Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
    args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
    CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
    factory.init(args);
    Tokenizer tokenizer = new KeywordTokenizer(new StringReader(
        "one two three four"));
    TokenStream ts = factory.create(tokenizer);
    assertTokenStreamContents(ts, new String[] {"one two three four"});
  }
View Full Code Here

    args.put("language", "tr");
    args.put("strength", "primary");
    factory.init(args);
    factory.inform(new StringMockSolrResourceLoader(""));
    TokenStream tsUpper = factory.create(
        new KeywordTokenizer(new StringReader(turkishUpperCase)));
    TokenStream tsLower = factory.create(
        new KeywordTokenizer(new StringReader(turkishLowerCase)));
    assertCollatesToSame(tsUpper, tsLower);
  }
View Full Code Here

    args.put("strength", "primary");
    args.put("decomposition", "canonical");
    factory.init(args);
    factory.inform(new StringMockSolrResourceLoader(""));
    TokenStream tsUpper = factory.create(
        new KeywordTokenizer(new StringReader(turkishUpperCase)));
    TokenStream tsLower = factory.create(
        new KeywordTokenizer(new StringReader(turkishLowerCase)));
    assertCollatesToSame(tsUpper, tsLower);
  }
View Full Code Here

    args.put("strength", "identical");
    args.put("decomposition", "full");
    factory.init(args);
    factory.inform(new StringMockSolrResourceLoader(""));
    TokenStream tsFull = factory.create(
        new KeywordTokenizer(new StringReader(fullWidth)));
    TokenStream tsHalf = factory.create(
        new KeywordTokenizer(new StringReader(halfWidth)));
    assertCollatesToSame(tsFull, tsHalf);
  }
View Full Code Here

    args.put("strength", "secondary");
    args.put("decomposition", "no");
    factory.init(args);
    factory.inform(new StringMockSolrResourceLoader(""));
    TokenStream tsUpper = factory.create(
        new KeywordTokenizer(new StringReader(upperCase)));
    TokenStream tsLower = factory.create(
        new KeywordTokenizer(new StringReader(lowerCase)));
    assertCollatesToSame(tsUpper, tsLower);
  }
View Full Code Here

    args.put("custom", "rules.txt");
    args.put("strength", "primary");
    factory.init(args);
    factory.inform(new StringMockSolrResourceLoader(tailoredRules));
    TokenStream tsUmlaut = factory.create(
        new KeywordTokenizer(new StringReader(germanUmlaut)));
    TokenStream tsOE = factory.create(
        new KeywordTokenizer(new StringReader(germanOE)));

    assertCollatesToSame(tsUmlaut, tsOE);
  }
View Full Code Here

    args.put("locale", "tr");
    args.put("strength", "primary");
    factory.init(args);
    factory.inform(new StringMockSolrResourceLoader(""));
    TokenStream tsUpper = factory.create(
        new KeywordTokenizer(new StringReader(turkishUpperCase)));
    TokenStream tsLower = factory.create(
        new KeywordTokenizer(new StringReader(turkishLowerCase)));
    assertCollatesToSame(tsUpper, tsLower);
  }
View Full Code Here

    args.put("strength", "primary");
    args.put("decomposition", "canonical");
    factory.init(args);
    factory.inform(new StringMockSolrResourceLoader(""));
    TokenStream tsUpper = factory.create(
        new KeywordTokenizer(new StringReader(turkishUpperCase)));
    TokenStream tsLower = factory.create(
        new KeywordTokenizer(new StringReader(turkishLowerCase)));
    assertCollatesToSame(tsUpper, tsLower);
  }
View Full Code Here

    args.put("strength", "secondary");
    args.put("decomposition", "no");
    factory.init(args);
    factory.inform(new StringMockSolrResourceLoader(""));
    TokenStream tsUpper = factory.create(
        new KeywordTokenizer(new StringReader(upperCase)));
    TokenStream tsLower = factory.create(
        new KeywordTokenizer(new StringReader(lowerCase)));
    assertCollatesToSame(tsUpper, tsLower);
  }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.KeywordTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.