Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.WhitespaceTokenizer


public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {

  public void testDefaults() throws Exception {
    DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
    factory.init(new HashMap<String, String>());
    TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));

    TokenStream filteredStream = factory.create(inputStream);
    assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
    assertTokenStreamContents(filteredStream, new String[] { "international", "ANTR" });
  }
View Full Code Here


    Map<String, String> parameters = new HashMap<String, String>();
    parameters.put("inject", "false");
    parameters.put("maxCodeLength", "8");
    factory.init(parameters);

    TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));

    TokenStream filteredStream = factory.create(inputStream);
    assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
    assertTokenStreamContents(filteredStream, new String[] { "ANTRNXNL" });
  }
View Full Code Here

   * Ensure that reset() removes any state (buffered tokens)
   */
  public void testReset() throws Exception {
    DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
    factory.init(new HashMap<String, String>());
    TokenStream inputStream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));

    TokenStream filteredStream = factory.create(inputStream);
    CharTermAttribute termAtt = filteredStream.addAttribute(CharTermAttribute.class);
    assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
   
View Full Code Here

  /**
   * Ensure the filter actually lowercases (and a bit more) greek text.
   */
  public void testNormalization() throws Exception {
    Reader reader = new StringReader("Μάϊος ΜΆΪΟΣ");
    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
    GreekLowerCaseFilterFactory factory = new GreekLowerCaseFilterFactory();
    factory.init(DEFAULT_VERSION_PARAM);
    TokenStream stream = factory.create(tokenizer);
    assertTokenStreamContents(stream, new String[] { "μαιοσ", "μαιοσ" });
  }
View Full Code Here

*/
public class TestPortugueseLightStemFilterFactory extends BaseTokenTestCase {
  public void testStemming() throws Exception {
    Reader reader = new StringReader("evidentemente");
    PortugueseLightStemFilterFactory factory = new PortugueseLightStemFilterFactory();
    TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
    assertTokenStreamContents(stream, new String[] { "evident" });
  }
View Full Code Here

*/
public class TestStemmerOverrideFilterFactory extends BaseTokenTestCase {
  public void testKeywords() throws IOException {
    // our stemdict stems dogs to 'cat'
    Reader reader = new StringReader("testing dogs");
    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
    StemmerOverrideFilterFactory factory = new StemmerOverrideFilterFactory();
    Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
    ResourceLoader loader = new SolrResourceLoader(null, null);
    args.put("dictionary", "stemdict.txt");
    factory.init(args);
View Full Code Here

    assertTokenStreamContents(ts, new String[] { "test", "cat" });
  }
 
  public void testKeywordsCaseInsensitive() throws IOException {
    Reader reader = new StringReader("testing DoGs");
    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
    StemmerOverrideFilterFactory factory = new StemmerOverrideFilterFactory();
    Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
    ResourceLoader loader = new SolrResourceLoader(null, null);
    args.put("dictionary", "stemdict.txt");
    args.put("ignoreCase", "true");
View Full Code Here

  /**
   * Ensure the filter actually decompounds text.
   */
  public void testDecompounding() throws Exception {
    Reader reader = new StringReader("I like to play softball");
    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
    DictionaryCompoundWordTokenFilterFactory factory = new DictionaryCompoundWordTokenFilterFactory();
    ResourceLoader loader = new SolrResourceLoader(null, null);
    Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
    args.put("dictionary", "compoundDictionary.txt");
    factory.init(args);
View Full Code Here

*/
public class TestRussianLightStemFilterFactory extends BaseTokenTestCase {
  public void testStemming() throws Exception {
    Reader reader = new StringReader("журналы");
    RussianLightStemFilterFactory factory = new RussianLightStemFilterFactory();
    TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
    assertTokenStreamContents(stream, new String[] { "журнал" });
  }
View Full Code Here

    /* default behavior */
    factoryDefault.init(args);
    factoryDefault.inform(loader);
   
    TokenStream ts = factoryDefault.create(
        new WhitespaceTokenizer(BaseTokenTestCase.DEFAULT_VERSION, new StringReader(testText)));
    BaseTokenTestCase.assertTokenStreamContents(ts,
        new String[] { "I", "borrowed", "5", "400", "00", "540000", "at", "25", "interest", "rate", "interestrate" });

    ts = factoryDefault.create(
        new WhitespaceTokenizer(BaseTokenTestCase.DEFAULT_VERSION, new StringReader("foo\u200Dbar")));
    BaseTokenTestCase.assertTokenStreamContents(ts,
        new String[] { "foo", "bar", "foobar" });

   
    /* custom behavior */
    WordDelimiterFilterFactory factoryCustom = new WordDelimiterFilterFactory();
    // use a custom type mapping
    args.put("types", "wdftypes.txt");
    factoryCustom.init(args);
    factoryCustom.inform(loader);
   
    ts = factoryCustom.create(
        new WhitespaceTokenizer(BaseTokenTestCase.DEFAULT_VERSION, new StringReader(testText)));
    BaseTokenTestCase.assertTokenStreamContents(ts,
        new String[] { "I", "borrowed", "$5,400.00", "at", "25%", "interest", "rate", "interestrate" });
   
    /* test custom behavior with a char > 0x7F, because we had to make a larger byte[] */
    ts = factoryCustom.create(
        new WhitespaceTokenizer(BaseTokenTestCase.DEFAULT_VERSION, new StringReader("foo\u200Dbar")));
    BaseTokenTestCase.assertTokenStreamContents(ts,
        new String[] { "foo\u200Dbar" });
  }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.WhitespaceTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.