Package org.apache.lucene.analysis.util

Examples of org.apache.lucene.analysis.util.ClasspathResourceLoader


        ("  Don't,break.at?/(punct)!  \u201Cnice\u201D\r\n\r\n85_At:all; `really\" +2=3$5,&813 !@#%$^)(*@#$   ");
    ICUTokenizerFactory factory = new ICUTokenizerFactory();
    final Map<String,String> args = new HashMap<String,String>();
    args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-break-only-on-whitespace.rbbi");
    factory.init(args);
    factory.inform(new ClasspathResourceLoader(this.getClass()));
    TokenStream stream = factory.create(reader);
    assertTokenStreamContents(stream,
        new String[] { "Don't,break.at?/(punct)!", "\u201Cnice\u201D", "85_At:all;", "`really\"""+2=3$5,&813", "!@#%$^)(*@#$" },
        new String[] { "<ALPHANUM>",               "<ALPHANUM>",       "<ALPHANUM>", "<ALPHANUM>", "<NUM>",       "<OTHER>" });
  }
View Full Code Here


        ("One-two punch.  Brang-, not brung-it.  This one--not that one--is the right one, -ish.");
    ICUTokenizerFactory factory = new ICUTokenizerFactory();
    final Map<String,String> args = new HashMap<String,String>();
    args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-dont-break-on-hyphens.rbbi");
    factory.init(args);
    factory.inform(new ClasspathResourceLoader(getClass()));
    TokenStream stream = factory.create(reader);
    assertTokenStreamContents(stream,
        new String[] { "One-two", "punch",
            "Brang", "not", "brung-it",
            "This", "one", "not", "that", "one", "is", "the", "right", "one", "ish" });
View Full Code Here

        ("Some English.  Немного русский.  ข้อความภาษาไทยเล็ก ๆ น้อย ๆ  More English.");
    ICUTokenizerFactory factory = new ICUTokenizerFactory();
    final Map<String,String> args = new HashMap<String,String>();
    args.put(ICUTokenizerFactory.RULEFILES, "Cyrl:KeywordTokenizer.rbbi,Thai:KeywordTokenizer.rbbi");
    factory.init(args);
    factory.inform(new ClasspathResourceLoader(getClass()));
    TokenStream stream = factory.create(reader);
    assertTokenStreamContents(stream, new String[] { "Some", "English",
        "Немного русский.  ",
        "ข้อความภาษาไทยเล็ก ๆ น้อย ๆ  ",
        "More", "English" });
View Full Code Here

/** basic tests for {@link ICUTokenizerFactory} **/
public class TestICUTokenizerFactory extends BaseTokenStreamTestCase {
  public void testMixedText() throws Exception {
    Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี  This is a test ກວ່າດອກ");
    ICUTokenizerFactory factory = new ICUTokenizerFactory(new HashMap<String,String>());
    factory.inform(new ClasspathResourceLoader(getClass()));
    TokenStream stream = factory.create(newAttributeFactory(), reader);
    assertTokenStreamContents(stream,
        new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี",
        "This", "is", "a", "test", "ກວ່າ", "ດອກ"});
  }
View Full Code Here

    Reader reader = new StringReader
        ("  Don't,break.at?/(punct)!  \u201Cnice\u201D\r\n\r\n85_At:all; `really\" +2=3$5,&813 !@#%$^)(*@#$   ");
    final Map<String,String> args = new HashMap<>();
    args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-break-only-on-whitespace.rbbi");
    ICUTokenizerFactory factory = new ICUTokenizerFactory(args);
    factory.inform(new ClasspathResourceLoader(this.getClass()));
    TokenStream stream = factory.create(newAttributeFactory(), reader);
    assertTokenStreamContents(stream,
        new String[] { "Don't,break.at?/(punct)!", "\u201Cnice\u201D", "85_At:all;", "`really\"""+2=3$5,&813", "!@#%$^)(*@#$" },
        new String[] { "<ALPHANUM>",               "<ALPHANUM>",       "<ALPHANUM>", "<ALPHANUM>", "<NUM>",       "<OTHER>" });
  }
View Full Code Here

    Reader reader = new StringReader
        ("One-two punch.  Brang-, not brung-it.  This one--not that one--is the right one, -ish.");
    final Map<String,String> args = new HashMap<>();
    args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-dont-break-on-hyphens.rbbi");
    ICUTokenizerFactory factory = new ICUTokenizerFactory(args);
    factory.inform(new ClasspathResourceLoader(getClass()));
    TokenStream stream = factory.create(newAttributeFactory(), reader);
    assertTokenStreamContents(stream,
        new String[] { "One-two", "punch",
            "Brang", "not", "brung-it",
            "This", "one", "not", "that", "one", "is", "the", "right", "one", "ish" });
View Full Code Here

    Reader reader = new StringReader
        ("Some English.  Немного русский.  ข้อความภาษาไทยเล็ก ๆ น้อย ๆ  More English.");
    final Map<String,String> args = new HashMap<>();
    args.put(ICUTokenizerFactory.RULEFILES, "Cyrl:KeywordTokenizer.rbbi,Thai:KeywordTokenizer.rbbi");
    ICUTokenizerFactory factory = new ICUTokenizerFactory(args);
    factory.inform(new ClasspathResourceLoader(getClass()));
    TokenStream stream = factory.create(newAttributeFactory(), reader);
    assertTokenStreamContents(stream, new String[] { "Some", "English",
        "Немного русский.  ",
        "ข้อความภาษาไทยเล็ก ๆ น้อย ๆ  ",
        "More", "English" });
View Full Code Here

  public void test() throws Exception {
    Reader reader = new StringReader("foo foobar super-duper-trooper");
    TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
    stream = tokenFilterFactory("Length",
        Version.LUCENE_4_3, new ClasspathResourceLoader(getClass()),
        "min", "4",
        "max", "10",
        "enablePositionIncrements", "false").create(stream);
    assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 1 });
  }
View Full Code Here

   * @deprecated Remove this test in Lucene 5.0 */
  @Deprecated
  public void testSynonymsOld() throws Exception {
    Reader reader = new StringReader("GB");
    TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
    stream = tokenFilterFactory("Synonym", Version.LUCENE_3_3, new ClasspathResourceLoader(getClass()),
        "synonyms", "synonyms.txt").create(stream);
    assertTrue(stream instanceof SlowSynonymFilter);
    assertTokenStreamContents(stream,
        new String[] { "GB", "gib", "gigabyte", "gigabytes" },
        new int[] { 1, 0, 0, 0 });
View Full Code Here

    TokenStream stream = tokenizerFactory("UAX29URLEmail").create(reader);
    assertTokenStreamContents(stream,
        new String[] {"ざ"});
   
    reader = new StringReader("ざ");
    stream = tokenizerFactory("UAX29URLEmail", Version.LUCENE_3_1, new ClasspathResourceLoader(getClass())).create(reader);
    assertTokenStreamContents(stream,
        new String[] {"さ"}); // old broken behavior
  }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.util.ClasspathResourceLoader

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.