Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.CharFilter


    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
    assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2);
  }

  public void testFullWidthChar() throws Exception {
    CharFilter cs = new MappingCharFilter( normMap, new StringReader( "\uff01") );
    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
    assertTokenStreamContents(ts, new String[]{"full-width-exclamation"}, new int[]{0}, new int[]{1}, 1);
  }
View Full Code Here


  //  bbb,16,19 =>    b,16,19
  //   aa,20,22 =>    a,20,22
  //
  public void testTokenStream() throws Exception {
    String testString = "h i j k ll cccc bbb aa";
    CharFilter cs = new MappingCharFilter( normMap, new StringReader( testString ) );
    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
    assertTokenStreamContents(ts,
      new String[]{"i","i","jj","kkk","llll","cc","b","a"},
      new int[]{0,2,4,6,8,11,16,20},
      new int[]{1,3,5,7,10,15,19,22},
View Full Code Here

  // aaaa,0,4 => a,0,4
  //   ll,5,7 => llllllll,5,7
  //    h,8,9 => i,8,9
  public void testChained() throws Exception {
    String testString = "aaaa ll h";
    CharFilter cs = new MappingCharFilter( normMap,
        new MappingCharFilter( normMap, new StringReader( testString ) ) );
    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
    assertTokenStreamContents(ts,
      new String[]{"a","llllllll","i"},
      new int[]{0,5,8},
View Full Code Here

  public void testNormalization() throws IOException {
    String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
    Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
    String expectedOutput = normalizer.normalize(input);

    CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), normalizer);
    char[] tempBuff = new char[10];
    StringBuilder output = new StringBuilder();
    while (true) {
      int length = reader.read(tempBuff);
      if (length == -1) {
        break;
      }
      output.append(tempBuff, 0, length);
      assertEquals(output.toString(), normalizer.normalize(input.substring(0, reader.correctOffset(output.length()))));
    }

    assertEquals(expectedOutput, output.toString());
  }
View Full Code Here

  public void testTokenStream() throws IOException {
    // '℃', '№', '㈱', '㌘', 'サ'+'<<', 'ソ'+'<<', '㌰'+'<<'
    String input = "℃ № ㈱ ㌘ ザ ゾ ㌰゙";

    CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
      Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE));

    Tokenizer tokenStream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);

    assertTokenStreamContents(tokenStream,
View Full Code Here

    assertAnalyzesTo(keywordAnalyzer, "。。。", new String[]{"。。。"});
  }

  public void testKanjiOnly() throws IOException {
    // Test kanji only repetition marks
    CharFilter filter = new JapaneseIterationMarkCharFilter(
        new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"),
        true, // kanji
        false // no kana
    );
    assertCharFilterEquals(filter, "時時、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。");
View Full Code Here

    assertCharFilterEquals(filter, "時時、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。");
  }

  public void testKanaOnly() throws IOException {
    // Test kana only repetition marks
    CharFilter filter = new JapaneseIterationMarkCharFilter(
        new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"),
        false, // no kanji
        true   // kana
    );
    assertCharFilterEquals(filter, "時々、おおのさんと一緒にお寿司が食べたいです。abcところどころ。");
View Full Code Here

    assertCharFilterEquals(filter, "時々、おおのさんと一緒にお寿司が食べたいです。abcところどころ。");
  }

  public void testNone() throws IOException {
    // Test no repetition marks
    CharFilter filter = new JapaneseIterationMarkCharFilter(
        new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"),
        false, // no kanji
        false  // no kana
    );
    assertCharFilterEquals(filter, "時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。");
View Full Code Here

  public void testTokenStream2() throws IOException {
    // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<'
    String input = "㌰゙5℃№㈱㌘ザゾ";

    CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
      Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

    Tokenizer tokenStream = new NGramTokenizer(TEST_VERSION_CURRENT, reader, 1, 1);

    assertTokenStreamContents(tokenStream,
View Full Code Here

  }
 
  public void testMassiveLigature() throws IOException {
    String input = "\uFDFA";

    CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
      Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

    Tokenizer tokenStream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);

    assertTokenStreamContents(tokenStream,
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.CharFilter

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.