Package org.apache.lucene.analysis.standard

Examples of org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer


  @Deprecated
  public void testVersion36() throws Exception {
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new UAX29URLEmailTokenizer(Version.LUCENE_3_6, reader);
        return new TokenStreamComponents(tokenizer);
      }
    };
    assertAnalyzesTo(a, "this is just a t\u08E6st lucene@apache.org", // new combining mark in 6.1
        new String[] { "this", "is", "just", "a", "t", "st", "lucene@apache.org" });
View Full Code Here


  @Deprecated
  public void testVersion40() throws Exception {
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new UAX29URLEmailTokenizer(Version.LUCENE_4_0, reader);
        return new TokenStreamComponents(tokenizer);
      }
    };
    // U+061C is a new combining mark in 6.3, found using "[[\p{WB:Format}\p{WB:Extend}]&[^\p{Age:6.2}]]"
    // on the online UnicodeSet utility: <http://unicode.org/cldr/utility/list-unicodeset.jsp>
View Full Code Here

    char whitespace[] = new char[4094];
    Arrays.fill(whitespace, ' ');
    sb.append(whitespace);
    sb.append("testing 1234");
    String input = sb.toString();
    UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
    BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
  }
View Full Code Here

    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents
        (String fieldName, Reader reader) {

        Tokenizer tokenizer = new UAX29URLEmailTokenizer(Version.LUCENE_31, reader);
        return new TokenStreamComponents(tokenizer);
      }
    };
    checkOneTerm(a, "ざ", "さ"); // hiragana Bug
    checkOneTerm(a, "ザ", "ザ"); // katakana Works
View Full Code Here

  @Deprecated
  public void testMailtoBackwards()  throws Exception {
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new UAX29URLEmailTokenizer(Version.LUCENE_34, reader);
        return new TokenStreamComponents(tokenizer);
      }
    };
    assertAnalyzesTo(a, "mailto:test@example.org",
        new String[] { "mailto:test", "example.org" });
View Full Code Here

  @Deprecated
  public void testVersion36() throws Exception {
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new UAX29URLEmailTokenizer(Version.LUCENE_36, reader);
        return new TokenStreamComponents(tokenizer);
      }
    };
    assertAnalyzesTo(a, "this is just a t\u08E6st lucene@apache.org", // new combining mark in 6.1
        new String[] { "this", "is", "just", "a", "t", "st", "lucene@apache.org" });
View Full Code Here

  @Deprecated
  public void testVersion40() throws Exception {
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new UAX29URLEmailTokenizer(Version.LUCENE_40, reader);
        return new TokenStreamComponents(tokenizer);
      }
    };
    // U+061C is a new combining mark in 6.3, found using "[[\p{WB:Format}\p{WB:Extend}]&[^\p{Age:6.2}]]"
    // on the online UnicodeSet utility: <http://unicode.org/cldr/utility/list-unicodeset.jsp>
View Full Code Here

    char whitespace[] = new char[4094];
    Arrays.fill(whitespace, ' ');
    sb.append(whitespace);
    sb.append("testing 1234");
    String input = sb.toString();
    UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
    BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
  }
View Full Code Here

    Analyzer a = new ReusableAnalyzerBase() {
      @Override
      protected TokenStreamComponents createComponents
        (String fieldName, Reader reader) {

        Tokenizer tokenizer = new UAX29URLEmailTokenizer(reader);
        return new TokenStreamComponents(tokenizer);
      }
    };
    checkOneTerm(a, "ざ", "さ"); // hiragana Bug
    checkOneTerm(a, "ザ", "ザ"); // katakana Works
View Full Code Here

    }

    @Override
    public Tokenizer create() {
        if (version.onOrAfter(Version.LUCENE_4_7)) {
            UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer();
            tokenizer.setMaxTokenLength(maxTokenLength);
            return tokenizer;
        } else {
            UAX29URLEmailTokenizer40 tokenizer = new UAX29URLEmailTokenizer40();
            tokenizer.setMaxTokenLength(maxTokenLength);
            return tokenizer;
        }
    }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.