Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.Tokenizer


  public void testCharacterEncoding1() throws Exception {
    final Reader reader = new StringReader("{ \"http://test.com/M\u00F6ller\" : \"M\u00F6ller\" }");
    final Map<String,String> args = this.getDefaultInitArgs();
    final JsonTokenizerFactory factory = new JsonTokenizerFactory();
    factory.init(args);
    final Tokenizer stream = factory.create(reader);
    this.assertTokenStreamContents(stream,
        new String[] {"http://test.com/Möller", "Möller"});
  }
View Full Code Here


  public void testCharacterEncoding2() throws Exception {
    final Reader reader = new StringReader("{ \"http://test.com/Möller\" : \"Möller\" }");
    final Map<String,String> args = this.getDefaultInitArgs();
    final JsonTokenizerFactory factory = new JsonTokenizerFactory();
    factory.init(args);
    final Tokenizer stream = factory.create(reader);
    this.assertTokenStreamContents(stream,
        new String[] {"http://test.com/Möller", "Möller"});
  }
View Full Code Here

      }
    }

    @Override
    public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
      Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
      if (tokenizer == null) {
        tokenizer = new SingleCharTokenizer(reader);
        setPreviousTokenStream(tokenizer);
      } else
        tokenizer.reset(reader);
      return tokenizer;
    }
View Full Code Here

  }


 
  public void testReset() throws Exception {
    Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
    TokenStream filter = new ShingleFilter(wsTokenizer, 2);
    assertTokenStreamContents(filter,
      new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
      new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
      new String[]{TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE},
      new int[]{1,0,1,0,1,0,1}
    );
    wsTokenizer.reset(new StringReader("please divide this sentence"));
    assertTokenStreamContents(filter,
      new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
      new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
      new String[]{TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE},
      new int[]{1,0,1,0,1,0,1}
View Full Code Here

        done = false;
      }
    }

    public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
      Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
      if (tokenizer == null) {
        tokenizer = new SingleCharTokenizer(reader);
        setPreviousTokenStream(tokenizer);
      } else
        tokenizer.reset(reader);
      return tokenizer;
    }
View Full Code Here

  public void testRandomStrings() throws Exception {
    final Transliterator transform = Transliterator.getInstance("Any-Latin");
    Analyzer a = new ReusableAnalyzerBase() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
        return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, transform));
      }
    };
    checkRandomData(random, a, 1000*RANDOM_MULTIPLIER);
  }
View Full Code Here

  }
 
  public void testWithKeywordAttribute() throws IOException {
    CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
    set.add("yourselves");
    Tokenizer tokenizer = new MockTokenizer(new StringReader("yourselves yours"), MockTokenizer.WHITESPACE, false);
    TokenStream filter = new PorterStemFilter(new KeywordMarkerFilter(tokenizer, set));  
    assertTokenStreamContents(filter, new String[] {"yourselves", "your"});
  }
View Full Code Here

  }

  @Override
  public final TokenStream reusableTokenStream(final String fieldName, final Reader reader) throws IOException {

    Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
    if (tokenizer == null) {
      tokenizer = new KeywordListTokenizer(reader);
      setPreviousTokenStream(tokenizer);
    } else {
      tokenizer.reset(reader);
    }
    return tokenizer;
  }
View Full Code Here

  public static void main(String[] args) throws Exception {
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    while (true) {
      System.out.print("Text: ");
      String line = in.readLine();
      Tokenizer tokenizer = new NutchDocumentTokenizer(new StringReader(line));
      TermAttribute termAtt = tokenizer.getAttribute(TermAttribute.class);
      System.out.print("Tokens: ");
      while (tokenizer.incrementToken()) {
        System.out.print(termAtt.term());
        System.out.print(" ");
      }
      System.out.println();
    }
View Full Code Here

  public void testGraphDups() throws Exception {

    final Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
       
        return new TokenStreamComponents(tokenizer) {
          int tokenStreamCounter = 0;
          final TokenStream[] tokenStreams = new TokenStream[] {
            new CannedTokenStream(new Token[] {
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.Tokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.