Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.TokenFilter


    // vowel shortening
    check("आईऊॠॡऐऔीूॄॣैौ", "अइउऋऌएओिुृॢेो");
  }
  private void check(String input, String output) throws IOException {
    Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
    TokenFilter tf = new HindiNormalizationFilter(tokenizer);
    assertTokenStreamContents(tf, new String[] { output });
  }
View Full Code Here


    check("कठिन", "कठिन");
  }
 
  private void check(String input, String output) throws IOException {
    Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
    TokenFilter tf = new HindiStemFilter(tokenizer);
    assertTokenStreamContents(tf, new String[] { output });
  }
View Full Code Here

    Analyzer analyzer = new Analyzer() {

      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        TokenFilter filter = new FakeStandardTokenizer(tokenizer);
        filter = new StopFilter(TEST_VERSION_CURRENT, filter, CharArraySet.EMPTY_SET);
        filter = new CJKBigramFilter(filter);
        return new TokenStreamComponents(tokenizer, filter);
      }
    };
View Full Code Here

  public void testElision() throws Exception {
    String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
    Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test));
    CharArraySet articles = new CharArraySet(TEST_VERSION_CURRENT, asSet("l", "M"), false);
    TokenFilter filter = new ElisionFilter(tokenizer, articles);
    List<String> tas = filter(filter);
    assertEquals("embrouille", tas.get(4));
    assertEquals("O'brian", tas.get(6));
    assertEquals("enfin", tas.get(7));
  }
View Full Code Here

    public void addWords(Object obj, Collection<String> features) {
        String document = obj.toString().toLowerCase();
        StandardTokenizer tokenizer = new StandardTokenizer(new StringReader(document));
        tokenizer.setMaxTokenLength(20);
        TokenFilter psf = new SnowballFilter(tokenizer, "English");
        Token t;
        StringBuilder sb = new StringBuilder();
        try {
            while ((t = psf.next()) != null) {
                sb.setLength(0);
                sb.append(t.termBuffer(), 0, t.termLength());
                //System.out.println(sb.toString());
                features.add(sb.toString());
            }
View Full Code Here

  public void testFirstPosInc() throws Exception {
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        TokenFilter filter = new MockSynonymFilter(tokenizer);
        StopFilter stopfilter = new StopFilter(Version.LUCENE_43, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
        stopfilter.setEnablePositionIncrements(false);
        return new TokenStreamComponents(tokenizer, stopfilter);
      }
    };
View Full Code Here

  public void testInvalidOffsets() throws Exception {
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
        filters = new EdgeNGramTokenFilter(Version.LUCENE_43, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
        return new TokenStreamComponents(tokenizer, filters);
      }
    };
    assertAnalyzesTo(analyzer, "mosfellsbær",
View Full Code Here

  public void testInvalidOffsets() throws Exception {
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
        filters = new NGramTokenFilter(TEST_VERSION_CURRENT, filters, 2, 2);
        return new TokenStreamComponents(tokenizer, filters);
      }
    };
    assertAnalyzesTo(analyzer, "mosfellsbær",
View Full Code Here

    if (log == null)
      throw new IllegalArgumentException("logStream must not be null");

    return new Analyzer() {
      public TokenStream tokenStream(final String fieldName, Reader reader) {
        return new TokenFilter(child.tokenStream(fieldName, reader)) {
          private int position = -1;
         
          public Token next() throws IOException {
            Token token = input.next(); // from filter super class
            log.println(toString(token));
View Full Code Here

    if (maxTokens == Integer.MAX_VALUE)
      return child; // no need to wrap
 
    return new Analyzer() {
      public TokenStream tokenStream(String fieldName, Reader reader) {
        return new TokenFilter(child.tokenStream(fieldName, reader)) {
          private int todo = maxTokens;
         
          public Token next() throws IOException {
            return --todo >= 0 ? input.next() : null;
          }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.TokenFilter

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.