Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.Analyzer$TokenStreamComponents


  /*
   * This test is the same as the above, except using an ideographic space as a separator.
   * This tests to ensure the stopwords are working correctly.
   */
  public void testChineseStopWordsDefaultTwoPhrasesIdeoSpace() throws Exception {
    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT); /* will load stopwords */
    String sentence = "我购买了道具和服装 我购买了道具和服装。";
    String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" };
    assertAnalyzesTo(ca, sentence, result);
  }
View Full Code Here


  /*
   * Check that position increments after stopwords are correct,
   * when stopfilter is configured with enablePositionIncrements
   */
  public void testChineseStopWords2() throws Exception {
    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT); /* will load stopwords */
    String sentence = "Title:San"; // : is a stopword
    String result[] = { "titl", "san"};
    int startOffsets[] = { 0, 6 };
    int endOffsets[] = { 5, 9 };
    int posIncr[] = { 1, 2 };
View Full Code Here

    int posIncr[] = { 1, 2 };
    assertAnalyzesTo(ca, sentence, result, startOffsets, endOffsets, posIncr);
  }
 
  public void testChineseAnalyzer() throws Exception {
    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true);
    String sentence = "我购买了道具和服装。";
    String[] result = { "我", "购买", "了", "道具", "和", "服装" };
    assertAnalyzesTo(ca, sentence, result);
  }
View Full Code Here

        new int[] { 0, 1, 3, 4, 6, 7 },
        new int[] { 1, 3, 4, 6, 7, 9 });
  }
 
  public void testReusableTokenStream() throws Exception {
    Analyzer a = new SmartChineseAnalyzer(Version.LUCENE_CURRENT);
    assertAnalyzesToReuse(a, "我购买 Tests 了道具和服装",
        new String[] { "我", "购买", "test", "了", "道具", "和", "服装"},
        new int[] { 0, 1, 4, 10, 11, 13, 14 },
        new int[] { 1, 3, 9, 11, 13, 14, 16 });
    assertAnalyzesToReuse(a, "我购买了道具和服装。",
View Full Code Here

    check("áá", "áá"); // token is too short: diacritics are not removed
    check("ááá", "aaa"); // normally, diacritics are removed
  }
 
  public void testReusableTokenStream() throws Exception {
    Analyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT);
    checkReuse(a, "boa", "boa");
    checkReuse(a, "boainain", "boainain");
    checkReuse(a, "boas", "boas");
    checkReuse(a, "bôas", "boas"); // removes diacritic: different from snowball portugese
  }
View Full Code Here

  }

  @Override
  public int doLogic() throws Exception {
    List<Fieldable> fields = doc.getFields();
    Analyzer analyzer = getRunData().getAnalyzer();
    int tokenCount = 0;
    for(final Fieldable field : fields) {
      final TokenStream stream;
      final TokenStream streamValue = field.tokenStreamValue();

      if (streamValue != null)
        stream = streamValue;
      else {
        // the field does not have a TokenStream,
        // so we have to obtain one from the analyzer
        final Reader reader;        // find or make Reader
        final Reader readerValue = field.readerValue();

        if (readerValue != null)
          reader = readerValue;
        else {
          String stringValue = field.stringValue();
          if (stringValue == null)
            throw new IllegalArgumentException("field must have either TokenStream, String or Reader value");
          stringReader.init(stringValue);
          reader = stringReader;
        }
       
        // Tokenize field
        stream = analyzer.reusableTokenStream(field.name(), reader);
      }

      // reset the TokenStream to the first token
      stream.reset();
View Full Code Here

   check("ophoping", "ophop");
   check("ophouden", "ophoud");
  }
 
  public void testReusableTokenStream() throws Exception {
    Analyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT);
    checkOneTermReuse(a, "lichaamsziek", "lichaamsziek");
    checkOneTermReuse(a, "lichamelijk", "licham");
    checkOneTermReuse(a, "lichamelijke", "licham");
    checkOneTermReuse(a, "lichamelijkheden", "licham");
  }
View Full Code Here

      return new WhitespaceTokenizer(reader);
    }
  }
 
  public void testLUCENE1678BWComp() throws Exception {
    Analyzer a = new DutchSubclassAnalyzer(Version.LUCENE_CURRENT);
    checkOneTermReuse(a, "lichaamsziek", "lichaamsziek");
    checkOneTermReuse(a, "lichamelijk", "lichamelijk");
    checkOneTermReuse(a, "lichamelijke", "lichamelijke");
    checkOneTermReuse(a, "lichamelijkheden", "lichamelijkheden");
  }
View Full Code Here

            fail("unexpected IOException");
        }
    }
   
    public void testReusableTokenStream() throws Exception {
      Analyzer a = new RussianAnalyzer(Version.LUCENE_CURRENT);
      assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
          new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
      assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
          new String[] { "знан", "хран", "тайн" });
    }
View Full Code Here

*/
public class TestDemo extends LuceneTestCase {

  public void testDemo() throws IOException, ParseException {

    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);

    // Store the index in memory:
    Directory directory = new RAMDirectory();
    // To store an index on disk, use this instead:
    //Directory directory = FSDirectory.open("/tmp/testindex");
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.Analyzer$TokenStreamComponents

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.