Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.WhitespaceTokenizer


    String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
        "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiols", "Makare", "Gesäll",
        "Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" };

    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
        new WhitespaceTokenizer(new StringReader("Basfiolsfodralmakaregesäll")),
        dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
        CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
        CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);

    assertTokenStreamContents(tf, new String[] { "Basfiolsfodralmakaregesäll", "Bas",
View Full Code Here


    }

    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
        .getHyphenationTree(reader);

    Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader(
        "Rindfleischüberwachungsgesetz"));
    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
        wsTokenizer, hyphenator, dict,
        CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
        CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
        CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
   
    TermAttribute termAtt = tf.getAttribute(TermAttribute.class);
    assertTrue(tf.incrementToken());
    assertEquals("Rindfleischüberwachungsgesetz", termAtt.term());
    assertTrue(tf.incrementToken());
    assertEquals("Rind", termAtt.term());
    wsTokenizer.reset(new StringReader("Rindfleischüberwachungsgesetz"));
    tf.reset();
    assertTrue(tf.incrementToken());
    assertEquals("Rindfleischüberwachungsgesetz", termAtt.term());
  }
View Full Code Here

      super(org.apache.lucene.util.Version.LUCENE_CURRENT);
    }
 
    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
      return new WhitespaceTokenizer(reader);
   
View Full Code Here

  private class NonreusableAnalyzer extends Analyzer {
    int invocationCount = 0;
    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
      if (++invocationCount % 2 == 0)
        return new WhitespaceTokenizer(reader);
      else
        return new LetterTokenizer(reader);
    }
View Full Code Here

      this.maxSynonyms = maxSynonyms;
    }
   
    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
      TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
      ts = new LowerCaseFilter(TEST_VERSION_CURRENT, ts);
      ts = new SynonymTokenFilter(ts, synonyms, maxSynonyms);
      return ts;
    }
View Full Code Here

    public TokenStream reusableTokenStream(String fieldName, Reader reader)
        throws IOException {
      SavedStreams streams = (SavedStreams) getPreviousTokenStream();
      if (streams == null) {
        streams = new SavedStreams();
        streams.source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
        streams.result = new LowerCaseFilter(TEST_VERSION_CURRENT, streams.source);
        streams.result = new SynonymTokenFilter(streams.result, synonyms, maxSynonyms);
        setPreviousTokenStream(streams);
      } else {
        streams.source.reset(reader);
View Full Code Here

  public void testDocumentsWriterExceptions() throws IOException {
    Analyzer analyzer = new Analyzer() {
      @Override
      public TokenStream tokenStream(String fieldName, Reader reader) {
        return new CrashingFilter(fieldName, new WhitespaceTokenizer(reader));
      }
    };

    for(int i=0;i<2;i++) {
      MockRAMDirectory dir = new MockRAMDirectory();
View Full Code Here

  public void testDocumentsWriterExceptionThreads() throws Exception {
    Analyzer analyzer = new Analyzer() {
      @Override
      public TokenStream tokenStream(String fieldName, Reader reader) {
        return new CrashingFilter(fieldName, new WhitespaceTokenizer(reader));
      }
    };

    final int NUM_THREAD = 3;
    final int NUM_ITER = 100;
View Full Code Here

    w.addDocument(doc);

    Analyzer analyzer = new Analyzer() {
      @Override
      public TokenStream tokenStream(String fieldName, Reader reader) {
        return new CrashingFilter(fieldName, new WhitespaceTokenizer(reader));
      }
    };

    Document crashDoc = new Document();
    crashDoc.add(new Field("crash", "do it on token 4", Field.Store.YES,
View Full Code Here

    for(int i=0;i<50;i++)
      b[i] = (byte) (i+77);

    Document doc = new Document();
    Field f = new Field("binary", b, 10, 17, Field.Store.YES);
    f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc1field1")));
    Field f2 = new Field("string", "value", Field.Store.YES,Field.Index.ANALYZED);
    f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc1field2")));
    doc.add(f);
    doc.add(f2);
    w.addDocument(doc);
   
    // add 2 docs to test in-memory merging
    f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc2field1")));
    f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc2field2")));
    w.addDocument(doc);
 
    // force segment flush so we can force a segment merge with doc3 later.
    w.commit();

    f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc3field1")));
    f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc3field2")));

    w.addDocument(doc);
    w.commit();
    w.optimize();   // force segment merge.
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.WhitespaceTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.