Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.WhitespaceTokenizer


    assertFalse(filter.incrementToken());
  }

  public void testIntEncoding() throws Exception {
    String test = "The quick|1 red|2 fox|3 jumped over the lazy|5 brown|99 dogs|83";
    DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test)), '|', new IntegerEncoder());
    TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
    PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
    assertTermEquals("The", filter, termAtt, payAtt, null);
    assertTermEquals("quick", filter, termAtt, payAtt, PayloadHelper.encodeInt(1));
    assertTermEquals("red", filter, termAtt, payAtt, PayloadHelper.encodeInt(2));
View Full Code Here


  public void test() throws IOException {
    TokenTypeSinkFilter sinkFilter = new TokenTypeSinkFilter("D");
    String test = "The quick red fox jumped over the lazy brown dogs";

    TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))));
    SinkTokenStream sink = ttf.newSinkTokenStream(sinkFilter);
   
    boolean seenDogs = false;

    TermAttribute termAtt = ttf.addAttribute(TermAttribute.class);
View Full Code Here

      super(matchVersion);
    }

    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
      return new WhitespaceTokenizer(reader);
    }
View Full Code Here

  }

  public void test() throws IOException {
    String test = "The quick red fox jumped over the lazy brown dogs";

    TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test)));
    int count = 0;
    PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
    OffsetAttribute offsetAtt = nptf.getAttribute(OffsetAttribute.class);
   
    while (nptf.incrementToken()) {
View Full Code Here

  }

  public void test() throws IOException {
    String test = "The quick red fox jumped over the lazy brown dogs";

    NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), 3, "D");
    boolean seenDogs = false;
    TermAttribute termAtt = nptf.getAttribute(TermAttribute.class);
    TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
    while (nptf.incrementToken()) {
View Full Code Here

  }

  public void test() throws IOException {
    String test = "The quick red fox jumped over the lazy brown dogs";

    TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))));
    int count = 0;
    TermAttribute termAtt = nptf.getAttribute(TermAttribute.class);
    TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
   
View Full Code Here

        FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS, false);
  }
 
 
  public void testReset() throws Exception {
    Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
    TokenStream filter = new ShingleFilter(wsTokenizer, 2);
    assertTokenStreamContents(filter,
      new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
      new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
      new String[]{TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE},
      new int[]{1,0,1,0,1,0,1}
    );
    wsTokenizer.reset(new StringReader("please divide this sentence"));
    assertTokenStreamContents(filter,
      new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
      new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
      new String[]{TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE,"shingle",TypeAttributeImpl.DEFAULT_TYPE},
      new int[]{1,0,1,0,1,0,1}
View Full Code Here

    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
        .getHyphenationTree(reader);

    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
        new WhitespaceTokenizer(new StringReader(
            "Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator,
        dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
        CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
        CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
    assertTokenStreamContents(tf, new String[] { "Rindfleischüberwachungsgesetz", "Rind",
View Full Code Here

    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
        .getHyphenationTree(reader);

    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
        new WhitespaceTokenizer(new StringReader(
            "Rindfleischüberwachungsgesetz")), hyphenator, dict,
        CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
        CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
    assertTokenStreamContents(tf, new String[] { "Rindfleischüberwachungsgesetz",
        "Rindfleisch", "fleisch", "überwachungsgesetz", "gesetz" }, new int[] {
View Full Code Here

    String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
        "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
        "Sko", "Vind", "Rute", "Torkare", "Blad" };

    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
        new WhitespaceTokenizer(
            new StringReader(
                "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
        dict);

    assertTokenStreamContents(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor",
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.WhitespaceTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.