Source Code of org.apache.solr.analysis.TestWordDelimiterFilter$LargePosIncTokenFilter

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.solr.analysis;


import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;


import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.HashSet;


/**
 * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
 */
public class TestWordDelimiterFilter extends AbstractSolrTestCase {
  public String getSchemaFile() { return "solr/conf/schema.xml"; }
  public String getSolrConfigFile() { return "solr/conf/solrconfig.xml"; }




  public void posTst(String v1, String v2, String s1, String s2) {
    assertU(adoc("id",  "42",
                 "subword", v1,
                 "subword", v2));
    assertU(commit());


    // there is a positionIncrementGap of 100 between field values, so
    // we test if that was maintained.
    assertQ("position increment lost",
            req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~90")
            ,"//result[@numFound=0]"
    );
    assertQ("position increment lost",
            req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~110")
            ,"//result[@numFound=1]"
    );
  }




  public void testRetainPositionIncrement() {
    posTst("foo","bar","foo","bar");
    posTst("-foo-","-bar-","foo","bar");
    posTst("foo","bar","-foo-","-bar-");


    posTst("123","456","123","456");
    posTst("/123/","/456/","123","456");


    posTst("/123/abc","qwe/456/","abc","qwe");


    posTst("zoo-foo","bar-baz","foo","bar");
    posTst("zoo-foo-123","456-bar-baz","foo","bar");
  }


  public void testNoGenerationEdgeCase() {
    assertU(adoc("id", "222", "numberpartfail", "123.123.123.123"));
  }


  public void testIgnoreCaseChange() {


    assertU(adoc("id",  "43",
                 "wdf_nocase", "HellO WilliAM",
                 "subword", "GoodBye JonEs"));
    assertU(commit());
    
    assertQ("no case change",
            req("wdf_nocase:(hell o am)")
            ,"//result[@numFound=0]"
    );
    assertQ("case change",
            req("subword:(good jon)")
            ,"//result[@numFound=1]"
    );
  }




  public void testPreserveOrignalTrue() {


    assertU(adoc("id",  "144",
                 "wdf_preserve", "404-123"));
    assertU(commit());
    
    assertQ("preserving original word",
            req("wdf_preserve:404")
            ,"//result[@numFound=1]"
    );
    
    assertQ("preserving original word",
        req("wdf_preserve:123")
        ,"//result[@numFound=1]"
    );


    assertQ("preserving original word",
        req("wdf_preserve:404-123*")
        ,"//result[@numFound=1]"
    );


  }


  /***
  public void testPerformance() throws IOException {
    String s = "now is the time-for all good men to come to-the aid of their country.";
    Token tok = new Token();
    long start = System.currentTimeMillis();
    int ret=0;
    for (int i=0; i<1000000; i++) {
      StringReader r = new StringReader(s);
      TokenStream ts = new WhitespaceTokenizer(r);
      ts = new WordDelimiterFilter(ts, 1,1,1,1,0);


      while (ts.next(tok) != null) ret++;
    }


    System.out.println("ret="+ret+" time="+(System.currentTimeMillis()-start));
  }
  ***/




  public void testOffsets() throws IOException {


    // test that subwords and catenated subwords have
    // the correct offsets.
    WordDelimiterFilter wdf = new WordDelimiterFilter(
            new TokenStream() {
              Token t;
              public Token next() throws IOException {
                if (t!=null) return null;
                t = new Token("foo-bar", 5, 12);  // actual
                return t;
              }
            },
    1,1,0,0,1,1,0);


    int i=0;
    for(Token t; (t=wdf.next())!=null;) {
      String termText = new String(t.termBuffer(), 0, t.termLength());
      if (termText.equals("foo")) {
        assertEquals(5, t.startOffset());
        assertEquals(8, t.endOffset());
        i++;
      }
      if (termText.equals("bar")) {
        assertEquals(9, t.startOffset());
        assertEquals(12, t.endOffset());
        i++;
      }
      if (termText.equals("foobar")) {
        assertEquals(5, t.startOffset());
        assertEquals(12, t.endOffset());
        i++;
      }
    }
    assertEquals(3,i); // make sure all 3 tokens were generated


    // test that if splitting or catenating a synonym, that the offsets
    // are not altered (they would be incorrect).
    wdf = new WordDelimiterFilter(
            new TokenStream() {
              Token t;
              public Token next() throws IOException {
                if (t!=null) return null;
                t = new Token("foo-bar", 5, 6);  // a synonym
                return t;
              }
            },
    1,1,0,0,1,1,0);
    for(Token t; (t=wdf.next())!=null;) {
      assertEquals(5, t.startOffset());
      assertEquals(6, t.endOffset());
    }
  }
  
  public void testOffsetChange() throws Exception
  {
    WordDelimiterFilter wdf = new WordDelimiterFilter(
      new TokenStream() {
        Token t;
        public Token next() {
         if (t != null) return null;
         t = new Token("übelkeit)", 7, 16);
         return t;
        }
      },
      1,1,0,0,1,1,0
    );
    
    Token t = wdf.next();
    
    assertNotNull(t);
    assertEquals("übelkeit", t.term());
    assertEquals(7, t.startOffset());
    assertEquals(15, t.endOffset());
  }
  
  public void testOffsetChange2() throws Exception
  {
    WordDelimiterFilter wdf = new WordDelimiterFilter(
      new TokenStream() {
        Token t;
        public Token next() {
         if (t != null) return null;
         t = new Token("(übelkeit", 7, 17);
         return t;
        }
      },
      1,1,0,0,1,1,0
    );
    
    Token t = wdf.next();
    
    assertNotNull(t);
    assertEquals("übelkeit", t.term());
    assertEquals(8, t.startOffset());
    assertEquals(17, t.endOffset());
  }
  
  public void testOffsetChange3() throws Exception
  {
    WordDelimiterFilter wdf = new WordDelimiterFilter(
      new TokenStream() {
        Token t;
        public Token next() {
         if (t != null) return null;
         t = new Token("(übelkeit", 7, 16);
         return t;
        }
      },
      1,1,0,0,1,1,0
    );
    
    Token t = wdf.next();
    
    assertNotNull(t);
    assertEquals("übelkeit", t.term());
    assertEquals(8, t.startOffset());
    assertEquals(16, t.endOffset());
  }
  
  public void testOffsetChange4() throws Exception
  {
    WordDelimiterFilter wdf = new WordDelimiterFilter(
      new TokenStream() {
        private Token t;
        public Token next() {
         if (t != null) return null;
         t = new Token("(foo,bar)", 7, 16);
         return t;
        }
      },
      1,1,0,0,1,1,0
    );
    
    Token t = wdf.next();
    
    assertNotNull(t);
    assertEquals("foo", t.term());
    assertEquals(8, t.startOffset());
    assertEquals(11, t.endOffset());
    
    t = wdf.next();
    
    assertNotNull(t);
    assertEquals("bar", t.term());
    assertEquals(12, t.startOffset());
    assertEquals(15, t.endOffset());
  }


  public void testAlphaNumericWords(){
     assertU(adoc("id",  "68","numericsubword","Java/J2SE"));
     assertU(commit());


     assertQ("j2se found",
            req("numericsubword:(J2SE)")
            ,"//result[@numFound=1]"
    );
      assertQ("no j2 or se",
            req("numericsubword:(J2 OR SE)")
            ,"//result[@numFound=0]"
    );
  }


  public void testProtectedWords(){
    assertU(adoc("id", "70","protectedsubword","c# c++ .net Java/J2SE"));
    assertU(commit());


    assertQ("java found",
            req("protectedsubword:(java)")
            ,"//result[@numFound=1]"
    );


    assertQ(".net found",
            req("protectedsubword:(.net)")
            ,"//result[@numFound=1]"
    );


    assertQ("c# found",
            req("protectedsubword:(c#)")
            ,"//result[@numFound=1]"
    );


    assertQ("c++ found",
            req("protectedsubword:(c++)")
            ,"//result[@numFound=1]"
    );


    assertQ("c found?",
            req("protectedsubword:c")
            ,"//result[@numFound=0]"
    );
    assertQ("net found?",
            req("protectedsubword:net")
            ,"//result[@numFound=0]"
    );
  }




  public void doSplit(final String input, String... output) throws Exception {
    WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
      boolean done=false;
      @Override
      public Token next() throws IOException {
        if (done) return null;
        done = true;
        return new Token(input,0,input.length());
      }
    }
            ,1,1,0,0,0
    );


    for(String expected : output) {
      Token t = wdf.next();
      assertEquals(expected, t.term());
    }


    assertEquals(null, wdf.next());
  }


  public void testSplits() throws Exception {
    doSplit("basic-split","basic","split");
    doSplit("camelCase","camel","Case");


    // non-space marking symbol shouldn't cause split
    // this is an example in Thai    
    doSplit("\u0e1a\u0e49\u0e32\u0e19","\u0e1a\u0e49\u0e32\u0e19");




  }
  
  public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
    WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
      boolean done=false;
      @Override
      public Token next() throws IOException {
        if (done) return null;
        done = true;
        return new Token(input,0,input.length());
      }
    }
            ,1,1,0,0,0,1,0,1,stemPossessive,null
    );


    for(String expected : output) {
      Token t = wdf.next();
      assertEquals(expected, t.term());
    }


    assertEquals(null, wdf.next());
  }
  
  /*
   * Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters. 
   */
  public void testPossessives() throws Exception {
    doSplitPossessive(1, "ra's", "ra");
    doSplitPossessive(0, "ra's", "ra", "s");
  }
  
  /*
   * Set a large position increment gap of 10 if the token is "largegap" or "/"
   */
  private final class LargePosIncTokenFilter extends TokenFilter {
    private TermAttribute termAtt;
    private PositionIncrementAttribute posIncAtt;
    
    protected LargePosIncTokenFilter(TokenStream input) {
      super(input);
      termAtt = (TermAttribute) addAttribute(TermAttribute.class);
      posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
    }


    @Override
    public boolean incrementToken() throws IOException {
      if (input.incrementToken()) {
        if (termAtt.term().equals("largegap") || termAtt.term().equals("/"))
          posIncAtt.setPositionIncrement(10);
        return true;
      } else {
        return false;
      }
    }  
  }
  
  public void testPositionIncrements() throws Exception {
    final CharArraySet protWords = new CharArraySet(new HashSet<String>(Arrays.asList("NUTCH")), false);
    
    /* analyzer that uses whitespace + wdf */
    Analyzer a = new Analyzer() {
      public TokenStream tokenStream(String field, Reader reader) {
        return new WordDelimiterFilter(
            new WhitespaceTokenizer(reader),
            1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
      }
    };


    /* in this case, works as expected. */
    assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
        new int[] { 0, 9 },
        new int[] { 6, 13 },
        new int[] { 1, 1 });
    
    /* only in this case, posInc of 2 ?! */
    assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
        new int[] { 0, 9, 12, 9 },
        new int[] { 6, 12, 13, 13 },
        new int[] { 1, 1, 1, 0 });
    
    assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
        new int[] { 0, 9, 15 },
        new int[] { 6, 14, 19 },
        new int[] { 1, 1, 1 });
    
    /* analyzer that will consume tokens with large position increments */
    Analyzer a2 = new Analyzer() {
      public TokenStream tokenStream(String field, Reader reader) {
        return new WordDelimiterFilter(
            new LargePosIncTokenFilter(
            new WhitespaceTokenizer(reader)),
            1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
      }
    };
    
    /* increment of "largegap" is preserved */
    assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
        new int[] { 0, 7, 16 },
        new int[] { 6, 15, 20 },
        new int[] { 1, 10, 1 });
    
    /* the "/" had a position increment of 10, where did it go?!?!! */
    assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
        new int[] { 0, 9 },
        new int[] { 6, 13 },
        new int[] { 1, 11 });
    
    /* in this case, the increment of 10 from the "/" is carried over */
    assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
        new int[] { 0, 9, 12, 9 },
        new int[] { 6, 12, 13, 13 },
        new int[] { 1, 11, 1, 0 });
    
    assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
        new int[] { 0, 9, 15 },
        new int[] { 6, 14, 19 },
        new int[] { 1, 11, 1 });
    
    Analyzer a3 = new Analyzer() {
      public TokenStream tokenStream(String field, Reader reader) {
        StopFilter filter = new StopFilter(
            new WhitespaceTokenizer(reader), StandardAnalyzer.STOP_WORDS_SET);
        filter.setEnablePositionIncrements(true);
        return new WordDelimiterFilter(filter, 
            1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
      }
    };


    assertAnalyzesTo(a3, "lucene.solr", 
        new String[] { "lucene", "solr", "lucenesolr" },
        new int[] { 0, 7, 0 },
        new int[] { 6, 11, 11 },
        new int[] { 1, 1, 0 });


    /* the stopword should add a gap here */
    assertAnalyzesTo(a3, "the lucene.solr", 
        new String[] { "lucene", "solr", "lucenesolr" }, 
        new int[] { 4, 11, 4 }, 
        new int[] { 10, 15, 15 },
        new int[] { 2, 1, 0 });
  }


  private void assertAnalyzesTo(Analyzer a, String input, String[] output,
      int startOffsets[], int endOffsets[], int posIncs[]) throws Exception {


    TokenStream ts = a.tokenStream("dummy", new StringReader(input));
    TermAttribute termAtt = (TermAttribute) ts
        .getAttribute(TermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) ts
        .getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts
        .getAttribute(PositionIncrementAttribute.class);
    for (int i = 0; i < output.length; i++) {
      assertTrue(ts.incrementToken());
      assertEquals(output[i], termAtt.term());
      assertEquals(startOffsets[i], offsetAtt.startOffset());
      assertEquals(endOffsets[i], offsetAtt.endOffset());
      assertEquals(posIncs[i], posIncAtt.getPositionIncrement());
    }
    assertFalse(ts.incrementToken());
    ts.close();
  }
}
Source Code of org.apache.solr.analysis.TestWordDelimiterFilter$LargePosIncTokenFilter

Related Classes of org.apache.solr.analysis.TestWordDelimiterFilter$LargePosIncTokenFilter