/* analyzer that uses whitespace + wdf */
Analyzer a = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(
tokenizer,
flags, protWords));
}
};
/* in this case, works as expected. */
assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
new int[] { 0, 9 },
new int[] { 6, 13 },
null,
new int[] { 1, 1 },
null,
false);
/* only in this case, posInc of 2 ?! */
assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
new int[] { 0, 9, 12, 9 },
new int[] { 6, 12, 13, 13 },
null,
new int[] { 1, 1, 1, 0 },
null,
false);
assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
new int[] { 0, 9, 15 },
new int[] { 6, 14, 19 },
null,
new int[] { 1, 1, 1 },
null,
false);
/* analyzer that will consume tokens with large position increments */
Analyzer a2 = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(
new LargePosIncTokenFilter(tokenizer),
flags, protWords));
}
};
/* increment of "largegap" is preserved */
assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
new int[] { 0, 7, 16 },
new int[] { 6, 15, 20 },
null,
new int[] { 1, 10, 1 },
null,
false);
/* the "/" had a position increment of 10, where did it go?!?!! */
assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
new int[] { 0, 9 },
new int[] { 6, 13 },
null,
new int[] { 1, 11 },
null,
false);
/* in this case, the increment of 10 from the "/" is carried over */
assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
new int[] { 0, 9, 12, 9 },
new int[] { 6, 12, 13, 13 },
null,
new int[] { 1, 11, 1, 0 },
null,
false);
assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
new int[] { 0, 9, 15 },
new int[] { 6, 14, 19 },
null,
new int[] { 1, 11, 1 },
null,
false);
Analyzer a3 = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
tokenizer, StandardAnalyzer.STOP_WORDS_SET);
filter.setEnablePositionIncrements(true);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(filter, flags, protWords));
}
};
assertAnalyzesTo(a3, "lucene.solr",
new String[] { "lucene", "solr", "lucenesolr" },