A simple class that stores Strings as char[]'s in a hash table. Note that this is not a general purpose class. For example, it cannot remove items from the set, nor does it resize its hash table to be smaller, etc. It is designed to be quick to test if a char[] is in the set without the necessity of converting it to a String first.
Please note: This class implements {@link java.util.Set Set} butdoes not behave like it should in all cases. The generic type is {@code Set}, because you can add any object to it, that has a string representation. The add methods will use {@link Object#toString} and store the result using a {@code char[]}buffer. The same behaviour have the {@code contains()} methods.The {@link #iterator()} returns an {@code Iterator}. For type safety also {@link #stringIterator()} is provided.
private TermAttribute termAtt;
public ChineseFilter ( TokenStream in ) {
super ( in ) ;
stopTable = new CharArraySet ( Arrays.asList ( STOP_WORDS ) , false ) ;
termAtt = addAttribute( TermAttribute. class ) ;
}
public void setArticles ( Set<?> articles ) {
if ( articles instanceof CharArraySet )
this .articles = ( CharArraySet ) articles;
else
this.articles = new CharArraySet ( articles, true ) ;
}
/**
* Constructs an elision filter with standard stop words
*/
protected ElisionFilter ( TokenStream input ) {
super ( input ) ;
this .articles = new CharArraySet ( Arrays.asList (
"l" , "m" , "t" , "qu" , "n" , "s" , "j" ) , true ) ;
termAtt = addAttribute( TermAttribute. class ) ;
}
/**
* Constructs an elision filter with an array of stop words
*/
public ElisionFilter ( TokenStream input, String [] articles ) {
super ( input ) ;
this .articles = new CharArraySet ( Arrays.asList ( articles ) , true ) ;
termAtt = addAttribute( TermAttribute. class ) ;
}
this .onlyLongestMatch=onlyLongestMatch;
if ( dictionary instanceof CharArraySet ) {
this .dictionary = ( CharArraySet ) dictionary;
} else {
this .dictionary = new CharArraySet ( dictionary.size () , false ) ;
addAllLowerCase( this .dictionary, dictionary ) ;
}
termAtt = addAttribute ( TermAttribute. class ) ;
offsetAtt = addAttribute( OffsetAttribute. class ) ;
* @param dictionary
* @return { @link Set } of lowercased terms
*/
public static final Set makeDictionary ( final String [] dictionary ) {
// is the below really case insensitive?
CharArraySet dict = new CharArraySet ( dictionary.length, false ) ;
addAllLowerCase( dict, Arrays.asList ( dictionary )) ;
return dict;
}
stopWords = getWordSet ( loader, stopWordFiles, ignoreCase ) ;
} catch ( IOException e ) {
throw new RuntimeException ( e ) ;
}
} else {
stopWords = new CharArraySet ( luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase ) ;
}
}
stopWords = getWordSet ( loader, stopWordFiles, ignoreCase ) ;
} catch ( IOException e ) {
throw new RuntimeException ( e ) ;
}
} else {
stopWords = new CharArraySet ( luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase ) ;
}
}
/**
* Set the keep word list.
* NOTE: if ignoreCase==true, the words are expected to be lowercase
*/
public void setWords ( Set<String> words ) {
this .words = new CharArraySet ( luceneMatchVersion, words, ignoreCase ) ;
}
this .words = new CharArraySet ( luceneMatchVersion, words, ignoreCase ) ;
}
public void setIgnoreCase ( boolean ignoreCase ) {
if ( words != null && this .ignoreCase != ignoreCase ) {
words = new CharArraySet ( luceneMatchVersion, words, ignoreCase ) ;
}
this .ignoreCase = ignoreCase;
}
Related Classes of org.apache.lucene.analysis.CharArraySet
Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact
coftware#gmail.com .