Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.CharArraySet$CharArraySetIterator


  /*
   * convenience ctor to enable deprecated ctors to set posInc explicitly
   */
  private TypeAwareStopFilter(Version matchVersion, boolean enablePositionIncrements, TokenStream input, Set<?> stopWords, boolean ignoreCase) {
    super(enablePositionIncrements, input);
    this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet) stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase);
    Collections.sort(blockedTypes);
  }
View Full Code Here


   * @param stopWords    An array of stopwords
   * @param ignoreCase   If true, all words are lower cased first.
   * @return a Set containing the words
   */
  public static final Set<Object> makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) {
    CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.length, ignoreCase);
    stopSet.addAll(Arrays.asList(stopWords));
    return stopSet;
  }
View Full Code Here

   * @param stopWords    A List of Strings or char[] or any other toString()-able list representing the stopwords
   * @param ignoreCase   if true, all words are lower cased first
   * @return A Set ({@link org.apache.lucene.analysis.CharArraySet}) containing the words
   */
  public static final Set<Object> makeStopSet(Version matchVersion, List<?> stopWords, boolean ignoreCase) {
    CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.size(), ignoreCase);
    stopSet.addAll(stopWords);
    return stopSet;
  }
View Full Code Here

   
  }
 
  @Test
  public void testPositionIncrements() throws Exception {
    final CharArraySet protWords = new CharArraySet(DEFAULT_VERSION, new HashSet<String>(Arrays.asList("NUTCH")), false);
   
    /* analyzer that uses whitespace + wdf */
    Analyzer a = new Analyzer() {
      @Override
      public TokenStream tokenStream(String field, Reader reader) {
View Full Code Here

    TokenStream stream = factory.create(new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false));
    assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 });
      
    // Now force case
    stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
    stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
    assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 });

    // Test Stopwords (ignoreCase via the setter instead)
    factory = new KeepWordFilterFactory();
    args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
    factory.init( args );
    factory.inform( loader );
    factory.setIgnoreCase(true);
    factory.setWords( words );
    assertTrue(factory.isIgnoreCase());
    assertFalse(factory.isEnablePositionIncrements());
    stream = factory.create(new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false));
    assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 });
   
    // Now force case and posIncr
    factory = new KeepWordFilterFactory();
    args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
    args.put( "ignoreCase", "false" );
    args.put( "enablePositionIncrements", "true" );
    factory.init( args );
    factory.inform( loader );
    factory.setWords( words );   
    assertFalse(factory.isIgnoreCase());
    assertTrue(factory.isEnablePositionIncrements());
    stream = factory.create(new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false));
    assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 });
      
    // Now force case
    stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
    stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
    assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
  }
View Full Code Here

      }
    } 
  }
 
  public void testPositionIncrements() throws Exception {
    final CharArraySet protWords = new CharArraySet(new HashSet<String>(Arrays.asList("NUTCH")), false);
   
    /* analyzer that uses whitespace + wdf */
    Analyzer a = new Analyzer() {
      public TokenStream tokenStream(String field, Reader reader) {
        return new WordDelimiterFilter(
View Full Code Here

    this.onlyLongestMatch=onlyLongestMatch;
   
    if (dictionary instanceof CharArraySet) {
      this.dictionary = (CharArraySet) dictionary;
    } else {
      this.dictionary = new CharArraySet(dictionary.size(), false);
      addAllLowerCase(this.dictionary, dictionary);
    }
   
    termAtt = addAttribute(TermAttribute.class);
    offsetAtt = addAttribute(OffsetAttribute.class);
View Full Code Here

   * @param dictionary
   * @return {@link Set} of lowercased terms
   */
  public static final Set makeDictionary(final String[] dictionary) {
    // is the below really case insensitive?
    CharArraySet dict = new CharArraySet(dictionary.length, false);
    addAllLowerCase(dict, Arrays.asList(dictionary));
    return dict;
  }
View Full Code Here

  final CharArraySet words;

 
  public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
    super(in);
    this.words = new CharArraySet(words, ignoreCase);
  }
View Full Code Here

      try {
        File protectedWordFiles = new File(wordFiles);
        if (protectedWordFiles.exists()) {
          List<String> wlist = loader.getLines(wordFiles);
          //This cast is safe in Lucene
          protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
        } else  {
          List<String> files = StrUtils.splitFileNames(wordFiles);
          for (String file : files) {
            List<String> wlist = loader.getLines(file.trim());
            if (protectedWords == null)
              protectedWords = new CharArraySet(wlist, false);
            else
              protectedWords.addAll(wlist);
          }
        }
      } catch (IOException e) {
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.CharArraySet$CharArraySetIterator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.