Package org.apache.lucene.analysis.miscellaneous

Examples of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter

See: http://en.wikipedia.org/wiki/Latin_characters_in_Unicode For example, 'à' will be replaced by 'a'.

     * @return the {@link TokenStreamComponents} for this analyzer.
     */
    @Override
    protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
        final Tokenizer source = new StandardTokenizer(Geonet.LUCENE_VERSION, reader);
        ASCIIFoldingFilter asciiFoldingFilter = new ASCIIFoldingFilter(new LowerCaseFilter(Geonet.LUCENE_VERSION,
                new StandardFilter(Geonet.LUCENE_VERSION, source)));

        if (this.stopwords != null && !this.stopwords.isEmpty()) {
            return new TokenStreamComponents(source, new StopFilter(Geonet.LUCENE_VERSION, asciiFoldingFilter, this.stopwords)) {
                @Override
View Full Code Here


            @Override
            protected Analyzer.TokenStreamComponents createComponents(String fieldName, Reader reader) {
                StandardTokenizer source = new StandardTokenizer(Version.LUCENE_45, reader);

                TokenStream filter = new ASCIIFoldingFilter(new LemmagenFilter(
                        new LowerCaseFilter(TEST_VERSION_CURRENT,
                                new StandardFilter(TEST_VERSION_CURRENT, source)), "mlteast-sk", TEST_VERSION_CURRENT));
                return new Analyzer.TokenStreamComponents(source, filter);
            }
        };
View Full Code Here

public final class ISOLatin1Analyzer extends Analyzer {

  @Override
  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    StandardTokenizer tokenizer = new StandardTokenizer( TestConstants.getTargetLuceneVersion(), reader );
    TokenStream filter = new ASCIIFoldingFilter( tokenizer );
    return new TokenStreamComponents( tokenizer, filter );
  }
View Full Code Here

    }
  }
 
  @Override
  public ASCIIFoldingFilter create(TokenStream input) {
    return new ASCIIFoldingFilter(input, preserveOriginal);
  }
View Full Code Here

  public void testInvalidOffset() throws Exception {
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
        filters = new WordTokenFilter(filters);
        return new TokenStreamComponents(tokenizer, filters);
      }
    };
   
View Full Code Here

  public void testInvalidOffsets() throws Exception {
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
        filters = new EdgeNGramTokenFilter(Version.LUCENE_43, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
        return new TokenStreamComponents(tokenizer, filters);
      }
    };
    assertAnalyzesTo(analyzer, "mosfellsbær",
View Full Code Here

  public void testInvalidOffsets() throws Exception {
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
        filters = new NGramTokenFilter(TEST_VERSION_CURRENT, filters, 2, 2);
        return new TokenStreamComponents(tokenizer, filters);
      }
    };
    assertAnalyzesTo(analyzer, "mosfellsbær",
View Full Code Here

        // if word termintates with $ will output word$, else will output all lemmas or word$ if OOV
        final StreamLemmasFilter src = new StreamLemmasFilter(reader, dictRadix, prefixesTree, SPECIAL_TOKENIZATION_CASES, commonWords, lemmaFilter);
        src.setSuffixForExactMatch(originalTermSuffix);
        src.setKeepOriginalWord(true);

        TokenStream tok = new ASCIIFoldingFilter(src);
        tok = new AlwaysAddSuffixFilter(tok, '$', true) {
            @Override
            protected boolean possiblySkipFilter() {
                if (HebrewTokenizer.tokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Hebrew).equals(typeAtt.type())) {
                    if (keywordAtt.isKeyword())
View Full Code Here

        // will ignore $ && will always output all lemmas + origin word$
        // basically, if analyzerType == AnalyzerType.INDEXING)
        final StreamLemmasFilter src = new StreamLemmasFilter(reader, dictRadix, prefixesTree, SPECIAL_TOKENIZATION_CASES, commonWords, lemmaFilter);
        src.setKeepOriginalWord(true);

        TokenStream tok = new ASCIIFoldingFilter(src);
        tok = new AlwaysAddSuffixFilter(tok, '$', true) {
            @Override
            protected boolean possiblySkipFilter() {
                if (HebrewTokenizer.tokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Hebrew).equals(typeAtt.type())) {
                    if (keywordAtt.isKeyword())
View Full Code Here

        // if word termintates with $ will output word$, else will output all lemmas or word$ if OOV
        final StreamLemmasFilter src = new StreamLemmasFilter(reader, dictRadix, prefixesTree, SPECIAL_TOKENIZATION_CASES, commonWords, lemmaFilter);
        src.setKeepOriginalWord(false);
        src.setSuffixForExactMatch(originalTermSuffix);

        TokenStream tok = new ASCIIFoldingFilter(src);
        //tok = new SuffixKeywordFilter(tok, '$');
        tok = new AlwaysAddSuffixFilter(tok, '$', true) {
            @Override
            protected boolean possiblySkipFilter() {
                if (HebrewTokenizer.tokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Hebrew).equals(typeAtt.type())) {
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.