Package org.apache.lucene.analysis.charfilter

Examples of org.apache.lucene.analysis.charfilter.NormalizeCharMap


   
    final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
    builder.add("mtqlpi", "");
    builder.add("mwoknt", "jjp");
    builder.add("tcgyreo", "zpfpajyws");
    final NormalizeCharMap map = builder.build();
   
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer t = new MockTokenizer(new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader), MockTokenFilter.ENGLISH_STOPSET, false, -65);
View Full Code Here


  // so in this case we behave like WDF, and preserve any modified offsets
  public void testInvalidOffsets() throws Exception {
    final CharArraySet dict = makeDictionary("fall");
    final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
    builder.add("ü", "ue");
    final NormalizeCharMap normMap = builder.build();
   
    Analyzer analyzer = new Analyzer() {

      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
View Full Code Here

   
    final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
    builder.add("mtqlpi", "");
    builder.add("mwoknt", "jjp");
    builder.add("tcgyreo", "zpfpajyws");
    final NormalizeCharMap map = builder.build();
   
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer t = new MockTokenizer(new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader), MockTokenFilter.ENGLISH_STOPSET, false, -65);
View Full Code Here

  }

  public void testNormalizeWinDelimToLinuxDelim() throws Exception {
    NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
    builder.add("\\", "/");
    NormalizeCharMap normMap = builder.build();
    String path = "c:\\a\\b\\c";
    Reader cs = new MappingCharFilter(normMap, new StringReader(path));
    PathHierarchyTokenizer t = new PathHierarchyTokenizer( cs );
    assertTokenStreamContents(t,
        new String[]{"c:", "c:/a", "c:/a/b", "c:/a/b/c"},
View Full Code Here

    // create MappingCharFilter
    List<String> mappingRules = new ArrayList<>();
    mappingRules.add( "\"&uuml;\" => \"ü\"" );
    NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
    builder.add("&uuml;", "ü");
    NormalizeCharMap normMap = builder.build();
    CharFilter charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) );

    // create PatternTokenizer
    TokenStream stream = new PatternTokenizer(charStream, Pattern.compile("[,;/\\s]+"), -1);
    assertTokenStreamContents(stream,
View Full Code Here

  // so in this case we behave like WDF, and preserve any modified offsets
  public void testInvalidOffsets() throws Exception {
    final CharArraySet dict = makeDictionary("fall");
    final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
    builder.add("ü", "ue");
    final NormalizeCharMap normMap = builder.build();
   
    Analyzer analyzer = new Analyzer() {

      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
View Full Code Here

  /** test that offsets are correct when mappingcharfilter is previously applied */
  public void testChangedOffsets() throws IOException {
    final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
    builder.add("a", "一二");
    builder.add("b", "二三");
    final NormalizeCharMap norm = builder.build();
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
        return new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer));
View Full Code Here

      builder.add("[", " | ");
      builder.add("]", " | ");
      builder.add("\u00AB", " | ");
      builder.add("\u00BB", " | ");

      NormalizeCharMap normMap = builder.build();

      Reader reader2 = (normMap == null ? reader : new MappingCharFilter(normMap,reader));
         
    final Tokenizer source = new WhitespaceTokenizer(matchVersion, reader2);
      TokenStream tokenStream = new LowerCaseFilter(matchVersion, source);
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.charfilter.NormalizeCharMap

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.