Source Code of org.apache.solr.analysis.SlowSynonymFilter

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.solr.analysis;


import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;


/** SynonymFilter handles multi-token synonyms with variable position increment offsets.
 * <p>
 * The matched tokens from the input stream may be optionally passed through (includeOrig=true)
 * or discarded.  If the original tokens are included, the position increments may be modified
 * to retain absolute positions after merging with the synonym tokenstream.
 * <p>
 * Generated synonyms will start at the same position as the first matched source token.
 * @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
 */
@Deprecated
final class SlowSynonymFilter extends TokenFilter {


  private final SlowSynonymMap map;  // Map<String, SynonymMap>
  private Iterator<AttributeSource> replacement;  // iterator over generated tokens


  public SlowSynonymFilter(TokenStream in, SlowSynonymMap map) {
    super(in);
    if (map == null)
      throw new IllegalArgumentException("map is required");


    this.map = map;
    // just ensuring these attributes exist...
    addAttribute(CharTermAttribute.class);
    addAttribute(PositionIncrementAttribute.class);
    addAttribute(OffsetAttribute.class);
    addAttribute(TypeAttribute.class);
  }




  /*
   * Need to worry about multiple scenarios:
   *  - need to go for the longest match
   *    a b => foo      #shouldn't match if "a b" is followed by "c d"
   *    a b c d => bar
   *  - need to backtrack - retry matches for tokens already read
   *     a b c d => foo
   *       b c => bar
   *     If the input stream is "a b c x", one will consume "a b c d"
   *     trying to match the first rule... all but "a" should be
   *     pushed back so a match may be made on "b c".
   *  - don't try and match generated tokens (thus need separate queue)
   *    matching is not recursive.
   *  - handle optional generation of original tokens in all these cases,
   *    merging token streams to preserve token positions.
   *  - preserve original positionIncrement of first matched token
   */
  @Override
  public boolean incrementToken() throws IOException {
    while (true) {
      // if there are any generated tokens, return them... don't try any
      // matches against them, as we specifically don't want recursion.
      if (replacement!=null && replacement.hasNext()) {
        copy(this, replacement.next());
        return true;
      }


      // common case fast-path of first token not matching anything
      AttributeSource firstTok = nextTok();
      if (firstTok == null) return false;
      CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
      SlowSynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
      if (result == null) {
        copy(this, firstTok);
        return true;
      }


      // fast-path failed, clone ourselves if needed
      if (firstTok == this)
        firstTok = cloneAttributes();
      // OK, we matched a token, so find the longest match.


      matched = new LinkedList<AttributeSource>();


      result = match(result);


      if (result==null) {
        // no match, simply return the first token read.
        copy(this, firstTok);
        return true;
      }


      // reuse, or create new one each time?
      ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1);


      //
      // there was a match... let's generate the new tokens, merging
      // in the matched tokens (position increments need adjusting)
      //
      AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
      boolean includeOrig = result.includeOrig();


      AttributeSource origTok = includeOrig ? firstTok : null;
      PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
      int origPos = firstPosIncAtt.getPositionIncrement();  // position of origTok in the original stream
      int repPos=0; // curr position in replacement token stream
      int pos=0;  // current position in merged token stream


      for (int i=0; i<result.synonyms.length; i++) {
        Token repTok = result.synonyms[i];
        AttributeSource newTok = firstTok.cloneAttributes();
        CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
        OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
        PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);


        OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);


        newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
        newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
        repPos += repTok.getPositionIncrement();
        if (i==0) repPos=origPos;  // make position of first token equal to original


        // if necessary, insert original tokens and adjust position increment
        while (origTok != null && origPos <= repPos) {
          PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
          origPosInc.setPositionIncrement(origPos-pos);
          generated.add(origTok);
          pos += origPosInc.getPositionIncrement();
          origTok = matched.isEmpty() ? null : matched.removeFirst();
          if (origTok != null) {
            origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
            origPos += origPosInc.getPositionIncrement();
          }
        }


        newPosIncAtt.setPositionIncrement(repPos - pos);
        generated.add(newTok);
        pos += newPosIncAtt.getPositionIncrement();
      }


      // finish up any leftover original tokens
      while (origTok!=null) {
        PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
        origPosInc.setPositionIncrement(origPos-pos);
        generated.add(origTok);
        pos += origPosInc.getPositionIncrement();
        origTok = matched.isEmpty() ? null : matched.removeFirst();
        if (origTok != null) {
          origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
          origPos += origPosInc.getPositionIncrement();
        }
      }


      // what if we replaced a longer sequence with a shorter one?
      // a/0 b/5 =>  foo/0
      // should I re-create the gap on the next buffered token?


      replacement = generated.iterator();
      // Now return to the top of the loop to read and return the first
      // generated token.. The reason this is done is that we may have generated
      // nothing at all, and may need to continue with more matching logic.
    }
  }




  //
  // Defer creation of the buffer until the first time it is used to
  // optimize short fields with no matches.
  //
  private LinkedList<AttributeSource> buffer;
  private LinkedList<AttributeSource> matched;


  private boolean exhausted;


  private AttributeSource nextTok() throws IOException {
    if (buffer!=null && !buffer.isEmpty()) {
      return buffer.removeFirst();
    } else {
      if (!exhausted && input.incrementToken()) {
        return this;
      } else {
        exhausted = true;
        return null;
      }
    }
  }


  private void pushTok(AttributeSource t) {
    if (buffer==null) buffer=new LinkedList<AttributeSource>();
    buffer.addFirst(t);
  }


  private SlowSynonymMap match(SlowSynonymMap map) throws IOException {
    SlowSynonymMap result = null;


    if (map.submap != null) {
      AttributeSource tok = nextTok();
      if (tok != null) {
        // clone ourselves.
        if (tok == this)
          tok = cloneAttributes();
        // check for positionIncrement!=1?  if>1, should not match, if==0, check multiple at this level?
        CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
        SlowSynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());


        if (subMap != null) {
          // recurse
          result = match(subMap);
        }


        if (result != null) {
          matched.addFirst(tok);
        } else {
          // push back unmatched token
          pushTok(tok);
        }
      }
    }


    // if no longer sequence matched, so if this node has synonyms, it's the match.
    if (result==null && map.synonyms!=null) {
      result = map;
    }


    return result;
  }


  private void copy(AttributeSource target, AttributeSource source) {
    if (target != source)
      source.copyTo(target);
  }


  @Override
  public void reset() throws IOException {
    input.reset();
    replacement = null;
    exhausted = false;
  }
}
Source Code of org.apache.solr.analysis.SlowSynonymFilter

Related Classes of org.apache.solr.analysis.SlowSynonymFilter