Package org.wltea.analyzer.lucene

Source Code of org.wltea.analyzer.lucene.IKTokenizer

/**
*
*/
package org.wltea.analyzer.lucene;

import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.wltea.analyzer.IKSegmentation;
import org.wltea.analyzer.Lexeme;


/**
* IK Analyzer v3.2
* Lucene3.0 Tokenizer适配器类
* 它封装了IKSegmentation实现
*
* @author 林良益
*
*/
public final class IKTokenizer extends Tokenizer {
 
  //IK分词器实现
  private IKSegmentation _IKImplement;
  //词元文本属性
  private TermAttribute termAtt;
  //词元位移属性
  private OffsetAttribute offsetAtt;
  //记录最后一个词元的结束位置
  private int finalOffset;

  /**
   * Lucene Tokenizer适配器类构造函数
   * @param in
   * @param isMaxWordLength 当为true时,分词器进行最大词长切分;当为false是,采用最细粒度切分
   */
  public IKTokenizer(Reader in , boolean isMaxWordLength) {
      super(in);
      offsetAtt = addAttribute(OffsetAttribute.class);
      termAtt = addAttribute(TermAttribute.class);
    _IKImplement = new IKSegmentation(in , isMaxWordLength);
 
 
  @Override
  public final boolean incrementToken() throws IOException {
    //清除所有的词元属性
    clearAttributes();
    Lexeme nextLexeme = _IKImplement.next();
    if(nextLexeme != null){
      //将Lexeme转成Attributes
      //设置词元文本
      termAtt.setTermBuffer(nextLexeme.getLexemeText());
      //设置词元长度
      termAtt.setTermLength(nextLexeme.getLength());
      //设置词元位移
      offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
      //记录分词的最后位置
      finalOffset = nextLexeme.getEndPosition();
      //返会true告知还有下个词元
      return true;
    }
    //返会false告知词元输出完毕
    return false;
  }
 
  /*
   * (non-Javadoc)
   * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
   */
  public void reset(Reader input) throws IOException {
    super.reset(input);
    _IKImplement.reset(input);
 
 
  @Override
  public final void end() {
      // set final offset
    offsetAtt.setOffset(finalOffset, finalOffset);
  }
 
}
TOP

Related Classes of org.wltea.analyzer.lucene.IKTokenizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.