Package com.chenlb.mmseg4j.analysis

Source Code of com.chenlb.mmseg4j.analysis.MMSegTokenizer

package com.chenlb.mmseg4j.analysis;

import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

import com.chenlb.mmseg4j.MMSeg;
import com.chenlb.mmseg4j.Seg;
import com.chenlb.mmseg4j.Word;

public class MMSegTokenizer extends Tokenizer {

  private MMSeg mmSeg;
 
  private CharTermAttribute termAtt;
  private OffsetAttribute offsetAtt;
  private TypeAttribute typeAtt;
 
  public MMSegTokenizer(Seg seg, Reader input) {
    super(input);
    mmSeg = new MMSeg(input, seg);
   
    termAtt = (CharTermAttribute)addAttribute(CharTermAttribute.class);
    offsetAtt = (OffsetAttribute)addAttribute(OffsetAttribute.class);
    typeAtt = (TypeAttribute)addAttribute(TypeAttribute.class);
  }
 
  public void reset(Reader input) throws IOException {
    super.reset(input);
    mmSeg.reset(input);
  }

/*//lucene 2.9 以下
   public Token next(Token reusableToken) throws IOException {
    Token token = null;
    Word word = mmSeg.next();
    if(word != null) {
      //lucene 2.3
      reusableToken.clear();
      reusableToken.setTermBuffer(word.getSen(), word.getWordOffset(), word.getLength());
      reusableToken.setStartOffset(word.getStartOffset());
      reusableToken.setEndOffset(word.getEndOffset());
      reusableToken.setType(word.getType());
     
      token = reusableToken;
     
      //lucene 2.4
      //token = reusableToken.reinit(word.getSen(), word.getWordOffset(), word.getLength(), word.getStartOffset(), word.getEndOffset(), word.getType());
    }
   
    return token;
  }*/

  //lucene 2.9/3.0
  @Override
  public boolean incrementToken() throws IOException {
    clearAttributes();
    Word word = mmSeg.next();
    if(word != null) {
      //lucene 3.0
      //termAtt.setTermBuffer(word.getSen(), word.getWordOffset(), word.getLength());
      //lucene 3.1
      termAtt.copyBuffer(word.getSen(), word.getWordOffset(), word.getLength());
      offsetAtt.setOffset(word.getStartOffset(), word.getEndOffset());
      typeAtt.setType(word.getType());
      return true;
    } else {
      end();
      return false;
    }
  }
}
TOP

Related Classes of com.chenlb.mmseg4j.analysis.MMSegTokenizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.