Examples of Lexeme


Examples of net.sourceforge.chaperon.model.lexicon.Lexeme

    if ((violations!=null) && (violations.getViolationCount()>0))
      throw new IllegalArgumentException("Lexicon is not valid: "+violations.getViolation(0));

    LexicalAutomaton automaton = new LexicalAutomaton(lexicon.getLexemeCount());

    Lexeme lexeme;
    PatternAutomaton definition = null;

    for (int i = 0; i<lexicon.getLexemeCount(); i++)
    {
      lexeme = lexicon.getLexeme(i);
      automaton.setLexemeSymbol(i, (lexeme.getSymbol()!=null) ? lexeme.getSymbol().getName() : null);

      definition = (new PatternAutomatonBuilder(lexeme.getDefinition())).getPatternAutomaton();
      if (definition!=null)
        automaton.setLexemeDefinition(i, definition);
      else
        throw new IllegalArgumentException("Couldn't create PatternAutomaton for "+
                                           lexeme.getSymbol()+" of \""+lexeme.getDefinition()+"\"");
    }

    this.automaton = automaton;
  }
View Full Code Here

Examples of org.wltea.analyzer.Lexeme

            if(hit.getBegin() > doneIndex + 1){
              //输出并处理从doneIndex+1 到 seg.start - 1之间的未知词段
              processUnknown(segmentBuff , context , doneIndex + 1 , hit.getBegin()- 1);
            }
            //输出当前的词
            Lexeme newLexeme = new Lexeme(context.getBuffOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CJK_NORMAL);
            context.addLexeme(newLexeme);
            //更新goneIndex,标识已处理
            if(doneIndex < context.getCursor()){
              doneIndex = context.getCursor();
            }
           
            if(hit.isPrefix()){//同时也是前缀
             
            }else{ //后面不再可能有匹配了
              //移出当前的hit
              hitList.remove(hit);
            }
           
          }else if(hit.isPrefix()){//前缀,未匹配成词
           
          }else if(hit.isUnmatch()){//不匹配
            //移出当前的hit
            hitList.remove(hit);
          }
        }
      }
     
      //处理以input为开始的一个新hit
      Hit hit = Dictionary.matchInMainDict(segmentBuff, context.getCursor() , 1);
      if(hit.isMatch()){//匹配成词
        //判断是否有不可识别的词段
        if(context.getCursor() > doneIndex + 1){
          //输出并处理从doneIndex+1 到 context.getCursor()- 1之间的未知
          processUnknown(segmentBuff , context , doneIndex + 1 , context.getCursor()- 1);
        }
        //输出当前的词
        Lexeme newLexeme = new Lexeme(context.getBuffOffset() , context.getCursor() , 1 , Lexeme.TYPE_CJK_NORMAL);
        context.addLexeme(newLexeme);
        //更新doneIndex,标识已处理
        if(doneIndex < context.getCursor()){
          doneIndex = context.getCursor();
        }
View Full Code Here

Examples of org.wltea.analyzer.Lexeme

   * @param segmentBuff
   * @param uBegin 起始位置
   * @param uEnd 终止位置
   */
  private void processUnknown(char[] segmentBuff , Context context , int uBegin , int uEnd){
    Lexeme newLexeme = null;
   
    Hit hit = Dictionary.matchInPrepDict(segmentBuff, uBegin, 1);   
    if(hit.isUnmatch()){//不是副词或介词     
      if(uBegin > 0){//处理姓氏
        hit = Dictionary.matchInSurnameDict(segmentBuff, uBegin - 1 , 1);
        if(hit.isMatch()){
          //输出姓氏
          newLexeme = new Lexeme(context.getBuffOffset() , uBegin - 1 , 1 , Lexeme.TYPE_CJK_SN);
          context.addLexeme(newLexeme);   
        }
      }     
    }
   
    //以单字输出未知词段
    for(int i = uBegin ; i <= uEnd ; i++){
      newLexeme = new Lexeme(context.getBuffOffset() , i , , Lexeme.TYPE_CJK_UNKNOWN);
      context.addLexeme(newLexeme);   
    }
   
    hit = Dictionary.matchInPrepDict(segmentBuff, uEnd, 1);
    if(hit.isUnmatch()){//不是副词或介词
      int length = 1;
      while(uEnd < context.getAvailable() - length){//处理后缀词
        hit = Dictionary.matchInSuffixDict(segmentBuff, uEnd + 1 , length);
        if(hit.isMatch()){
          //输出后缀
          newLexeme = new Lexeme(context.getBuffOffset() , uEnd + , length , Lexeme.TYPE_CJK_SF);
          context.addLexeme(newLexeme);
          break;
        }
        if(hit.isUnmatch()){
          break;
View Full Code Here

Examples of org.wltea.analyzer.Lexeme

 
  @Override
  public final boolean incrementToken() throws IOException {
    //清除所有的词元属性
    clearAttributes();
    Lexeme nextLexeme = _IKImplement.next();
    if(nextLexeme != null){
      //将Lexeme转成Attributes
      //设置词元文本
      termAtt.setTermBuffer(nextLexeme.getLexemeText());
      //设置词元长度
      termAtt.setTermLength(nextLexeme.getLength());
      //设置词元位移
      offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
      //记录分词的最后位置
      finalOffset = nextLexeme.getEndPosition();
      //返会true告知还有下个词元
      return true;
    }
    //返会false告知词元输出完毕
    return false;
View Full Code Here

Examples of org.wltea.analyzer.Lexeme

   * @param context
   */
  private void outputNumLexeme(Context context){
    if(nStart > -1 && nEnd > -1){
      //生成已切分的词元
      Lexeme newLexeme = new Lexeme(context.getBuffOffset() ,nStart , nEnd - nStart + 1 , Lexeme.TYPE_NUM );
      context.addLexeme(newLexeme);
      fCaN = true;
    }
  }
View Full Code Here

Examples of org.wltea.analyzer.Lexeme

   * @param context
   */
  private void outputCountLexeme(Context context){
    if(countStart > -1 && countEnd > -1){
      //生成已切分的词元
      Lexeme countLexeme = new Lexeme(context.getBuffOffset() ,countStart , countEnd - countStart + 1 , Lexeme.TYPE_NUMCOUNT);
      context.addLexeme(countLexeme);
    }

  } 
View Full Code Here

Examples of org.wltea.analyzer.Lexeme

        //不在忽略尾部的链接字符
        end = context.getCursor();         
       
      }else{
        //生成已切分的词元
        Lexeme newLexeme = new Lexeme(context.getBuffOffset() , start , end - start + 1 , Lexeme.TYPE_LETTER);
        context.addLexeme(newLexeme);
        //设置当前分词器状态为“待处理”
        start = -1;
        end = -1;
      }     
    }
   
    //context.getCursor() == context.getAvailable() - 1读取缓冲区最后一个字符,直接输出
    if(context.getCursor() == context.getAvailable() - 1){
      if(start != -1 && end != -1){
        //生成已切分的词元
        Lexeme newLexeme = new Lexeme(context.getBuffOffset() , start , end - start + 1 , Lexeme.TYPE_LETTER);
        context.addLexeme(newLexeme);
      }
      //设置当前分词器状态为“待处理”
      start = -1;
      end = -1;
View Full Code Here

Examples of org.wltea.analyzer.Lexeme

      if(CharacterHelper.isEnglishLetter(input)){
        //记录当前指针位置为结束位置
        letterEnd =  context.getCursor();
      }else{
        //生成已切分的词元
        Lexeme newLexeme = new Lexeme(context.getBuffOffset() , letterStart , letterEnd - letterStart + 1 , Lexeme.TYPE_LETTER);
        context.addLexeme(newLexeme);
        //设置当前分词器状态为“待处理”
        letterStart = -1;
        letterEnd = -1;
      }
    }
   
    //context.getCursor() == context.getAvailable() - 1读取缓冲区最后一个字符,直接输出
    if(context.getCursor() == context.getAvailable() - 1){
      if(letterStart != -1 && letterEnd != -1){
        //生成已切分的词元
        Lexeme newLexeme = new Lexeme(context.getBuffOffset() , letterStart , letterEnd - letterStart + 1 , Lexeme.TYPE_LETTER);
        context.addLexeme(newLexeme);
      }
      //设置当前分词器状态为“待处理”
      letterStart = -1;
      letterEnd = -1;
View Full Code Here

Examples of org.wltea.analyzer.Lexeme

//      e.printStackTrace();
//    }
    System.out.println(t)
    IKSegmentation ikSeg = new IKSegmentation(new StringReader(t) ,false);
    try {
      Lexeme l = null;
      while( (l = ikSeg.next()) != null){
        System.out.println(l);
      }
    } catch (IOException e) {
      // TODO Auto-generated catch block
View Full Code Here

Examples of org.wltea.analyzer.Lexeme

   
    for(String t : testStr){
      System.out.println(t)
      IKSegmentation ikSeg = new IKSegmentation(new StringReader(t) , false);
      try {
        Lexeme l = null;
        while( (l = ikSeg.next()) != null){
          System.out.println(l);
        }
      } catch (IOException e) {
        // TODO Auto-generated catch block
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.