Package org.wltea.analyzer.dic

Examples of org.wltea.analyzer.dic.Hit


          }
        }
      }
     
      //处理以input为开始的一个新hit
      Hit hit = Dictionary.matchInMainDict(segmentBuff, context.getCursor() , 1);
      if(hit.isMatch()){//匹配成词
        //判断是否有不可识别的词段
        if(context.getCursor() > doneIndex + 1){
          //输出并处理从doneIndex+1 到 context.getCursor()- 1之间的未知
          processUnknown(segmentBuff , context , doneIndex + 1 , context.getCursor()- 1);
        }
        //输出当前的词
        Lexeme newLexeme = new Lexeme(context.getBuffOffset() , context.getCursor() , 1 , Lexeme.TYPE_CJK_NORMAL);
        context.addLexeme(newLexeme);
        //更新doneIndex,标识已处理
        if(doneIndex < context.getCursor()){
          doneIndex = context.getCursor();
        }

        if(hit.isPrefix()){//同时也是前缀
          //向词段队列增加新的Hit
          hitList.add(hit);
        }
       
      }else if(hit.isPrefix()){//前缀,未匹配成词
        //向词段队列增加新的Hit
        hitList.add(hit);
       
      }else if(hit.isUnmatch()){//不匹配,当前的input不是词,也不是词前缀,将其视为分割性的字符
        if(doneIndex >= context.getCursor()){
          //当前不匹配的字符已经被处理过了,不需要再processUnknown
          return;
        }
       
        //输出从doneIndex到当前字符(含当前字符)之间的未知词
        processUnknown(segmentBuff , context , doneIndex + 1 , context.getCursor());
        //更新doneIndex,标识已处理
        doneIndex = context.getCursor();
      }
     
    }else {//输入的不是中文(CJK)字符
      if(hitList.size() > 0
          &&  doneIndex < context.getCursor() - 1){
        for(Hit hit : hitList){
          //判断是否有不可识别的词段
          if(doneIndex < hit.getEnd()){
            //输出并处理从doneIndex+1 到 seg.end之间的未知词段
            processUnknown(segmentBuff , context , doneIndex + 1 , hit.getEnd());
          }
        }
      }
      //清空词段队列
      hitList.clear();
      //更新doneIndex,标识已处理
      if(doneIndex < context.getCursor()){
        doneIndex = context.getCursor();
      }
    }
   
    //缓冲区结束临界处理
    if(context.getCursor() == context.getAvailable() - 1){ //读取缓冲区结束的最后一个字符     
      if( hitList.size() > 0 //队列中还有未处理词段
        && doneIndex < context.getCursor()){//最后一个字符还未被输出过
        for(Hit hit : hitList){
          //判断是否有不可识别的词段
          if(doneIndex < hit.getEnd() ){
            //输出并处理从doneIndex+1 到 seg.end之间的未知词段
            processUnknown(segmentBuff , context , doneIndex + 1 , hit.getEnd());
          }
        }
      }
      //清空词段队列
      hitList.clear();;
View Full Code Here


   * @param uEnd 终止位置
   */
  private void processUnknown(char[] segmentBuff , Context context , int uBegin , int uEnd){
    Lexeme newLexeme = null;
   
    Hit hit = Dictionary.matchInPrepDict(segmentBuff, uBegin, 1);   
    if(hit.isUnmatch()){//不是副词或介词     
      if(uBegin > 0){//处理姓氏
        hit = Dictionary.matchInSurnameDict(segmentBuff, uBegin - 1 , 1);
        if(hit.isMatch()){
          //输出姓氏
          newLexeme = new Lexeme(context.getBuffOffset() , uBegin - 1 , 1 , Lexeme.TYPE_CJK_SN);
          context.addLexeme(newLexeme);   
        }
      }     
    }
   
    //以单字输出未知词段
    for(int i = uBegin ; i <= uEnd ; i++){
      newLexeme = new Lexeme(context.getBuffOffset() , i , , Lexeme.TYPE_CJK_UNKNOWN);
      context.addLexeme(newLexeme);   
    }
   
    hit = Dictionary.matchInPrepDict(segmentBuff, uEnd, 1);
    if(hit.isUnmatch()){//不是副词或介词
      int length = 1;
      while(uEnd < context.getAvailable() - length){//处理后缀词
        hit = Dictionary.matchInSuffixDict(segmentBuff, uEnd + 1 , length);
        if(hit.isMatch()){
          //输出后缀
          newLexeme = new Lexeme(context.getBuffOffset() , uEnd + , length , Lexeme.TYPE_CJK_SF);
          context.addLexeme(newLexeme);
          break;
        }
        if(hit.isUnmatch()){
          break;
        }
        length++;
      }
    }   
View Full Code Here

   * 处理中文量词
   * @param segmentBuff
   * @param context
   */
  private void processCount(char[] segmentBuff , Context context){
    Hit hit = null;

    if(countStart == -1){
      hit = Dictionary.matchInQuantifierDict(segmentBuff , context.getCursor() , 1);
    }else{
      hit = Dictionary.matchInQuantifierDict(segmentBuff , countStart , context.getCursor() - countStart + 1);
    }
   
    if(hit != null){
      if(hit.isPrefix()){
        if(countStart == -1){
          //设置量词的开始
          countStart = context.getCursor();
        }
      }
     
      if(hit.isMatch()){
        if(countStart == -1){
          countStart = context.getCursor();
        }
        //设置量词可能的结束
        countEnd = context.getCursor();
        //输出可能存在的量词
        outputCountLexeme(context);
      }
     
      if(hit.isUnmatch()){
        if(countStart != -1){
          //重置量词状态
          countStart = -1;
          countEnd = -1;
        }
View Full Code Here

      e1.printStackTrace();
    }
   
    System.out.println(new Date() + " begin march");
    long begintime = System.currentTimeMillis();
    Hit hit = null;
    int umCount = 0;
    int mCount = 0;
    for(String word : allWords){
      char[] chars = word.toCharArray();
      hit = _root_.match(chars , 0, chars.length);
      if(hit.isUnmatch()){
        //System.out.println(word);
        umCount++;
      }else{
        mCount++;
        //System.out.println(mCount + " : " + word);
View Full Code Here

        e1.printStackTrace();
      }
     
      System.out.println(new Date() + " begin march");
      long begintime = System.currentTimeMillis();
      Hit hit = null;
      int umCount = 0;
      int mCount = 0;
      for(String word : allWords){     
        hit = Dictionary.matchInMainDict(word.toCharArray(), 0, word.length());
        if(hit.isUnmatch()){
          System.out.println(word);
          umCount++;
        }else{
          mCount++;
        }
View Full Code Here

          }
        }
      }
     
      //处理以input为开始的一个新hit
      Hit hit = Dictionary.matchInMainDict(segmentBuff, context.getCursor() , 1);
      if(hit.isMatch()){//匹配成词
        //判断是否有不可识别的词段
        if(context.getCursor() > doneIndex + 1){
          //输出并处理从doneIndex+1 到 context.getCursor()- 1之间的未知
          processUnknown(segmentBuff , context , doneIndex + 1 , context.getCursor()- 1);
        }
        //输出当前的词
        Lexeme newLexeme = new Lexeme(context.getBuffOffset() , context.getCursor() , 1 , Lexeme.TYPE_CJK_NORMAL);
        context.addLexeme(newLexeme);
        //更新doneIndex,标识已处理
        if(doneIndex < context.getCursor()){
          doneIndex = context.getCursor();
        }

        if(hit.isPrefix()){//同时也是前缀
          //向词段队列增加新的Hit
          hitList.add(hit);
        }
       
      }else if(hit.isPrefix()){//前缀,未匹配成词
        //向词段队列增加新的Hit
        hitList.add(hit);
       
      }else if(hit.isUnmatch()){//不匹配,当前的input不是词,也不是词前缀,将其视为分割性的字符
        if(doneIndex >= context.getCursor()){
          //当前不匹配的字符已经被处理过了,不需要再processUnknown
          return;
        }
       
        //输出从doneIndex到当前字符(含当前字符)之间的未知词
        processUnknown(segmentBuff , context , doneIndex + 1 , context.getCursor());
        //更新doneIndex,标识已处理
        doneIndex = context.getCursor();
      }
     
    }else {//输入的不是中文(CJK)字符
      if(hitList.size() > 0
          &&  doneIndex < context.getCursor() - 1){
        for(Hit hit : hitList){
          //判断是否有不可识别的词段
          if(doneIndex < hit.getEnd()){
            //输出并处理从doneIndex+1 到 seg.end之间的未知词段
            processUnknown(segmentBuff , context , doneIndex + 1 , hit.getEnd());
          }
        }
      }
      //清空词段队列
      hitList.clear();
      //更新doneIndex,标识已处理
      if(doneIndex < context.getCursor()){
        doneIndex = context.getCursor();
      }
    }
   
    //缓冲区结束临界处理
    if(context.getCursor() == context.getAvailable() - 1){ //读取缓冲区结束的最后一个字符     
      if( hitList.size() > 0 //队列中还有未处理词段
        && doneIndex < context.getCursor()){//最后一个字符还未被输出过
        for(Hit hit : hitList){
          //判断是否有不可识别的词段
          if(doneIndex < hit.getEnd() ){
            //输出并处理从doneIndex+1 到 seg.end之间的未知词段
            processUnknown(segmentBuff , context , doneIndex + 1 , hit.getEnd());
          }
        }
      }
      //清空词段队列
      hitList.clear();;
View Full Code Here

   * @param uEnd 终止位置
   */
  private void processUnknown(char[] segmentBuff , Context context , int uBegin , int uEnd){
    Lexeme newLexeme = null;
   
    Hit hit = Dictionary.matchInPrepDict(segmentBuff, uBegin, 1);   
    if(hit.isUnmatch()){//不是副词或介词     
      if(uBegin > 0){//处理姓氏
        hit = Dictionary.matchInSurnameDict(segmentBuff, uBegin - 1 , 1);
        if(hit.isMatch()){
          //输出姓氏
          newLexeme = new Lexeme(context.getBuffOffset() , uBegin - 1 , 1 , Lexeme.TYPE_CJK_SN);
          context.addLexeme(newLexeme);   
        }
      }     
    }
   
    //以单字输出未知词段
    for(int i = uBegin ; i <= uEnd ; i++){
      newLexeme = new Lexeme(context.getBuffOffset() , i , , Lexeme.TYPE_CJK_UNKNOWN);
      context.addLexeme(newLexeme);   
    }
   
    hit = Dictionary.matchInPrepDict(segmentBuff, uEnd, 1);
    if(hit.isUnmatch()){//不是副词或介词
      int length = 1;
      while(uEnd < context.getAvailable() - length){//处理后缀词
        hit = Dictionary.matchInSuffixDict(segmentBuff, uEnd + 1 , length);
        if(hit.isMatch()){
          //输出后缀
          newLexeme = new Lexeme(context.getBuffOffset() , uEnd + , length , Lexeme.TYPE_CJK_SF);
          context.addLexeme(newLexeme);
          break;
        }
        if(hit.isUnmatch()){
          break;
        }
        length++;
      }
    }   
View Full Code Here

   * 处理中文量词
   * @param segmentBuff
   * @param context
   */
  private void processCount(char[] segmentBuff , Context context){
    Hit hit = null;

    if(countStart == -1){
      hit = Dictionary.matchInQuantifierDict(segmentBuff , context.getCursor() , 1);
    }else{
      hit = Dictionary.matchInQuantifierDict(segmentBuff , countStart , context.getCursor() - countStart + 1);
    }
   
    if(hit != null){
      if(hit.isPrefix()){
        if(countStart == -1){
          //设置量词的开始
          countStart = context.getCursor();
        }
      }
     
      if(hit.isMatch()){
        if(countStart == -1){
          countStart = context.getCursor();
        }
        //设置量词可能的结束
        countEnd = context.getCursor();
        //输出可能存在的量词
        outputCountLexeme(context);
      }
     
      if(hit.isUnmatch()){
        if(countStart != -1){
          //重置量词状态
          countStart = -1;
          countEnd = -1;
        }
View Full Code Here

      e1.printStackTrace();
    }
   
    System.out.println(new Date() + " begin march");
    long begintime = System.currentTimeMillis();
    Hit hit = null;
    int umCount = 0;
    int mCount = 0;
    for(String word : allWords){     
      hit = _root_.match(word.toCharArray());
      if(hit.isUnmatch()){
        System.out.println(word);
        umCount++;
      }else{
        mCount++;
        System.out.println(mCount + " : " + word);
View Full Code Here

        e1.printStackTrace();
      }
     
      System.out.println(new Date() + " begin march");
      long begintime = System.currentTimeMillis();
      Hit hit = null;
      int umCount = 0;
      int mCount = 0;
      for(String word : allWords){     
        hit = Dictionary.matchInMainDict(word.toCharArray(), 0, word.length());
        if(hit.isUnmatch()){
          System.out.println(word);
          umCount++;
        }else{
          mCount++;
        }
View Full Code Here

TOP

Related Classes of org.wltea.analyzer.dic.Hit

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.