Package net.paoding.analysis.knife

Source Code of net.paoding.analysis.knife.NumberKnife

/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package net.paoding.analysis.knife;

import net.paoding.analysis.dictionary.Dictionary;
import net.paoding.analysis.dictionary.Hit;

/**
*
* @author Zhiliang Wang [qieqie.wang@gmail.com]
*
*/
public class NumberKnife extends CombinatoricsKnife implements DictionariesWare {

  private Dictionary units;
 
  public NumberKnife() {
  }

  public NumberKnife(Dictionaries dictionaries) {
    setDictionaries(dictionaries);
  }

  public void setDictionaries(Dictionaries dictionaries) {
    super.setDictionaries(dictionaries);
    units = dictionaries.getUnitsDictionary();
  }
 

  public int assignable(Beef beef, int offset, int index) {
    char ch = beef.charAt(index);
    if (CharSet.isArabianNumber(ch))
      return ASSIGNED;
    if (index > offset) {
      if (CharSet.isLantingLetter(ch) || ch == '.' || ch == '-' || ch == '_') {
        if (CharSet.isLantingLetter(ch)
            || !CharSet.isArabianNumber(beef.charAt(index + 1))) {
          //分词效果
          //123.456    ->123.456/
          //123.abc.34  ->123/123.abc.34/abc/34/  ["abc"、"abc/34"系由LetterKnife分出,非NumberKnife]
          //没有或判断!CharSet.isArabianNumber(beef.charAt(index + 1)),则分出"123.",而非"123"
          //123.abc.34  ->123./123.abc.34/abc/34/
          return POINT;
        }
        return ASSIGNED;
      }
    }
    return LIMIT;
  }
 
  protected int collectLimit(Collector collector, Beef beef,
      int offset, int point, int limit, int dicWordVote) {
    // "123abc"的直接调用super的
    if (point != -1) {
      return super.collectLimit(collector, beef, offset, point, limit, dicWordVote);
    }
    //
    // 2.2两
    //    ^=_point
    //    
    final int _point = limit;
    // 当前尝试判断的字符的位置
    int curTail = offset;
    int number1 = -1;
    int number2 = -1;
    int bitValue = 0;
    int maxUnit = 0;
    //TODO:这里又重复从curTail(其值为offset)判断,重新遍历判断是否为数字,算是一个重复计算
    //但考虑这个计算对中文分词性能影响微乎其微暂时先不优化
    for (; (bitValue = CharSet.toNumber(beef.charAt(curTail))) >= 0; curTail++) {
      //
      if (bitValue == 2
          && (beef.charAt(curTail) == '两' || beef.charAt(curTail) == '俩' || beef
              .charAt(curTail) == '倆')) {
        if (curTail != offset) {
          break;
        }
      }
      // 处理连续汉字个位值的数字:"三四五六"  ->"3456"
      if (bitValue >= 0 && bitValue < 10) {
        if (number2 < 0)
          number2 = bitValue;
        else {
          number2 *= 10;
          number2 += bitValue;
        }
      } else {
        if (number2 < 0) {
          if (number1 < 0) {
            number1 = 1;
          }
          number1 *= bitValue;
        } else {
          if (number1 < 0) {
            number1 = 0;
          }
          if (bitValue >= maxUnit) {
            number1 += number2;
            number1 *= bitValue;
            maxUnit = bitValue;
          } else {
            number1 += number2 * bitValue;
          }
        }
        number2 = -1;
      }
    }
    if (number2 > 0) {
      if (number1 < 0) {
        number1 = number2;
      } else {
        number1 += number2;
      }
    }
    if (number1 >= 0 && curTail > _point) {
      doCollect(collector, String.valueOf(number1), beef, offset, curTail);
    }
    else {
      super.collectLimit(collector, beef, offset, point, limit, dicWordVote);
    }
   
    curTail = curTail > limit ? curTail : limit;
   
    //
    // 后面可能跟了计量单位
    if (units != null && CharSet.isCjkUnifiedIdeographs(beef.charAt(curTail))) {
      Hit wd = null;
      Hit wd2 = null;
      int i = curTail + 1;
      while ((wd = units.search(beef, curTail, i - curTail)).isHit()) {
        wd2 = wd;
        i++;
        if (!wd.isUnclosed()) {
          break;
        }
      }
      i --;
      if (wd2 != null) {
        collector.collect(wd2.getWord().getText(), curTail, i);
        return i;
      }
    }
    //
   
    return curTail > limit ? curTail : -1;
  }


}
TOP

Related Classes of net.paoding.analysis.knife.NumberKnife

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.