Package org.ictclas4j.util

Source Code of org.ictclas4j.util.Utility

package org.ictclas4j.util;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;

import org.ictclas4j.bean.Dictionary;
import org.ictclas4j.bean.PersonName;
import org.ictclas4j.segment.PosTagger;

import com.gftech.util.GFCommon;
import com.gftech.util.GFString;

public class Utility {
  // GB2312�����к��ֹ���6768�����м���������ַ�Ҳ�����ڣ�
  public static final int GB_NUM = 6768;

  // GBK�����к��ֹ���21998�����м���������ַ�Ҳ�����ڣ�
  public static final int GBK_NUM = 21998;

  // GBK��չ������10�����ֺ�26��Ӣ����ĸ
  public static final int GBK_NUM_EXT = GBK_NUM + 36;

  // The number of Chinese Char,including 5 empty position between 3756-3761
  public static final int WORD_MAXLENGTH = 100;

  public static final int WT_DELIMITER = 0;

  public static final int WT_CHINESE = 1;

  public static final int WT_OTHER = 2;

  public static final int CT_SENTENCE_BEGIN = 1;// Sentence begin

  public static final int CT_SENTENCE_END = 4;// Sentence ending

  public static final int CT_SINGLE = 5;// SINGLE byte

  public static final int CT_DELIMITER = CT_SINGLE + 1;// delimiter

  public static final int CT_CHINESE = CT_SINGLE + 2;// Chinese Char

  public static final int CT_LETTER = CT_SINGLE + 3;// HanYu Pinyin

  public static final int CT_NUM = CT_SINGLE + 4;// HanYu Pinyin

  public static final int CT_INDEX = CT_SINGLE + 5;// HanYu Pinyin

  public static final int CT_OTHER = CT_SINGLE + 12;// Other

  public static final int MAX_WORDS = 650;

  public static final int MAX_SEGMENT_NUM = 10;

  public static final String POSTFIX_SINGLE = "�Ӱ���dzش嵥�����̵궴�ɶӷ��帮�Ը۸󹬹������źӺ������������ǽ־����ӿڿ�����¥·������Ū����������������Ȫ��ɽʡ��ˮ����̨̲̳����ͤ��������ϪϿ������������ҤӪ����԰ԷԺբկվ����ׯ�������";

  public static final String[] POSTFIX_MUTIPLE = { "�뵺", "��ԭ", "����", "���", "�󹫹�", "����", "����", "�۹�", "�ɲ�", "�ۿ�",
      "���ٹ�·", "��ԭ", "��·", "��԰", "���͹�", "�ȵ�", "�㳡", "����", "��Ͽ", "��ͬ", "����", "����", "����", "�ֵ�", "�ڰ�", "��ͷ", "ú��",
      "����", "ũ��", "���", "ƽԭ", "����", "Ⱥ��", "ɳĮ", "ɳ��", "ɽ��", "ɽ��", "ˮ��", "���", "����", "��·", "�´�", "ѩ��", "�γ�", "�κ�",
      "�泡", "ֱϽ��", "������", "������", "������", "" };

  public static final String TRANS_ENGLISH = "�������������������°İʰŰͰװݰ������������ȱϱ˱𲨲��������������Ųɲֲ��񳹴��Ĵȴδ����������������µõĵǵϵҵٵ۶����Ŷض����������������Ʒҷѷ�򸣸������ǸɸԸ���������ŹϹ��������������ϺӺպ����������������Ӽּ��ܽ𾩾þӾ��������������¿ƿɿ˿Ͽ����������������������������������������������������������¡¬²³·��������������������éï÷����������������ĦĪīĬķľ������������������������ŦŬŵŷ��������������Ƥƽ��������ǡǿ��������Ȫ��������������������ɣɪɭɯɳɽ������ʥʩʫʯʲʷʿ��˹˾˿��������̩̹����������͡ͼ������������������Τάκ��������������������ϣϲ������Ъл������������ҢҶ��������������ӢӺ����Լ������ղ������������׿������٤��������üν�����������Ľ����������������ɺ����ѷ��������ܽ���������������";

  public static final String TRANS_RUSSIAN = "�������°ͱȱ˲�����Ĵ�µö��Ŷ���������Ǹ�����Ӽ�ݽ𿨿ƿɿ˿���������������������¬³������÷����ķ������ŵ������������������ɫɽ��ʲ˹����̹������ά������ϣл��ҮҶ�������������ǵٸ�����ջ������������������������������ɣɳ��̩ͼ������׿��";

  public static final String TRANS_JAPANESE = "���°˰װٰ�������ȱ��������ʲ˲ֲ������سന�����δ����������µص�ɶ������縣�Ը߹����Źȹع���úƺͺϺӺں���󻧻Ļ漪�ͼѼӼ�������������������þƾտ����ɿ˿�����������������������������¡¹������������ľ��������������Ƭƽ����ǧǰdz����������������Ȫ������������ɭɴɼɽ��������ʥʯʵʸ������ˮ˳˾��̩��������������βδ����������ϸ������СТ����������������������ңҰҲҶһ����������ӣ��������������ԨԪԫԭԶ����������������լ����������ֲ֦֪֮��������������׵��������ܥݶ��޹������";

  // Translation type
  public static final int TT_ENGLISH = 0;

  public static final int TT_RUSSIAN = 1;

  public static final int TT_JAPANESE = 2;

  // Seperator type
  public static final String SEPERATOR_C_SENTENCE = "������������";

  public static final String SEPERATOR_C_SUB_SENTENCE = "����������������";

  public static final String SEPERATOR_E_SENTENCE = "!?:;";

  public static final String SEPERATOR_E_SUB_SENTENCE = ",()\"'";

  public static final String SEPERATOR_LINK = "\n\r ��";

  // Sentence begin and ending string
  public static final String SENTENCE_BEGIN = "ʼ##ʼ";

  public static final String SENTENCE_END = "ĩ##ĩ";

  // Seperator between two words
  public static final String WORD_SEGMENTER = "@";

  public static final int MAX_WORDS_PER_SENTENCE = 120;

  public static final int MAX_UNKNOWN_PER_SENTENCE = 200;

  public static final int MAX_POS_PER_WORD = 20;

  public static final int LITTLE_FREQUENCY = 6;

  public enum TAG_TYPE {
    TT_NORMAL, TT_PERSON, TT_PLACE, TT_TRANS_PERSON
  };

  public static final int MAX_FREQUENCE = 2079997;// 7528283+329805

  // //1993123+86874

  public static final int MAX_SENTENCE_LEN = 2000;

  public static final double INFINITE_VALUE = 10000.00;

  // ƽ������
  public static final double SMOOTH_PARAM = 0.1;

  public static final String UNKNOWN_PERSON = "δ##��";

  public static final String UNKNOWN_SPACE = "δ##��";

  public static final String UNKNOWN_NUM = "δ##��";

  public static final String UNKNOWN_TIME = "δ##ʱ";

  public static final String UNKNOWN_LETTER = "δ##��";

  public static boolean gbGenerate(String fileName) {
    File file;
    int i, j;
    file = new File(fileName);
    try {
      PrintWriter out = new PrintWriter(new FileOutputStream(file));
      if (!file.canWrite())
        return false;// fail while opening the file
      for (i = 161; i < 255; i++)
        for (j = 161; j < 255; j++)
          out.println("" + i + j + "," + i + "," + j);
      out.close();
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    }
    return true;
  }

  /***************************************************************************
   *
   * Func Name : CC_Generate
   *
   * Description: Generate the Chinese Char List file
   *
   *
   * Parameters : sFilename: the file name for the output CC List
   *
   * Returns : public static boolean Author : Kevin Zhang History : 1.create
   * 2002-1-8
   **************************************************************************/
  public static boolean CC_Generate(String fileName) {
    File file;
    int i, j;
    file = new File(fileName);
    try {
      PrintWriter out = new PrintWriter(new FileOutputStream(file));
      for (i = 176; i < 255; i++)
        for (j = 161; j < 255; j++)
          out.println("" + i + j + "," + i + "," + j);
      out.close();
    } catch (FileNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    return true;
  }

  /***************************************************************************
   *
   * Func Name : CC_Find
   *
   * Description: Find a Chinese sub-string in the Chinese String
   *
   *
   * Parameters : string:Null-terminated string to search
   *
   * strCharSet:Null-terminated string to search for
   *
   * Returns : String Author : Kevin Zhang History : 1.create 2002-1-8
   **************************************************************************/
  public static boolean CC_Find(final byte[] string, final byte[] strCharSet) {
    if (string != null && strCharSet != null) {
      int index = strstr(string, strCharSet);
      if (index != -1 && (index % 2 == 1)) {
        return false;
      }
    }
    return true;
  }

  /***************************************************************************
   *
   * Func Name : charType
   *
   * Description: Judge the type of sChar or (sChar,sChar+1)
   *
   *
   * Parameters : sFilename: the file name for the output CC List
   *
   * Returns : int : the type of char Author : Kevin Zhang History : 1.create
   * 2002-1-8
   **************************************************************************/
  public static int charType(String str) {
    
    if (str != null && str.length() > 0) {
      byte[] b = str.getBytes();
      byte b1 = b[0];
      byte b2 = b.length > 1 ? b[1] : 0;
      int ub1=getUnsigned(b1);
      int ub2=getUnsigned(b2);
      if (ub1 < 128) {
        if ("\"!,.?()[]{}+=".indexOf((char) b1) != -1)
          return CT_DELIMITER;
        return CT_SINGLE;
      } else if (ub1 == 162)
        return CT_INDEX;
      else if (ub1 == 163 && ub2 > 175 && ub2 < 186)
        return CT_NUM;
      else if (ub1 == 163
          && (ub2 >= 193 && ub2 <= 218 || ub2 >= 225
              && ub2 <= 250))
        return CT_LETTER;
      else if (ub1 == 161 || ub1 == 163)
        return CT_DELIMITER;
      else if (ub1 >= 176 && ub1 <= 247)
        return CT_CHINESE;

    }
    return CT_OTHER;

  }

  /***************************************************************************
   *
   * Func Name : GetCCPrefix
   *
   * Description: Get the max Prefix string made up of Chinese Char
   *
   *
   * Parameters : sSentence: the original sentence which includes Chinese or
   * Non-Chinese char
   *
   * Returns : the end of the sub-sentence Author : Kevin Zhang History :
   * 1.create 2002-1-8
   **************************************************************************/
  public static int getCCPrefix(byte[] sSentence) {
    int nLen = sSentence.length;
    int nCurPos = 0;
    while (nCurPos < nLen && getUnsigned(sSentence[nCurPos]) > 175 && getUnsigned(sSentence[nCurPos]) < 248) {
      nCurPos += 2;// Get next Chinese Char
    }
    return nCurPos;
  }

  /***************************************************************************
   *
   * Func Name : IsAllSingleByte
   *
   * Description: Judge the string is all made up of Single Byte Char
   *
   *
   * Parameters : sSentence: the original sentence which includes Chinese or
   * Non-Chinese char
   *
   * Returns : the end of the sub-sentence Author : Kevin Zhang History :
   * 1.create 2002-1-24
   **************************************************************************/
  public static boolean isAllChinese(String str) {
    if (str != null) {

      String temp = str + " ";
      for (int i = 0; i < str.length(); i++) {
        byte[] b = temp.substring(i, i + 1).getBytes();
        if (b.length == 2) {
          int ub0=getUnsigned(b[0]);
          if (!(ub0 < 248 && ub0 > 175)
              || !(ub0 < 253 && ub0 > 160))
            return false;
        }
      }

      return true;
    }
    return false;
  }

  /***************************************************************************
   *
   * Func Name : IsAllNonChinese
   *
   * Description: Judge the string is all made up of Single Byte Char
   *
   *
   * Parameters : sSentence: the original sentence which includes Chinese or
   * Non-Chinese char
   *
   * Returns : the end of the sub-sentence Author : Kevin Zhang History :
   * 1.create 2002-1-24
   **************************************************************************/
  public static boolean isAllNonChinese(byte[] sString) {
    int nLen = sString.length;
    int i = 0;

    while (i < nLen) {
      if (getUnsigned(sString[i]) < 248 && getUnsigned(sString[i]) > 175)
        return false;
      if (sString[i] < 0)
        i += 2;
      else
        i += 1;
    }
    return true;
  }

  /***************************************************************************
   *
   * Func Name : IsAllSingleByte
   *
   * Description: Judge the string is all made up of Single Byte Char
   *
   *
   * Parameters : sSentence: the original sentence which includes Chinese or
   * Non-Chinese char
   *
   * Returns : the end of the sub-sentence Author : Kevin Zhang History :
   * 1.create 2002-1-24
   **************************************************************************/
  public static boolean isAllSingleByte(String str) {
    if (str != null) {
      int len = str.length();
      int i = 0;
      byte[] b = str.getBytes();
      while (i < len && b[i] < 128) {
        i++;
      }
      if (i < len)
        return false;
      return true;
    }
    return false;
  }

  /***************************************************************************
   *
   * Func Name : IsAllNum
   *
   * Description: Judge the string is all made up of Num Char
   *
   *
   * Parameters : sSentence: the original sentence which includes Chinese or
   * Non-Chinese char
   *
   * Returns : the end of the sub-sentence Author : Kevin Zhang History :
   * 1.create 2002-1-24
   **************************************************************************/
  public static boolean isAllNum(String str) {

    if (str != null) {
      int i = 0;
      String temp = str + " ";
      // �жϿ�ͷ�Ƿ���+-֮��ķ���
      if ("��+��-��".indexOf(temp.substring(0, 1)) != -1)
        i++;
      /** �����ȫ�ǵģ������������������� �ַ�* */
      while (i < str.length() && "��������������������".indexOf(str.substring(i, i + 1)) != -1)
        i++;

      // Get middle delimiter such as .
      if (i < str.length()) {
        String s = str.substring(i, i + 1);
        if ("�á�����".indexOf(s) != -1 || ".".equals(s) || "/".equals(s)) {// 98��1��
          i++;
          while (i + 1 < str.length() && "��������������������".indexOf(str.substring(i + 1, i + 2)) != -1)
            i++;
        }
      }

      if (i >= str.length())
        return true;

      while (i < str.length() && GFString.cint(str.substring(i, i + 1)) >= 0
          && GFString.cint(str.substring(i, i + 1)) <= 9)
        i++;
      // Get middle delimiter such as .
      if (i < str.length()) {
        String s = str.substring(i, i + 1);
        if ("�á�����".indexOf(s) != -1 || ".".equals(s) || "/".equals(s)) {// 98��1��
          i++;
          while (i + 1 < str.length() && "0123456789".indexOf(str.substring(i + 1, i + 2)) != -1)
            i++;
        }
      }

      if (i < str.length()) {

        if ("��ǧ���ڰ�Ǫ����".indexOf(str.substring(i, i + 1)) == -1 && !"%".equals(str.substring(i, i + 1)))
          i--;
      }
      if (i >= str.length())
        return true;
    }
    return false;
  }

  /***************************************************************************
   *
   * Func Name : IsAllIndex
   *
   * Description: Judge the string is all made up of Index Num Char
   *
   *
   * Parameters : sSentence: the original sentence which includes Chinese or
   * Non-Chinese char
   *
   * Returns : the end of the sub-sentence Author : Kevin Zhang History :
   * 1.create 2002-1-24
   **************************************************************************/
  public static boolean isAllIndex(byte[] sString) {
    int nLen = sString.length;
    int i = 0;

    while (i < nLen - 1 && getUnsigned(sString[i]) == 162) {
      i += 2;
    }
    if (i >= nLen)
      return true;
    while (i < nLen && (sString[i] > 'A' - 1 && sString[i] < 'Z' + 1)
        || (sString[i] > 'a' - 1 && sString[i] < 'z' + 1)) {// single
      // byte
      // number
      // char
      i += 1;
    }

    if (i < nLen)
      return false;
    return true;

  }

  /***************************************************************************
   *
   * Func Name : IsAllLetter
   *
   * Description: Judge the string is all made up of Letter Char
   *
   *
   * Parameters : sSentence: the original sentence which includes Chinese or
   * Non-Chinese char
   *
   * Returns : the end of the sub-sentence Author : Kevin Zhang History :
   * 1.create 2002-1-24
   **************************************************************************/
  public static boolean isAllLetter(String str) {
    int i = 0;

    if (str != null) {
      int nLen = str.length();
      byte[] b = str.getBytes();
      while (i < nLen - 1
          && getUnsigned(b[i]) == 163
          && ((getUnsigned(b[i + 1]) >= 193 && getUnsigned(b[i + 1]) <= 218) || (getUnsigned(b[i + 1]) >= 225 && getUnsigned(b[i + 1]) <= 250))) {
        i += 2;
      }
      if (i < nLen)
        return false;
      return true;
    }
    return false;
  }

  /***************************************************************************
   *
   * Func Name : IsAllDelimiter
   *
   * Description: Judge the string is all made up of Delimiter
   *
   *
   * Parameters : sSentence: the original sentence which includes Chinese or
   * Non-Chinese char
   *
   * Returns : the end of the sub-sentence Author : Kevin Zhang History :
   * 1.create 2002-1-24
   **************************************************************************/
  public static boolean isAllDelimiter(byte[] sString) {
    int nLen = sString.length;
    int i = 0;

    while (i < nLen - 1 && (getUnsigned(sString[i]) == 161 || getUnsigned(sString[i]) == 163)) {
      i += 2;
    }
    if (i < nLen)
      return false;
    return true;
  }

  /***************************************************************************
   *
   * Func Name : BinarySearch
   *
   * Description: Lookup the index of nVal in the table nTable which length is
   * nTableLen
   *
   * Parameters : nPOS: the POS value
   *
   * Returns : the index value Author : Kevin Zhang History : 1.create
   * 2002-1-25
   **************************************************************************/
  public static int binarySearch(int val, int[] table) {
    if (table != null) {
      int len = table.length;
      int start = 0, end = len - 1, mid = (start + end) / 2;

      while (start <= end)// Binary search
      {
        if (table[mid] == val) {
          return mid;// find it
        } else if (table[mid] < val) {
          start = mid + 1;
        } else {
          end = mid - 1;
        }
        mid = (start + end) / 2;
      }
    }
    return -1;// Can not find it;
  }

  /***************************************************************************
   *
   * Func Name : IsForeign
   *
   * Description: Decide whether the word is not a Non-fereign word
   *
   * Parameters : sWord: the word
   *
   * Returns : the index value Author : Kevin Zhang History : 1.create
   * 2002-1-26
   **************************************************************************/
  public static boolean isForeign(String word) {
    if (word != null) {
      int foreignCount = getForeignCharCount(word);
      int charCount = word.length();
      if (charCount > 2 || foreignCount >= 1 * charCount / 2)
        return true;
    }
    return false;
  }

  /***************************************************************************
   *
   * Func Name : IsAllForeign
   *
   * Description: Decide whether the word is not a Non-fereign word
   *
   * Parameters : sWord: the word
   *
   * Returns : the index value Author : Kevin Zhang History : 1.create
   * 2002-3-25
   **************************************************************************/
  public static boolean isAllForeign(String sWord) {
    int nForeignCount = getForeignCharCount(sWord);
    if (2 * nForeignCount == sWord.length())
      return true;
    return false;
  }

  /***************************************************************************
   *
   * Func Name : IsForeign
   *
   * Description: Decide whether the word is Chinese Num word
   *
   * Parameters : sWord: the word
   *
   * Returns : the index value Author : Kevin Zhang History : 1.create
   * 2002-1-26
   **************************************************************************/
  public static boolean isAllChineseNum(String word) {// �ٷ�֮������������ϰ˵�ʮ�˷���

    String chineseNum = "���һ�������������߰˾�ʮإ��ǧ����Ҽ��������½��ƾ�ʰ��Ǫ�á�������";//
    String prefix = "�������ϳ�";

    if (word != null) {
      String temp = word + " ";
      for (int i = 0; i < word.length(); i++) {

        if (temp.indexOf("��֮", i) != -1)// �ٷ�֮��
        {
          i += 2;
          continue;
        }

        String tchar = temp.substring(i, i + 1);
        if (chineseNum.indexOf(tchar) == -1 && (i != 0 || prefix.indexOf(tchar) == -1))
          return false;
      }
      return true;
    }

    return false;
  }

  /***************************************************************************
   *
   * Func Name : GetForeignCharCount
   *
   * Description:
   *
   * Parameters : sWord: the word
   *
   * Returns : the index value Author : Kevin Zhang History : 1.create
   * 2002-4-4 2.Modify 2002-5-21
   **************************************************************************/
  public static int getForeignCharCount(String sWord) {
    int nForeignCount, nCount;
    // English char counnts
    nForeignCount = getCharCount(TRANS_ENGLISH, sWord);
    // Japan char counnts
    nCount = getCharCount(TRANS_JAPANESE, sWord);
    if (nForeignCount <= nCount)
      nForeignCount = nCount;
    // Russian char counnts
    nCount = getCharCount(TRANS_RUSSIAN, sWord);
    if (nForeignCount <= nCount)
      nForeignCount = nCount;
    return nForeignCount;
  }

  /**
   * �õ��ַ������ַ����ַ����г��ֵĴ���
   *
   * @param charSet
   * @param word
   * @return
   */
  public static int getCharCount(String charSet, String word) {
    int nCount = 0;

    if (word != null) {
      String temp = word + " ";
      for (int i = 0; i < word.length(); i++) {
        String s = temp.substring(i, i + 1);
        if (charSet.indexOf(s) != -1)
          nCount++;
      }
    }

    return nCount;
  }

  /***************************************************************************
   *
   * Func Name : GetForeignCharCount
   *
   * Description: Return the foreign type
   *
   * Parameters : sWord: the word
   *
   * Returns : the index value Author : Kevin Zhang History : 1.create
   * 2002-4-4 2.Modify 2002-5-21
   **************************************************************************/
  public int GetForeignType(String sWord) {
    int nForeignCount, nCount, nType = TT_ENGLISH;
    nForeignCount = getCharCount(TRANS_ENGLISH, sWord);// English
    // char
    // counnts
    nCount = getCharCount(TRANS_RUSSIAN, sWord);// Russian
    // char
    // counnts
    if (nForeignCount < nCount) {
      nForeignCount = nCount;
      nType = TT_RUSSIAN;
    }
    nCount = getCharCount(TRANS_JAPANESE, sWord);// Japan
    // char
    // counnts
    if (nForeignCount < nCount) {
      nForeignCount = nCount;
      nType = TT_JAPANESE;
    }
    return nType;
  }

  public static byte[] readBytes(DataInputStream in, int len) {
    if (in != null && len > 0) {
      byte[] b = new byte[len];
      try {
        for (int i = 0; i < len; i++)
          b[i] = in.readByte();
      } catch (IOException e) {
        e.printStackTrace();
      }

      return b;
    }

    return null;
  }

  public static boolean PostfixSplit(byte[] sWord, byte[] sWordRet, byte[] sPostfix) {
    byte[] sSinglePostfix = POSTFIX_SINGLE.getBytes();
    byte[][] sMultiPostfix = new byte[POSTFIX_MUTIPLE.length][9];
    for (int i = 0; i < sMultiPostfix.length; i++)
      sMultiPostfix[i] = POSTFIX_MUTIPLE[i].getBytes();
    int nPostfixLen = 0, nWordLen = sWord.length;
    int i = 0;

    while (sMultiPostfix[i][0] != 0
        && strncmp(GFCommon.bytesCopy(sWord, nWordLen - sMultiPostfix[i].length, sWord.length - nWordLen
            + sMultiPostfix[i].length), 0, sMultiPostfix[i], sMultiPostfix[i].length) == false) {// Try
      // to
      // get
      // the
      // postfix of an
      // address
      i++;
    }
    GFCommon.bytesCopy(sPostfix, sMultiPostfix[i], 0, sMultiPostfix.length);
    nPostfixLen = sMultiPostfix[i].length;// Get the length of place
    // postfix

    if (nPostfixLen == 0) {
      sPostfix[2] = 0;
      strncpy(sPostfix, GFCommon.bytesCopy(sWord, nWordLen - 2, 2), 2);
      if (CC_Find(sSinglePostfix, sPostfix))
        nPostfixLen = 2;
    }

    strncpy(sWordRet, sWord, nWordLen - nPostfixLen);
    sWordRet[nWordLen - nPostfixLen] = 0;// Get the place name which have
    // erasing the postfix
    sPostfix[nPostfixLen] = 0;
    return true;
  }

  /**
   * �Ƚϵڶ����ֽ������Ƿ��ڵ�һ���г���
   *
   * @param b1
   * @param b2
   * @return ���ص�һ�γ�����λ�á����û�г��֣��򷵻أ�1
   */
  public static int strstr(byte[] b1, byte[] b2) {
    boolean flag = true;
    if (b1 != null && b2 != null) {
      for (int i = 0; i < b1.length; i++) {
        if (b1[i] != b2[0])
          continue;
        else {
          if (b1.length - i >= b2.length) {
            for (int j = 0; j < b2.length; j++) {
              if (b2[j] != b1[i + j]) {
                flag = false;
                break;
              }
            }

            if (flag) {
              return i;
            }
          }
        }
      }
    }

    return -1;
  }

  public static int strchr(byte[] bs, byte b) {
    if (bs != null) {
      for (int i = 0; i < bs.length; i++) {
        if (bs[i] == b)
          return i;
      }

    }

    return -1;
  }

  /**
   * �Ƚ������ֽ�����ǰlen���ֽ��Ƿ����
   *
   * @param b1
   * @param b2
   * @param len
   * @return
   */
  public static boolean strncmp(byte[] b1, int startIndex, byte[] b2, int len) {
    if (b1 != null && b2 != null && len > 0) {
      if (b1.length >= len && b2.length >= len) {
        for (int i = startIndex; i < len; i++) {
          if (b1[i] != b2[i])
            return true;
        }
      }
    }

    return false;
  }

  public static int getUnsigned(byte b) {
    if (b > 0)
      return (int) b;
    else
      return (b & 0x7F)|0x80;
  }

  public static void strncpy(byte[] dest, byte[] src, int len) {
    if (dest != null && src != null) {
      if (dest.length >= len && len <= src.length) {
        for (int i = 0; i < len; i++)
          dest[i] = src[i];
      }
    }
  }

  /**
   * ������6768��λ���ж�Ӧ��ID��
   */
  public static int GB_ID(String str) {
    int result = -1;
    if (str != null && str.length() > 0) {
      byte[] b = str.getBytes();
      result = (getUnsigned(b[0]) - 176) * 94 + (getUnsigned(b[1]) - 161);
    }
    return result;
  }

  /**
   * The first char computed by the Chinese Char ID
   *
   * @param id
   * @return
   */
  public static int CC_CHAR1(int id) {
    return (id) / 94 + 176;
  }

  /**
   * The second char computed by the Chinese Char ID
   *
   * @param id
   * @return
   */
  public static int CC_CHAR2(int id) {
    return (id) % 94 + 161;
  }

  public static int strcat(byte[] dest, byte[] src, int len) {
    if (dest != null && src != null && len > 0) {

      for (int i = 0; i < dest.length; i++) {
        if (dest[i] == 0) {
          for (int j = 0; j < len; j++)
            dest[i] = src[j];
          return i;
        }
      }

    }

    return -1;
  }

  public static int strcpy(byte[] dest, byte[] src) {
    return strcpy(dest, src, src.length);
  }

  public static int strcpy(byte[] dest, byte[] src, int len) {
    if (dest != null && src != null && len > 0) {
      int i = 0;
      for (i = 0; i < len; i++) {
        dest[i] = src[i];

      }
      return i;
    }

    return -1;
  }

  /**
   * ����ID�ŵõ���Ӧ��GB����
   *
   * @param id
   *            0--6767
   * @return
   */
  public static String getGBWord(int id) {
    String result = null;

    if (id >= 0 && id < 6768) {
      byte[] b = new byte[2];
      b[0] = (byte) CC_CHAR1(id);
      b[1] = (byte) CC_CHAR2(id);
      try {
        result = new String(b, "GBK");
      } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
      }
    }
    return result;
  }

  public static boolean isSingle(String s) {
    if (s != null && s.getBytes().length == 1)
      return true;
    else
      return false;
  }

  public static int[] removeInvalid(int[] src) {
    int[] result = null;
    int count = 0;
    if (src != null && src.length > 0) {
      for (int i = 0; i < src.length; i++) {
        if (i != 0 && src[i] == 0)
          break;
        else
          count++;
      }

      result = new int[count];
      for (int i = 0; i < count; i++)
        result[i] = src[i];
    }

    return result;
  }

  /**
   * �ж��ַ����Ƿ������
   *
   * @param str
   * @return
   */
  public static boolean isYearTime(String snum) {
    if (snum != null) {
      int len = snum.length();
      String first = snum.substring(0, 1);

      // 1992��, 98��,06��
      if (isAllSingleByte(snum)
          && (len == 4 || len == 2 && (GFString.cint(first) > 4 || GFString.cint(first) == 0)))
        return true;
      if (isAllNum(snum) && (len >= 6 || len == 4 && "������������".indexOf(first) != -1))
        return true;
      if (getCharCount("���һ�����������߰˾�Ҽ��������½��ƾ�", snum) == len && len >= 2)
        return true;
      if (len == 4 && getCharCount("ǧǪ���", snum) == 2)// ��Ǫ�����
        return true;
      if (len == 1 && getCharCount("ǧǪ", snum) == 1)
        return true;
      if (len == 2 && getCharCount("���ұ����켺�����ɹ�", snum) == 1
          && getCharCount("�ӳ���î������δ�����纥", snum.substring(1)) == 1)
        return true;
    }
    return false;
  }

  /**
   * �ж�һ���ַ����������ַ��Ƿ�����һ���ַ���������
   *
   * @param aggr
   *            �ַ�������
   * @param str
   *            ��Ҫ�жϵ��ַ���
   * @return
   */
  public static boolean isInAggregate(String aggr, String str) {
    if (aggr != null && str != null) {
      str += "1";
      for (int i = 0; i < str.length(); i++) {
        String s = str.substring(i, i + 1);
        if (aggr.indexOf(s) == -1)
          return false;
      }
      return true;
    }

    return false;
  }

  /**
   * �жϸ��ַ����Ƿ��ǰ���ַ�
   *
   * @param str
   * @return
   */
  public static boolean isDBCCase(String str) {
    if (str != null) {
      str += " ";
      for (int i = 0; i < str.length(); i++) {
        String s = str.substring(i, i + 1);
        if (s.getBytes().length != 1)
          return false;
      }

      return true;
    }

    return false;
  }

  /**
   * �жϸ��ַ����Ƿ���ȫ���ַ�
   *
   * @param str
   * @return
   */
  public static boolean isSBCCase(String str) {
    if (str != null) {
      str += " ";
      for (int i = 0; i < str.length(); i++) {
        String s = str.substring(i, i + 1);
        if (s.getBytes().length != 2)
          return false;
      }

      return true;
    }

    return false;
  }

  /**
   * �ж��Ƿ���һ�����ַ����ָ�����
   *
   * @param str
   * @return
   */
  public static boolean isDelimiter(String str) {
    if (str != null && ("-".equals(str) || "��".equals(str)))
      return true;
    else
      return false;
  }

  public static boolean isUnknownWord(String word) {
    if (word != null && word.indexOf("δ##") == 0)
      return true;
    else
      return false;
  }

  public static PersonName chineseNameSplit( PosTagger personTagger,String word, int index) {
    PersonName result = null;

    if ( word != null && personTagger != null) {
      Dictionary personDict = personTagger.getUnknownDict();
      int len = word.length();
      if (len < 2 || len > 4)
        return null;
      String[] atoms = GFString.atomSplit(word);
      for (String s : atoms) {
        if (Utility.charType(s) != Utility.CT_CHINESE && Utility.charType(s) != Utility.CT_OTHER)
          return null;
      }

      String surName = null;
      int surNameLen = 2;
      if (len > 2)
        surName = word.substring(0, surNameLen);
      else if (len == 2)
        surName = word;
      if (!personDict.isExist( surName, 1,index)) {
        surNameLen = 1;
        if (len > 1)
          surName = word.substring(0, surNameLen);
        else if (len == 1)
          surName = word;
        if (!personDict.isExist( surName, 1,index)) {
          surName = null;
          surNameLen = 0;
        }
      }
      String giveName = word.substring(surNameLen);
      if (len > 3) {
        String temp = word.substring(surNameLen, surNameLen + 1);
        if (personDict.isExist( temp, 1,index)) {

          giveName = word.substring(surNameLen + 1);
        }
      }

      double freq = personDict.getFreq( surName, 1,index);
      String temp = giveName.substring(0, 1);
      double freq2 = personDict.getFreq( temp, 2,index);

      if (surNameLen != 2
          && ((surNameLen == 0 && len > 2) || giveName.length() > 2 || getForeignCharCount(word) >= 3
              && freq < personDict.getFreq( "��", 1,index) / 40 && freq2 < personDict.getFreq( "��", 2,index) / 20 || (freq < 10 && getForeignCharCount(giveName) == (len - surNameLen) / 2)))
        return null;
      if (len == 2 && personTagger.isGivenName(word))
        return null;
      result = new PersonName();
      result.setFirstName(surName);
      result.setLastName(giveName);
    }
    return result;

  }

  public static String int2pos(int pos) {
    String result = "";

    int tag = Math.abs(pos);
    for (int k = 0; k < 4; k++) {
      int value = tag & 0xFF;
      if (value > 0)
        result = (char) value + result;
      tag = tag >> 8;
    }

    return result;
  }

  /**
   * ȡ�ú�����GBK�����е�ID�ţ��������š� ID�ŵı��Ź����ǣ�
   * GBK����ĺ��ֹ��ֳ����飬��21998����GB2312������еij�����Ϊ��һ�飬
   * 0xB0A1--0xF7FE,���ֺ�CC_ID�м����һ�£��ڶ���Ϊ
   * 0x8140--0xA0FE��������Ϊ0xA140--0xFEA0;���Ŀ�Ϊ���ֺ�26
   * ��Ӣ����ĸ�����ִ�Сд����21998-22034����0��1��...x��y��z��
   *
   * @param word
   * @return
   */
  public final static int GBK_ID(String word) {
    int result = -1;
    if (word != null && word.length() > 0) {
      final int height1 = 72, width1 = 94;
      final int count1 = height1 * width1;
      final int height2 = 32, width2 = 191;
      final int count2 = height2 * width2;
      final int width3 = 97; // height3=94
      final int count4 = 21998;
     
      //System.out.println("GBK_ID:"+word);
      word = word.length() > 1 ? word.substring(0, 1) : word;
      byte[] bs = word.getBytes();
      if (bs.length == 2) {
        result = 0;
        int i1 = getUnsigned(bs[0]);
        int i2 = getUnsigned(bs[1]);
        // ��һ��
        if (i1 >= 0xB0 && i1 <= 0xF7 && i2 >= 0xA1 && i2 <= 0xFE) {
          result = (i1 - 0xB0) * width1 + (i2 - 0xA1);
        }
        // �ڶ���
        else if (i1 >= 0x81 && i1 <= 0xA0 && i2 >= 0x40 && i2 <= 0xFE) {
          result += height1 * width1;
          result += (i1 - 0x81) * width2 + (i2 - 0x40);
        }
        // ������
        else if (i1 >= 0xA1 && i1 <= 0xFE && i2 >= 0x40 && i2 <= 0xA0) {
          result += count1 + count2;
          result += (i1 - 0xA1) * width3 + (i2 - 0x40);
        }
      } else if (bs.length == 1) {
        // is number
        if (bs[0] >= 48 && bs[0] <= 57) {
          result = count4 + bs[0] - 48;
        }
        // is lower number
        else if (bs[0] >= 97 && bs[0] <= 122) {
          result = count4 + bs[0] - 97 + 10;
        }
      }
    }
    return result;
  }

  /**
   * ����GBK_ID�õ���Ӧ��GBK����
   *
   * @param gbk_id
   * @return
   */
  public static String getGBKWord(int gbk_id) {
    String result = null;
    final int height1 = 72, width1 = 94;
    final int count1 = height1 * width1;
    final int height2 = 32, width2 = 191;
    final int count2 = count1 + height2 * width2;
    final int height3 = 94, width3 = 97; //
    final int count3 = count2 + height3 * width3;
    final int count4 = 21998;
    byte[] bs = new byte[2];

    if (gbk_id >= 0 && gbk_id < count1) {
      bs[0] = (byte) (gbk_id / width1 + 0xB0);
      bs[1] = (byte) (gbk_id % width1 + 0xA1);
    } else if (gbk_id >= count1 && gbk_id < count2) {
      gbk_id -= count1;
      bs[0] = (byte) (gbk_id / width2 + 0x81);
      bs[1] = (byte) (gbk_id % width2 + 0x40);
    } else if (gbk_id >= count2 && gbk_id < count3) {
      gbk_id -= count2;
      bs[0] = (byte) (gbk_id / width3 + 0xA1);
      bs[1] = (byte) (gbk_id % width3 + 0x40);
    } else if (gbk_id >= count4 && gbk_id <= count4 + 36) {
      bs = new byte[1];
      if (gbk_id >= count4 && gbk_id < count4 + 10)
        bs[0] = (byte) (gbk_id - count4 + 48);
      else
        bs[0] = (byte) (gbk_id - count4 - 10 + 97);
    } else
      return null;
    result = new String(bs);
    return result;
  }

  /**
   * ���ֵ�˳��������ַ������бȽ�
   *
   * @param s1
   * @param s2
   * @return
   */
  public static int compareTo(String s1, String s2) {
    if (s1 == null && s2 == null)
      return 0;
    else if (s1 != null && s2 == null)
      return 1;
    else if (s1 == null && s2 != null)
      return -1;
    else {
      int len = Math.min(s1.length(), s2.length());
      for (int i = 0; i < len; i++) {
        String id1 = s1.substring(i, i + 1);
        String id2 = s2.substring(i, i + 1);
        if (id1.equals(id2))
          continue;

        int rs =  GBK_ID(id1) - GBK_ID(id2);

        if (rs != 0)
          return rs;
      }

      if (s1.length() > s2.length())
        return 1;
      else if (s1.length() < s2.length())
        return -1;
      else
        return 0;
    }

  }

}

TOP

Related Classes of org.ictclas4j.util.Utility

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.