package org.ictclas4j.util;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import org.ictclas4j.bean.Dictionary;
import org.ictclas4j.bean.PersonName;
import org.ictclas4j.segment.PosTagger;
import com.gftech.util.GFCommon;
import com.gftech.util.GFString;
public class Utility {
// GB2312�����к��ֹ���6768�����м���������ַ�Ҳ�����ڣ�
public static final int GB_NUM = 6768;
// GBK�����к��ֹ���21998�����м���������ַ�Ҳ�����ڣ�
public static final int GBK_NUM = 21998;
// GBK��չ������10�����ֺ�26��Ӣ����ĸ
public static final int GBK_NUM_EXT = GBK_NUM + 36;
// The number of Chinese Char,including 5 empty position between 3756-3761
public static final int WORD_MAXLENGTH = 100;
public static final int WT_DELIMITER = 0;
public static final int WT_CHINESE = 1;
public static final int WT_OTHER = 2;
public static final int CT_SENTENCE_BEGIN = 1;// Sentence begin
public static final int CT_SENTENCE_END = 4;// Sentence ending
public static final int CT_SINGLE = 5;// SINGLE byte
public static final int CT_DELIMITER = CT_SINGLE + 1;// delimiter
public static final int CT_CHINESE = CT_SINGLE + 2;// Chinese Char
public static final int CT_LETTER = CT_SINGLE + 3;// HanYu Pinyin
public static final int CT_NUM = CT_SINGLE + 4;// HanYu Pinyin
public static final int CT_INDEX = CT_SINGLE + 5;// HanYu Pinyin
public static final int CT_OTHER = CT_SINGLE + 12;// Other
public static final int MAX_WORDS = 650;
public static final int MAX_SEGMENT_NUM = 10;
public static final String POSTFIX_SINGLE = "�Ӱ���dzش嵥�����̵궴�ɶӷ��帮�Ը۸������źӺ������������ǽ־����ӿڿ�����¥·������Ū����������������Ȫ��ɽʡ��ˮ����̨̲̳����ͤ��������ϪϿ������������ҤӪ����ԷԺբկվ����ׯ�������";
public static final String[] POSTFIX_MUTIPLE = { "�뵺", "��ԭ", "����", "���", "��", "����", "����", "�۹�", "�ɲ�", "�ۿ�",
"���ٹ�·", "��ԭ", "��·", "��", "����", "�ȵ�", "�㳡", "����", "��Ͽ", "��ͬ", "����", "����", "����", "�ֵ�", "�ڰ�", "��ͷ", "ú��",
"����", "ũ��", "���", "ƽԭ", "����", "Ⱥ��", "ɳĮ", "ɳ��", "ɽ��", "ɽ��", "ˮ��", "���", "����", "��·", "�´�", "ѩ��", "�γ�", "�κ�",
"�泡", "ֱϽ��", "������", "������", "������", "" };
public static final String TRANS_ENGLISH = "�������������������°İʰŰͰװݰ������������ȱϱ˱��������������Ųɲֲ����Ĵȴδ����������������µõĵǵϵҵٵ۶����Ŷض����������������Ʒҷѷ�������ǸɸԸ���������ŹϹ��������������ϺӺպ����������������Ӽּ��ܽþӾ��������������¿ƿɿ˿Ͽ����������������������������������������������������������¡¬²³·��������������������éï÷����������������ĦĪīĬķľ������������������������ŦŬŵŷ��������������Ƥƽ��������ǡǿ��������Ȫ��������������������ɣɪɭɯɳɽ������ʥʩʫʯʲʷʿ��˹˾˿��������̩̹����������͡ͼ������������������Τάκ��������������������ϣϲ������Ъл������������ҢҶ��������������ӢӺ����Լ������ղ������������������٤��������üν�����������Ľ����������������ɺ����ѷ��������ܽ���������������";
public static final String TRANS_RUSSIAN = "�������°ͱȱ˲�����Ĵ�µö��Ŷ���������Ǹ�����Ӽ�ݽƿɿ˿���������������������¬³������÷����ķ������ŵ������������������ɫɽ��ʲ˹����̹������ά������ϣл��ҮҶ�������������ǵٸ�����ջ������������������������������ɣɳ��̩ͼ��������";
public static final String TRANS_JAPANESE = "���°˰װٰ�������ȱ��������ʲ˲ֲ������سന�����δ����������µص�ɶ������縣�Ը߹����Źȹع���úƺͺϺӺں���Ļ漪�ͼѼӼ�������������������þƾտ����ɿ˿�����������������������������¡¹������������ľ��������������Ƭƽ����ǧǰdz����������������Ȫ������������ɭɴɼɽ��������ʥʯʵʸ������ˮ˳˾��̩��������������βδ����������ϸ������СТ����������������������ңҰҲҶһ����������ӣ��������������ԨԪԫԭԶ����������������լ����������ֲ֦֪֮����������������������ܥݶ��������";
// Translation type
public static final int TT_ENGLISH = 0;
public static final int TT_RUSSIAN = 1;
public static final int TT_JAPANESE = 2;
// Seperator type
public static final String SEPERATOR_C_SENTENCE = "������������";
public static final String SEPERATOR_C_SUB_SENTENCE = "����������������";
public static final String SEPERATOR_E_SENTENCE = "!?:;";
public static final String SEPERATOR_E_SUB_SENTENCE = ",()\"'";
public static final String SEPERATOR_LINK = "\n\r ��";
// Sentence begin and ending string
public static final String SENTENCE_BEGIN = "ʼ##ʼ";
public static final String SENTENCE_END = "ĩ##ĩ";
// Seperator between two words
public static final String WORD_SEGMENTER = "@";
public static final int MAX_WORDS_PER_SENTENCE = 120;
public static final int MAX_UNKNOWN_PER_SENTENCE = 200;
public static final int MAX_POS_PER_WORD = 20;
public static final int LITTLE_FREQUENCY = 6;
public enum TAG_TYPE {
TT_NORMAL, TT_PERSON, TT_PLACE, TT_TRANS_PERSON
};
public static final int MAX_FREQUENCE = 2079997;// 7528283+329805
// //1993123+86874
public static final int MAX_SENTENCE_LEN = 2000;
public static final double INFINITE_VALUE = 10000.00;
// ƽ������
public static final double SMOOTH_PARAM = 0.1;
public static final String UNKNOWN_PERSON = "δ##��";
public static final String UNKNOWN_SPACE = "δ##��";
public static final String UNKNOWN_NUM = "δ##��";
public static final String UNKNOWN_TIME = "δ##ʱ";
public static final String UNKNOWN_LETTER = "δ##��";
public static boolean gbGenerate(String fileName) {
File file;
int i, j;
file = new File(fileName);
try {
PrintWriter out = new PrintWriter(new FileOutputStream(file));
if (!file.canWrite())
return false;// fail while opening the file
for (i = 161; i < 255; i++)
for (j = 161; j < 255; j++)
out.println("" + i + j + "," + i + "," + j);
out.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
return true;
}
/***************************************************************************
*
* Func Name : CC_Generate
*
* Description: Generate the Chinese Char List file
*
*
* Parameters : sFilename: the file name for the output CC List
*
* Returns : public static boolean Author : Kevin Zhang History : 1.create
* 2002-1-8
**************************************************************************/
public static boolean CC_Generate(String fileName) {
File file;
int i, j;
file = new File(fileName);
try {
PrintWriter out = new PrintWriter(new FileOutputStream(file));
for (i = 176; i < 255; i++)
for (j = 161; j < 255; j++)
out.println("" + i + j + "," + i + "," + j);
out.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return true;
}
/***************************************************************************
*
* Func Name : CC_Find
*
* Description: Find a Chinese sub-string in the Chinese String
*
*
* Parameters : string:Null-terminated string to search
*
* strCharSet:Null-terminated string to search for
*
* Returns : String Author : Kevin Zhang History : 1.create 2002-1-8
**************************************************************************/
public static boolean CC_Find(final byte[] string, final byte[] strCharSet) {
if (string != null && strCharSet != null) {
int index = strstr(string, strCharSet);
if (index != -1 && (index % 2 == 1)) {
return false;
}
}
return true;
}
/***************************************************************************
*
* Func Name : charType
*
* Description: Judge the type of sChar or (sChar,sChar+1)
*
*
* Parameters : sFilename: the file name for the output CC List
*
* Returns : int : the type of char Author : Kevin Zhang History : 1.create
* 2002-1-8
**************************************************************************/
public static int charType(String str) {
if (str != null && str.length() > 0) {
byte[] b = str.getBytes();
byte b1 = b[0];
byte b2 = b.length > 1 ? b[1] : 0;
int ub1=getUnsigned(b1);
int ub2=getUnsigned(b2);
if (ub1 < 128) {
if ("\"!,.?()[]{}+=".indexOf((char) b1) != -1)
return CT_DELIMITER;
return CT_SINGLE;
} else if (ub1 == 162)
return CT_INDEX;
else if (ub1 == 163 && ub2 > 175 && ub2 < 186)
return CT_NUM;
else if (ub1 == 163
&& (ub2 >= 193 && ub2 <= 218 || ub2 >= 225
&& ub2 <= 250))
return CT_LETTER;
else if (ub1 == 161 || ub1 == 163)
return CT_DELIMITER;
else if (ub1 >= 176 && ub1 <= 247)
return CT_CHINESE;
}
return CT_OTHER;
}
/***************************************************************************
*
* Func Name : GetCCPrefix
*
* Description: Get the max Prefix string made up of Chinese Char
*
*
* Parameters : sSentence: the original sentence which includes Chinese or
* Non-Chinese char
*
* Returns : the end of the sub-sentence Author : Kevin Zhang History :
* 1.create 2002-1-8
**************************************************************************/
public static int getCCPrefix(byte[] sSentence) {
int nLen = sSentence.length;
int nCurPos = 0;
while (nCurPos < nLen && getUnsigned(sSentence[nCurPos]) > 175 && getUnsigned(sSentence[nCurPos]) < 248) {
nCurPos += 2;// Get next Chinese Char
}
return nCurPos;
}
/***************************************************************************
*
* Func Name : IsAllSingleByte
*
* Description: Judge the string is all made up of Single Byte Char
*
*
* Parameters : sSentence: the original sentence which includes Chinese or
* Non-Chinese char
*
* Returns : the end of the sub-sentence Author : Kevin Zhang History :
* 1.create 2002-1-24
**************************************************************************/
public static boolean isAllChinese(String str) {
if (str != null) {
String temp = str + " ";
for (int i = 0; i < str.length(); i++) {
byte[] b = temp.substring(i, i + 1).getBytes();
if (b.length == 2) {
int ub0=getUnsigned(b[0]);
if (!(ub0 < 248 && ub0 > 175)
|| !(ub0 < 253 && ub0 > 160))
return false;
}
}
return true;
}
return false;
}
/***************************************************************************
*
* Func Name : IsAllNonChinese
*
* Description: Judge the string is all made up of Single Byte Char
*
*
* Parameters : sSentence: the original sentence which includes Chinese or
* Non-Chinese char
*
* Returns : the end of the sub-sentence Author : Kevin Zhang History :
* 1.create 2002-1-24
**************************************************************************/
public static boolean isAllNonChinese(byte[] sString) {
int nLen = sString.length;
int i = 0;
while (i < nLen) {
if (getUnsigned(sString[i]) < 248 && getUnsigned(sString[i]) > 175)
return false;
if (sString[i] < 0)
i += 2;
else
i += 1;
}
return true;
}
/***************************************************************************
*
* Func Name : IsAllSingleByte
*
* Description: Judge the string is all made up of Single Byte Char
*
*
* Parameters : sSentence: the original sentence which includes Chinese or
* Non-Chinese char
*
* Returns : the end of the sub-sentence Author : Kevin Zhang History :
* 1.create 2002-1-24
**************************************************************************/
public static boolean isAllSingleByte(String str) {
if (str != null) {
int len = str.length();
int i = 0;
byte[] b = str.getBytes();
while (i < len && b[i] < 128) {
i++;
}
if (i < len)
return false;
return true;
}
return false;
}
/***************************************************************************
*
* Func Name : IsAllNum
*
* Description: Judge the string is all made up of Num Char
*
*
* Parameters : sSentence: the original sentence which includes Chinese or
* Non-Chinese char
*
* Returns : the end of the sub-sentence Author : Kevin Zhang History :
* 1.create 2002-1-24
**************************************************************************/
public static boolean isAllNum(String str) {
if (str != null) {
int i = 0;
String temp = str + " ";
// �жϿ�ͷ�Ƿ���+-֮��ķ���
if ("��+��-��".indexOf(temp.substring(0, 1)) != -1)
i++;
/** �����ȫ�ǵģ������������������� �ַ�* */
while (i < str.length() && "��������������������".indexOf(str.substring(i, i + 1)) != -1)
i++;
// Get middle delimiter such as .
if (i < str.length()) {
String s = str.substring(i, i + 1);
if ("�����".indexOf(s) != -1 || ".".equals(s) || "/".equals(s)) {// 98��1��
i++;
while (i + 1 < str.length() && "��������������������".indexOf(str.substring(i + 1, i + 2)) != -1)
i++;
}
}
if (i >= str.length())
return true;
while (i < str.length() && GFString.cint(str.substring(i, i + 1)) >= 0
&& GFString.cint(str.substring(i, i + 1)) <= 9)
i++;
// Get middle delimiter such as .
if (i < str.length()) {
String s = str.substring(i, i + 1);
if ("�����".indexOf(s) != -1 || ".".equals(s) || "/".equals(s)) {// 98��1��
i++;
while (i + 1 < str.length() && "0123456789".indexOf(str.substring(i + 1, i + 2)) != -1)
i++;
}
}
if (i < str.length()) {
if ("��ǧ���ڰ�Ǫ����".indexOf(str.substring(i, i + 1)) == -1 && !"%".equals(str.substring(i, i + 1)))
i--;
}
if (i >= str.length())
return true;
}
return false;
}
/***************************************************************************
*
* Func Name : IsAllIndex
*
* Description: Judge the string is all made up of Index Num Char
*
*
* Parameters : sSentence: the original sentence which includes Chinese or
* Non-Chinese char
*
* Returns : the end of the sub-sentence Author : Kevin Zhang History :
* 1.create 2002-1-24
**************************************************************************/
public static boolean isAllIndex(byte[] sString) {
int nLen = sString.length;
int i = 0;
while (i < nLen - 1 && getUnsigned(sString[i]) == 162) {
i += 2;
}
if (i >= nLen)
return true;
while (i < nLen && (sString[i] > 'A' - 1 && sString[i] < 'Z' + 1)
|| (sString[i] > 'a' - 1 && sString[i] < 'z' + 1)) {// single
// byte
// number
// char
i += 1;
}
if (i < nLen)
return false;
return true;
}
/***************************************************************************
*
* Func Name : IsAllLetter
*
* Description: Judge the string is all made up of Letter Char
*
*
* Parameters : sSentence: the original sentence which includes Chinese or
* Non-Chinese char
*
* Returns : the end of the sub-sentence Author : Kevin Zhang History :
* 1.create 2002-1-24
**************************************************************************/
public static boolean isAllLetter(String str) {
int i = 0;
if (str != null) {
int nLen = str.length();
byte[] b = str.getBytes();
while (i < nLen - 1
&& getUnsigned(b[i]) == 163
&& ((getUnsigned(b[i + 1]) >= 193 && getUnsigned(b[i + 1]) <= 218) || (getUnsigned(b[i + 1]) >= 225 && getUnsigned(b[i + 1]) <= 250))) {
i += 2;
}
if (i < nLen)
return false;
return true;
}
return false;
}
/***************************************************************************
*
* Func Name : IsAllDelimiter
*
* Description: Judge the string is all made up of Delimiter
*
*
* Parameters : sSentence: the original sentence which includes Chinese or
* Non-Chinese char
*
* Returns : the end of the sub-sentence Author : Kevin Zhang History :
* 1.create 2002-1-24
**************************************************************************/
public static boolean isAllDelimiter(byte[] sString) {
int nLen = sString.length;
int i = 0;
while (i < nLen - 1 && (getUnsigned(sString[i]) == 161 || getUnsigned(sString[i]) == 163)) {
i += 2;
}
if (i < nLen)
return false;
return true;
}
/***************************************************************************
*
* Func Name : BinarySearch
*
* Description: Lookup the index of nVal in the table nTable which length is
* nTableLen
*
* Parameters : nPOS: the POS value
*
* Returns : the index value Author : Kevin Zhang History : 1.create
* 2002-1-25
**************************************************************************/
public static int binarySearch(int val, int[] table) {
if (table != null) {
int len = table.length;
int start = 0, end = len - 1, mid = (start + end) / 2;
while (start <= end)// Binary search
{
if (table[mid] == val) {
return mid;// find it
} else if (table[mid] < val) {
start = mid + 1;
} else {
end = mid - 1;
}
mid = (start + end) / 2;
}
}
return -1;// Can not find it;
}
/***************************************************************************
*
* Func Name : IsForeign
*
* Description: Decide whether the word is not a Non-fereign word
*
* Parameters : sWord: the word
*
* Returns : the index value Author : Kevin Zhang History : 1.create
* 2002-1-26
**************************************************************************/
public static boolean isForeign(String word) {
if (word != null) {
int foreignCount = getForeignCharCount(word);
int charCount = word.length();
if (charCount > 2 || foreignCount >= 1 * charCount / 2)
return true;
}
return false;
}
/***************************************************************************
*
* Func Name : IsAllForeign
*
* Description: Decide whether the word is not a Non-fereign word
*
* Parameters : sWord: the word
*
* Returns : the index value Author : Kevin Zhang History : 1.create
* 2002-3-25
**************************************************************************/
public static boolean isAllForeign(String sWord) {
int nForeignCount = getForeignCharCount(sWord);
if (2 * nForeignCount == sWord.length())
return true;
return false;
}
/***************************************************************************
*
* Func Name : IsForeign
*
* Description: Decide whether the word is Chinese Num word
*
* Parameters : sWord: the word
*
* Returns : the index value Author : Kevin Zhang History : 1.create
* 2002-1-26
**************************************************************************/
public static boolean isAllChineseNum(String word) {// �ٷ�֮������������ϰ˵�ʮ�˷���
String chineseNum = "���һ�������������߰˾�ʮإ��ǧ����Ҽ��������½��ƾ�ʰ��Ǫ�á�������";//
String prefix = "�������ϳ�";
if (word != null) {
String temp = word + " ";
for (int i = 0; i < word.length(); i++) {
if (temp.indexOf("��֮", i) != -1)// �ٷ�֮��
{
i += 2;
continue;
}
String tchar = temp.substring(i, i + 1);
if (chineseNum.indexOf(tchar) == -1 && (i != 0 || prefix.indexOf(tchar) == -1))
return false;
}
return true;
}
return false;
}
/***************************************************************************
*
* Func Name : GetForeignCharCount
*
* Description:
*
* Parameters : sWord: the word
*
* Returns : the index value Author : Kevin Zhang History : 1.create
* 2002-4-4 2.Modify 2002-5-21
**************************************************************************/
public static int getForeignCharCount(String sWord) {
int nForeignCount, nCount;
// English char counnts
nForeignCount = getCharCount(TRANS_ENGLISH, sWord);
// Japan char counnts
nCount = getCharCount(TRANS_JAPANESE, sWord);
if (nForeignCount <= nCount)
nForeignCount = nCount;
// Russian char counnts
nCount = getCharCount(TRANS_RUSSIAN, sWord);
if (nForeignCount <= nCount)
nForeignCount = nCount;
return nForeignCount;
}
/**
* �õ��ַ������ַ����ַ����г��ֵĴ���
*
* @param charSet
* @param word
* @return
*/
public static int getCharCount(String charSet, String word) {
int nCount = 0;
if (word != null) {
String temp = word + " ";
for (int i = 0; i < word.length(); i++) {
String s = temp.substring(i, i + 1);
if (charSet.indexOf(s) != -1)
nCount++;
}
}
return nCount;
}
/***************************************************************************
*
* Func Name : GetForeignCharCount
*
* Description: Return the foreign type
*
* Parameters : sWord: the word
*
* Returns : the index value Author : Kevin Zhang History : 1.create
* 2002-4-4 2.Modify 2002-5-21
**************************************************************************/
public int GetForeignType(String sWord) {
int nForeignCount, nCount, nType = TT_ENGLISH;
nForeignCount = getCharCount(TRANS_ENGLISH, sWord);// English
// char
// counnts
nCount = getCharCount(TRANS_RUSSIAN, sWord);// Russian
// char
// counnts
if (nForeignCount < nCount) {
nForeignCount = nCount;
nType = TT_RUSSIAN;
}
nCount = getCharCount(TRANS_JAPANESE, sWord);// Japan
// char
// counnts
if (nForeignCount < nCount) {
nForeignCount = nCount;
nType = TT_JAPANESE;
}
return nType;
}
public static byte[] readBytes(DataInputStream in, int len) {
if (in != null && len > 0) {
byte[] b = new byte[len];
try {
for (int i = 0; i < len; i++)
b[i] = in.readByte();
} catch (IOException e) {
e.printStackTrace();
}
return b;
}
return null;
}
public static boolean PostfixSplit(byte[] sWord, byte[] sWordRet, byte[] sPostfix) {
byte[] sSinglePostfix = POSTFIX_SINGLE.getBytes();
byte[][] sMultiPostfix = new byte[POSTFIX_MUTIPLE.length][9];
for (int i = 0; i < sMultiPostfix.length; i++)
sMultiPostfix[i] = POSTFIX_MUTIPLE[i].getBytes();
int nPostfixLen = 0, nWordLen = sWord.length;
int i = 0;
while (sMultiPostfix[i][0] != 0
&& strncmp(GFCommon.bytesCopy(sWord, nWordLen - sMultiPostfix[i].length, sWord.length - nWordLen
+ sMultiPostfix[i].length), 0, sMultiPostfix[i], sMultiPostfix[i].length) == false) {// Try
// to
// get
// the
// postfix of an
// address
i++;
}
GFCommon.bytesCopy(sPostfix, sMultiPostfix[i], 0, sMultiPostfix.length);
nPostfixLen = sMultiPostfix[i].length;// Get the length of place
// postfix
if (nPostfixLen == 0) {
sPostfix[2] = 0;
strncpy(sPostfix, GFCommon.bytesCopy(sWord, nWordLen - 2, 2), 2);
if (CC_Find(sSinglePostfix, sPostfix))
nPostfixLen = 2;
}
strncpy(sWordRet, sWord, nWordLen - nPostfixLen);
sWordRet[nWordLen - nPostfixLen] = 0;// Get the place name which have
// erasing the postfix
sPostfix[nPostfixLen] = 0;
return true;
}
/**
* �Ƚϵڶ����ֽ������Ƿ��ڵ�һ���г���
*
* @param b1
* @param b2
* @return ���ص�һ�γ�����λ�á����û�г��֣��أ�1
*/
public static int strstr(byte[] b1, byte[] b2) {
boolean flag = true;
if (b1 != null && b2 != null) {
for (int i = 0; i < b1.length; i++) {
if (b1[i] != b2[0])
continue;
else {
if (b1.length - i >= b2.length) {
for (int j = 0; j < b2.length; j++) {
if (b2[j] != b1[i + j]) {
flag = false;
break;
}
}
if (flag) {
return i;
}
}
}
}
}
return -1;
}
public static int strchr(byte[] bs, byte b) {
if (bs != null) {
for (int i = 0; i < bs.length; i++) {
if (bs[i] == b)
return i;
}
}
return -1;
}
/**
* �Ƚ������ֽ�����ǰlen���ֽ��Ƿ����
*
* @param b1
* @param b2
* @param len
* @return
*/
public static boolean strncmp(byte[] b1, int startIndex, byte[] b2, int len) {
if (b1 != null && b2 != null && len > 0) {
if (b1.length >= len && b2.length >= len) {
for (int i = startIndex; i < len; i++) {
if (b1[i] != b2[i])
return true;
}
}
}
return false;
}
public static int getUnsigned(byte b) {
if (b > 0)
return (int) b;
else
return (b & 0x7F)|0x80;
}
public static void strncpy(byte[] dest, byte[] src, int len) {
if (dest != null && src != null) {
if (dest.length >= len && len <= src.length) {
for (int i = 0; i < len; i++)
dest[i] = src[i];
}
}
}
/**
* ������6768��λ���ж�Ӧ��ID��
*/
public static int GB_ID(String str) {
int result = -1;
if (str != null && str.length() > 0) {
byte[] b = str.getBytes();
result = (getUnsigned(b[0]) - 176) * 94 + (getUnsigned(b[1]) - 161);
}
return result;
}
/**
* The first char computed by the Chinese Char ID
*
* @param id
* @return
*/
public static int CC_CHAR1(int id) {
return (id) / 94 + 176;
}
/**
* The second char computed by the Chinese Char ID
*
* @param id
* @return
*/
public static int CC_CHAR2(int id) {
return (id) % 94 + 161;
}
public static int strcat(byte[] dest, byte[] src, int len) {
if (dest != null && src != null && len > 0) {
for (int i = 0; i < dest.length; i++) {
if (dest[i] == 0) {
for (int j = 0; j < len; j++)
dest[i] = src[j];
return i;
}
}
}
return -1;
}
public static int strcpy(byte[] dest, byte[] src) {
return strcpy(dest, src, src.length);
}
public static int strcpy(byte[] dest, byte[] src, int len) {
if (dest != null && src != null && len > 0) {
int i = 0;
for (i = 0; i < len; i++) {
dest[i] = src[i];
}
return i;
}
return -1;
}
/**
* ����ID�ŵõ���Ӧ��GB����
*
* @param id
* 0--6767
* @return
*/
public static String getGBWord(int id) {
String result = null;
if (id >= 0 && id < 6768) {
byte[] b = new byte[2];
b[0] = (byte) CC_CHAR1(id);
b[1] = (byte) CC_CHAR2(id);
try {
result = new String(b, "GBK");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
return result;
}
public static boolean isSingle(String s) {
if (s != null && s.getBytes().length == 1)
return true;
else
return false;
}
public static int[] removeInvalid(int[] src) {
int[] result = null;
int count = 0;
if (src != null && src.length > 0) {
for (int i = 0; i < src.length; i++) {
if (i != 0 && src[i] == 0)
break;
else
count++;
}
result = new int[count];
for (int i = 0; i < count; i++)
result[i] = src[i];
}
return result;
}
/**
* �ж��ַ����Ƿ������
*
* @param str
* @return
*/
public static boolean isYearTime(String snum) {
if (snum != null) {
int len = snum.length();
String first = snum.substring(0, 1);
// 1992��, 98��,06��
if (isAllSingleByte(snum)
&& (len == 4 || len == 2 && (GFString.cint(first) > 4 || GFString.cint(first) == 0)))
return true;
if (isAllNum(snum) && (len >= 6 || len == 4 && "������������".indexOf(first) != -1))
return true;
if (getCharCount("���һ�����������߰˾�Ҽ��������½��ƾ�", snum) == len && len >= 2)
return true;
if (len == 4 && getCharCount("ǧǪ���", snum) == 2)// ��Ǫ�����
return true;
if (len == 1 && getCharCount("ǧǪ", snum) == 1)
return true;
if (len == 2 && getCharCount("���ұ����켺�����ɹ�", snum) == 1
&& getCharCount("�ӳ���î������δ�����纥", snum.substring(1)) == 1)
return true;
}
return false;
}
/**
* �ж�һ���ַ����������ַ��Ƿ�����һ���ַ���������
*
* @param aggr
* �ַ�������
* @param str
* ��Ҫ�жϵ��ַ���
* @return
*/
public static boolean isInAggregate(String aggr, String str) {
if (aggr != null && str != null) {
str += "1";
for (int i = 0; i < str.length(); i++) {
String s = str.substring(i, i + 1);
if (aggr.indexOf(s) == -1)
return false;
}
return true;
}
return false;
}
/**
* �жϸ��ַ����Ƿ��ǰ���ַ�
*
* @param str
* @return
*/
public static boolean isDBCCase(String str) {
if (str != null) {
str += " ";
for (int i = 0; i < str.length(); i++) {
String s = str.substring(i, i + 1);
if (s.getBytes().length != 1)
return false;
}
return true;
}
return false;
}
/**
* �жϸ��ַ����Ƿ���ȫ���ַ�
*
* @param str
* @return
*/
public static boolean isSBCCase(String str) {
if (str != null) {
str += " ";
for (int i = 0; i < str.length(); i++) {
String s = str.substring(i, i + 1);
if (s.getBytes().length != 2)
return false;
}
return true;
}
return false;
}
/**
* �ж��Ƿ���һ�����ַ����ָ�����
*
* @param str
* @return
*/
public static boolean isDelimiter(String str) {
if (str != null && ("-".equals(str) || "��".equals(str)))
return true;
else
return false;
}
public static boolean isUnknownWord(String word) {
if (word != null && word.indexOf("δ##") == 0)
return true;
else
return false;
}
public static PersonName chineseNameSplit( PosTagger personTagger,String word, int index) {
PersonName result = null;
if ( word != null && personTagger != null) {
Dictionary personDict = personTagger.getUnknownDict();
int len = word.length();
if (len < 2 || len > 4)
return null;
String[] atoms = GFString.atomSplit(word);
for (String s : atoms) {
if (Utility.charType(s) != Utility.CT_CHINESE && Utility.charType(s) != Utility.CT_OTHER)
return null;
}
String surName = null;
int surNameLen = 2;
if (len > 2)
surName = word.substring(0, surNameLen);
else if (len == 2)
surName = word;
if (!personDict.isExist( surName, 1,index)) {
surNameLen = 1;
if (len > 1)
surName = word.substring(0, surNameLen);
else if (len == 1)
surName = word;
if (!personDict.isExist( surName, 1,index)) {
surName = null;
surNameLen = 0;
}
}
String giveName = word.substring(surNameLen);
if (len > 3) {
String temp = word.substring(surNameLen, surNameLen + 1);
if (personDict.isExist( temp, 1,index)) {
giveName = word.substring(surNameLen + 1);
}
}
double freq = personDict.getFreq( surName, 1,index);
String temp = giveName.substring(0, 1);
double freq2 = personDict.getFreq( temp, 2,index);
if (surNameLen != 2
&& ((surNameLen == 0 && len > 2) || giveName.length() > 2 || getForeignCharCount(word) >= 3
&& freq < personDict.getFreq( "��", 1,index) / 40 && freq2 < personDict.getFreq( "��", 2,index) / 20 || (freq < 10 && getForeignCharCount(giveName) == (len - surNameLen) / 2)))
return null;
if (len == 2 && personTagger.isGivenName(word))
return null;
result = new PersonName();
result.setFirstName(surName);
result.setLastName(giveName);
}
return result;
}
public static String int2pos(int pos) {
String result = "";
int tag = Math.abs(pos);
for (int k = 0; k < 4; k++) {
int value = tag & 0xFF;
if (value > 0)
result = (char) value + result;
tag = tag >> 8;
}
return result;
}
/**
* ȡ�ú�����GBK�����е�ID�ţ��������š� ID�ŵı��Ź����ǣ�
* GBK����ĺ��ֹ��ֳ����飬��21998����GB2312������еij�����Ϊ��һ�飬
* 0xB0A1--0xF7FE,���ֺ�CC_ID�м����һ�£��ڶ���Ϊ
* 0x8140--0xA0FE��������Ϊ0xA140--0xFEA0;���Ŀ�Ϊ���ֺ�26
* ��Ӣ����ĸ�����ִ�Сд����21998-22034����0��1��...x��y��z��
*
* @param word
* @return
*/
public final static int GBK_ID(String word) {
int result = -1;
if (word != null && word.length() > 0) {
final int height1 = 72, width1 = 94;
final int count1 = height1 * width1;
final int height2 = 32, width2 = 191;
final int count2 = height2 * width2;
final int width3 = 97; // height3=94
final int count4 = 21998;
//System.out.println("GBK_ID:"+word);
word = word.length() > 1 ? word.substring(0, 1) : word;
byte[] bs = word.getBytes();
if (bs.length == 2) {
result = 0;
int i1 = getUnsigned(bs[0]);
int i2 = getUnsigned(bs[1]);
// ��һ��
if (i1 >= 0xB0 && i1 <= 0xF7 && i2 >= 0xA1 && i2 <= 0xFE) {
result = (i1 - 0xB0) * width1 + (i2 - 0xA1);
}
// �ڶ���
else if (i1 >= 0x81 && i1 <= 0xA0 && i2 >= 0x40 && i2 <= 0xFE) {
result += height1 * width1;
result += (i1 - 0x81) * width2 + (i2 - 0x40);
}
// ������
else if (i1 >= 0xA1 && i1 <= 0xFE && i2 >= 0x40 && i2 <= 0xA0) {
result += count1 + count2;
result += (i1 - 0xA1) * width3 + (i2 - 0x40);
}
} else if (bs.length == 1) {
// is number
if (bs[0] >= 48 && bs[0] <= 57) {
result = count4 + bs[0] - 48;
}
// is lower number
else if (bs[0] >= 97 && bs[0] <= 122) {
result = count4 + bs[0] - 97 + 10;
}
}
}
return result;
}
/**
* ����GBK_ID�õ���Ӧ��GBK����
*
* @param gbk_id
* @return
*/
public static String getGBKWord(int gbk_id) {
String result = null;
final int height1 = 72, width1 = 94;
final int count1 = height1 * width1;
final int height2 = 32, width2 = 191;
final int count2 = count1 + height2 * width2;
final int height3 = 94, width3 = 97; //
final int count3 = count2 + height3 * width3;
final int count4 = 21998;
byte[] bs = new byte[2];
if (gbk_id >= 0 && gbk_id < count1) {
bs[0] = (byte) (gbk_id / width1 + 0xB0);
bs[1] = (byte) (gbk_id % width1 + 0xA1);
} else if (gbk_id >= count1 && gbk_id < count2) {
gbk_id -= count1;
bs[0] = (byte) (gbk_id / width2 + 0x81);
bs[1] = (byte) (gbk_id % width2 + 0x40);
} else if (gbk_id >= count2 && gbk_id < count3) {
gbk_id -= count2;
bs[0] = (byte) (gbk_id / width3 + 0xA1);
bs[1] = (byte) (gbk_id % width3 + 0x40);
} else if (gbk_id >= count4 && gbk_id <= count4 + 36) {
bs = new byte[1];
if (gbk_id >= count4 && gbk_id < count4 + 10)
bs[0] = (byte) (gbk_id - count4 + 48);
else
bs[0] = (byte) (gbk_id - count4 - 10 + 97);
} else
return null;
result = new String(bs);
return result;
}
/**
* ���ֵ�˳��������ַ������бȽ�
*
* @param s1
* @param s2
* @return
*/
public static int compareTo(String s1, String s2) {
if (s1 == null && s2 == null)
return 0;
else if (s1 != null && s2 == null)
return 1;
else if (s1 == null && s2 != null)
return -1;
else {
int len = Math.min(s1.length(), s2.length());
for (int i = 0; i < len; i++) {
String id1 = s1.substring(i, i + 1);
String id2 = s2.substring(i, i + 1);
if (id1.equals(id2))
continue;
int rs = GBK_ID(id1) - GBK_ID(id2);
if (rs != 0)
return rs;
}
if (s1.length() > s2.length())
return 1;
else if (s1.length() < s2.length())
return -1;
else
return 0;
}
}
}