int ch = 0; //character being extracted
int chBytes; //num bytes consumed by current char (1 - 4)
final StringBuilder tempString = new StringBuilder();
SCRIPT currentScript = SCRIPT.NONE;
boolean inControl = false;
//decode and extract a character
while (curOffset < len) {
// based on "valid UTF-8 byte sequences" in the Unicode 5.0 book
final int curByte = buff[curOffset] & 0xFF; //ensure we are not comparing signed bytes to ints
if (curByte <= 0x7F) {
chBytes = 1;
ch = curByte;
} else if (curByte <= 0xC1) {
break;
} else if (curByte <= 0xDF) {
if (len - curOffset < 2) {
break;
}
final int curByte_1 = buff[curOffset + 1] & 0xFF;
if (curByte_1 >= 0x80 && curByte_1 <= 0xBF) {
chBytes = 2;
ch = (((curByte & 0x1f) << 6) + (curByte_1 & 0x3f));
} else {
break;
}
} else if (curByte == 0xE0) {
if (len - curOffset < 3) {
break;
}
final int curByte_1 = buff[curOffset + 1] & 0xFF;
final int curByte_2 = buff[curOffset + 2] & 0xFF;
if (curByte_1 >= 0xA0 && curByte_1 <= 0xBF
&& curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
chBytes = 3;
ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
} else {
break;
}
} else if (curByte <= 0xEC) {
if (len - curOffset < 3) {
break;
}
final int curByte_1 = buff[curOffset + 1] & 0xFF;
final int curByte_2 = buff[curOffset + 2] & 0xFF;
if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
&& curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
chBytes = 3;
ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
} else {
break;
}
} else if (curByte == 0xED) {
if (len - curOffset < 3) {
break;
}
final int curByte_1 = buff[curOffset + 1] & 0xFF;
final int curByte_2 = buff[curOffset + 2] & 0xFF;
if (curByte_1 >= 0x80 && curByte_1 <= 0x9F
&& curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
chBytes = 3;
ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
} else {
break;
}
} else if (curByte <= 0xEF) {
if (len - curOffset < 3) {
break;
}
final int curByte_1 = buff[curOffset + 1] & 0xFF;
final int curByte_2 = buff[curOffset + 2] & 0xFF;
if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
&& curByte_2 >= 0x80 && curByte_2 <= 0xBF) {
chBytes = 3;
ch = (((curByte & 0x0f) << 12) + ((curByte_1 & 0x3f) << 6) + (curByte_2 & 0x3f));
} else {
break;
}
} else if (curByte == 0xF0) {
if (len - curOffset < 4) {
break;
}
final int curByte_1 = buff[curOffset + 1] & 0xFF;
final int curByte_2 = buff[curOffset + 2] & 0xFF;
final int curByte_3 = buff[curOffset + 3] & 0xFF;
if (curByte_1 >= 0x90 && curByte_1 <= 0xBF
&& curByte_2 >= 0x80 && curByte_2 <= 0xBF
&& curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
chBytes = 4;
ch = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
} else {
break;
}
} else if (curByte <= 0xF3) {
if (len - curOffset < 4) {
break;
}
final int curByte_1 = buff[curOffset + 1] & 0xFF;
final int curByte_2 = buff[curOffset + 2] & 0xFF;
final int curByte_3 = buff[curOffset + 3] & 0xFF;
if (curByte_1 >= 0x80 && curByte_1 <= 0xBF
&& curByte_2 >= 0x80 && curByte_2 <= 0xBF
&& curByte_3 >= 0x80 && curByte_3 <= 0xBF) {
chBytes = 4;
ch = (((curByte & 0x07) << 18) + ((curByte_1 & 0x3f) << 12) + ((curByte_2 & 0x3f) << 6) + (curByte_3 & 0x3f));
} else {
break;
}
} else {
break;
}
curOffset += chBytes;
//skip if beyond range
if (ch > StringExtractUnicodeTable.UNICODE_TABLE_SIZE - 1) {
break;
}
//lookup byteVal in the unicode table
SCRIPT scriptFound = unicodeTable.getScript(ch);
if (scriptFound == SCRIPT.NONE) {
break;
}