* @return Instance of EncodingInfo incapsulating all encoding-related data.
*/
protected static EncodingInfo getEncodingName(byte[] b4, int count) {
if (count < 2) {
return new EncodingInfo("UTF-8", null);
}
// UTF-16, with BOM
int b0 = b4[0] & 0xFF;
int b1 = b4[1] & 0xFF;
if (b0 == 0xFE && b1 == 0xFF) {
// UTF-16, big-endian
return new EncodingInfo("UTF-16BE", new Boolean(true), true);
}
if (b0 == 0xFF && b1 == 0xFE) {
// UTF-16, little-endian
return new EncodingInfo("UTF-16LE", new Boolean(false), true);
}
// default to UTF-8 if we don't have enough bytes to make a
// good determination of the encoding
if (count < 3) {
return new EncodingInfo("UTF-8", null);
}
// UTF-8 with a BOM
int b2 = b4[2] & 0xFF;
if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
return new EncodingInfo("UTF-8", null, true);
}
// default to UTF-8 if we don't have enough bytes to make a
// good determination of the encoding
if (count < 4) {
return new EncodingInfo("UTF-8", null);
}
// other encodings
int b3 = b4[3] & 0xFF;
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
// UCS-4, big endian (1234)
return new EncodingInfo("ISO-10646-UCS-4", new Boolean(true));
}
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
// UCS-4, little endian (4321)
return new EncodingInfo("ISO-10646-UCS-4", new Boolean(false));
}
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
// UCS-4, unusual octet order (2143)
// REVISIT: What should this be? (Currently this would be
// an exception :)
return new EncodingInfo("ISO-10646-UCS-4", null);
}
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
// UCS-4, unusual octect order (3412)
// REVISIT: What should this be?
return new EncodingInfo("ISO-10646-UCS-4", null);
}
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
// UTF-16, big-endian, no BOM
// (or could turn out to be UCS-2...
// REVISIT: What should this be?
return new EncodingInfo("UTF-16BE", new Boolean(true));
}
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
// UTF-16, little-endian, no BOM
// (or could turn out to be UCS-2...
return new EncodingInfo("UTF-16LE", new Boolean(false));
}
if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
// EBCDIC
// a la xerces1, return CP037 instead of EBCDIC here
return new EncodingInfo("CP037", null);
}
// default encoding
return new EncodingInfo("UTF-8", null);
} // END getEncodingName(byte[], int) : EncodingInfo