/*
* EncodingUtil.java
TODO methods for booleans
*
* Copyright (C) 2005-2006 Tommi Laukkanen
* http://www.substanceofcode.com
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*/
// Expand to define test define
@DTESTDEF@
// Expand to define logging define
@DLOGDEF@
package com.substanceofcode.utils;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Hashtable;
import java.util.Vector;
import com.substanceofcode.utils.CauseException;
//#ifdef DLOGGING
import net.sf.jlogmicro.util.logging.Logger;
import net.sf.jlogmicro.util.logging.Level;
//#endif
/**
* Simple encoding handler to allow handling utf-16 and 1252.
*
* @author Irving Bunton Jr
*/
public class EncodingUtil {
final static public boolean m_midpIso = (System.getProperty(
"microedition.encoding").toLowerCase().startsWith("iso-8859") ||
System.getProperty(
"microedition.encoding").toLowerCase().startsWith("iso8859"));
final static public String m_isoEncoding = initIsoEncoding();
final static public boolean m_midpWin = (System.getProperty(
"microedition.encoding").toLowerCase().startsWith("cp") ||
System.getProperty(
"microedition.encoding").toLowerCase().startsWith("windows"));
final static public String m_winEncoding = initWinEncoding();
final static public boolean m_midpUni = System.getProperty(
"microedition.encoding").toLowerCase().startsWith("utf-8");
final static String[] m_isoCommonEntities =
{"iexcl", "cent", "pound", "curren", "yen",
"brvbar", "sect", "uml", "copy", "ordf",
"laquo", "not", "shy", "reg", "macr",
"deg", "plusmn", "sup2", "sup3", "acute",
"micro", "para", "middot", "cedil", "sup1",
"ordm", "raquo", "frac14", "frac12", "frac34",
"iquest"};
final static String[] m_isoSpecialEntities =
{"ndash", // en dash
"mdash", // em dash
"lsquo", // left single quotation mark
"rsquo", // right single quotation mark
"sbquo", // single low-9 quotation mark
"ldquo", // left double quotation mark
"rdquo", // right double quotation mark
"bdquo"}; // double low-9 quotation mark
final static char[] m_isoSpecialValues =
{'-', // en dash
'-', // em dash
'\'', // left single quotation mark
'\'', // right single quotation mark
'\'', // single low-9 quotation mark
'\"', // left double quotation mark
'\"', // right double quotation mark
'\"'}; // double low-9 quotation mark
final static char[] m_isoCommValues =
{0xA1, 0xA2, 0xA3, 0xA4, 0xA5,
0xA6, 0xA7, 0xA8, 0xA9, 0xAA,
0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
0xB0, 0xB1, 0xB2, 0xB3, 0xB4,
0xB5, 0xB6, 0xB7, 0xB8, 0xB9,
0xBA, 0xBB, 0xBC, 0xBD, 0xBE,
0xBF};
final static String[] m_isoLatin1Entities =
{"Agrave", "Aacute", "Acirc", "Atilde", "Auml",
"Aring", "AElig", "Ccedil", "Egrave", "Eacute", "Ecirc", "Euml",
"Igrave", "Iacute", "Icirc", "Iuml", "ETH", "Ntilde", "Ograve",
"Oacute", "Ocirc", "Otilde", "Ouml", "times", "Oslash", "Ugrave",
"Uacute", "Ucirc", "Uuml", "Yacute", "THORN", "szlig", "agrave",
"aacute", "acirc", "atilde", "auml", "aring", "aelig", "ccedil",
"egrave", "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
"iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
"ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", "uuml",
"yacute", "thorn", "yuml"};
// Convert windows characters in iso 8859 control range to ISO
// (not the actual character, but a good fix or remove if no equivalent)
final public static char[] m_winIsoConvx80 = initWinIsoConv();
// Convert uni chars to equivalent windows characters in the 0x80 - 0x9f
// range.
final public static char[] m_uniWinConvx80 = initUniWinConvx80();
// See if windows cp-1252 is supported.
final public static boolean m_hasWinEncoding = hasWinEncoding();
// See if ISO8859-1 is supported.
final public static boolean m_hasIso8859Encoding = hasIso8859Encoding();
final private static String m_xmlEntKeys =
"< > & '"";
final private static String[] m_xmlEntValues =
{"<", ">", " ", "&", "'", "\""};
// Left single quote in cp-1252 (Windows) encoding.
public static final char CWSGL_LOW9_QUOTE = 0x82; // #130;
public static final char CWDBL_LOW9_QUOTE = 0x84; // #132;
public static final char CWLEFT_SGL_QUOTE = 0x91; // #145;
public static final char CWRIGHT_SGL_QUOTE = 0x92; // #146;
public static final char [] CAWRIGHT_SGL_QUOTE = {CWRIGHT_SGL_QUOTE};
public static final String WRIGHT_SGL_QUOTE = new String(CAWRIGHT_SGL_QUOTE);
public static final char CWLEFT_DBL_QUOTE = 0x93; // #147;
public static final char CWRIGHT_DBL_QUOTE = 0x94; // #148;
public static final char CWEN_DASH = 0x96; // #150;
public static final char CWEM_DASH = 0x97; // #151;
// Left single quote in Unicode (utf-16) encoding.
// Long dash a.k.a en dash
public static final char CEN_DASH = 0x2013;
public static final char CEM_DASH = 0x2014;
public static final char CLEFT_SGL_QUOTE = 0x2018;
public static final char CRIGHT_SGL_QUOTE = 0x2019;
public static final char [] CARIGHT_SGL_QUOTE = {CRIGHT_SGL_QUOTE};
public static final String RIGHT_SGL_QUOTE = new String(CARIGHT_SGL_QUOTE);
public static final char CSGL_LOW9_QUOTE = 0x201A;
private static final char CLEFT_DBL_QUOTE = 0x201C;
private static final char CRIGHT_DBL_QUOTE = 0x201D;
public static final char CDBL_LOW9_QUOTE = 0x201E;
public static final char CA_UMLAUTE = (char)228;
private static final char CO_UMLAUTE = (char)246;
public static final char CNON_BREAKING_SP = (char)160;
private EncodingStreamReader m_encodingStreamReader;
final private static Hashtable m_convXmlEntities = initXmlEntities();
final private static Hashtable m_convIso88591 = initAlphaIso88591(false);
final private static Hashtable m_convXmlIso88591 = initAlphaIso88591(true);
final private static Hashtable m_convCp1252 = initAlphaCp1252(false);
final private static Hashtable m_convXmlCp1252 = initAlphaCp1252(true);
private String m_docEncoding = ""; // Default for XML is UTF-8.
// unexpected UTF-16.
private boolean m_utf = false; // Doc is utf.
private boolean m_getPrologue = true;
private boolean m_windows = false; // True if windows code space
final private static boolean m_convWinUni = initConvWinUni();
static Vector m_statExcs = null; // Exceptions encountered
Vector m_excs = null; // Exceptions encountered
//#ifdef DTEST
final private static boolean m_debugTrace = false; // True if want to trace more
//#endif
//#ifdef DLOGGING
final private Logger logger = Logger.getLogger("EncodingUtil");
final private boolean fineLoggable = logger.isLoggable(Level.FINE);
final private boolean finestLoggable = logger.isLoggable(Level.FINEST);
//#endif
/** Creates a new instance of EncodingUtil */
public EncodingUtil(InputStream inputStream) {
m_encodingStreamReader = new EncodingStreamReader(inputStream);
}
/** Determine the encoding based on what is passed in as well
as if/when strings are to be further encoded. Also decide to
modify bytes read.
**/
public void getEncoding(final String fileEncoding, final String encoding) {
getEncoding(m_hasIso8859Encoding, m_isoEncoding, m_hasWinEncoding,
m_winEncoding, fileEncoding, encoding);
}
/** Determine the encoding based on what is passed in as well
as if/when strings are to be further encoded. Also decide to
modify bytes read.
**/
public void getEncoding(final boolean hasIso8859Encoding,
final String isoEncoding, final boolean hasWinEncoding,
final String winEncoding, final String fileEncoding,
final String encoding) {
String cencoding = encoding;
// If there is a second char, don't stop splitting until we
// return that char as input.
if (cencoding == null) {
cencoding = "UTF-8";
}
cencoding = cencoding.toUpperCase();
boolean modUTF16 = m_encodingStreamReader.isModUTF16();
boolean modEncoding = m_encodingStreamReader.isModEncoding();
m_utf = false;
m_windows = false;
String docEncoding = fileEncoding;
// Only need to convert from 2 byte to 1 byte and vsa versa.
if ((cencoding.equals("UTF-8") || cencoding.equals("UTF8"))) {
docEncoding = "UTF-8";
modEncoding = false;
m_utf = true;
} else if (cencoding.equals("UTF-16") || cencoding.equals("UTF16")) {
// If utf-16, don't set doc encoding as we are converting the
// bytes to single chars.
modUTF16 = true;
m_utf = true;
// Don't do doc encoding as the stream reader does it.
docEncoding = "";
} else if (cencoding.startsWith("ISO-8859")) {
if (hasIso8859Encoding) {
if (isoEncoding.indexOf("-") == -1) {
docEncoding = StringUtil.replace(cencoding, "ISO-",
"ISO");
docEncoding = docEncoding.replace('-', '_');
} else {
docEncoding = cencoding;
}
} else {
docEncoding = "";
}
modEncoding = false;
} else if (cencoding.startsWith("ISO8859")) {
if (hasIso8859Encoding) {
if (isoEncoding.indexOf("-") >= 0) {
docEncoding = StringUtil.replace(cencoding, "ISO",
"ISO-");
docEncoding = docEncoding.replace('_', '-');
} else {
docEncoding = cencoding;
}
} else {
docEncoding = "";
}
modEncoding = false;
} else if (cencoding.startsWith("WINDOWS-12")) {
if (hasWinEncoding) {
if (winEncoding.indexOf("-") == -1) {
docEncoding = StringUtil.replace(cencoding, "WINDOWS-",
"Cp");
} else {
docEncoding = cencoding;
}
} else {
docEncoding = "";
}
modEncoding = false;
m_windows = true;
} else if (cencoding.indexOf("CP-") == 0) {
if (hasWinEncoding) {
if (winEncoding.indexOf("-") >= 0) {
docEncoding = StringUtil.replace(cencoding, "CP-",
"WINDOWS-");
} else {
docEncoding = StringUtil.replace(cencoding, "CP-",
"Cp");
}
} else {
docEncoding = "";
}
modEncoding = false;
m_windows = true;
} else if (cencoding.startsWith("CP")) {
if (hasWinEncoding) {
if (winEncoding.indexOf("-") >= 0) {
docEncoding = StringUtil.replace(cencoding, "CP",
"WINDOWS-");
} else {
docEncoding = StringUtil.replace(cencoding, "CP", "Cp");
}
} else {
docEncoding = "";
}
modEncoding = false;
m_windows = true;
}
if (docEncoding.equals(fileEncoding)) {
m_docEncoding = "";
} else {
m_docEncoding = docEncoding;
}
if (m_docEncoding.length() != 0) {
try {
String a = new String("a".getBytes(), m_docEncoding);
} catch (UnsupportedEncodingException e) {
CauseException ce = new CauseException(
"UnsupportedEncodingException while trying to " +
"convert doc encoding: " + m_docEncoding, e);
if (m_excs == null) {
m_excs = new Vector();
}
m_excs.addElement(ce);
//#ifdef DLOGGING
logger.severe(ce.getMessage(), e);
//#endif
System.out.println(ce.getMessage());
// If encoding problem, use the main encoding as it is
// close enough.
if (m_windows) {
if (hasWinEncoding) {
m_docEncoding = winEncoding;
} else {
m_docEncoding = "";
}
} else if (m_utf) {
m_docEncoding = "";
} else {
if (hasIso8859Encoding) {
m_docEncoding = isoEncoding;
} else {
m_docEncoding = "";
}
}
try {
String a = new String("a".getBytes(), m_docEncoding);
} catch (UnsupportedEncodingException e2) {
CauseException ce2 = new CauseException(
"Second unsupportedEncodingException while " +
" trying to convert doc encoding: " +
m_docEncoding, e2);
m_excs.addElement(ce2);
//#ifdef DLOGGING
logger.severe(ce2.getMessage(), e2);
//#endif
System.out.println(ce2.getMessage());
m_docEncoding = "";
}
}
}
m_encodingStreamReader.setModEncoding(modEncoding);
m_encodingStreamReader.setModUTF16(modUTF16);
//#ifdef DLOGGING
if (fineLoggable) {logger.fine("hasIso8859Encoding=" + hasIso8859Encoding);}
if (fineLoggable) {logger.fine("isoEncoding=" + isoEncoding);}
if (fineLoggable) {logger.fine("hasWinEncoding=" + hasWinEncoding);}
if (fineLoggable) {logger.fine("winEncoding=" + winEncoding);}
if (fineLoggable) {logger.fine("encoding=" + encoding);}
if (fineLoggable) {logger.fine("cencoding=" + cencoding);}
if (fineLoggable) {logger.fine("docEncoding=" + docEncoding);}
if (fineLoggable) {logger.fine("m_docEncoding=" + m_docEncoding);}
if (fineLoggable) {logger.fine("fileEncoding=" + fileEncoding);}
if (fineLoggable) {logger.fine("m_windows=" + m_windows);}
if (fineLoggable) {logger.fine("m_utf=" + m_utf);}
if (fineLoggable) {logger.fine("modEncoding=" + modEncoding);}
if (fineLoggable) {logger.fine("modUTF16=" + modUTF16);}
//#endif
}
/* Replace special characters with valid ones for the specified
encoding. */
public static String replaceSpChars(String text, boolean isWindows,
boolean isUtf) {
return replaceSpChars(text, isWindows, isUtf, m_midpWin, m_midpUni);
}
/* Replace special characters with valid ones for the specified
encoding. For callers which use an instance of this class. */
public String replaceSpChars(String text) {
return replaceSpChars(text, m_windows, m_utf, m_midpWin, m_midpUni);
}
/* Replace special characters with valid ones for the specified
encoding. */
public static String replaceSpChars(String text, final boolean isWindows,
final boolean isUtf,
final boolean midpWin,
final boolean midpUni) {
try {
// No need to convert i diaeresis anymore as we do encoding
// change.
if (isWindows) {
if (midpWin) {
if (m_convWinUni) {
text = replaceSpUniChars(text);
return text;
}
/* If we are converting a windows doc, the windows special
characters are control characters in other encodings,
so change to ASCII. */
} else if (m_convWinUni) {
if (!midpUni) {
text = replaceSpUniWinChars(text);
}
} else {
char [] ctext = text.toCharArray();
char [] ntext = new char[text.length()];
int jc = 0;
for (int ic = 0; ic < ctext.length; ic++) {
final char cchr = ctext[ic];
if ((0x80 <= (int)cchr) && ((int)cchr <= 0x9f)) {
if (m_winIsoConvx80[(int)cchr - 0x80] != 0x01) {
ntext[jc++] = m_winIsoConvx80[(int)cchr - 0x80];
//#ifdef DTEST
if (m_debugTrace) {System.out.println("array cchr,conv=" + cchr + "," + Integer.toHexString(cchr) + "," + ntext[jc - 1] + "," + Integer.toHexString(ntext[jc - 1]));}
//#endif
}
} else {
ntext[jc++] = cchr;
//#ifdef DTEST
if (m_debugTrace) {System.out.println("cchr,conv=" + cchr + "," + Integer.toHexString(cchr) + "," + ntext[jc - 1] + "," + Integer.toHexString(ntext[jc - 1]));}
//#endif
}
}
text = new String(ntext, 0, jc);
//#ifdef DTEST
if (m_debugTrace) {System.out.println( "text,len=" + text + "," + text.length());}
//#endif
}
} else if (isUtf && !midpUni) {
text = replaceSpUniChars(text);
}
text = text.replace(CNON_BREAKING_SP, ' ');
} catch (Throwable t) {
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("replaceSpChars error ", t);
//#endif
System.out.println("replaceSpChars error " + t + "," +
t.getMessage());
}
return text;
}
/* Replace Unicode special characters with valid ones for Windows
encoding as they sometimes are valid even in iso8859_1 even though
it shouldn't be. */
public static String replaceSpUniWinChars(String text) {
try {
final char [] ctext = text.toCharArray();
char [] ntext = new char[text.length()];
int jc = 0;
for (int ic = 0; ic < ctext.length; ic++) {
final char c = ctext[ic];
switch(c & 0xff00) {
case 0x2000:
switch(c) {
case CEN_DASH:
ntext[jc++] = '-';
break;
case CEM_DASH:
ntext[jc++] = '-';
break;
case CLEFT_SGL_QUOTE:
ntext[jc++] = '\'';
break;
case CRIGHT_SGL_QUOTE:
ntext[jc++] = '\'';
break;
case CSGL_LOW9_QUOTE:
ntext[jc++] = '\'';
break;
case CLEFT_DBL_QUOTE:
ntext[jc++] = '\"';
break;
case CRIGHT_DBL_QUOTE:
ntext[jc++] = '\"';
break;
case CDBL_LOW9_QUOTE:
ntext[jc++] = '\"';
break;
case 0x2020:
ntext[jc++] = 0x86;
break;
case 0x2021:
ntext[jc++] = 0x87;
break;
case 0x2022:
ntext[jc++] = 0x95;
break;
case 0x2026:
ntext[jc++] = 0x85;
break;
case 0x2030:
ntext[jc++] = 0x89;
break;
case 0x2039:
ntext[jc++] = 0x8B;
break;
case 0x203A:
ntext[jc++] = 0x9B;
break;
case 0x20AC:
ntext[jc++] = 0x80;
System.out.println("ic,c=" + c + "," + Integer.toHexString(ntext[jc-1]));
break;
default:
ntext[jc++] = c;
break;
}
break;
default:
ntext[jc++] = c;
break;
}
}
text = new String(ntext, 0, jc);
} catch (Throwable t) {
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("replaceSpUniWinChars error ", t);
//#endif
System.out.println("replaceSpUniWinChars error " + t + "," +
t.getMessage());
}
return text;
}
/* Replace Unicode special characters which have Windows (cp1252)
equivalents into their windows equivalents except for those
that have simi-equivalents (e.g. en dash to regular dash)*/
public static String replaceSpUniChars(String text) {
text = text.replace(CSGL_LOW9_QUOTE, '\'');
text = text.replace(CLEFT_SGL_QUOTE, '\'');
text = text.replace(CRIGHT_SGL_QUOTE, '\'');
text = text.replace(CLEFT_DBL_QUOTE, '\"');
text = text.replace(CRIGHT_DBL_QUOTE, '\"');
text = text.replace(CDBL_LOW9_QUOTE, '\"');
text = text.replace(CEN_DASH, '-');
text = text.replace(CEM_DASH, '-');
return text;
}
/* Replace Windows special characters with simi-equivalents
(e.g. en dash to regular dash)*/
public static String replaceSpWinChars(String text) {
text = text.replace(CWSGL_LOW9_QUOTE, '\'');
text = text.replace(CWLEFT_SGL_QUOTE, '\'');
text = text.replace(CWRIGHT_SGL_QUOTE, '\'');
text = text.replace(CWLEFT_DBL_QUOTE, '\"');
text = text.replace(CWRIGHT_DBL_QUOTE, '\"');
text = text.replace(CWDBL_LOW9_QUOTE, '\"');
text = text.replace(CWEN_DASH, '-');
text = text.replace(CWEM_DASH, '-');
return text;
}
/* Replace all numeric entites e.g. ä
* @param s String to alter.
*/
public static String replaceNumEntity( String s) {
if (s == null) return s;
String snum = "";
try {
int index01 = s.indexOf( "&#" );
char [] achar = new char[1];
while (index01 != -1) {
int index02 = s.indexOf( ';' , index01 );
if (index02 == -1) {
return s;
}
try {
snum = s.substring(index01 + 2, index02);
// TODO redo with StringBuffer?
if (snum.length() == 0) {
return s;
}
switch (snum.charAt(0)) {
case 'x':
case 'X':
achar[0] = (char)Integer.parseInt(snum.substring(
1), 16);
break;
default:
achar[0] = (char)Integer.parseInt(snum);
break;
}
s = s.substring(0, index01) + new String(achar) +
s.substring(index02 + 1);
} catch (NumberFormatException e) {
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("replaceNumEntity NumberFormatException error for " + snum, e);
//#endif
System.out.println("replaceNumEntity error " + e + "," +
e.getMessage());
return s;
}
index01 = s.indexOf( "&#" );
}
} catch (Throwable t) {
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("replaceNumEntity error ", t);
//#endif
System.out.println("replaceNumEntity error " + t + "," +
t.getMessage());
}
return s;
}
/**
Replace alphabetic entities.
*/
public static String replaceAlphaEntities(final boolean convXmlEnts,
String text) {
final Hashtable m_convEntities = (m_midpWin) ?
(convXmlEnts ? m_convXmlCp1252 : m_convCp1252) :
(convXmlEnts ? m_convXmlIso88591 : m_convIso88591);
int beginPos = 0;
int pos = -1;
while ((pos = text.indexOf('&', beginPos)) >= 0) {
int epos = text.indexOf(';', pos);
if (epos < 0) {
break;
}
int nbpos = text.indexOf('&', pos + 1);
if ((nbpos >= 0) && (nbpos < epos)) {
beginPos = nbpos;
continue;
}
if ((pos + 1) == epos) {
beginPos = epos + 1;
continue;
}
String entity = text.substring(pos + 1, epos);
Object oent = m_convEntities.get(entity);
if (oent != null) {
String ent = (String)oent;
text = text.substring(0, pos) + ent + text.substring(epos + 1);
// If we made a substitution, keep the position the same
// as sometimes, we get a double substitution when
// we substitute & for & this may create another
// entity (e.g. &quot; becomes & ")
beginPos = pos;
} else {
beginPos = epos + 1;
}
}
return text;
}
/**
Replace alphabetic entities.
*/
public static String replaceXmlEntities(String text) {
int beginPos = 0;
int pos = -1;
while ((pos = text.indexOf('&', beginPos)) >= 0) {
int epos = text.indexOf(';', pos);
if (epos < 0) {
break;
}
int nbpos = text.indexOf('&', pos + 1);
if ((nbpos >= 0) && (nbpos < epos)) {
beginPos = nbpos;
continue;
}
if ((pos + 1) == epos) {
beginPos = epos + 1;
continue;
}
String entity = text.substring(pos, epos + 1);
int spos = m_xmlEntKeys.indexOf(entity);
if (spos >= 0) {
String ent = m_xmlEntValues[spos / 6];
text = text.substring(0, pos) + ent + text.substring(epos + 1);
// If we made a substitution, keep the position the same
// as sometimes, we get a double substitution when
// we substitute & for & this may create another
// entity (e.g. &quot; becomes & ")
beginPos = pos;
} else {
beginPos = epos + 1;
}
}
return text;
}
/**
Create table of XML entities.
*/
public static Hashtable initXmlEntities() {
Hashtable convEntities = new Hashtable();
try {
initHtmlCommEnts(convEntities);
} catch (Throwable t) {
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("initXmlEntities", t);
//#endif
}
return convEntities;
}
/**
Create table of alpha entities for iso8859-1.
*/
public static Hashtable initAlphaIso88591(final boolean convXmlEnts) {
//#ifdef DTEST
System.out.println( "m_midpIso=" + m_midpIso);
//#endif
final char isoLatin1Values[] =
{0xC0, 0xC1, 0xC2, 0xC3, 0xC4,
0xC5, 0xC6, 0xC7, 0xC8, 0xC9,
0xCA, 0xCB, 0xCC, 0xCD, 0xCE,
0xCF, 0xD0, 0xD1, 0xD2, 0xD3,
0xD4, 0xD5, 0xD6, 0xD7, 0xD8,
0xD9, 0xDA, 0xDB, 0xDC, 0xDD,
0xDE, 0xDF, 0xE0, 0xE1, 0xE2,
0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
0xE8, 0xE9, 0xEA, 0xEB, 0xEC,
0xED, 0xEE, 0xEF, 0xF0, 0xF1,
0xF2, 0xF3, 0xF4, 0xF5, 0xF6,
0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
0xFC, 0xFD, 0xFE, 0xFF};
Hashtable convEntities = new Hashtable();
try {
initEntVals(convEntities, m_isoCommonEntities, m_isoCommValues);
initEntVals(convEntities, m_isoLatin1Entities, isoLatin1Values);
initEntVals(convEntities, m_isoSpecialEntities, m_isoSpecialValues);
if (convXmlEnts) {
initHtmlCommEnts(convEntities);
}
} catch (Throwable t) {
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("initAlphaIso88591", t);
//#endif
}
return convEntities;
}
/**
Create table of alpha entities for windows 1252.
*/
public static Hashtable initAlphaCp1252(final boolean convXmlEnts) {
//#ifdef DTEST
System.out.println( "m_midpWin=" + m_midpWin);
//#endif
char isoLatin1Values[] =
{0xC0, 0xC1, 0xC2, 0xC3, 0xC4,
0xC5, 0xC6, 0xC7, 0xC8, 0xC9,
0xCA, 0xCB, 0xCC, 0xCD, 0xCE,
0xCF, 0xD0, 0xD1, 0xD2, 0xD3,
0xD4, 0xD5, 0xD6, 0xD7, 0xD8,
0xD9, 0xDA, 0xDB, 0xDC, 0xDD,
0xDE, 0xDF, 0xE0, 0xE1, 0xE2,
0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
0xE8, 0xE9, 0xEA, 0xEB, 0xEC,
0xED, 0xEE, 0xEF, 0xF0, 0xF1,
0xF2, 0xF3, 0xF4, 0xF5, 0xF6,
0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
0xFC, 0xFD, 0xFE, 0xFF};
Hashtable convEntities = new Hashtable();
try {
/* ISO common entities have same encodings as Cp1252 */
initEntVals(convEntities, m_isoCommonEntities, m_isoCommValues);
initEntVals(convEntities, m_isoLatin1Entities, isoLatin1Values);
char wm_isoSpecialValues[] =
{CWEN_DASH, // en dash
CWEM_DASH, // em dash
CWLEFT_SGL_QUOTE, // left single quotation mark
CWRIGHT_SGL_QUOTE, // right single quotation mark
0x82, // single low-9 quotation mark
CWLEFT_DBL_QUOTE, // left double quotation mark
CWRIGHT_DBL_QUOTE, // right double quotation mark
0x84}; // double low-9 quotation mark
initEntVals(convEntities, m_isoSpecialEntities, wm_isoSpecialValues);
if (convXmlEnts) {
initHtmlCommEnts(convEntities);
}
} catch (Throwable t) {
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("initAlphaCp1252", t);
//#endif
}
return convEntities;
}
/* Initialize entries with passed in entity strings and character
values turned into strings. */
public static void initEntVals(Hashtable convEntities, String[] entities, char[] entValues) {
try {
//#ifdef DTEST
System.out.println( "Entities, values len=" + entities.length + "," + entValues.length);
//#endif
for (int ic = 0; (ic < entities.length) && (ic < entValues.length);
ic++) {
char [] cvalue = {entValues[ic]};
// Sometimes, this can produce an error in some default
// encodings.
try {
String value = new String(cvalue);
convEntities.put(entities[ic], value);
} catch (Throwable t) {
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("initEntVals convert error bvalue=" +
Integer.toHexString(cvalue[0]), t);
//#endif
}
}
} catch (Throwable t) {
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("initEntVals", t);
//#endif
}
}
/* Init windows (cp-1252) to Iso 8859 encoding. This has either 1
if there is no equivalent (this is used to remove the equivalent char
from the string to be converted). If not a 1, the character is
used to replace the character in the string to be converted.
The conversion starts at 0x80 and goes to including 0x9f.
*/
private static char [] initWinIsoConv() {
char [] convTable = new char[0x9f - 0x80 + 1];
try {
//#ifdef DTEST
System.out.println( "convTable.length=" + convTable.length);
//#endif
convTable[0x80 - 0x80] = 0x20AC; //EURO SIGN
convTable[0x81 - 0x80] = 0x01;
convTable[0x82 - 0x80] = '\''; //SINGLE LOW-9 QUOTATION MARK
convTable[0x83 - 0x80] = 0x0192; //LATIN SMALL LETTER F WITH HOOK
convTable[0x84 - 0x80] = '\"'; //DOUBLE LOW-9 QUOTATION MARK
convTable[0x85 - 0x80] = 0x2026; //HORIZONTAL ELLIPSIS
convTable[0x86 - 0x80] = 0x2020; //DAGGER
convTable[0x87 - 0x80] = 0x2021; //DOUBLE DAGGER
convTable[0x88 - 0x80] = 0x02C6; //MODIFIER LETTER CIRCUMFLEX ACCENT
convTable[0x89 - 0x80] = 0x2030; //PER MILLE SIGN
convTable[0x8A - 0x80] = 0x0160; //LATIN CAPITAL LETTER S WITH CARON
convTable[0x8B - 0x80] = 0x2039; //SINGLE LEFT-POINTING ANGLE QUOTATION MARK
convTable[0x8C - 0x80] = 0x0152; //LATIN CAPITAL LIGATURE OE
convTable[0x8D - 0x80] = 0x01;
convTable[0x8E - 0x80] = 0x017D; //LATIN CAPITAL LETTER Z WITH CARON
convTable[0x8F - 0x80] = 0x01;
convTable[0x90 - 0x80] = 0x01;
convTable[0x91 - 0x80] = '\''; //LEFT SINGLE QUOTATION MARK
convTable[0x92 - 0x80] = '\''; //RIGHT SINGLE QUOTATION MARK
convTable[0x93 - 0x80] = '\"'; //LEFT DOUBLE QUOTATION MARK
convTable[0x94 - 0x80] = '\"'; //RIGHT DOUBLE QUOTATION MARK
convTable[0x95 - 0x80] = 0x2022; //BULLET
convTable[0x96 - 0x80] = '-'; //EN DASH
convTable[0x97 - 0x80] = '-'; //EM DASH
convTable[0x98 - 0x80] = 0x02DC; //SMALL TILDE
convTable[0x99 - 0x80] = 0x2122; //TRADE MARK SIGN
convTable[0x9A - 0x80] = 0x0161; //LATIN SMALL LETTER S WITH CARON
convTable[0x9B - 0x80] = 0x203A; //SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
convTable[0x9C - 0x80] = 0x0153; //LATIN SMALL LIGATURE OE
convTable[0x9D - 0x80] = 0x01;
convTable[0x9E - 0x80] = 0x017E; //LATIN SMALL LETTER Z WITH CARON
convTable[0x9F - 0x80] = 0x0178; //LATIN CAPITAL LETTER Y WITH DIAERESIS
} catch (Throwable t) {
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("initWinIsoConv", t);
//#endif
}
return convTable;
}
/* Init unicode to windows (cp-1252). This has either 1
if there is no equivalent (this is used to remove the equivalent char
from the string to be converted). If not a 1, the character is
used to replace the character in the string to be converted.
The conversion starts at 0x80 and goes to including 0x9f.
*/
private static char [] initUniWinConvx80() {
char [] convTable = new char[0x9f - 0x80 + 1];
try {
//#ifdef DTEST
System.out.println( "convTable.length=" + convTable.length);
//#endif
for (int ic = 0; ic < convTable.length; ic++) {
char cc = (char)(ic + 0x80);
switch (cc) {
case CWSGL_LOW9_QUOTE:
convTable[ic] = '\'';
break;
case CWDBL_LOW9_QUOTE:
convTable[ic] = '\"';
break;
case CWLEFT_DBL_QUOTE:
convTable[ic] = '\"';
break;
case CWRIGHT_DBL_QUOTE:
convTable[ic] = '\"';
break;
case CWLEFT_SGL_QUOTE:
convTable[ic] = '\'';
break;
case CWEN_DASH:
convTable[ic] = '-';
break;
case CWEM_DASH:
convTable[ic] = '-';
break;
default:
convTable[ic] = 0x01;
break;
}
}
} catch (Throwable t) {
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("initUniWinConvx80", t);
//#endif
}
return convTable;
}
/* Initialize entries for XML. */
private static void initHtmlCommEnts(Hashtable convEntities) {
String htmlCommonEntities[] =
{"lt", "gt", "nbsp", "amp", "apos", "quot"};
char htmlCommonValues[] = {'<', '>', ' ', '&', '\'', '\"'};
initEntVals(convEntities, htmlCommonEntities, htmlCommonValues);
}
/* Determine if creating a string converts the windows chars to
Unicode. */
private static boolean initConvWinUni() {
boolean rtn = false;
try {
byte[] blftSgl = {(byte)CWLEFT_SGL_QUOTE};
try {
String convStr = new String(blftSgl, "Cp1252");
rtn = convStr.charAt(0) == CLEFT_SGL_QUOTE;
} catch (UnsupportedEncodingException e) {
//#ifdef DTEST
System.out.println( "Unsupported encoding Cp1252");
//#endif
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("UnsupportedEncodingException Cp1252", e);
//#endif
try {
String convStr2 = new String(blftSgl, "Cp1252");
rtn = convStr2.charAt(0) == CLEFT_SGL_QUOTE;
} catch (UnsupportedEncodingException e2) {
//#ifdef DTEST
System.out.println( "Unsupported encoding WINDOWS-1252");
//#endif
//#ifdef DLOGGING
logger.severe("UnsupportedEncodingException Cp1252", e2);
//#endif
}
}
//#ifdef DTEST
System.out.println( "initConvWinUni()=" + rtn);
//#endif
} catch (Throwable t) {
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("initConvWinUni", t);
//#endif
}
return rtn;
}
/* Determine ISO encoding string. */
private static String initIsoEncoding() {
try {
try {
String convStr = new String("a".getBytes(), "ISO8859_1");
return "ISO8859_1";
} catch (UnsupportedEncodingException e) {
//#ifdef DTEST
System.out.println( "Unsupported encoding ISO8859_1");
//#endif
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("initIsoEncoding UnsupportedEncodingException ISO8859_1", e);
//#endif
try {
String convStr2 = new String("a".getBytes(), "ISO-8859-1");
return "ISO-8859-1";
} catch (UnsupportedEncodingException e2) {
//#ifdef DTEST
System.out.println("initIsoEncoding Unsupported encoding ISO-8859-1");
//#endif
//#ifdef DLOGGING
logger.severe("initIsoEncoding UnsupportedEncodingException ISO-8859-1", e2);
//#endif
}
}
} catch (Throwable t) {
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("initIsoEncoding initConvWinUni", t);
//#endif
}
return "ISO8859_1";
}
/* Determine Windows encoding string. */
private static String initWinEncoding() {
try {
try {
String convStr = new String("a".getBytes(), "Cp1252");
return "Cp1252";
} catch (UnsupportedEncodingException e) {
CauseException ce = new CauseException(
"initWinEncoding UnsupportedEncodingException " +
"while trying to convert encoding Cp1252.", e);
if (m_statExcs == null) {
m_statExcs = new Vector();
}
m_statExcs.addElement(ce);
//#ifdef DTEST
System.out.println(ce.getMessage());
//#endif
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe(ce.getMessage(), e);
//#endif
try {
String convStr2 = new String("a".getBytes(), "WINDOWS-1252");
return "WINDOWS-1252";
} catch (UnsupportedEncodingException e2) {
CauseException ce2 = new CauseException(
"initWinEncoding second " +
"unsupportedEncodingException while " +
" trying to convert encoding WINDOWS-1252.", e2);
m_statExcs.addElement(ce2);
//#ifdef DTEST
System.out.println(ce2.getMessage());
//#endif
//#ifdef DLOGGING
logger.severe(ce2.getMessage(), e2);
//#endif
}
}
} catch (Throwable t) {
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("initWinEncoding() initConvWinUni", t);
//#endif
}
return "Cp1252";
}
/* Determine if windows encoding is supported. */
public static boolean hasWinEncoding() {
try {
try {
String convStr = new String("a".getBytes(), "Cp1252");
return true;
} catch (UnsupportedEncodingException e) {
CauseException ce = new CauseException(
"hasWinEncoding UnsupportedEncodingException " +
"while trying to convert encoding Cp1252.", e);
if (m_statExcs == null) {
m_statExcs = new Vector();
}
m_statExcs.addElement(ce);
//#ifdef DTEST
System.out.println(ce.getMessage());
//#endif
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe(ce.getMessage(), e);
//#endif
try {
String convStr2 = new String("a".getBytes(), "WINDOWS-1252");
return true;
} catch (UnsupportedEncodingException e2) {
CauseException ce2 = new CauseException(
"initWinEncoding second " +
"unsupportedEncodingException while " +
" trying to convert encoding WINDOWS-1252.", e2);
m_statExcs.addElement(ce2);
//#ifdef DTEST
System.out.println(ce2.getMessage());
//#endif
//#ifdef DLOGGING
logger.severe(ce2.getMessage(), e2);
//#endif
}
}
} catch (Throwable t) {
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("hasWinEncoding initConvWinUni", t);
//#endif
}
return false;
}
/* Determine if iso-8859-1 encoding is supported. */
private static boolean hasIso8859Encoding() {
try {
try {
String convStr = new String("a".getBytes(), "ISO8859_1");
return true;
} catch (UnsupportedEncodingException e) {
//#ifdef DTEST
System.out.println( "Unsupported encoding ISO8859_1");
//#endif
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("hasIso8859Encoding UnsupportedEncodingException ISO8859_1", e);
//#endif
try {
String convStr2 = new String("a".getBytes(), "ISO-8859-1");
return true;
} catch (UnsupportedEncodingException e2) {
//#ifdef DTEST
System.out.println("hasIso8859Encoding Unsupported encoding ISO-8859-1");
//#endif
//#ifdef DLOGGING
logger.severe("initIsoEncoding UnsupportedEncodingException ISO-8859-1", e2);
//#endif
}
}
} catch (Throwable t) {
//#ifdef DLOGGING
Logger logger = Logger.getLogger("EncodingUtil");
logger.severe("hasIso8859Encoding initConvWinUni", t);
//#endif
}
return false;
}
public void setDocEncoding(String m_docEncoding) {
this.m_docEncoding = m_docEncoding;
}
public String getDocEncoding() {
return (m_docEncoding);
}
public void setEncodingStreamReader(EncodingStreamReader m_encodingStreamReader) {
this.m_encodingStreamReader = m_encodingStreamReader;
}
public EncodingStreamReader getEncodingStreamReader() {
return (m_encodingStreamReader);
}
public boolean isWindows() {
return (m_windows);
}
public boolean isUtf() {
return (m_utf);
}
//#ifdef DTEST
public static String[] getIsoCommonEntities() {
return (m_isoCommonEntities);
}
public static Hashtable getConvIso88591() {
return (m_convIso88591);
}
public static Hashtable getConvCp1252() {
return (m_convCp1252);
}
static public String[] getIsoSpecialEntities() {
return (m_isoSpecialEntities);
}
static public String getWinEncoding() {
return (m_winEncoding);
}
public static boolean isConvWinUni() {
return (m_convWinUni);
}
public static boolean isHasWinEncoding() {
return (m_hasWinEncoding);
}
//#endif
static public String getIsoEncoding() {
return (m_isoEncoding);
}
public Vector getExcs() {
if (m_excs == null) {
return new Vector();
} else {
return (m_excs);
}
}
public static Vector getStatExcs() {
if (m_statExcs == null) {
return new Vector();
} else {
return (m_statExcs);
}
}
}