Package org.commoncrawl.util.shared

Source Code of org.commoncrawl.util.shared.CharsetUtils

package org.commoncrawl.util.shared;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.commoncrawl.io.shared.NIOHttpHeaders;
import org.commoncrawl.util.shared.HttpHeaderUtils.ContentTypeAndCharset;
import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
import org.mozilla.intl.chardet.nsPSMDetector;

import com.google.common.collect.ImmutableMap;

/**
*
* Charset detection and decoding helpers
*
* @author rana
*
*/
public class CharsetUtils {

  public static final Log LOG = LogFactory.getLog(CharsetUtils.class);

  /** check for a normalized (java friendly) alias for the original charset name **/
  public static String aliasCharset(String inputCharset) {
    String alias = aliasTable.get(inputCharset.toLowerCase());
    return (alias != null) ? alias : inputCharset;
  }

  public static String bestEffortDecodeBytes(NIOHttpHeaders headers, byte[] crawlData,int offset,int length)
      throws IOException {

    ContentTypeAndCharset urlMetadata = new ContentTypeAndCharset();

    HttpHeaderUtils.parseContentType(headers, urlMetadata);

    if (urlMetadata._charset != null && crawlData != null) {
      if (urlMetadata._contentType != null
          && urlMetadata._contentType.equalsIgnoreCase("text/html")) {
        // sniff encoding in metadata ...
        String alternateCharset = CharsetUtils
            .sniffCharacterEncoding(crawlData,offset,length);
        if (alternateCharset != null) {
          urlMetadata._charset = alternateCharset;
        }
      }
    }

    // now if charset is REALLY still not available ...
    if (urlMetadata._charset == null && crawlData != null) {

      if (urlMetadata._contentType != null
          && MimeTypeFilter.isTextType(urlMetadata._contentType)) {
        // try to detect the charset from the stream ...
        String detectedCharset = CharsetUtils
            .detectCharacterEncoding(crawlData,offset,length);

        if (detectedCharset != null) {
          urlMetadata._charset = (detectedCharset);
        } else {
          // manually set charset to ASCII :-(
          urlMetadata._charset = ("ASCII");
        }
      }
    }

    // finally if charset is available ... try to alias it ...
    if (urlMetadata._charset != null) {
      String aliasValue = aliasTable.get(urlMetadata._charset.toLowerCase());
      if (aliasValue != null) {
        urlMetadata._charset = aliasValue;
      }
    }

    // now try to find it via java names
    if (urlMetadata._charset != null && crawlData != null) {
      Charset charset = null;
      try {
        charset = Charset.forName(urlMetadata._charset);
      } catch (Exception e) {
        LOG.error(CCStringUtils.stringifyException(e));
        // try to detect the charset from the stream ...
        String detectedCharset = CharsetUtils
            .detectCharacterEncoding(crawlData,offset,length);
        if (detectedCharset != null) {
          try {
            charset = Charset.forName(detectedCharset);
          } catch (Exception e2) {
            LOG.error(CCStringUtils.stringifyException(e));
          }
        }
      }
      if (charset != null) {
        try {
          CharBuffer ucs2Chars = charset.decode(ByteBuffer.wrap(crawlData,offset,length));
          return ucs2Chars.toString();
        } catch (Exception e) {
          LOG.error(CCStringUtils.stringifyException(e));
        }
      }
    } else {
      LOG.error("NULL_CHARSET_PASSED_TO_PARSER");
    }

    return null;
  }

  private static final int CHUNK_SIZE     = 2000;

  private static Pattern   metaPattern    = Pattern
                                              .compile(
                                                  "<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>",
                                                  Pattern.CASE_INSENSITIVE);
  private static Pattern   charsetPattern = Pattern
                                              .compile(
                                                  "charset=\\s*([a-z][_\\-0-9a-z]*)",
                                                  Pattern.CASE_INSENSITIVE);

  /**
   * Given a <code>byte[]</code> representing an html file of an
   * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag
   * from the first <code>CHUNK_SIZE</code> bytes. If there's no meta tag for
   * Content-Type or no charset is specified, <code>null</code> is returned. <br />
   * FIXME: non-byte oriented character encodings (UTF-16, UTF-32) can't be
   * handled with this. We need to do something similar to what's done by
   * mozilla
   * (http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser
   * .cpp#1993). See also http://www.w3.org/TR/REC-xml/#sec-guessing <br />
   *
   * @param content
   *          <code>byte[]</code> representation of an html file
   */

  public static String sniffCharacterEncoding(byte[] content,int offset,int length) {

    // LOG.info("ENTERING SNIFFCHARENCODING...");
    length = length < CHUNK_SIZE ? length : CHUNK_SIZE;

    // We don't care about non-ASCII parts so that it's sufficient
    // to just inflate each byte to a 16-bit value by padding.
    // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
    // {U+0041, U+0082, U+00B7}.
    String str = "";
    try {
      str = new String(content, offset, length, Charset.forName("ASCII").toString());
    } catch (UnsupportedEncodingException e) {
      // code should never come here, but just in case...
      return null;
    }

    // LOG.info("RUNNING METAPATTERN MATCHER...");
    Matcher metaMatcher = metaPattern.matcher(str);
    String encoding = null;

    if (metaMatcher.find()) {
      // LOG.info("RUNNING CHARSET PATTERN MATCHER...");
      Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
      if (charsetMatcher.find())
        encoding = new String(charsetMatcher.group(1));
    }

    // LOG.info("ENCODING IS:" + encoding);
    return encoding;
  }

  public static class DetectorState implements nsICharsetDetectionObserver {

    public boolean _done            = false;
    public String  _detectedCharset = null;

    @Override
    public void Notify(String charset) {
      _detectedCharset = charset;
      _done = true;
    }

  }

  private static int MAX_CHARS_TO_DETECT = 16000;

  /** last resort - detect encoding using charset detector **/
  public static String detectCharacterEncoding(byte[] content,int offset,int length) {

    if (content != null && content.length != 0) {

      DetectorState state = new DetectorState();

      nsDetector detector = new nsDetector(nsPSMDetector.ALL);

      detector.Init(state);

      if (offset != 0) {
        byte[] contentCopy = new byte[Math.min(length, MAX_CHARS_TO_DETECT)];
        System.arraycopy(content,offset, contentCopy,0,length);
        content = contentCopy;
      }
     
      boolean isAscii = detector.isAscii(content, content.length);

      if (!isAscii) {
        isAscii = detector.DoIt(content, Math.min(content.length,
            MAX_CHARS_TO_DETECT), false);
      }
      detector.DataEnd();

      if (isAscii) {
        return "ASCII";
      } else if (state._detectedCharset != null) {
        return state._detectedCharset;
      } else {
        String prob[] = detector.getProbableCharsets();
        if (prob != null && prob.length != 0) {
          return prob[0];
        }
      }
    }
    return null;
  }

  static final ImmutableMap<String, String> aliasTable;

  static {

    ImmutableMap.Builder<String, String> builder = new ImmutableMap.Builder<String, String>();

    builder.put("ibm-1208", "UTF-8");
    builder.put("ibm-1209", "UTF-8");
    builder.put("ibm-5304", "UTF-8");
    builder.put("ibm-5305", "UTF-8");
    builder.put("ibm-13496", "UTF-8");
    builder.put("ibm-13497", "UTF-8");
    builder.put("ibm-17592", "UTF-8");
    builder.put("ibm-17593", "UTF-8");
    builder.put("windows-65001", "UTF-8");
    builder.put("cp1208", "UTF-8");
    builder.put("iso-10646-ucs-2", "UTF-16");
    builder.put("ibm-1204", "UTF-16");
    builder.put("ibm-1205", "UTF-16");
    builder.put("unicode", "UTF-16");
    builder.put("csunicode", "UTF-16");
    builder.put("ucs-2", "UTF-16");
    builder.put("x-utf-16be", "UTF-16BE");
    builder.put("unicodebigunmarked", "UTF-16BE");
    builder.put("ibm-1200", "UTF-16BE");
    builder.put("ibm-1201", "UTF-16BE");
    builder.put("ibm-13488", "UTF-16BE");
    builder.put("ibm-13489", "UTF-16BE");
    builder.put("ibm-17584", "UTF-16BE");
    builder.put("ibm-17585", "UTF-16BE");
    builder.put("ibm-21680", "UTF-16BE");
    builder.put("ibm-21681", "UTF-16BE");
    builder.put("ibm-25776", "UTF-16BE");
    builder.put("ibm-25777", "UTF-16BE");
    builder.put("ibm-29872", "UTF-16BE");
    builder.put("ibm-29873", "UTF-16BE");
    builder.put("ibm-61955", "UTF-16BE");
    builder.put("ibm-61956", "UTF-16BE");
    builder.put("windows-1201", "UTF-16BE");
    builder.put("cp1200", "UTF-16BE");
    builder.put("cp1201", "UTF-16BE");
    builder.put("utf16_bigendian", "UTF-16BE");
    builder.put("x-utf-16le", "UTF-16LE");
    builder.put("unicodelittleunmarked", "UTF-16LE");
    builder.put("ibm-1202", "UTF-16LE");
    builder.put("ibm-1203", "UTF-16LE");
    builder.put("ibm-13490", "UTF-16LE");
    builder.put("ibm-13491", "UTF-16LE");
    builder.put("ibm-17586", "UTF-16LE");
    builder.put("ibm-17587", "UTF-16LE");
    builder.put("ibm-21682", "UTF-16LE");
    builder.put("ibm-21683", "UTF-16LE");
    builder.put("ibm-25778", "UTF-16LE");
    builder.put("ibm-25779", "UTF-16LE");
    builder.put("ibm-29874", "UTF-16LE");
    builder.put("ibm-29875", "UTF-16LE");
    builder.put("utf16_littleendian", "UTF-16LE");
    builder.put("windows-1200", "UTF-16LE");
    builder.put("ibm-819", "ISO-8859-1");
    builder.put("ibm819", "ISO-8859-1");
    builder.put("cp819", "ISO-8859-1");
    builder.put("latin1", "ISO-8859-1");
    builder.put("8859_1", "ISO-8859-1");
    builder.put("csisolatin1", "ISO-8859-1");
    builder.put("iso-ir-100", "ISO-8859-1");
    builder.put("iso_8859-1:1987", "ISO-8859-1");
    builder.put("l1", "ISO-8859-1");
    builder.put("819", "ISO-8859-1");
    builder.put("ascii", "US-ASCII");
    builder.put("ansi_x3.4-1968", "US-ASCII");
    builder.put("ansi_x3.4-1986", "US-ASCII");
    builder.put("iso_646.irv:1991", "US-ASCII");
    builder.put("iso_646.irv:1983", "US-ASCII");
    builder.put("iso646-us", "US-ASCII");
    builder.put("us", "US-ASCII");
    builder.put("csascii", "US-ASCII");
    builder.put("iso-ir-6", "US-ASCII");
    builder.put("cp367", "US-ASCII");
    builder.put("ascii7", "US-ASCII");
    builder.put("646", "US-ASCII");
    builder.put("windows-20127", "US-ASCII");
    builder.put("ibm-367", "US-ASCII");
    builder.put("ibm367", "US-ASCII");
    builder.put("ibm-912_p100-1995", "ISO-8859-2");
    builder.put("ibm-912", "ISO-8859-2");
    builder.put("iso_8859-2:1987", "ISO-8859-2");
    builder.put("latin2", "ISO-8859-2");
    builder.put("csisolatin2", "ISO-8859-2");
    builder.put("iso-ir-101", "ISO-8859-2");
    builder.put("l2", "ISO-8859-2");
    builder.put("8859_2", "ISO-8859-2");
    builder.put("cp912", "ISO-8859-2");
    builder.put("912", "ISO-8859-2");
    builder.put("windows-28592", "ISO-8859-2");
    builder.put("ibm-913_p100-2000", "ISO-8859-3");
    builder.put("ibm-913", "ISO-8859-3");
    builder.put("iso_8859-3:1988", "ISO-8859-3");
    builder.put("latin3", "ISO-8859-3");
    builder.put("csisolatin3", "ISO-8859-3");
    builder.put("iso-ir-109", "ISO-8859-3");
    builder.put("l3", "ISO-8859-3");
    builder.put("8859_3", "ISO-8859-3");
    builder.put("cp913", "ISO-8859-3");
    builder.put("913", "ISO-8859-3");
    builder.put("windows-28593", "ISO-8859-3");
    builder.put("ibm-914_p100-1995", "ISO-8859-4");
    builder.put("ibm-914", "ISO-8859-4");
    builder.put("latin4", "ISO-8859-4");
    builder.put("csisolatin4", "ISO-8859-4");
    builder.put("iso-ir-110", "ISO-8859-4");
    builder.put("iso_8859-4:1988", "ISO-8859-4");
    builder.put("l4", "ISO-8859-4");
    builder.put("8859_4", "ISO-8859-4");
    builder.put("cp914", "ISO-8859-4");
    builder.put("914", "ISO-8859-4");
    builder.put("windows-28594", "ISO-8859-4");
    builder.put("ibm-915_p100-1995", "ISO-8859-5");
    builder.put("ibm-915", "ISO-8859-5");
    builder.put("cyrillic", "ISO-8859-5");
    builder.put("csisolatincyrillic", "ISO-8859-5");
    builder.put("iso-ir-144", "ISO-8859-5");
    builder.put("iso_8859-5:1988", "ISO-8859-5");
    builder.put("8859_5", "ISO-8859-5");
    builder.put("cp915", "ISO-8859-5");
    builder.put("915", "ISO-8859-5");
    builder.put("windows-28595", "ISO-8859-5");
    builder.put("ibm-1089_p100-1995", "ISO-8859-6");
    builder.put("ibm-1089", "ISO-8859-6");
    builder.put("arabic", "ISO-8859-6");
    builder.put("csisolatinarabic", "ISO-8859-6");
    builder.put("iso-ir-127", "ISO-8859-6");
    builder.put("iso_8859-6:1987", "ISO-8859-6");
    builder.put("ecma-114", "ISO-8859-6");
    builder.put("asmo-708", "ISO-8859-6");
    builder.put("8859_6", "ISO-8859-6");
    builder.put("cp1089", "ISO-8859-6");
    builder.put("1089", "ISO-8859-6");
    builder.put("windows-28596", "ISO-8859-6");
    builder.put("iso-8859-6-i", "ISO-8859-6");
    builder.put("iso-8859-6-e", "ISO-8859-6");
    builder.put("ibm-9005_x110-2007", "ISO-8859-7");
    builder.put("ibm-9005", "ISO-8859-7");
    builder.put("greek", "ISO-8859-7");
    builder.put("greek8", "ISO-8859-7");
    builder.put("elot_928", "ISO-8859-7");
    builder.put("ecma-118", "ISO-8859-7");
    builder.put("csisolatingreek", "ISO-8859-7");
    builder.put("iso-ir-126", "ISO-8859-7");
    builder.put("iso_8859-7:1987", "ISO-8859-7");
    builder.put("windows-28597", "ISO-8859-7");
    builder.put("sun_eu_greek", "ISO-8859-7");
    builder.put("ibm-813_p100-1995", "ISO-8859-7");
    builder.put("ibm-813", "ISO-8859-7");
    builder.put("8859_7", "ISO-8859-7");
    builder.put("cp813", "ISO-8859-7");
    builder.put("813", "ISO-8859-7");
    builder.put("ibm-5012_p100-1999", "ISO-8859-8");
    builder.put("ibm-5012", "ISO-8859-8");
    builder.put("hebrew", "ISO-8859-8");
    builder.put("csisolatinhebrew", "ISO-8859-8");
    builder.put("iso-ir-138", "ISO-8859-8");
    builder.put("iso_8859-8:1988", "ISO-8859-8");
    builder.put("iso-8859-8-i", "ISO-8859-8");
    builder.put("iso-8859-8-e", "ISO-8859-8");
    builder.put("8859_8", "ISO-8859-8");
    builder.put("windows-28598", "ISO-8859-8");
    builder.put("hebrew8", "ISO-8859-8");
    builder.put("ibm-916_p100-1995", "ibm-916");
    builder.put("cp916", "ibm-916");
    builder.put("916", "ibm-916");
    builder.put("ibm-920_p100-1995", "ISO-8859-9");
    builder.put("ibm-920", "ISO-8859-9");
    builder.put("latin5", "ISO-8859-9");
    builder.put("csisolatin5", "ISO-8859-9");
    builder.put("iso-ir-148", "ISO-8859-9");
    builder.put("iso_8859-9:1989", "ISO-8859-9");
    builder.put("l5", "ISO-8859-9");
    builder.put("8859_9", "ISO-8859-9");
    builder.put("cp920", "ISO-8859-9");
    builder.put("920", "ISO-8859-9");
    builder.put("windows-28599", "ISO-8859-9");
    builder.put("ecma-128", "ISO-8859-9");
    builder.put("turkish8", "ISO-8859-9");
    builder.put("turkish", "ISO-8859-9");
    builder.put("ibm-921_p100-1995", "ISO-8859-13");
    builder.put("ibm-921", "ISO-8859-13");
    builder.put("8859_13", "ISO-8859-13");
    builder.put("windows-28603", "ISO-8859-13");
    builder.put("cp921", "ISO-8859-13");
    builder.put("921", "ISO-8859-13");
    builder.put("ibm-923_p100-1998", "ISO-8859-15");
    builder.put("ibm-923", "ISO-8859-15");
    builder.put("latin-9", "ISO-8859-15");
    builder.put("l9", "ISO-8859-15");
    builder.put("8859_15", "ISO-8859-15");
    builder.put("latin0", "ISO-8859-15");
    builder.put("csisolatin0", "ISO-8859-15");
    builder.put("csisolatin9", "ISO-8859-15");
    builder.put("iso8859_15_fdis", "ISO-8859-15");
    builder.put("cp923", "ISO-8859-15");
    builder.put("923", "ISO-8859-15");
    builder.put("windows-28605", "ISO-8859-15");
    builder.put("ibm-943_p15a-2003", "Shift_JIS");
    builder.put("ms_kanji", "Shift_JIS");
    builder.put("csshiftjis", "Shift_JIS");
    builder.put("windows-31j", "Shift_JIS");
    builder.put("cswindows31j", "Shift_JIS");
    builder.put("x-sjis", "Shift_JIS");
    builder.put("x-ms-cp932", "Shift_JIS");
    builder.put("cp932", "Shift_JIS");
    builder.put("windows-932", "Shift_JIS");
    builder.put("cp943c", "Shift_JIS");
    builder.put("ibm-943c", "Shift_JIS");
    builder.put("ms932", "Shift_JIS");
    builder.put("pck", "Shift_JIS");
    builder.put("sjis", "Shift_JIS");
    builder.put("s-jis", "Shift_JIS");
    builder.put("ibm-943_vsub_vpua", "Shift_JIS");
    builder.put("ibm-943_p130-1999", "x-IBM943");
    builder.put("ibm-943", "x-IBM943");
    builder.put("shift_jis", "x-IBM943");
    builder.put("943", "x-IBM943");
    builder.put("ibm-943_vascii_vsub_vpu", "x-IBM943");
    builder.put("cp943", "x-IBM943");
    builder.put("ibm-33722_p120-1999", "x-IBM33722");
    builder.put("ibm-5050", "x-IBM33722");
    builder.put("cp33722", "x-IBM33722");
    builder.put("33722", "x-IBM33722");
    builder.put("ibm-33722_vascii_vpua", "x-IBM33722");
    builder.put("ibm-954_p101-2007", "x-JISAutoDetect");
    builder.put("ibm-954", "x-JISAutoDetect");
    builder.put("euc-jp", "x-JISAutoDetect");
    builder.put("cseucpkdfmtjapanese", "x-JISAutoDetect");
    builder.put("x-euc-jp", "x-JISAutoDetect");
    builder.put("eucjis", "x-JISAutoDetect");
    builder.put("ujis", "x-JISAutoDetect");
    builder.put("windows-950-2000", "Big5");
    builder.put("csbig5", "Big5");
    builder.put("windows-950", "Big5");
    builder.put("x-big5", "Big5");
    builder.put("ibm-950_p110-1999", "x-IBM950");
    builder.put("ibm-950", "x-IBM950");
    builder.put("cp950", "x-IBM950");
    builder.put("950", "x-IBM950");
    builder.put("ibm-1375_p100-2007", "Big5-HKSCS");
    builder.put("ibm-1375", "Big5-HKSCS");
    builder.put("big5-hkscs", "Big5-HKSCS");
    builder.put("big5hk", "Big5-HKSCS");
    builder.put("hkscs-big5", "Big5-HKSCS");
    builder.put("ibm-5471_p100-2006", "x-MS950-HKSCS");
    builder.put("ibm-5471", "x-MS950-HKSCS");
    builder.put("ms950_hkscs", "x-MS950-HKSCS");
    builder.put("hkbig5", "x-MS950-HKSCS");
    builder.put("big5-hkscs:unicode3.0", "x-MS950-HKSCS");
    builder.put("windows-936-2000", "GBK");
    builder.put("cp936", "GBK");
    builder.put("ms936", "GBK");
    builder.put("windows-936", "GBK");
    builder.put("ibm-1383_p110-1999", "GB2312");
    builder.put("ibm-1383", "GB2312");
    builder.put("csgb2312", "GB2312");
    builder.put("cp1383", "GB2312");
    builder.put("1383", "GB2312");
    builder.put("euc-cn", "GB2312");
    builder.put("ibm-euccn", "GB2312");
    builder.put("hp15cn", "GB2312");
    builder.put("ibm-1383_vpua", "GB2312");
    builder.put("ibm-964_p110-1999", "x-IBM964");
    builder.put("ibm-964", "x-IBM964");
    builder.put("euc-tw", "x-IBM964");
    builder.put("ibm-euctw", "x-IBM964");
    builder.put("cns11643", "x-IBM964");
    builder.put("cp964", "x-IBM964");
    builder.put("964", "x-IBM964");
    builder.put("ibm-964_vpua", "x-IBM964");
    builder.put("ibm-949_p110-1999", "x-IBM949");
    builder.put("ibm-949", "x-IBM949");
    builder.put("cp949", "x-IBM949");
    builder.put("949", "x-IBM949");
    builder.put("ibm-949_vascii_vsub_vpua", "x-IBM949");
    builder.put("ibm-970_p110_p110-2006_u2", "EUC-KR");
    builder.put("ibm-970", "EUC-KR");
    builder.put("euc-kr", "EUC-KR");
    builder.put("ks_c_5601-1987", "EUC-KR");
    builder.put("windows-51949", "EUC-KR");
    builder.put("cseuckr", "EUC-KR");
    builder.put("ibm-euckr", "EUC-KR");
    builder.put("ksc_5601", "EUC-KR");
    builder.put("5601", "EUC-KR");
    builder.put("cp970", "EUC-KR");
    builder.put("970", "EUC-KR");
    builder.put("ibm-970_vpua", "EUC-KR");
    builder.put("windows-949-2000", "x-windows-949");
    builder.put("windows-949", "x-windows-949");
    builder.put("ks_c_5601-1989", "x-windows-949");
    builder.put("csksc56011987", "x-windows-949");
    builder.put("korean", "x-windows-949");
    builder.put("iso-ir-149", "x-windows-949");
    builder.put("ms949", "x-windows-949");
    builder.put("windows-874-2000", "x-windows-874");
    builder.put("windows-874", "x-windows-874");
    builder.put("ms874", "x-windows-874");
    builder.put("ibm-874_p100-1995", "x-IBM874");
    builder.put("ibm-874", "x-IBM874");
    builder.put("ibm-9066", "x-IBM874");
    builder.put("cp874", "x-IBM874");
    builder.put("tis-620", "x-IBM874");
    builder.put("tis620.2533", "x-IBM874");
    builder.put("eucth", "x-IBM874");
    builder.put("ibm-437_p100-1995", "IBM437");
    builder.put("ibm437", "IBM437");
    builder.put("cp437", "IBM437");
    builder.put("437", "IBM437");
    builder.put("cspc8codepage437", "IBM437");
    builder.put("windows-437", "IBM437");
    builder.put("ibm-737_p100-1997", "x-IBM737");
    builder.put("ibm-737", "x-IBM737");
    builder.put("ibm737", "x-IBM737");
    builder.put("cp737", "x-IBM737");
    builder.put("windows-737", "x-IBM737");
    builder.put("737", "x-IBM737");
    builder.put("ibm-775_p100-1996", "IBM775");
    builder.put("ibm-775", "IBM775");
    builder.put("ibm775", "IBM775");
    builder.put("cp775", "IBM775");
    builder.put("cspc775baltic", "IBM775");
    builder.put("windows-775", "IBM775");
    builder.put("775", "IBM775");
    builder.put("ibm-850_p100-1995", "IBM850");
    builder.put("ibm-850", "IBM850");
    builder.put("ibm850", "IBM850");
    builder.put("cp850", "IBM850");
    builder.put("850", "IBM850");
    builder.put("cspc850multilingual", "IBM850");
    builder.put("windows-850", "IBM850");
    builder.put("ibm-852_p100-1995", "IBM852");
    builder.put("ibm-852", "IBM852");
    builder.put("ibm852", "IBM852");
    builder.put("cp852", "IBM852");
    builder.put("852", "IBM852");
    builder.put("cspcp852", "IBM852");
    builder.put("windows-852", "IBM852");
    builder.put("ibm-855_p100-1995", "IBM855");
    builder.put("ibm-855", "IBM855");
    builder.put("ibm855", "IBM855");
    builder.put("cp855", "IBM855");
    builder.put("855", "IBM855");
    builder.put("csibm855", "IBM855");
    builder.put("cspcp855", "IBM855");
    builder.put("windows-855", "IBM855");
    builder.put("ibm-856_p100-1995", "x-IBM856");
    builder.put("ibm-856", "x-IBM856");
    builder.put("ibm856", "x-IBM856");
    builder.put("cp856", "x-IBM856");
    builder.put("856", "x-IBM856");
    builder.put("ibm-857_p100-1995", "IBM857");
    builder.put("ibm-857", "IBM857");
    builder.put("cp857", "IBM857");
    builder.put("857", "IBM857");
    builder.put("csibm857", "IBM857");
    builder.put("windows-857", "IBM857");
    builder.put("ibm-858_p100-1997", "IBM00858");
    builder.put("ibm-858", "IBM00858");
    builder.put("ccsid00858", "IBM00858");
    builder.put("cp00858", "IBM00858");
    builder.put("pc-multilingual-850+euro", "IBM00858");
    builder.put("cp858", "IBM00858");
    builder.put("windows-858", "IBM00858");
    builder.put("ibm-860_p100-1995", "IBM860");
    builder.put("ibm-860", "IBM860");
    builder.put("cp860", "IBM860");
    builder.put("860", "IBM860");
    builder.put("csibm860", "IBM860");
    builder.put("ibm-861_p100-1995", "IBM861");
    builder.put("ibm-861", "IBM861");
    builder.put("cp861", "IBM861");
    builder.put("861", "IBM861");
    builder.put("cp-is", "IBM861");
    builder.put("csibm861", "IBM861");
    builder.put("windows-861", "IBM861");
    builder.put("ibm-862_p100-1995", "IBM862");
    builder.put("ibm-862", "IBM862");
    builder.put("cp862", "IBM862");
    builder.put("862", "IBM862");
    builder.put("cspc862latinhebrew", "IBM862");
    builder.put("dos-862", "IBM862");
    builder.put("windows-862", "IBM862");
    builder.put("ibm-863_p100-1995", "");
    builder.put("ibm-863", "IBM863");
    builder.put("cp863", "IBM863");
    builder.put("863", "IBM863");
    builder.put("csibm863", "IBM863");
    builder.put("ibm-864_x110-1999", "IBM864");
    builder.put("ibm-864", "IBM864");
    builder.put("cp864", "IBM864");
    builder.put("csibm864", "IBM864");
    builder.put("ibm-865_p100-1995", "IBM865");
    builder.put("ibm-865", "IBM865");
    builder.put("cp865", "IBM865");
    builder.put("865", "IBM865");
    builder.put("csibm865", "IBM865");
    builder.put("ibm-866_p100-1995", "IBM866");
    builder.put("ibm-866", "IBM866");
    builder.put("cp866", "IBM866");
    builder.put("866", "IBM866");
    builder.put("csibm866", "IBM866");
    builder.put("windows-866", "IBM866");
    builder.put("ibm-868_p100-1995", "IBM868");
    builder.put("ibm-868", "IBM868");
    builder.put("cp868", "IBM868");
    builder.put("868", "IBM868");
    builder.put("csibm868", "IBM868");
    builder.put("cp-ar", "IBM868");
    builder.put("ibm-869_p100-1995", "IBM869");
    builder.put("ibm-869", "IBM869");
    builder.put("cp869", "IBM869");
    builder.put("869", "IBM869");
    builder.put("cp-gr", "IBM869");
    builder.put("csibm869", "IBM869");
    builder.put("windows-869", "IBM869");
    builder.put("ibm-878_p100-1996", "KOI8-R");
    builder.put("ibm-878", "KOI8-R");
    builder.put("koi8-r", "KOI8-R");
    builder.put("koi8", "KOI8-R");
    builder.put("cskoi8r", "KOI8-R");
    builder.put("windows-20866", "KOI8-R");
    builder.put("cp878", "KOI8-R");
    builder.put("ibm-922_p100-1999", "x-IBM922");
    builder.put("ibm-922", "x-IBM922");
    builder.put("ibm922", "x-IBM922");
    builder.put("cp922", "x-IBM922");
    builder.put("922", "x-IBM922");
    builder.put("ibm-5346_p100-1998", "windows-1250");
    builder.put("ibm-5346", "windows-1250");
    builder.put("cp1250", "windows-1250");
    builder.put("ibm-5347_p100-1998", "windows-1251");
    builder.put("ibm-5347", "windows-1251");
    builder.put("cp1251", "windows-1251");
    builder.put("ansi1251", "windows-1251");
    builder.put("ibm-5348_p100-1997", "windows-1252");
    builder.put("ibm-5348", "windows-1252");
    builder.put("cp1252", "windows-1252");
    builder.put("ibm-5349_p100-1998", "windows-1253");
    builder.put("ibm-5349", "windows-1253");
    builder.put("cp1253", "windows-1253");
    builder.put("ibm-5350_p100-1998", "windows-1254");
    builder.put("ibm-5350", "windows-1254");
    builder.put("cp1254", "windows-1254");
    builder.put("ibm-9447_p100-2002", "windows-1255");
    builder.put("ibm-9447", "windows-1255");
    builder.put("cp1255", "windows-1255");
    builder.put("ibm-9448_x100-2005", "windows-1256");
    builder.put("ibm-9448", "windows-1256");
    builder.put("cp1256", "windows-1256");
    builder.put("ibm-9449_p100-2002", "windows-1257");
    builder.put("ibm-9449", "windows-1257");
    builder.put("cp1257", "windows-1257");
    builder.put("ibm-5354_p100-1998", "windows-1258");
    builder.put("ibm-5354", "windows-1258");
    builder.put("cp1258", "windows-1258");
    builder.put("ibm-1006_p100-1995", "x-IBM1006");
    builder.put("ibm-1006", "x-IBM1006");
    builder.put("ibm1006", "x-IBM1006");
    builder.put("cp1006", "x-IBM1006");
    builder.put("1006", "x-IBM1006");
    builder.put("ibm-1098_p100-1995", "x-IBM1006");
    builder.put("ibm-1098", "x-IBM1006");
    builder.put("ibm1098", "x-IBM1006");
    builder.put("cp1098", "x-IBM1006");
    builder.put("1098", "x-IBM1006");
    builder.put("ibm-1124_p100-1996", "x-IBM1124");
    builder.put("ibm-1124", "x-IBM1124");
    builder.put("cp1124", "x-IBM1124");
    builder.put("1124", "x-IBM1124");
    builder.put("ISO_2022,locale=ja,version=0", "ISO-2022-JP");
    builder.put("iso-2022-jp", "ISO-2022-JP");
    builder.put("csiso2022jp", "ISO-2022-JP");
    builder.put("ISO_2022,locale=ko,version=0", "ISO-2022-KR");
    builder.put("iso-2022-kr", "ISO-2022-KR");
    builder.put("csiso2022kr", "ISO-2022-KR");
    builder.put("ISO_2022,locale=zh,version=0", "ISO-2022-CN");
    builder.put("iso-2022-cn", "ISO-2022-CN");
    builder.put("csiso2022cn", "ISO-2022-CN");
    builder.put("ibm-37_p100-1995", "IBM037");
    builder.put("ibm-37", "IBM037");
    builder.put("ibm-037", "IBM037");
    builder.put("ebcdic-cp-us", "IBM037");
    builder.put("ebcdic-cp-ca", "IBM037");
    builder.put("ebcdic-cp-wt", "IBM037");
    builder.put("ebcdic-cp-nl", "IBM037");
    builder.put("csibm037", "IBM037");
    builder.put("cp037", "IBM037");
    builder.put("37", "IBM037");
    builder.put("cpibm37", "IBM037");
    builder.put("cp37", "IBM037");
    builder.put("ibm-273_p100-1995", "IBM273");
    builder.put("ibm-273", "IBM273");
    builder.put("cp273", "IBM273");
    builder.put("csibm273", "IBM273");
    builder.put("ebcdic-de", "IBM273");
    builder.put("273", "IBM273");
    builder.put("ibm-277_p100-1995", "IBM277");
    builder.put("ibm-277", "IBM277");
    builder.put("cp277", "IBM277");
    builder.put("ebcdic-cp-dk", "IBM277");
    builder.put("ebcdic-cp-no", "IBM277");
    builder.put("csibm277", "IBM277");
    builder.put("ebcdic-dk", "IBM277");
    builder.put("277", "IBM277");
    builder.put("ibm-278_p100-1995", "IBM278");
    builder.put("ibm-278", "IBM278");
    builder.put("cp278", "IBM278");
    builder.put("ebcdic-cp-fi", "IBM278");
    builder.put("ebcdic-cp-se", "IBM278");
    builder.put("csibm278", "IBM278");
    builder.put("ebcdic-sv", "IBM278");
    builder.put("278", "IBM278");
    builder.put("ibm-280_p100-1995", "IBM280");
    builder.put("ibm-280", "IBM280");
    builder.put("cp280", "IBM280");
    builder.put("ebcdic-cp-it", "IBM280");
    builder.put("csibm280", "IBM280");
    builder.put("280", "IBM280");
    builder.put("ibm-284_p100-1995", "IBM284");
    builder.put("ibm-284", "IBM284");
    builder.put("cp284", "IBM284");
    builder.put("ebcdic-cp-es", "IBM284");
    builder.put("csibm284", "IBM284");
    builder.put("cpibm284", "IBM284");
    builder.put("284", "IBM284");
    builder.put("ibm-285_p100-1995", "IBM285");
    builder.put("ibm-285", "IBM285");
    builder.put("cp285", "IBM285");
    builder.put("ebcdic-cp-gb", "IBM285");
    builder.put("csibm285", "IBM285");
    builder.put("cpibm285", "IBM285");
    builder.put("ebcdic-gb", "IBM285");
    builder.put("285", "IBM285");
    builder.put("ibm-297_p100-1995", "IBM297");
    builder.put("ibm-297", "IBM297");
    builder.put("cp297", "IBM297");
    builder.put("ebcdic-cp-fr", "IBM297");
    builder.put("csibm297", "IBM297");
    builder.put("cpibm297", "IBM297");
    builder.put("297", "IBM297");
    builder.put("ibm-420_x120-1999", "IBM420");
    builder.put("ibm-420", "IBM420");
    builder.put("ibm420", "IBM420");
    builder.put("cp420", "IBM420");
    builder.put("ebcdic-cp-ar1", "IBM420");
    builder.put("csibm420", "IBM420");
    builder.put("420", "IBM420");
    builder.put("ibm-424_p100-1995", "IBM424");
    builder.put("ibm-424", "IBM424");
    builder.put("cp424", "IBM424");
    builder.put("ebcdic-cp-he", "IBM424");
    builder.put("csibm424", "IBM424");
    builder.put("424", "IBM424");
    builder.put("ibm-500_p100-1995", "IBM500");
    builder.put("ibm-500", "IBM500");
    builder.put("cp500", "IBM500");
    builder.put("ebcdic-cp-be", "IBM500");
    builder.put("csibm500", "IBM500");
    builder.put("ebcdic-cp-ch", "IBM500");
    builder.put("500", "IBM500");
    builder.put("ibm-838_p100-1995", "IBM-Thai");
    builder.put("ibm-838", "IBM-Thai");
    builder.put("ibm838", "IBM-Thai");
    builder.put("csibmthai", "IBM-Thai");
    builder.put("cp838", "IBM-Thai");
    builder.put("838", "IBM-Thai");
    builder.put("ibm-9030", "IBM-Thai");
    builder.put("ibm-870_p100-1995", "IBM870");
    builder.put("ibm-870", "IBM870");
    builder.put("cp870", "IBM870");
    builder.put("ebcdic-cp-roece", "IBM870");
    builder.put("ebcdic-cp-yu", "IBM870");
    builder.put("csibm870", "IBM870");
    builder.put("ibm-871_p100-1995", "IBM871");
    builder.put("ibm-871", "IBM871");
    builder.put("ebcdic-cp-is", "IBM871");
    builder.put("csibm871", "IBM871");
    builder.put("cp871", "IBM871");
    builder.put("ebcdic-is", "IBM871");
    builder.put("871", "IBM871");
    builder.put("ibm-875_p100-1995", "x-IBM875");
    builder.put("ibm-875", "x-IBM875");
    builder.put("ibm875", "x-IBM875");
    builder.put("cp875", "x-IBM875");
    builder.put("875", "x-IBM875");
    builder.put("ibm-918_p100-1995", "IBM918");
    builder.put("ibm-918", "IBM918");
    builder.put("cp918", "IBM918");
    builder.put("ebcdic-cp-ar2", "IBM918");
    builder.put("csibm918", "IBM918");
    builder.put("ibm-930_p120-1999", "x-IBM930");
    builder.put("ibm-930", "x-IBM930");
    builder.put("ibm-5026", "x-IBM930");
    builder.put("ibm930", "x-IBM930");
    builder.put("cp930", "x-IBM930");
    builder.put("930", "x-IBM930");
    builder.put("ibm-933_p110-1995", "x-IBM933");
    builder.put("ibm-933", "x-IBM933");
    builder.put("cp933", "x-IBM933");
    builder.put("933", "x-IBM933");
    builder.put("ibm-935_p110-1999", "x-IBM935");
    builder.put("ibm-935", "x-IBM935");
    builder.put("cp935", "x-IBM935");
    builder.put("935", "x-IBM935");
    builder.put("ibm-937_p110-1999", "x-IBM937");
    builder.put("ibm-937", "x-IBM937");
    builder.put("cp937", "x-IBM937");
    builder.put("937", "x-IBM937");
    builder.put("ibm-939_p120-1999", "x-IBM939");
    builder.put("ibm-939", "x-IBM939");
    builder.put("ibm-931", "x-IBM939");
    builder.put("ibm-5035", "x-IBM939");
    builder.put("ibm939", "x-IBM939");
    builder.put("cp939", "x-IBM939");
    builder.put("939", "x-IBM939");
    builder.put("ibm-1025_p100-1995", "x-IBM1025");
    builder.put("ibm-1025", "x-IBM1025");
    builder.put("cp1025", "x-IBM1025");
    builder.put("1025", "x-IBM1025");
    builder.put("ibm-1026_p100-1995", "IBM1026");
    builder.put("ibm-1026", "IBM1026");
    builder.put("ibm1026", "IBM1026");
    builder.put("cp1026", "IBM1026");
    builder.put("csibm1026", "IBM1026");
    builder.put("1026", "IBM1026");
    builder.put("ibm-1047_p100-1995", "IBM1047");
    builder.put("ibm-1047", "IBM1047");
    builder.put("ibm1047", "IBM1047");
    builder.put("cp1047", "IBM1047");
    builder.put("1047", "IBM1047");
    builder.put("ibm-1097_p100-1995", "x-IBM1097");
    builder.put("ibm-1097", "x-IBM1097");
    builder.put("cp1097", "x-IBM1097");
    builder.put("1097", "x-IBM1097");
    builder.put("ibm-1112_p100-1995", "x-IBM1112");
    builder.put("ibm-1112", "x-IBM1112");
    builder.put("cp1112", "x-IBM1112");
    builder.put("1112", "x-IBM1112");
    builder.put("ibm-1122_p100-1999", "x-IBM1122");
    builder.put("ibm-1122", "x-IBM1122");
    builder.put("cp1122", "x-IBM1122");
    builder.put("1122", "x-IBM1122");
    builder.put("ibm-1123_p100-1995", "x-IBM1123");
    builder.put("ibm-1123", "x-IBM1123");
    builder.put("cp1123", "x-IBM1123");
    builder.put("1123", "x-IBM1123");
    builder.put("ibm-1140_p100-1997", "IBM01140");
    builder.put("ibm-1140", "IBM01140");
    builder.put("ccsid01140", "IBM01140");
    builder.put("cp01140", "IBM01140");
    builder.put("cp1140", "IBM01140");
    builder.put("ebcdic-us-37+euro", "IBM01140");
    builder.put("ibm-1141_p100-1997", "IBM01141");
    builder.put("ibm-1141", "IBM01141");
    builder.put("ccsid01141", "IBM01141");
    builder.put("cp01141", "IBM01141");
    builder.put("cp1141", "IBM01141");
    builder.put("ebcdic-de-273+euro", "IBM01141");
    builder.put("ibm-1142_p100-1997", "IBM01142");
    builder.put("ibm-1142", "IBM01142");
    builder.put("ccsid01142", "IBM01142");
    builder.put("cp01142", "IBM01142");
    builder.put("cp1142", "IBM01142");
    builder.put("ebcdic-dk-277+euro", "IBM01142");
    builder.put("ebcdic-no-277+euro", "IBM01142");
    builder.put("ibm-1143_p100-1997", "IBM01143");
    builder.put("ibm-1143", "IBM01143");
    builder.put("ccsid01143", "IBM01143");
    builder.put("cp01143", "IBM01143");
    builder.put("cp1143", "IBM01143");
    builder.put("ebcdic-fi-278+euro", "IBM01143");
    builder.put("ebcdic-se-278+euro", "IBM01143");
    builder.put("ibm-1144_p100-1997", "IBM01144");
    builder.put("ibm-1144", "IBM01144");
    builder.put("ccsid01144", "IBM01144");
    builder.put("cp01144", "IBM01144");
    builder.put("cp1144", "IBM01144");
    builder.put("ebcdic-it-280+euro", "IBM01144");
    builder.put("ibm-1145_p100-1997", "IBM01145");
    builder.put("ibm-1145", "IBM01145");
    builder.put("ccsid01145", "IBM01145");
    builder.put("cp01145", "IBM01145");
    builder.put("cp1145", "IBM01145");
    builder.put("ebcdic-es-284+euro", "IBM01145");
    builder.put("ibm-1146_p100-1997", "IBM01146");
    builder.put("ibm-1146", "IBM01146");
    builder.put("ccsid01146", "IBM01146");
    builder.put("cp01146", "IBM01146");
    builder.put("cp1146", "IBM01146");
    builder.put("ebcdic-gb-285+euro", "IBM01146");
    builder.put("ibm-1147_p100-1997", "IBM01147");
    builder.put("ibm-1147", "IBM01147");
    builder.put("ccsid01147", "IBM01147");
    builder.put("cp01147", "IBM01147");
    builder.put("cp1147", "IBM01147");
    builder.put("ebcdic-fr-297+euro", "IBM01147");
    builder.put("ibm-1148_p100-1997", "IBM01148");
    builder.put("ibm-1148", "IBM01148");
    builder.put("ccsid01148", "IBM01148");
    builder.put("cp01148", "IBM01148");
    builder.put("cp1148", "IBM01148");
    builder.put("ebcdic-international-500+euro", "IBM01148");
    builder.put("ibm-1149_p100-1997", "IBM01149");
    builder.put("ibm-1149", "IBM01149");
    builder.put("ccsid01149", "IBM01149");
    builder.put("cp01149", "IBM01149");
    builder.put("cp1149", "IBM01149");
    builder.put("ebcdic-is-871+euro", "IBM01149");
    aliasTable = builder.build();
  }

}
TOP

Related Classes of org.commoncrawl.util.shared.CharsetUtils

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.