Source Code of xbird.util.codec.UTF8Codec

/*
 * @(#)$Id: UTF8Converter.java 1001 2006-10-03 09:47:24Z yui $
 *
 * Copyright 2006-2008 Makoto YUI
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * Contributors:
 *     Makoto YUI - initial implementation
 */
package xbird.util.codec;


import java.io.CharConversionException;
import java.io.IOException;
import java.io.OutputStream;


import xbird.util.collections.ints.IntArrayList;
import xbird.util.io.FastByteArrayOutputStream;


/**
 * This is a utility class encode/decode UTF-8.
 * <DIV lang="en"></DIV>
 * <DIV lang="ja"></DIV>
 * 
 * @author Makoto YUI <yuin405+xbird@gmail.com>
 */
public final class UTF8Codec {


    private static final char EOF = 0xFF; // non valid UTF8 String.


    private static final String ERR_MSG_INVALID_UTF8_ENCODING = "Invalid UTF-8 encoding";


    /** table for hexize. */
    private static final char[] hexSymbol = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
            'A', 'B', 'C', 'D', 'E', 'F' };


    private UTF8Codec() {}


    public static byte[] encode(final int[] cp, final int length) {
        // determine how many bytes are needed for the complete conversion
        final int bytesNeeded = requiredBytes(cp);
        // allocate a byte[] of the necessary size
        final byte[] utf8 = new byte[bytesNeeded];
        // do the conversion from character code points to utf-8
        for(int i = 0, bytes = 0; i < length; i++) {
            if(cp[i] < 0x80) {
                utf8[bytes++] = (byte) cp[i];
            } else if(cp[i] < 0x0800) {
                utf8[bytes++] = (byte) (cp[i] >> 6 | 0xC0);
                utf8[bytes++] = (byte) (cp[i] & 0x3F | 0x80);
            } else if(cp[i] < 0x10000) {
                utf8[bytes++] = (byte) (cp[i] >> 12 | 0xE0);
                utf8[bytes++] = (byte) (cp[i] >> 6 & 0x3F | 0x80);
                utf8[bytes++] = (byte) (cp[i] & 0x3F | 0x80);
            } else if(cp[i] < 0x200000) {
                utf8[bytes++] = (byte) (cp[i] >> 18 | 0xF0);
                utf8[bytes++] = (byte) (cp[i] >> 12 & 0x3F | 0x80);
                utf8[bytes++] = (byte) (cp[i] >> 6 & 0x3F | 0x80);
                utf8[bytes++] = (byte) (cp[i] & 0x3F | 0x80);
            } else if(cp[i] < 0x4000000) {
                utf8[bytes++] = (byte) (cp[i] >> 24 | 0xF8);
                utf8[bytes++] = (byte) ((cp[i] >> 18 & 0x3F) | 0x80);
                utf8[bytes++] = (byte) ((cp[i] >> 12 & 0x3F) | 0x80);
                utf8[bytes++] = (byte) ((cp[i] >> 6 & 0x3F) | 0x80);
                utf8[bytes++] = (byte) ((cp[i] & 0x3F) | 0x80);
            } else { // already verified
                utf8[bytes++] = (byte) (cp[i] >> 30 | 0xFC);
                utf8[bytes++] = (byte) ((cp[i] >> 24 & 0x3F) | 0x80);
                utf8[bytes++] = (byte) ((cp[i] >> 18 & 0x3F) | 0x80);
                utf8[bytes++] = (byte) ((cp[i] >> 12 & 0x3F) | 0x80);
                utf8[bytes++] = (byte) ((cp[i] >> 6 & 0x3F) | 0x80);
                utf8[bytes++] = (byte) ((cp[i] & 0x3F) | 0x80);
            }
        }
        return utf8;
    }


    /**
     * Converts an array of Unicode scalar values (code points) into
     * UTF-8. This algorithm works under the assumption that all
     * surrogate pairs have already been converted into scalar code
     * point values within the argument.
     * 
     * get detail on this page.
     * http://developers.sun.com/dev/gadc/technicalpublications/articles/utf8.html
     * 
     * @param cp an array of Unicode scalar values (code points)
     * @return a byte[] containing the UTF-8 encoded characters
     */
    public static byte[] encode(final int[] cp) {
        return encode(cp, cp.length);
    }


    public static int[] decode(final byte[] binary, final int len) {
        if(len == 0) {
            return null;
        }
        IntArrayList res = new IntArrayList((int) (len * 0.75));
        int code;
        for(int offset = 0; (code = decodeCharacter(binary, offset, 0, 0)) != EOF; offset += characterSize(code)) {
            res.add(code);
        }
        return res.toArray();
    }


    public static int[] decode(final byte[] binary) {
        return decode(binary, binary.length);
    }


    public static int characterAt(final byte[] binary, final int index) {
        if(index < 1) {
            throw new IllegalArgumentException("Index MUST be more than 0, but was " + index);
        }
        int code = -1;
        int i = 0, offset = 0;
        while(i++ < index && (code = decodeCharacter(binary, offset, 0, 0)) != EOF) {
            offset += characterSize(code);
        }
        return code;
    }


    public static int requiredBytes(final int[] cp) {
        int bytesNeeded = 0;
        final int length = cp.length;
        for(int i = 0; i < length; i++) {
            bytesNeeded += characterSize(cp[i]);
        }
        return bytesNeeded;
    }


    public static int characterSize(final int in) {
        if(in < 0) { // sanity check
            // minus value (11111111: -1, 10000000: -128)
            throw new IllegalStateException(ERR_MSG_INVALID_UTF8_ENCODING);
        }
        if(in < 0x80) {
            // in < 128 (1byte) 
            return 1;
        } else if(in < 0x0800) {
            // 128 <= in < 2048 (2bytes).
            return 2;
        } else if(in < 0x10000) {
            // 2048 <= in < 65536 (3bytes)
            return 3;
        } else if(in < 0x200000) {
            // 65536 <= in < 2097152 (4bytes) 
            return 4;
        } else if(in < 0x4000000) {
            // 2097152 <= in < 67108864 (5bytes)
            return 5;
        } else if(in <= 0x7FFFFFFF) {
            // 67108864 <= in < 2147483648 (6bytes)
            return 6;
        } else { // sanity check
            throw new IllegalStateException(ERR_MSG_INVALID_UTF8_ENCODING);
        }
    }


    private static int decodeCharacter(final byte[] binary, final int offset, int outCode, int moreBytes) {
        final int length = binary.length;
        for(int i = offset; i < length; i++) {
            final byte b = binary[i];
            // Decodes UTF-8.
            if(b >= 0 && moreBytes == 0) {
                // 0xxxxxxx
                return b;
            } else if(((b & 0xC0) == 0x80) && (moreBytes != 0)) {
                // 10xxxxxx (continuation byte)
                outCode = (outCode << 6) | (b & 0x3F); // Adds 6 bits to code.
                if(--moreBytes == 0) {
                    return outCode;
                } else if(moreBytes < 0) { // sanity check
                    throw new IllegalStateException(ERR_MSG_INVALID_UTF8_ENCODING);
                } else {
                    return decodeCharacter(binary, i + 1, outCode, moreBytes);
                }
            } else if(((b & 0xE0) == 0xC0) && (moreBytes == 0)) {
                // 110xxxxx
                outCode = b & 0x1F;
                return decodeCharacter(binary, i + 1, outCode, 1);
            } else if(((b & 0xF0) == 0xE0) && (moreBytes == 0)) {
                // 1110xxxx
                outCode = b & 0x0F;
                return decodeCharacter(binary, i + 1, outCode, 2);
            } else if(((b & 0xF8) == 0xF0) && (moreBytes == 0)) {
                // 11110xxx
                outCode = b & 0x07;
                return decodeCharacter(binary, i + 1, outCode, 3);
            } else if(((b & 0xFC) == 0xF8) && (moreBytes == 0)) {
                // 111110xx
                outCode = b & 0x03;
                return decodeCharacter(binary, i + 1, outCode, 4);
            } else if(((b & 0xFE) == 0xFC) && (moreBytes == 0)) {
                // 1111110x
                outCode = b & 0x01;
                return decodeCharacter(binary, i + 1, outCode, 5);
            } else { // sanity check
                throw new IllegalStateException(ERR_MSG_INVALID_UTF8_ENCODING);
            }
        }
        return EOF;
    }


    /**
     * Converts bytea to Hex chars.
     */
    public static char[] toHex(final byte[] b) {
        final int length = b.length;
        final char[] ret = new char[length * 2];
        for(int i = 0; i < length; i++) {
            final int high = ((b[i] & 0xF0) >> 4);
            ret[i] = hexSymbol[high];
            final int low = (b[i] & 0x0F);
            ret[i + 1] = hexSymbol[low];
        }
        return ret;
    }


    public static int toHex(final char b) {
        final int high = ((b & 0xF0) >> 4);
        final int low = (b & 0x0F);
        int ret = hexSymbol[high] << 4;
        ret = (ret & hexSymbol[low]);
        return ret;
    }


    public static void writeUTF8(char c, OutputStream out) throws IOException {
        writeUTF8((int) c, out);
    }


    public static void writeUTF8(char c, FastByteArrayOutputStream out) {
        writeUTF8((int) c, out);
    }


    public static void writeUTF8(int code, OutputStream out) throws IOException {
        if((code & 0xffffff80) == 0) {
            out.write(code);
        } else if((code & 0xfffff800) == 0) { // 2 bytes.
            out.write(0xc0 | (code >> 6));
            out.write(0x80 | (code & 0x3f));
        } else if((code & 0xffff0000) == 0) { // 3 bytes.
            out.write(0xe0 | (code >> 12));
            out.write(0x80 | ((code >> 6) & 0x3f));
            out.write(0x80 | (code & 0x3f));
        } else if((code & 0xff200000) == 0) { // 4 bytes.
            out.write(0xf0 | (code >> 18));
            out.write(0x80 | ((code >> 12) & 0x3f));
            out.write(0x80 | ((code >> 6) & 0x3f));
            out.write(0x80 | (code & 0x3f));
        } else if((code & 0xf4000000) == 0) { // 5 bytes.
            out.write(0xf8 | (code >> 24));
            out.write(0x80 | ((code >> 18) & 0x3f));
            out.write(0x80 | ((code >> 12) & 0x3f));
            out.write(0x80 | ((code >> 6) & 0x3f));
            out.write(0x80 | (code & 0x3f));
        } else if((code & 0x80000000) == 0) { // 6 bytes.
            out.write(0xfc | (code >> 30));
            out.write(0x80 | ((code >> 24) & 0x3f));
            out.write(0x80 | ((code >> 18) & 0x3f));
            out.write(0x80 | ((code >> 12) & 0x3F));
            out.write(0x80 | ((code >> 6) & 0x3F));
            out.write(0x80 | (code & 0x3F));
        } else {
            throw new CharConversionException("Illegal character: " + Integer.toHexString(code));
        }
    }


    public static void writeUTF8(int code, FastByteArrayOutputStream out) {
        if((code & 0xffffff80) == 0) {
            out.write(code);
        } else if((code & 0xfffff800) == 0) { // 2 bytes.
            out.write(0xc0 | (code >> 6));
            out.write(0x80 | (code & 0x3f));
        } else if((code & 0xffff0000) == 0) { // 3 bytes.
            out.write(0xe0 | (code >> 12));
            out.write(0x80 | ((code >> 6) & 0x3f));
            out.write(0x80 | (code & 0x3f));
        } else if((code & 0xff200000) == 0) { // 4 bytes.
            out.write(0xf0 | (code >> 18));
            out.write(0x80 | ((code >> 12) & 0x3f));
            out.write(0x80 | ((code >> 6) & 0x3f));
            out.write(0x80 | (code & 0x3f));
        } else if((code & 0xf4000000) == 0) { // 5 bytes.
            out.write(0xf8 | (code >> 24));
            out.write(0x80 | ((code >> 18) & 0x3f));
            out.write(0x80 | ((code >> 12) & 0x3f));
            out.write(0x80 | ((code >> 6) & 0x3f));
            out.write(0x80 | (code & 0x3f));
        } else if((code & 0x80000000) == 0) { // 6 bytes.
            out.write(0xfc | (code >> 30));
            out.write(0x80 | ((code >> 24) & 0x3f));
            out.write(0x80 | ((code >> 18) & 0x3f));
            out.write(0x80 | ((code >> 12) & 0x3F));
            out.write(0x80 | ((code >> 6) & 0x3F));
            out.write(0x80 | (code & 0x3F));
        } else {
            throw new IllegalArgumentException("Illegal character: " + Integer.toHexString(code));
        }
    }


}
Source Code of xbird.util.codec.UTF8Codec

Related Classes of xbird.util.codec.UTF8Codec