// BlogBridge -- RSS feed reader, manager, and web based service
// Copyright (C) 2002-2006 by R. Pito Salas
//
// This program is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free Software Foundation;
// either version 2 of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
// without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along with this program;
// if not, write to the Free Software Foundation, Inc., 59 Temple Place,
// Suite 330, Boston, MA 02111-1307 USA
//
// Contact: R. Pito Salas
// mailto:pitosalas@users.sourceforge.net
// More information: about BlogBridge
// http://www.blogbridge.com
// http://sourceforge.net/projects/blogbridge
//
// $Id: UTF8Reader.java,v 1.3 2006/01/08 05:00:10 kyank Exp $
//
package com.salas.bb.utils.xml;
import java.io.InputStream;
import java.io.IOException;
import java.io.CharConversionException;
import java.io.Reader;
/**
* High-speed reader of any UTF-8-like stream. It's capable of reading both valid and invalid
* streams. If it finds invalid UTF-8 sequences it uses invalid bytes as bytes from ISO-8859-1
* and continues parsing. This approach guaranties that <b>any</b> stream will be parsed,
* but we do not guaranty the correctness of our own interpretation of invalid sequences.
*/
public final class UTF8Reader extends Reader
{
private InputStream in;
private byte[] buffer;
private int start;
private int finish;
private char secondHalf;
private int multibyteChar;
private int multibyteCharsToGo;
private int multibyteCharsRead;
/**
* Creates UTF-8 reader-interpreter for the stream.
*
* @param stream source stream.
*/
public UTF8Reader(InputStream stream)
{
in = stream;
buffer = new byte[8192];
finish = 0;
start = 0;
resetMultibyte();
}
/**
* Close the stream. Once a stream has been closed, further read(),
* ready(), mark(), or reset() invocations will throw an IOException.
* Closing a previously-closed stream, however, has no effect.
*
* @throws IOException If an I/O error occurs
*/
public void close() throws IOException
{
if (in != null)
{
in.close();
buffer = null;
in = null;
start = 0;
finish = 0;
}
}
/**
* Tell whether this stream is ready to be read.
*
* @return TRUE if the next read() is guaranteed not to block for input,
* false otherwise. Note that returning false does not guarantee that the
* next read will block.
*
* @throws IOException If an I/O error occurs
*/
public boolean ready()
throws IOException
{
return finish > start || in == null || in.available() != 0;
}
/**
* Reads maximum <code>len</code> bytes from stream into the target buffer starting from
* specified <code>offset</code>.
*
* @param buf target buffer.
* @param offset offset in buffer.
* @param len max bytes to read in.
*
* @return number of bytes read or -1 if the stream is over.
*
* @throws IOException in case of I/O error.
*/
public int read(char[] buf, int offset, int len)
throws IOException
{
int index = 0;
int ch = 0;
if (len <= 0) return 0;
if (secondHalf != 0)
{
buf[offset + index] = secondHalf;
index++;
secondHalf = 0;
}
while (index < len)
{
if (finish <= start)
{
int readCount = -1;
if (in != null)
{
int readOffset = 0;
// compact if necessary
if (multibyteCharsToGo > 0)
{
int off = start - (1 + multibyteCharsRead);
int length = finish - off;
System.arraycopy(buffer, off, buffer, 0, length);
readOffset = length;
start = length;
} else
{
start = 0;
}
readCount = in.read(buffer, readOffset, buffer.length - readOffset);
}
if (readCount <= 0)
{
if (multibyteCharsToGo > 0)
{
// Stream finished, but we have not finished job yet
finish = start;
index = saveMultiByteStartAndRewind(buf, offset, index);
continue;
} else
{
// Close and exit
close();
ch = -1;
break;
}
} else
{
finish = start + readCount;
}
}
// Get next char
ch = buffer[start] & 0x0ff;
if (multibyteCharsToGo > 0)
{
// multi-byte sequence continues...
if ((ch & 0xc0) == 0x80)
{
// valid continuation byte
multibyteChar = (multibyteChar << 6) | (ch & 0x3f);
multibyteCharsToGo--;
multibyteCharsRead++;
start++;
if (multibyteCharsToGo == 0)
{
// finished reading multi-byte successfully -- write it to the target
// buffer and forget
// Unicode supports c <= 0x0010 ffff ...
if (multibyteChar > 0x0010ffff)
{
throw new CharConversionException("UTF-8 encoding of character 0x00" +
Integer.toHexString(multibyteChar) +
" can't be converted to Unicode.");
} else if (multibyteChar > 0xffff)
{
// Convert UCS-4 char to UTF-16
multibyteChar -= 0x10000;
secondHalf = (char)(0xDC00 + (multibyteChar & 0x03ff));
multibyteChar = 0xD800 + (multibyteChar >> 10);
}
buf[offset + index++] = (char)multibyteChar;
if (secondHalf != 0 && index < len)
{
buf[offset + index++] = secondHalf;
secondHalf = 0;
}
resetMultibyte();
}
} else
{
// the sequence got broken -- write first byte as is and rewind to the
// first continuation byte
index = saveMultiByteStartAndRewind(buf, offset, index);
}
} else
{
// Find multi-byte sequence start, others - ASCII of ISO-8859-1
if ((ch & 0x0E0) == 0x0C0)
{
// 2 bytes (0x0080 - 0x07FF)
multibyteChar = ch & 0x1F;
multibyteCharsToGo = 1;
ch = -1;
} else if ((ch & 0x0F0) == 0x0E0)
{
// 3 bytes (0x0800 - 0xFFFF)
multibyteChar = ch & 0x0F;
multibyteCharsToGo = 2;
ch = -1;
} else if ((ch & 0x0F8) == 0x0F0)
{
// 4 bytes (0x0001 0000 <= c <= 0x001F FFFF)
multibyteChar = ch & 0x07;
multibyteCharsToGo = 3;
ch = -1;
}
// Write if there's anything to write
if (ch != -1) buf[offset + index++] = (char)ch;
start++;
}
}
return (index > 0) ? index : (ch == -1) ? -1 : 0;
}
/**
* Saves starting byte of false multi-byte sequence into buffer and rewind to
* first continuation byte to start further parsing from (if any continuation bytes
* were read of course).
*
* @param buf target buffer.
* @param offset offset in target buffer.
* @param i current index in target buffer relative to offset.
*
* @return new index value.
*/
private int saveMultiByteStartAndRewind(char[] buf, int offset, int i)
{
start -= multibyteCharsRead;
buf[offset + i++] = (char)(buffer[start - 1] & 0xFF);
resetMultibyte();
return i;
}
/**
* Resets all multi-byte properties into initial state.
*/
private void resetMultibyte()
{
multibyteChar = 0;
multibyteCharsToGo = 0;
multibyteCharsRead = 0;
}
}