/*******************************************************************************
* Copyright (c) 2009-2013 CWI
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* * Davy Landman - Davy.Landman@cwi.nl - CWI
*******************************************************************************/
package org.rascalmpl.unicode;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
public class UnicodeInputStreamReader extends Reader {
private Reader wrapped;
private InputStream original;
private String encoding;
public UnicodeInputStreamReader(InputStream in) {
original = in;
}
public UnicodeInputStreamReader(InputStream in, String encoding) {
original = in;
this.encoding = encoding;
}
public UnicodeInputStreamReader(InputStream in, Charset charset) {
this(in, charset == null ? null : charset.name());
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
if (wrapped == null) {
if (encoding != null) {
// we have an encoding, so lets just skip the possible BOM
wrapped = removeBOM(original, encoding);
original = null;
}
else {
// we have to try and detect the decoding
wrapped = detectCharset(original);
original = null;
}
}
return wrapped.read(cbuf, off, len);
}
@Override
public void close() throws IOException {
if (wrapped != null) {
wrapped.close();
}
else {
original.close();
}
}
private static Reader removeBOM(InputStream in, String encoding) throws IOException {
byte[] detectionBuffer = new byte[UnicodeDetector.getMaximumBOMLength()];
int bufferSize = in.read(detectionBuffer);
ByteOrderMarker b = UnicodeDetector.detectBom(detectionBuffer, bufferSize);
if (b != null) {
Charset ref = Charset.forName(encoding);
if (UnicodeDetector.isAmbigiousBOM(b.getCharset(), ref)) {
b = ByteOrderMarker.fromString(encoding);
}
if (b.getCharset().equals(ref) || b.getGroup().equals(ref)) {
InputStream prefix = new ByteArrayInputStream(detectionBuffer, b.getHeaderLength(), bufferSize - b.getHeaderLength());
return new InputStreamReader(new ConcatInputStream(prefix, in), b.getCharset());
}
else {
throw new UnsupportedEncodingException("The requested encoding was " + encoding + " but the file contained a BOM for " + b.getCharset().name() + ".");
}
}
else {
InputStream prefix = new ByteArrayInputStream(detectionBuffer, 0, bufferSize);
return new InputStreamReader(new ConcatInputStream(prefix, in), encoding);
}
}
private static Reader detectCharset(InputStream in) throws IOException {
byte[] detectionBuffer = new byte[UnicodeDetector.getSuggestedDetectionSampleSize()];
int bufferSize = in.read(detectionBuffer);
ByteOrderMarker b =UnicodeDetector.detectBom(detectionBuffer, bufferSize);
if (b != null) {
// we have to remove the BOM from the front
InputStream prefix = new ByteArrayInputStream(detectionBuffer, b.getHeaderLength(), bufferSize - b.getHeaderLength());
return new InputStreamReader(new ConcatInputStream(prefix, in), b.getCharset());
}
Charset cs = UnicodeDetector.detectByContent(detectionBuffer, bufferSize);
if (cs == null) {
cs = Charset.defaultCharset();
}
InputStream prefix = new ByteArrayInputStream(detectionBuffer, 0, bufferSize);
return new InputStreamReader(new ConcatInputStream(prefix, in), cs);
}
}