* and content language ({@link HttpHeaders#CONTENT_LANGUAGE}).
*
* @return Reader to utf8 encoded reader.
*/
public static Reader getUTF8Reader(InputStream stream, Metadata metadata) throws TikaException, IOException{
CharsetDetector detector = new CharsetDetector();
// Use the declared character encoding, if available
String encoding = metadata.get(Metadata.CONTENT_ENCODING);
if (encoding != null) {
detector.setDeclaredEncoding(encoding);
}
// CharsetDetector expects a stream to support marks
if (!stream.markSupported()) {
stream = new BufferedInputStream(stream);
}
detector.setText(stream);
CharsetMatch match = detector.detect();
if (match == null) {
throw new TikaException("Unable to detect character encoding");
}
metadata.set(Metadata.CONTENT_ENCODING, match.getName());