}
}
String encoding = metadata.get(Metadata.CONTENT_ENCODING);
if (encoding == null) {
throw new TikaException(
"Text encoding could not be detected and no encoding"
+ " hint is available in document metadata");
}
// TIKA-341: Only stomp on content-type after we're done trying to
// use it to guess at the charset.
metadata.set(Metadata.CONTENT_TYPE, "text/plain");
try {
Reader reader =
new BufferedReader(new InputStreamReader(stream, encoding));
// TIKA-240: Drop the BOM when extracting plain text
reader.mark(1);
int bom = reader.read();
if (bom != '\ufeff') { // zero-width no-break space
reader.reset();
}
XHTMLContentHandler xhtml =
new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
char[] buffer = new char[4096];
int n = reader.read(buffer);
while (n != -1) {
xhtml.characters(buffer, 0, n);
n = reader.read(buffer);
}
xhtml.endElement("p");
xhtml.endDocument();
} catch (UnsupportedEncodingException e) {
throw new TikaException(
"Unsupported text encoding: " + encoding, e);
}
}