public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
// Automatically detect the character encoding
AutoDetectReader reader = new AutoDetectReader(
new CloseShieldInputStream(stream), metadata,
context.get(ServiceLoader.class, LOADER));
try {
Charset charset = reader.getCharset();
String previous = metadata.get(Metadata.CONTENT_TYPE);
if (previous == null || previous.startsWith("text/html")) {
MediaType type = new MediaType(MediaType.TEXT_HTML, charset);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
}
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
// Get the HTML mapper from the parse context
HtmlMapper mapper =
context.get(HtmlMapper.class, new HtmlParserMapper());
// Parse the HTML document
org.ccil.cowan.tagsoup.Parser parser =
new org.ccil.cowan.tagsoup.Parser();
// Use schema from context or default
Schema schema = context.get(Schema.class, HTML_SCHEMA);
// TIKA-528: Reuse share schema to avoid heavy instantiation
parser.setProperty(
org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
// TIKA-599: Shared schema is thread-safe only if bogons are ignored
parser.setFeature(
org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
parser.setContentHandler(new XHTMLDowngradeHandler(
new HtmlHandler(mapper, handler, metadata)));
parser.parse(reader.asInputSource());
} finally {
reader.close();
}
}