}
}
// No (valid) charset in a meta http-equiv tag, see if it's in the passed content-encoding
// hint, or the passed content-type hint.
CharsetDetector detector = new CharsetDetector();
String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
String incomingType = metadata.get(Metadata.CONTENT_TYPE);
if (incomingCharset == null && incomingType != null) {
// TIKA-341: Use charset in content-type
MediaType mt = MediaType.parse(incomingType);
if (mt != null) {
String charset = mt.getParameters().get("charset");
if ((charset != null) && Charset.isSupported(charset)) {
incomingCharset = charset;
}
}
}
if (incomingCharset != null) {
detector.setDeclaredEncoding(incomingCharset);
}
// TIKA-341 without enabling input filtering (stripping of tags) the
// short HTML tests don't work well.
detector.enableInputFilter(true);
detector.setText(stream);
for (CharsetMatch match : detector.detectAll()) {
if (Charset.isSupported(match.getName())) {
metadata.set(Metadata.CONTENT_ENCODING, match.getName());
// TIKA-339: Don't set language, as it's typically not a very good
// guess, and it can create ambiguity if another (better) language