String text =
"Hello, World! This is simple UTF-8 text content written"
+ " in English to test autodetection of both the character"
+ " encoding and the language of the input stream.";
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
parser.parse(
new ByteArrayInputStream(text.getBytes("UTF-8")),
new WriteOutContentHandler(writer),
metadata,
new ParseContext());
String content = writer.toString();
assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
assertEquals("en", metadata.get(Metadata.LANGUAGE));
// TODO: ICU reports the content encoding as ISO-8859-1, even though
// it could just as well be ASCII or UTF-8, so for now we won't
// test for the Metadata.CONTENT_ENCODING field
assertTrue(content.contains("Hello"));