@Test
public void testParseAscii() throws Exception {
String path = "/test-documents/testHTML.html";
final StringWriter href = new StringWriter();
final StringWriter name = new StringWriter();
ContentHandler body = new BodyContentHandler();
Metadata metadata = new Metadata();
InputStream stream = HtmlParserTest.class.getResourceAsStream(path);
try {
ContentHandler link = new DefaultHandler() {
@Override
public void startElement(
String u, String l, String n, Attributes a)
throws SAXException {
if ("a".equals(l)) {
if (a.getValue("href") != null) {
href.append(a.getValue("href"));
} else if (a.getValue("name") != null) {
name.append(a.getValue("name"));
}
}
}
};
new HtmlParser().parse(
stream, new TeeContentHandler(body, link),
metadata, new ParseContext());
} finally {
stream.close();
}
assertEquals(
"Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Tika Developers", metadata.get("Author"));
assertEquals("5", metadata.get("refresh"));
assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
assertEquals("http://www.apache.org/", href.toString());
assertEquals("test-anchor", name.toString());
String content = body.toString();
assertTrue(
"Did not contain expected text:" + "Test Indexation Html",
content.contains("Test Indexation Html"));
assertTrue(
"Did not contain expected text:" + "Indexation du fichier",