public void testHtmlParsing() throws Exception {
URL path = SimpleParserTest.class.getResource("/simple-page.html");
BaseParser parser = new SimpleParser();
FetchedDatum content = makeFetchedDatum(path);
ParsedDatum parse = parser.parse(content);
Assert.assertNotNull(parse.getParsedText());
// TODO - add back in title text to simple-page, when we generate this
File parsedTextFile = new File(SimpleParserTest.class.getResource("/" + "simple-page.txt").getFile());
String expectedString = FileUtils.readFileToString(parsedTextFile, "utf-8");
String actualString = parse.getParsedText();
// Trim of leading returns so split() doesn't give us an empty term
// TODO - use our own split that skips leading/trailing separators
compareTermsInStrings(expectedString, actualString.replaceFirst("^[\\n]+", ""));
// TODO reenable when Tika bug is fixed re not emitting <img> links.
// Outlink[] outlinks = parse.getOutlinks();
// Assert.assertEquals(10, outlinks.length);
Assert.assertEquals("TransPac Software", parse.getTitle());
}