public void testEnWikipediaParsingFromReader() throws IOException,
InterruptedException {
URL wikiDump = Thread.currentThread().getContextClassLoader().getResource(
"enwiki-20090902-pages-articles-sample.xml");
assertNotNull(wikiDump);
WikipediaRecordReader reader = new WikipediaPageInputFormat.WikipediaRecordReader(
wikiDump, 0, 100000);
// first article
assertTrue(reader.nextKeyValue());
assertEquals(new Text("AccessibleComputing"), reader.getCurrentKey());
String markup = reader.getCurrentValue().toString();
assertEquals(
"#REDIRECT [[Computer accessibility]] {{R from CamelCase}}",
markup);
AnnotatingMarkupParser converter = new AnnotatingMarkupParser();
String simpleText = converter.parse(markup);
assertEquals("", simpleText);
assertTrue(converter.getWikiLinkAnnotations().isEmpty());
assertEquals("http://en.wikipedia.org/wiki/Computer_accessibility",
converter.getRedirect());
// second article
assertTrue(reader.nextKeyValue());
assertEquals("Anarchism", reader.getCurrentKey().toString());
markup = reader.getCurrentValue().toString();
converter = new AnnotatingMarkupParser();
simpleText = converter.parse(markup);
assertTrue(simpleText.startsWith("\nAnarchism is a political philosophy"
+ " encompassing theories and attitudes"));
assertEquals(465, converter.getWikiLinkAnnotations().size());
Annotation firstLink = converter.getWikiLinkAnnotations().get(0);
assertEquals("political philosophy", firstLink.label);
assertEquals("http://en.wikipedia.org/wiki/Political_philosophy",
firstLink.value);
assertEquals(16, firstLink.begin);
assertEquals(36, firstLink.end);
assertEquals("political philosophy",
simpleText.substring(firstLink.begin, firstLink.end));
// third article
assertTrue(reader.nextKeyValue());
assertEquals("AfghanistanHistory", reader.getCurrentKey().toString());
markup = reader.getCurrentValue().toString();
converter = new AnnotatingMarkupParser();
simpleText = converter.parse(markup);
assertEquals("", simpleText);
assertEquals(0, converter.getWikiLinkAnnotations().size());
assertEquals("http://en.wikipedia.org/wiki/History_of_Afghanistan",
converter.getRedirect());
// fourth article
assertTrue(reader.nextKeyValue());
assertEquals("Autism", reader.getCurrentKey().toString());
markup = reader.getCurrentValue().toString();
converter = new AnnotatingMarkupParser();
simpleText = converter.parse(markup);
assertTrue(simpleText.contains("Autism is a brain development disorder"
+ " characterized by impaired social interaction and communication"));
assertEquals(234, converter.getWikiLinkAnnotations().size());
firstLink = converter.getWikiLinkAnnotations().get(0);
assertEquals("Neurodevelopmental disorder", firstLink.label);
assertEquals(
"http://en.wikipedia.org/wiki/Neurodevelopmental_disorder",
firstLink.value);
assertEquals(15, firstLink.begin);
assertEquals(41, firstLink.end);
assertEquals("brain development disorder",
simpleText.substring(firstLink.begin, firstLink.end));
for (Annotation a: converter.getWikiLinkAnnotations()) {
// internal anchors are not extracted as links
assertFalse(a.value.startsWith("#"));
}
// there is no fifth article in this test file
assertFalse(reader.nextKeyValue());
}