Package pignlproc.format.WikipediaPageInputFormat

Examples of pignlproc.format.WikipediaPageInputFormat.WikipediaRecordReader.nextKeyValue()


        assertNotNull(wikiDump);
        WikipediaRecordReader reader = new WikipediaPageInputFormat.WikipediaRecordReader(
                wikiDump, 0, 100000);

        // first article
        assertTrue(reader.nextKeyValue());
        assertEquals(new Text("AccessibleComputing"), reader.getCurrentKey());
        String markup = reader.getCurrentValue().toString();
        assertEquals(
                "#REDIRECT [[Computer accessibility]] {{R from CamelCase}}",
                markup);
View Full Code Here


        assertTrue(converter.getWikiLinkAnnotations().isEmpty());
        assertEquals("http://en.wikipedia.org/wiki/Computer_accessibility",
                converter.getRedirect());

        // second article
        assertTrue(reader.nextKeyValue());
        assertEquals("Anarchism", reader.getCurrentKey().toString());
        markup = reader.getCurrentValue().toString();
        converter = new AnnotatingMarkupParser();
        simpleText = converter.parse(markup);
        assertTrue(simpleText.startsWith("\nAnarchism is a political philosophy"
View Full Code Here

        assertEquals(36, firstLink.end);
        assertEquals("political philosophy",
                simpleText.substring(firstLink.begin, firstLink.end));

        // third article
        assertTrue(reader.nextKeyValue());
        assertEquals("AfghanistanHistory", reader.getCurrentKey().toString());
        markup = reader.getCurrentValue().toString();
        converter = new AnnotatingMarkupParser();
        simpleText = converter.parse(markup);
        assertEquals("", simpleText);
View Full Code Here

        assertEquals(0, converter.getWikiLinkAnnotations().size());
        assertEquals("http://en.wikipedia.org/wiki/History_of_Afghanistan",
                converter.getRedirect());

        // fourth article
        assertTrue(reader.nextKeyValue());
        assertEquals("Autism", reader.getCurrentKey().toString());
        markup = reader.getCurrentValue().toString();
        converter = new AnnotatingMarkupParser();
        simpleText = converter.parse(markup);
        assertTrue(simpleText.contains("Autism is a brain development disorder"
View Full Code Here

            // internal anchors are not extracted as links
            assertFalse(a.value.startsWith("#"));
        }

        // there is no fifth article in this test file
        assertFalse(reader.nextKeyValue());
    }

    @Test
    public void testFrWikipediaParsingFromReader() throws IOException,
            InterruptedException {
View Full Code Here

        assertNotNull(wikiDump);
        WikipediaRecordReader reader = new WikipediaPageInputFormat.WikipediaRecordReader(
                wikiDump, 0, 100000);

        // first article
        assertTrue(reader.nextKeyValue());
        assertEquals("Antoine Meillet", reader.getCurrentKey().toString());
        String markup = reader.getCurrentValue().toString();
        AnnotatingMarkupParser converter = new AnnotatingMarkupParser();
        String simpleText = converter.parse(markup);
        // TODO: handle date templates
View Full Code Here

                paragraphs.get(1));
        assertEquals(15, paragraphs.size());
        assertNull(converter.getRedirect());

        // go to the last article wich is a redirect
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());
View Full Code Here

        assertEquals(15, paragraphs.size());
        assertNull(converter.getRedirect());

        // go to the last article wich is a redirect
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());

        assertEquals("Amenophis IV", reader.getCurrentKey().toString());
View Full Code Here

        assertNull(converter.getRedirect());

        // go to the last article wich is a redirect
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());

        assertEquals("Amenophis IV", reader.getCurrentKey().toString());
        markup = reader.getCurrentValue().toString();
View Full Code Here

        // go to the last article wich is a redirect
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());

        assertEquals("Amenophis IV", reader.getCurrentKey().toString());
        markup = reader.getCurrentValue().toString();
        converter = new AnnotatingMarkupParser();
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.