Package pignlproc.markup

Examples of pignlproc.markup.AnnotatingMarkupParser


        String markup = reader.getCurrentValue().toString();
        assertEquals(
                "#REDIRECT [[Computer accessibility]] {{R from CamelCase}}",
                markup);

        AnnotatingMarkupParser converter = new AnnotatingMarkupParser();
        String simpleText = converter.parse(markup);
        assertEquals("", simpleText);
        assertTrue(converter.getWikiLinkAnnotations().isEmpty());
        assertEquals("http://en.wikipedia.org/wiki/Computer_accessibility",
                converter.getRedirect());

        // second article
        assertTrue(reader.nextKeyValue());
        assertEquals("Anarchism", reader.getCurrentKey().toString());
        markup = reader.getCurrentValue().toString();
        converter = new AnnotatingMarkupParser();
        simpleText = converter.parse(markup);
        assertTrue(simpleText.startsWith("\nAnarchism is a political philosophy"
                + " encompassing theories and attitudes"));
        assertEquals(465, converter.getWikiLinkAnnotations().size());
        Annotation firstLink = converter.getWikiLinkAnnotations().get(0);
        assertEquals("political philosophy", firstLink.label);
        assertEquals("http://en.wikipedia.org/wiki/Political_philosophy",
                firstLink.value);
        assertEquals(16, firstLink.begin);
        assertEquals(36, firstLink.end);
        assertEquals("political philosophy",
                simpleText.substring(firstLink.begin, firstLink.end));

        // third article
        assertTrue(reader.nextKeyValue());
        assertEquals("AfghanistanHistory", reader.getCurrentKey().toString());
        markup = reader.getCurrentValue().toString();
        converter = new AnnotatingMarkupParser();
        simpleText = converter.parse(markup);
        assertEquals("", simpleText);
        assertEquals(0, converter.getWikiLinkAnnotations().size());
        assertEquals("http://en.wikipedia.org/wiki/History_of_Afghanistan",
                converter.getRedirect());

        // fourth article
        assertTrue(reader.nextKeyValue());
        assertEquals("Autism", reader.getCurrentKey().toString());
        markup = reader.getCurrentValue().toString();
        converter = new AnnotatingMarkupParser();
        simpleText = converter.parse(markup);
        assertTrue(simpleText.contains("Autism is a brain development disorder"
                + " characterized by impaired social interaction and communication"));
        assertEquals(234, converter.getWikiLinkAnnotations().size());
        firstLink = converter.getWikiLinkAnnotations().get(0);
        assertEquals("Neurodevelopmental disorder", firstLink.label);
        assertEquals(
                "http://en.wikipedia.org/wiki/Neurodevelopmental_disorder",
                firstLink.value);
        assertEquals(15, firstLink.begin);
        assertEquals(41, firstLink.end);
        assertEquals("brain development disorder",
                simpleText.substring(firstLink.begin, firstLink.end));

        for (Annotation a: converter.getWikiLinkAnnotations()) {
            // internal anchors are not extracted as links
            assertFalse(a.value.startsWith("#"));
        }

        // there is no fifth article in this test file
View Full Code Here


        // first article
        assertTrue(reader.nextKeyValue());
        assertEquals("Antoine Meillet", reader.getCurrentKey().toString());
        String markup = reader.getCurrentValue().toString();
        AnnotatingMarkupParser converter = new AnnotatingMarkupParser();
        String simpleText = converter.parse(markup);
        // TODO: handle date templates
        assertTrue(simpleText.startsWith("Paul Jules Antoine Meillet, né le  à Moulins,"
                + " Allier, mort le  à Châteaumeillant"));
        assertEquals(48, converter.getWikiLinkAnnotations().size());
        List<String> headers = converter.getHeaders();
        assertEquals("Biographie", headers.get(0));
        assertEquals("Antoine Meillet et les études arméniennes",
                headers.get(1));
        assertEquals("Antoine Meillet et les études homériques", headers.get(2));
        assertEquals(8, headers.size());
        List<String> paragraphs = converter.getParagraphs();
        assertEquals(
                "Paul Jules Antoine Meillet, né le  à Moulins, Allier, mort"
                        + " le  à Châteaumeillant, Cher, est le principal linguiste"
                        + " français des premières décennies du .",
                paragraphs.get(0));
        assertEquals(
                "D'origine bourbonnaise, fils d'un notaire de Châteaumeillant (Cher),"
                        + " il fait ses études secondaires au lycée"
                        + " Théodore-de-Banville de Moulins.\n"
                        + "Étudiant à la faculté des lettres de Paris à partir de 1885,"
                        + " il suit notamment les cours de Louis Havet à la Sorbonne, de"
                        + " Michel Bréal au Collège de France et de Ferdinand de Saussure"
                        + " à l'École pratique des hautes études. Il assure à la suite"
                        + " de Saussure le cours de grammaire comparée, qu'il complète à"
                        + " partir de 1894 par une conférence sur l'iranien.",
                paragraphs.get(1));
        assertEquals(15, paragraphs.size());
        assertNull(converter.getRedirect());

        // go to the last article wich is a redirect
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());
        assertTrue(reader.nextKeyValue());

        assertEquals("Amenophis IV", reader.getCurrentKey().toString());
        markup = reader.getCurrentValue().toString();
        converter = new AnnotatingMarkupParser();
        assertEquals("", converter.parse(markup));
        assertEquals(0, converter.getWikiLinkAnnotations().size());
        assertEquals("http://en.wikipedia.org/wiki/Akh%C3%A9naton",
                converter.getRedirect());

        // this was the last article
        assertFalse(reader.nextKeyValue());
    }
View Full Code Here

            }
            String title = reader.getCurrentKey().toString();
            String uri = AnnotatingMarkupParser.titleToUri(title, languageCode);
            String rawMarkup = reader.getCurrentValue().toString();

            AnnotatingMarkupParser converter = new AnnotatingMarkupParser(
                    languageCode);
            String text = converter.parse(rawMarkup);
            String redirect = converter.getRedirect();
            DataBag links = bagFactory.newDefaultBag();
            for (Annotation link : converter.getWikiLinkAnnotations()) {
                links.add(tupleFactory.newTupleNoCopy(Arrays.asList(link.value,
                        link.begin, link.end)));
            }
            DataBag headers = bagFactory.newDefaultBag();
            for (Annotation h : converter.getHeaderAnnotations()) {
                headers.add(tupleFactory.newTupleNoCopy(Arrays.asList(h.value,
                        h.begin, h.end)));
            }
            DataBag paragraphs = bagFactory.newDefaultBag();
            for (Annotation p : converter.getParagraphAnnotations()) {
                paragraphs.add(tupleFactory.newTupleNoCopy(Arrays.asList(
                        p.value, p.begin, p.end)));
            }
            return tupleFactory.newTupleNoCopy(Arrays.asList(title, uri, text,
                    redirect, links, headers, paragraphs));
View Full Code Here

TOP

Related Classes of pignlproc.markup.AnnotatingMarkupParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.