Package it.cnr.isti.hpc.wikipedia.reader

Examples of it.cnr.isti.hpc.wikipedia.reader.WikipediaArticleReader


  public static void main(String[] args) {
    MediawikiToJsonCLI cli = new MediawikiToJsonCLI(args);
    String input = cli.getInput();
    String output = cli.getOutput();
    String lang = cli.getParam("lang");
    WikipediaArticleReader wap = new WikipediaArticleReader(input, output,
        lang);
    try {
      wap.start();
    } catch (Exception e) {
      logger.error("parsing the mediawiki {}", e.toString());
      System.exit(-1);
    }
  }
View Full Code Here


public class WikipediaArticleReaderTest {

  @Test
  public void testParsing() throws UnsupportedEncodingException, FileNotFoundException, IOException, SAXException {
    URL u = this.getClass().getResource("/en/mercedes.xml");
    WikipediaArticleReader wap = new WikipediaArticleReader(u.getFile(),"/tmp/mercedes.json.gz", Language.EN);
    wap.start();
    String json = IOUtils.getFileAsUTF8String("/tmp/mercedes.json.gz");
    Article a = Article.fromJson(json);
    assertTrue(a.getCleanText().startsWith("Mercedes-Benz"));
    assertEquals(15, a.getCategories().size());
   
View Full Code Here

TOP

Related Classes of it.cnr.isti.hpc.wikipedia.reader.WikipediaArticleReader

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.