Package dmir.wikipedia

Source Code of dmir.wikipedia.WikiMain

package dmir.wikipedia;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.InputStreamReader;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import joptsimple.OptionParser;
import joptsimple.OptionSet;

import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.xml.sax.InputSource;

public class WikiMain {

    /**
     *
     * This function receives the following Strings:
     *
     * * Language: English, Spanish, Portuguese
     * * Path to Wikipedia page-articles.xml.bz2 file from http://dumps.wikimedia.org/enwiki/
     * * Path to where the file containing the graph information should be written
     * * Path to where the file containing the content information should be written
     * * Path to where the file containing the categories information should be written
     *
     * Optionally:
     *
     * * Path to where the file containing the links to the English Wikipedia should be written
     * * Path to where the file containing the coordinates information should be written
     * * Path to where the file containing the population information should be written
     * * Path to where the file containing the area information should be written
     *
     * Example:
     * java WikiMain -language=English -wiki-dump=enwiki-20111201-pages-articles.xml.bz2 \
     * -graph=wiki_graph.txt -content=wiki_content.txt \
     * -categories=wiki_categories.txt -en-links=wiki_linksEN.txt
     *
     * Attention:
     * This software ignores articles and links regarding special Wikipedia pages such as
     * those starting with "Template:", "Portal:", "List_of_", or ending with "_(disambiguation).
     * You can override the ignore function defined by AbstractWikiPageCleaner to change the behavior.
     *
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
       
      OptionParser optParser = new OptionParser() {
            {
                accepts( "language" ).withRequiredArg().required();
                accepts( "wiki-dump" ).withRequiredArg().required();
                accepts( "graph" ).withRequiredArg().required();
                accepts( "content" ).withRequiredArg().required();
                accepts( "categories" ).withRequiredArg().required();
                accepts( "en-links" ).withRequiredArg();
                accepts( "coordinates" ).withRequiredArg();
                accepts( "population" ).withRequiredArg();
                accepts( "area" ).withRequiredArg();
            }
        };
      OptionSet options = optParser.parse( args );
       
        String language = (String) options.valueOf( "language" );
        String wikipedia = (String) options.valueOf( "wiki-dump" );
        String graph = (String) options.valueOf( "graph" );
        String content = (String) options.valueOf( "content" );
        String categories = (String) options.valueOf( "categories" );
        //addons
        String linksEN = options.has("en-links") ? (String) options.valueOf( "en-links" ) : null;
        String coordinates = options.has("coordinates") ? (String) options.valueOf( "coordinates" ) : null;
        String population = options.has("population") ? (String) options.valueOf( "population" ) : null;
        String area = options.has("area") ? (String) options.valueOf( "area" ) : null;
       
        SAXParserFactory factory = SAXParserFactory.newInstance();
        SAXParser parser = factory.newSAXParser();
        WikiPageParser k = new WikiPageParser(graph, content, categories, linksEN, coordinates, population, area, language);
       
        BufferedReader br;
        if (wikipedia.endsWith(".bz2")) { //.bz2
            FileInputStream fis = new FileInputStream(wikipedia);
            br = new BufferedReader(new InputStreamReader(new BZip2CompressorInputStream(fis), "UTF-8"));
        } else { //.xml
            br = new BufferedReader(new FileReader(wikipedia));
        }
       
        parser.parse(new InputSource(br), k);
        k.close()
    }
}
TOP

Related Classes of dmir.wikipedia.WikiMain

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.