package dmir.wikipedia;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.InputStreamReader;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import joptsimple.OptionParser;
import joptsimple.OptionSet;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.xml.sax.InputSource;
public class WikiMain {
/**
*
* This function receives the following Strings:
*
* * Language: English, Spanish, Portuguese
* * Path to Wikipedia page-articles.xml.bz2 file from http://dumps.wikimedia.org/enwiki/
* * Path to where the file containing the graph information should be written
* * Path to where the file containing the content information should be written
* * Path to where the file containing the categories information should be written
*
* Optionally:
*
* * Path to where the file containing the links to the English Wikipedia should be written
* * Path to where the file containing the coordinates information should be written
* * Path to where the file containing the population information should be written
* * Path to where the file containing the area information should be written
*
* Example:
* java WikiMain -language=English -wiki-dump=enwiki-20111201-pages-articles.xml.bz2 \
* -graph=wiki_graph.txt -content=wiki_content.txt \
* -categories=wiki_categories.txt -en-links=wiki_linksEN.txt
*
* Attention:
* This software ignores articles and links regarding special Wikipedia pages such as
* those starting with "Template:", "Portal:", "List_of_", or ending with "_(disambiguation).
* You can override the ignore function defined by AbstractWikiPageCleaner to change the behavior.
*
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
OptionParser optParser = new OptionParser() {
{
accepts( "language" ).withRequiredArg().required();
accepts( "wiki-dump" ).withRequiredArg().required();
accepts( "graph" ).withRequiredArg().required();
accepts( "content" ).withRequiredArg().required();
accepts( "categories" ).withRequiredArg().required();
accepts( "en-links" ).withRequiredArg();
accepts( "coordinates" ).withRequiredArg();
accepts( "population" ).withRequiredArg();
accepts( "area" ).withRequiredArg();
}
};
OptionSet options = optParser.parse( args );
String language = (String) options.valueOf( "language" );
String wikipedia = (String) options.valueOf( "wiki-dump" );
String graph = (String) options.valueOf( "graph" );
String content = (String) options.valueOf( "content" );
String categories = (String) options.valueOf( "categories" );
//addons
String linksEN = options.has("en-links") ? (String) options.valueOf( "en-links" ) : null;
String coordinates = options.has("coordinates") ? (String) options.valueOf( "coordinates" ) : null;
String population = options.has("population") ? (String) options.valueOf( "population" ) : null;
String area = options.has("area") ? (String) options.valueOf( "area" ) : null;
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser parser = factory.newSAXParser();
WikiPageParser k = new WikiPageParser(graph, content, categories, linksEN, coordinates, population, area, language);
BufferedReader br;
if (wikipedia.endsWith(".bz2")) { //.bz2
FileInputStream fis = new FileInputStream(wikipedia);
br = new BufferedReader(new InputStreamReader(new BZip2CompressorInputStream(fis), "UTF-8"));
} else { //.xml
br = new BufferedReader(new FileReader(wikipedia));
}
parser.parse(new InputSource(br), k);
k.close();
}
}