long startTime = System.currentTimeMillis();
String path = cmdline.getOptionValue(INPUT_OPTION);
PrintStream out = new PrintStream(System.out, true, "UTF-8");
WikiClean cleaner = new WikiCleanBuilder().withTitle(true).build();
Directory dir = FSDirectory.open(new File(indexPath));
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER);
config.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, config);
LOG.info("Creating index at " + indexPath);
LOG.info("Indexing with " + threads + " threads");
try {
WikipediaBz2DumpInputStream stream = new WikipediaBz2DumpInputStream(path);
ExecutorService executor = Executors.newFixedThreadPool(threads);
int cnt = 0;
String page;
while ((page = stream.readNext()) != null) {
String title = cleaner.getTitle(page);
// These are heuristic specifically for filtering out non-articles in enwiki-20120104.
if (title.startsWith("Wikipedia:") || title.startsWith("Portal:") || title.startsWith("File:")) {
continue;
}