IndexWriter writer = new IndexWriter(dir, config);
LOG.info("Creating index at " + indexPath);
LOG.info("Indexing with " + threads + " threads");
try {
WikipediaBz2DumpInputStream stream = new WikipediaBz2DumpInputStream(path);
ExecutorService executor = Executors.newFixedThreadPool(threads);
int cnt = 0;
String page;
while ((page = stream.readNext()) != null) {
String title = cleaner.getTitle(page);
// These are heuristic specifically for filtering out non-articles in enwiki-20120104.
if (title.startsWith("Wikipedia:") || title.startsWith("Portal:") || title.startsWith("File:")) {
continue;