package io.lumify.wikipedia.mapreduce;
import io.lumify.core.util.LumifyLogger;
import io.lumify.core.util.LumifyLoggerFactory;
import io.lumify.wikipedia.RandomAccessFileInputStream;
import org.apache.commons.cli.*;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import java.io.*;
import java.text.DecimalFormat;
public class WikipediaFileToMRFile {
private static final LumifyLogger LOGGER = LumifyLoggerFactory.getLogger(WikipediaFileToMRFile.class);
private static final DecimalFormat NUMBER_FORMATTER = new DecimalFormat("#,###");
public static void main(String[] args) throws ParseException, IOException {
Options options = new Options();
options.addOption(
OptionBuilder
.withLongOpt("in")
.withDescription("Input file name")
.hasArg(true)
.withArgName("file")
.create("i")
);
options.addOption(
OptionBuilder
.withLongOpt("out")
.withDescription("Output file name")
.hasArg(true)
.withArgName("file")
.create("o")
);
options.addOption(
OptionBuilder
.withLongOpt("help")
.withDescription("Prints help")
.hasArg(false)
.create("h")
);
CommandLineParser parser = new GnuParser();
CommandLine cmd = parser.parse(options, args);
if (cmd.hasOption("help")) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("run", options, true);
System.exit(1);
return;
}
if (!cmd.hasOption("in")) {
System.err.println("in is required");
System.exit(1);
return;
}
String in = cmd.getOptionValue("in");
if (!cmd.hasOption("out")) {
System.err.println("out is required");
System.exit(1);
return;
}
String out = cmd.getOptionValue("out");
new WikipediaFileToMRFile().run(in, out);
}
private void run(String inputFileName, String outputFileName) throws IOException {
InputStream in;
RandomAccessFile randomAccessFile = null;
File inputFile = new File(inputFileName);
if (!inputFile.exists()) {
throw new RuntimeException("Could not find " + inputFileName);
}
if (inputFile.getName().endsWith("bz2")) {
FileInputStream fileInputStream = new FileInputStream(inputFile);
in = new BZip2CompressorInputStream(fileInputStream);
} else {
randomAccessFile = new RandomAccessFile(inputFile, "r");
in = new RandomAccessFileInputStream(randomAccessFile);
}
File outputFile = new File(outputFileName);
if (outputFile.exists()) {
throw new RuntimeException("Output file already exists " + outputFileName);
}
OutputStream out = new FileOutputStream(outputFile);
run(randomAccessFile, in, out);
}
private void run(RandomAccessFile randomAccessFile, InputStream in, OutputStream out) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
try {
String line;
int lineNumber = 0;
int pageCount = 0;
boolean foundStartPage = false;
while ((line = reader.readLine()) != null) {
if ((lineNumber % 100000) == 0) {
LOGGER.info("Processing line " + NUMBER_FORMATTER.format(lineNumber) + (randomAccessFile == null ? "" : " (offset: " + randomAccessFile.getFilePointer() + ")"));
}
if (line.contains("<page>") && line.trim().equals("<page>")) {
foundStartPage = true;
writeLine(line, out);
} else if (line.contains("</page>") && line.trim().equals("</page>")) {
writeLine(line, out);
out.write("\n".getBytes());
pageCount++;
if ((pageCount % 1000) == 0) {
LOGGER.info("Processing page " + NUMBER_FORMATTER.format(pageCount));
}
foundStartPage = false;
} else if (foundStartPage) {
writeLine(line, out);
out.write("\\n".getBytes());
}
lineNumber++;
}
} finally {
reader.close();
}
}
private void writeLine(String line, OutputStream out) throws IOException {
out.write(line.trim().getBytes());
}
}