StringBuilder content = new StringBuilder();
content.append(header);
NumberFormat decimalFormatter = new DecimalFormat("0000");
File dumpFile = new File(dumpFilePath);
FileLineIterator it;
if (dumpFilePath.endsWith(".bz2")) {
// default compression format from http://download.wikimedia.org
CompressionCodec codec = new BZip2Codec();
it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile)));
} else {
// assume the user has previously de-compressed the dump file
it = new FileLineIterator(dumpFile);
}
int filenumber = 0;
while (it.hasNext()) {
String thisLine = it.next();
if (thisLine.trim().startsWith("<page>")) {
boolean end = false;
while (!thisLine.trim().startsWith("</page>")) {
content.append(thisLine).append('\n');
if (it.hasNext()) {
thisLine = it.next();
} else {
end = true;
break;
}
}