Reader[] cacheReaders = SequenceFileOutputFormat.getReaders(getConf(),
input);
for (Reader current : cacheReaders) {
// read the key + values in that file
Text key = new Text();
BehemothDocument inputDoc = new BehemothDocument();
while (current.next(key, inputDoc)) {
count[0]++;
// filter the doc?
if (!docFilter.keep(inputDoc))
continue;
if (dumpBinary && inputDoc.getContent() == null)
continue;
else if (!dumpBinary && inputDoc.getText() == null)
continue;
String fileName = Integer.toString(count[0]);
String urldoc = inputDoc.getUrl();
if (mode.equals(FileNamingMode.URL) && urldoc != null
&& urldoc.length() > 0) {
fileName = URLEncoder.encode(urldoc, "UTF-8");
} else if (mode.equals(FileNamingMode.UUID) && urldoc != null
&& urldoc.length() > 0) {
fileName = UUID.nameUUIDFromBytes(urldoc.getBytes())
.toString();
} else {
fileName = String.format("%09d", count[0]);
}
if (!dumpBinary)
fileName += ".txt";
byte[] contentBytes;
if (dumpBinary)
contentBytes = inputDoc.getContent();
else
contentBytes = inputDoc.getText().getBytes("UTF-8");
// out.write(contentBytes, 0, contentBytes.length);
addToArchive(fileName, contentBytes, dir);
// add the mapping URL->filename in the index -> archive num
index.writeBytes(urldoc + "\t" + fileName + "\t"