if (fileStats[i].getPath().getName().startsWith("_")) {
continue;
}
LOG.info("processing " + fileStats[i].getPath());
FSLineReader reader = new FSLineReader(fileStats[i].getPath(), fs);
Text line = new Text();
while (reader.readLine(line) > 0) {
String[] arr = line.toString().split("\\t+", 2);
int docno = Integer.parseInt(arr[0]);
int len = Integer.parseInt(arr[1]);
// Note that because of speculative execution there may be
// multiple copies of doclength data. Therefore, we can't
// just count number of doclengths read. Instead, keep track
// of largest docno encountered.
if (docno < docnoOffset) {
throw new RuntimeException(
"Error: docno " + docno + " < docnoOffset " + docnoOffset + "!");
}
doclengths[docno - docnoOffset] = len;
if (docno > maxDocno) {
maxDocno = docno;
}
if (docno < minDocno) {
minDocno = docno;
}
}
reader.close();
context.getCounter(DocLengths.Files).increment(1);
}
LOG.info("min docno: " + minDocno);
LOG.info("max docno: " + maxDocno);