*/
public synchronized void dump(boolean sorted, PrintStream output) throws Exception {
reset();
FetcherOutput fo = new FetcherOutput();
Content co = new Content();
ParseData pd = new ParseData();
ParseText pt = new ParseText();
long recNo = 0L;
if (!sorted) {
while(next(fo, co, pt, pd)) {
output.println("Recno:: " + recNo++);
output.println("FetcherOutput::\n" + fo.toString());
if (contentReader != null)
output.println("Content::\n" + co.toString());
if (parseDataReader != null)
output.println("ParseData::\n" + pd.toString());
if (parseTextReader != null)
output.println("ParseText::\n" + pt.toString());
output.println("");
}
} else {
File unsortedFile = new File(segmentDir, ".unsorted");
File sortedFile = new File(segmentDir, ".sorted");
nfs.delete(unsortedFile);
nfs.delete(sortedFile);
SequenceFile.Writer seqWriter = new SequenceFile.Writer(nfs,
unsortedFile.toString(), UTF8.class, LongWritable.class);
FetchListEntry fle;
LongWritable rec = new LongWritable();
UTF8 url = new UTF8();
String urlString;
while (fetcherReader.next(fo) != null) {
fle = fo.getFetchListEntry();
urlString = fle.getPage().getURL().toString();
rec.set(recNo);
url.set(urlString);
seqWriter.append(url, rec);
recNo++;
}
seqWriter.close();
// sort the SequenceFile
long start = System.currentTimeMillis();
SequenceFile.Sorter sorter = new SequenceFile.Sorter(nfs,
new UTF8.Comparator(), LongWritable.class);
sorter.sort(unsortedFile.toString(), sortedFile.toString());
float localSecs = (System.currentTimeMillis() - start) / 1000.0f;
LOG.info(" - sorted: " + recNo + " entries in " + localSecs + "s, "
+ (recNo/localSecs) + " entries/s");
nfs.delete(unsortedFile);
SequenceFile.Reader seqReader = new SequenceFile.Reader(nfs, sortedFile.toString());
while (seqReader.next(url, rec)) {
recNo = rec.get();
get(recNo, fo, co, pt, pd);
output.println("Recno:: " + recNo++);
output.println("FetcherOutput::\n" + fo.toString());
if (contentReader != null)
output.println("Content::\n" + co.toString());
if (parseDataReader != null)
output.println("ParseData::\n" + pd.toString());
if (parseTextReader != null)
output.println("ParseText::\n" + pt.toString());
output.println("");
}
seqReader.close();