*
* @param args Command line arguments
*/
public static void main(String[] args) throws IOException {
String filename = args[0];
BlockCompressedInputStream stream =
new BlockCompressedInputStream(new File(filename));
DataInputStream in = new DataInputStream(stream);
WarcRecord warcRecord = null;
Vector<long[]> markers = new Vector<long[]>();
while ((warcRecord = readNextWarcRecord(in)) != null) {
//System.out.println(warcRecord.toString());
markers.add(new long[]{warcRecord.getStartMarker(), warcRecord.getStopMarker()});
}
stream.close();
// check if we can read the substreams from the markers
for (Iterator<long[]> iterator = markers.iterator(); iterator.hasNext(); ) {
long[] ls = iterator.next();
stream =
new BlockCompressedInputStream(new File(filename));
SegmentedInputStream sis = new SegmentedInputStream(stream, ls[0], ls[1]);
in = new DataInputStream(sis);
while ((warcRecord = readNextWarcRecord(in)) != null) {
System.out.println(warcRecord.toString());
WarcHTMLResponseRecord w = new WarcHTMLResponseRecord(warcRecord);
if (w.isHTMLResponse()) {
//System.out.println(w.getHTMLContent());
// See how the parsed content looks like
BulletParser parser = new BulletParser(TRECParsingFactory.INSTANCE);
ComposedCallbackBuilder composedBuilder = new ComposedCallbackBuilder();
StructuredTextExtractor textExtractor = new StructuredTextExtractor();
composedBuilder.add(textExtractor);
parser.setCallback(composedBuilder.compose());
parser.parse(w.getHTMLContent().toCharArray());
System.out.println(textExtractor.getText());
}
}
in.close();
stream.close();
}
}