/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.io;
import java.io.IOException;
import java.text.NumberFormat;
import org.archive.io.arc.ARCReader;
import org.archive.io.arc.ARCReaderFactory;
import org.archive.io.arc.ARCRecord;
import org.archive.io.arc.ARCRecordMetaData;
import org.archive.io.warc.WARCReader;
import org.archive.io.warc.WARCReaderFactory;
import org.archive.io.warc.WARCRecord;
import java.util.logging.Logger;
/**
* useful for determining why ArchiveReader fails for problematic W/ARC files
* @author siznax
*
*/
public class ArchiveTest
{
/** input W/ARC filename */
String arcFilename;
void setArcFile(String arcFile) {
this.arcFilename = arcFile;
}
/** one of available modes */
String mode;
public void setMode(String mode) {
this.mode = mode;
}
/** mimetype to select from input */
String filter;
public void setFilter(String filter) {
this.filter = filter;
}
/** byte offset into input file */
long offset;
public void setOffset(long offset) {
this.offset = offset;
}
/** W/ARC record index to begin output */
protected int recordStartIndex;
/** W/ARC record index to end output */
protected int recordEndIndex;
void setRecordRange(int start, int end) {
this.recordStartIndex = start;
this.recordEndIndex = end;
}
/** count of W/ARC records found in input */
protected int recordCount;
/** count of selected mimetype found in input */
protected int filterCount;
/** logger for errors, warnings */
private static Logger logger = Logger.getLogger(ArchiveTest.class.getName());
/** main method modes to scan for errors, filter,
* and emulate wayback use cases
*/
public static String[] modes = {"index","replay","dump","cdx","filter"};
/** arbitrary buffer size for replay mode */
static int BUFFER_SIZE = 1024*16;
/** some typical mimetypes found in W/ARCs */
static String[] mimeTypes = {
"image/gif",
"image/png",
"text/css",
"text/dns",
"text/html",
"text/plain"
};
public ArchiveTest() throws IOException {
}
/**
* @return true if archive filename ends in "arc" or "arc.gz"
*/
boolean isARCFormat() {
return this.arcFilename.endsWith(".arc")
|| this.arcFilename.endsWith(".arc.gz");
}
/**
* @return ARCReader if {@link #isARCFormat()}=true, else WARCReader
* @throws IOException
*/
ArchiveReader getReader() throws IOException {
if (this.isARCFormat()) {
return ARCReaderFactory.get(this.arcFilename);
} else {
return WARCReaderFactory.get(this.arcFilename);
}
}
/**
* @param index current record index into arc file
* @return true if current index is in range
*/
boolean inRecordRange(long index) {
if (index >= this.recordStartIndex && index <= this.recordEndIndex)
return true;
else
return false;
}
/**
* @param r ArchiveRecord
* @param filter mimetype string, see mimeTypes
* @return true if current record mimetype equals mimetype filter field
*/
boolean filterMimeType(ArchiveRecord r, String filter) {
if (r.getHeader().getMimetype().equals(this.filter))
return true;
else
return false;
}
void logRecordErrors(ArchiveRecord record) {
Logger logger = Logger.getLogger(this.getClass().getName());
if (this.isARCFormat()) {
ARCRecord arcRecord = (ARCRecord) record;
if (arcRecord.hasErrors()) {
ArchiveRecordHeader header = record.getHeader();
logger.warning("record at offset: " + header.getOffset()
+ " has errors: " + arcRecord.getErrors());
}
} else {
WARCRecord warcRecord = (WARCRecord) record;
warcRecord.getHeader();
}
}
/** emulate ArchiveRecord.outputCDX for comparison */
static void outputCdx(ArchiveRecordHeader h) {
Long rl = h.getLength();
Long ro = h.getOffset();
String[] hdr = {
h.getDate(),
"-", // Ip
h.getUrl(),
h.getMimetype(),
"-", // status code
"-", // digest
ro.toString(),
rl.toString(),
};
for (String fld : hdr)
System.out.print(fld + " ");
System.out.println();
}
void printMetadata(ARCRecord record, ArchiveRecordHeader header) {
System.out.print( " Date : " + header.getDate() + "\n"
+ " IP : " + ((ARCRecordMetaData)header).getIp() + "\n"
+ " URL : " + header.getUrl() + "\n"
+ " MIME : " + header.getMimetype() + "\n"
+ " Status: " + ((ARCRecordMetaData)header).getStatusCode() + "\n"
+ " Digest: " + record.getDigestStr() + "\n"
+ " Offset: " + header.getOffset() + "\n"
+ " Length: " + header.getLength() + "\n");
}
void printMetadata(WARCRecord record, ArchiveRecordHeader header) {
System.out.print( " Date : " + header.getDate() + "\n"
+ " IP : " + header.getHeaderValue("WARC-IP-Address") + "\n"
+ " URL : " + header.getUrl() + "\n"
+ " MIME : " + header.getMimetype() + "\n"
+ " Status: " + "-" + "\n"
+ " Digest: " + header.getHeaderValue("WARC-Payload-Digest") + "\n"
+ " Offset: " + header.getOffset() + "\n"
+ " Length: " + header.getLength() + "\n");
}
void printInfo() {
System.out.println(this.getClass().getName());
System.out.println(" file: " + this.arcFilename);
System.out.println(" format: " + this.getFormat());
System.out.println(" mode: " + this.mode);
if (this.mode.equals("filter"))
System.out.println(" filter: " + this.filter);
if (this.mode.equals("fetch"))
System.out.println(" offset: " + this.offset);
if (this.mode.equals("filter")
|| this.mode.equals("cdx")
|| this.mode.equals("dump"))
System.out.println(" range: " + "[" + this.recordStartIndex
+ "," + this.recordEndIndex + "]");
}
/**
* return W/ARC extension and compression extension
*/
String getFormat() {
if(this.arcFilename.endsWith(".gz")) {
return this.arcFilename.substring(this.arcFilename
.lastIndexOf(".",this.arcFilename.length()-4));
}
return this.arcFilename.substring(this.arcFilename.lastIndexOf("."));
}
/**
* process output by selected mode
* @throws IOException
*/
void readArchive() throws IOException {
ArchiveReader reader = this.getReader();
if (this.mode.equals("index")) {
// parse HTTP header only
System.out.println("INDEX " + this.getArcType()
+ " record at offset: " + offset);
if (this.isARCFormat()) {
indexRecord((ARCReader)reader);
} else {
indexRecord((WARCReader)reader);
}
} else if (this.mode.equals("replay")) {
// skip header and read
System.out.println("REPLAY " + this.getArcType()
+ " record at offset: " + offset + "");
if (this.isARCFormat()) {
this.replayRecord((ARCReader)reader);
} else {
this.replayRecord((WARCReader)reader);
}
} else if (this.mode.equals("dump")) {
this.dumpArchive(reader);
} else if (this.mode.equals("cdx")) {
this.outputArchiveCDX(reader);
} else if (this.mode.equals("filter")) { // filter MIME type
this.filterArchive(reader);
} else { // scan; do nothing, but count iterations
this.scanArchive(reader);
}
if (this.offset == -1) {
System.out.println("\n========== found: "
+ this.recordCount + " records. ");
}
System.out.println("\n========== Done.");
}
/**
* get archive type by file extension
* @return arc file extension, e.g. 'warc.gz'
*/
private String getArcType() {
return getFormat().split("\\.")[1];
}
/**
* scan (read) archive printing "." for each record or errors if they occur
* and total number of records found
* @param reader and ArchiveReader instance
*/
private void scanArchive(ArchiveReader reader) {
System.out.println();
for (ArchiveRecord record : reader) {
this.recordCount++;
logRecordErrors(record);
System.out.print(".");
if ((this.recordCount % 100) == 0)
System.out.print("[" + this.recordCount+ "]\n");
}
}
/**
* filter archive on a mimetype for records in range
* @param reader an ArchiveReader instance
*/
private void filterArchive(ArchiveReader reader) {
for (ArchiveRecord record : reader) {
recordCount++;
if (inRecordRange(recordCount)) {
if (filterMimeType(record,this.filter)==true) {
System.out.print(mode + " [" + recordCount + "] ");
outputCdx(record.getHeader());
filterCount++;
}
}
if (recordCount > this.recordEndIndex)
break;
}
double filterPercent = (double)filterCount/recordCount;
NumberFormat filterPercentFmt = NumberFormat.getPercentInstance();
filterPercentFmt.setMinimumFractionDigits(2);
System.out.println("\n========== found: "
+ filterCount + "/" + recordCount + " = "
+ filterPercentFmt.format(filterPercent)
+ " mimetype=" + filter
+ " records. ");
}
/**
* output CDX-like output for records in range
* @param reader an ArchiveReader instance
*/
private void outputArchiveCDX(ArchiveReader reader) {
for (ArchiveRecord record : reader) {
recordCount++;
if (inRecordRange(recordCount)) {
System.out.print(mode + " [" + recordCount + "] ");
logRecordErrors(record);
outputCdx(record.getHeader());
}
if (recordCount > this.recordEndIndex) {
break;
}
}
}
/**
* write records in range on STDOUT
* @param reader an ArchiveReader instance
* @throws IOException
*/
private void dumpArchive(ArchiveReader reader) throws IOException {
for (ArchiveRecord record : reader) {
recordCount++;
if (inRecordRange(recordCount)) {
System.out.println("\n********** "
+ mode + " ["+recordCount+"] "
+ "**********\n");
record.dump();
}
if (recordCount > this.recordEndIndex) {
break;
}
}
}
/**
* wayback-like replay of ARC record at offset
* @param arcReader an ARCReader intance
* @throws IOException
*/
private void replayRecord(ARCReader arcReader) throws IOException {
arcReader.setStrict(true);
ARCRecord arcRecord = (ARCRecord) arcReader.get(this.offset);
arcRecord.skipHttpHeader();
if (arcRecord.hasErrors()) {
logger.warning("record has errors: " + arcRecord.getErrors());
}
byte[] buffer = new byte[BUFFER_SIZE];
if (arcRecord.available() > 0) {
// for (int r = -1; (r = arcRecord.read(buffer, 0, BUFFER_SIZE)) != -1;) {
int r = -1;
while((r = arcRecord.read(buffer, 0, BUFFER_SIZE)) != -1) {
// os.write(buffer, 0, r);
System.out.write(buffer, 0, r);
}
} else {
System.out.println("record bytes available: "
+ arcRecord.available());
}
}
/**
* wayback-like replay of WARC record at offset
* @param warcReader a WARCReader instance
* @throws IOException
*/
private void replayRecord(WARCReader warcReader) throws IOException {
warcReader.setStrict(true);
WARCRecord warcRecord = (WARCRecord) warcReader.get(this.offset);
byte[] buffer = new byte[BUFFER_SIZE];
if (warcRecord.available() > 0) {
int r = -1;
while((r = warcRecord.read(buffer, 0, BUFFER_SIZE)) != -1) {
System.out.write(buffer, 0, r);
}
}
System.out.println("record bytes available: "
+ warcRecord.available());
}
/**
* wayback-like index an ARC record at offset
* @param arcReader an ARCReader instance
* @throws IOException
*/
private void indexRecord(ARCReader arcReader) throws IOException {
arcReader.setStrict(true);
arcReader.setParseHttpHeaders(true);
ARCRecord arcRecord = (ARCRecord) arcReader.get(this.offset);
ArchiveRecordHeader header = arcRecord.getHeader();
if (arcRecord.hasErrors())
logger.warning("record has errors: " + arcRecord.getErrors());
System.out.println("========== dumping HTTP header:");
arcRecord.dumpHttpHeader();
System.out.println("========== selected metadata:");
arcRecord.close(); // must close record to get digest
printMetadata(arcRecord,header);
System.out.println("========== getting metadata:");
System.out.println(arcRecord.getMetaData());
System.out.println("\n"
+ "record length declared: "
+ header.getLength() + "\n"
+ "header bytes read : "
+ arcRecord.httpHeaderBytesRead);
}
/**
* wayback-like index a WARC record at offset
* @param warcReader a WARCReader instance
* @throws IOException
*/
private void indexRecord(WARCReader warcReader) throws IOException {
warcReader.setStrict(true);
// warcReader.setParseHttpHeaders(true);
WARCRecord warcRecord = (WARCRecord)warcReader.get(this.offset);
ArchiveRecordHeader header = warcRecord.getHeader();
System.out.println("========== selected metadata:");
warcRecord.close(); // must close record to get digest
printMetadata(warcRecord,header);
System.out.println("========== header: \n" + header);
}
/**
* test (scan|cdx|index|replay|dump) an archive.
* some of these modes are use-cases for wayback indexing mentioned in:
* http://webarchive.jira.com/browse/HER-1568
* @param arcfile a ARC or WARC archive (possibly .gz)
* @param offset byte offset into archive
* @param mode (default=scan)|cdx|index|replay|dump
* @param record_range_start record index start (default=0)
* @param record_range_end record index end (default=100)
* @param filter mimetype, e.g. "text/html"
* @throws IOException
*/
public static void main(String[] args) throws IOException {
new ArchiveTest().instanceMain(args);
}
public void instanceMain(String[] args) throws IOException {
if (args.length > 1) {
int offset = Integer.valueOf(args[1]);
String mode = (args.length>2) ? args[2] : "scan";
int start = (args.length>3) ? Integer.valueOf(args[3]) : 0;
int end = (args.length>4) ? Integer.valueOf(args[4]) : 100;
String filter = (args.length>5) ? args[5] : null;
setArcFile(args[0]);
setOffset(Integer.valueOf(args[1]));
setOffset(offset);
setMode(mode);
setRecordRange(start,end);
setFilter(filter);
printInfo();
readArchive();
} else {
String usage = "ArcWarcTests.java arcfile offset "
+ "[ [scan|cdx|index|replay|dump] "
+ "record_range_start record_range_end filter]";
System.out.println(usage);
}
}
}