/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.io.arc;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.NullInputStream;
import org.apache.commons.io.output.NullOutputStream;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPoolMember;
import org.archive.io.WriterPoolSettings;
import org.archive.util.ArchiveUtils;
import org.archive.util.TmpDirTestCase;
import com.google.common.io.Closeables;
/**
* Test ARCWriter class.
*
* This code exercises ARCWriter AND ARCReader. First it writes ARCs w/
* ARCWriter. Then it validates what was written w/ ARCReader.
*
* @author stack
*/
public class ARCWriterTest
extends TmpDirTestCase implements ARCConstants {
/**
* Utility class for writing bad ARCs (with trailing junk)
*/
public class CorruptibleARCWriter extends ARCWriter {
byte[] endJunk = null;
public CorruptibleARCWriter(AtomicInteger serial_no, WriterPoolSettings settings) {
super(serial_no, settings);
}
@Override
protected void postWriteRecordTasks() throws IOException {
if (endJunk != null) {
this.write(endJunk);
}
super.postWriteRecordTasks();
}
public void setEndJunk(byte[] b) throws IOException {
this.endJunk = b;
}
}
/**
* Suffix to use for ARC files made by JUNIT.
*/
private static final String SUFFIX = "JUNIT";
private static final String SOME_URL = "http://www.archive.org/test/";
private static final AtomicInteger SERIAL_NO = new AtomicInteger();
/*
* @see TestCase#setUp()
*/
protected void setUp() throws Exception {
super.setUp();
}
/*
* @see TestCase#tearDown()
*/
protected void tearDown() throws Exception {
super.tearDown();
}
protected static String getContent() {
return getContent(null);
}
protected static String getContent(String indexStr) {
String page = (indexStr != null)? "Page #" + indexStr: "Some Page";
return "HTTP/1.1 200 OK\r\n" +
"Content-Type: text/html\r\n\r\n" +
"<html><head><title>" + page +
"</title></head>" +
"<body>" + page +
"</body></html>";
}
@SuppressWarnings("deprecation")
protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index)
throws IOException {
String indexStr = Integer.toString(index);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
// Start the record with an arbitrary 14-digit date per RFC2540
String now = ArchiveUtils.get14DigitDate();
int recordLength = 0;
byte[] record = (getContent(indexStr)).getBytes();
recordLength += record.length;
baos.write(record);
// Add the newline between records back in
baos.write("\n".getBytes());
recordLength += 1;
arcWriter.write("http://www.one.net/id=" + indexStr, "text/html",
"0.1.2.3", Long.parseLong(now), recordLength, baos);
return recordLength;
}
private File writeRecords(String baseName, boolean compress,
long maxSize, int recordCount)
throws IOException {
cleanUpOldFiles(baseName);
File [] files = {getTmpDir()};
ARCWriter arcWriter =
new ARCWriter(
SERIAL_NO,
new WriterPoolSettingsData(
baseName,
"${prefix}-"+SUFFIX,
maxSize,
compress,
Arrays.asList(files),
null));
assertNotNull(arcWriter);
for (int i = 0; i < recordCount; i++) {
writeRandomHTTPRecord(arcWriter, i);
}
arcWriter.close();
assertTrue("Doesn't exist: " +
arcWriter.getFile().getAbsolutePath(),
arcWriter.getFile().exists());
return arcWriter.getFile();
}
private void validate(File arcFile, int recordCount)
throws FileNotFoundException, IOException {
ARCReader reader = ARCReaderFactory.get(arcFile);
assertNotNull(reader);
List<ArchiveRecordHeader> metaDatas = null;
if (recordCount == -1) {
metaDatas = reader.validate();
} else {
metaDatas = reader.validate(recordCount);
}
reader.close();
// Now, run through each of the records doing absolute get going from
// the end to start. Reopen the arc so no context between this test
// and the previous.
for (int i = metaDatas.size() - 1; i >= 0; i--) {
reader = ARCReaderFactory.get(arcFile);
ARCRecordMetaData meta = (ARCRecordMetaData)metaDatas.get(i);
ArchiveRecord r = reader.get(meta.getOffset());
String mimeType = r.getHeader().getMimetype();
assertTrue("Record is bogus",
mimeType != null && mimeType.length() > 0);
reader.close();
}
assertEquals("Metadata count not as expected",recordCount, metaDatas.size());
for (Iterator<ArchiveRecordHeader> i = metaDatas.iterator(); i.hasNext();) {
ARCRecordMetaData r = (ARCRecordMetaData)i.next();
assertTrue("Record is empty", r.getLength() > 0);
}
}
public void testCheckARCFileSize()
throws IOException {
runCheckARCFileSizeTest("checkARCFileSize", false);
}
public void testCheckARCFileSizeCompressed()
throws IOException {
runCheckARCFileSizeTest("checkARCFileSize", true);
}
public void testWriteRecord() throws IOException {
final int recordCount = 2;
File arcFile = writeRecords("writeRecord", false,
DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
validate(arcFile, recordCount + 1); // Header record.
}
public void testRandomAccess() throws IOException {
final int recordCount = 3;
File arcFile = writeRecords("writeRecord", true,
DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
ARCReader reader = ARCReaderFactory.get(arcFile);
// Get to second record. Get its offset for later use.
boolean readFirst = false;
String url = null;
long offset = -1;
long totalRecords = 0;
boolean readSecond = false;
for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext(); totalRecords++) {
ARCRecord ar = (ARCRecord)i.next();
if (!readFirst) {
readFirst = true;
continue;
}
if (!readSecond) {
url = ar.getMetaData().getUrl();
offset = ar.getMetaData().getOffset();
readSecond = true;
}
}
reader.close();
reader = ARCReaderFactory.get(arcFile, offset);
ArchiveRecord ar = reader.get();
assertEquals(ar.getHeader().getUrl(), url);
ar.close();
reader.close();
// Get reader again. See how iterator works with offset
reader = ARCReaderFactory.get(arcFile, offset);
int count = 0;
for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext(); i.next()) {
count++;
}
reader.close();
assertEquals(totalRecords - 1, count);
}
public void testWriteRecordCompressed() throws IOException {
final int recordCount = 2;
File arcFile = writeRecords("writeRecordCompressed", true,
DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
validate(arcFile, recordCount + 1 /*Header record*/);
}
public void testWriteGiantRecord() throws IOException {
PrintStream dummyStream = new PrintStream(new NullOutputStream());
ARCWriter arcWriter =
new ARCWriter(
SERIAL_NO,
dummyStream,
new File("dummy"),
new WriterPoolSettingsData(
"",
"",
-1,
false,
null,
null));
assertNotNull(arcWriter);
// Start the record with an arbitrary 14-digit date per RFC2540
long now = System.currentTimeMillis();
long recordLength = org.apache.commons.io.FileUtils.ONE_GB * 3;
arcWriter.write("dummy:uri", "application/octet-stream",
"0.1.2.3", now, recordLength, new NullInputStream(recordLength));
arcWriter.close();
}
private void runCheckARCFileSizeTest(String baseName, boolean compress)
throws FileNotFoundException, IOException {
File f = writeRecords(baseName, compress, 1024, 15);
validate(f, 15+1);
}
protected CorruptibleARCWriter createARCWriter(String name, boolean compress) {
File [] files = {getTmpDir()};
return new CorruptibleARCWriter(
SERIAL_NO,
new WriterPoolSettingsData(
name,
"${prefix}-"+SUFFIX,
DEFAULT_MAX_ARC_FILE_SIZE,
compress,
Arrays.asList(files),
null));
}
protected static ByteArrayInputStream getBais(String str)
throws IOException {
return new ByteArrayInputStream(str.getBytes());
}
/**
* Writes a record, suppressing normal length-checks (so that
* intentionally malformed records may be written).
*/
protected static void writeRecord(ARCWriter writer, String url,
String type, int len, ByteArrayInputStream bais)
throws IOException {
writer.write(url, type, "192.168.1.1", (new Date()).getTime(), len,
bais, false);
}
protected int iterateRecords(ARCReader r)
throws IOException {
int count = 0;
for (Iterator<ArchiveRecord> i = r.iterator(); i.hasNext();) {
ARCRecord rec = (ARCRecord)i.next();
rec.close();
if (count != 0) {
assertTrue("Unexpected URL " + rec.getMetaData().getUrl(),
rec.getMetaData().getUrl().startsWith(SOME_URL));
}
count++;
}
return count;
}
protected CorruptibleARCWriter createArcWithOneRecord(String name,
boolean compressed)
throws IOException {
CorruptibleARCWriter writer = createARCWriter(name, compressed);
String content = getContent();
writeRecord(writer, SOME_URL, "text/html",
content.length(), getBais(content));
return writer;
}
public void testSpaceInURL() {
String eMessage = null;
try {
holeyUrl("testSpaceInURL", false, " ");
} catch (IOException e) {
eMessage = e.getMessage();
}
assertTrue("Didn't get expected exception: " + eMessage,
eMessage.startsWith("Metadata line doesn't match"));
}
public void testTabInURL() {
String eMessage = null;
try {
holeyUrl("testTabInURL", false, "\t");
} catch (IOException e) {
eMessage = e.getMessage();
}
assertTrue("Didn't get expected exception: " + eMessage,
eMessage.startsWith("Metadata line doesn't match"));
}
protected void holeyUrl(String name, boolean compress, String urlInsert)
throws IOException {
ARCWriter writer = null;
try {
writer = createArcWithOneRecord(name, compress);
// Add some bytes on the end to mess up the record.
String content = getContent();
writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
content.length(), getBais(content));
} finally {
Closeables.close(writer, true);
}
}
// If uncompressed, length has to be right or parse will fail.
//
// public void testLengthTooShort() throws IOException {
// lengthTooShort("testLengthTooShort-" + PREFIX, false);
// }
public void testLengthTooShortCompressed() throws IOException {
lengthTooShort("testLengthTooShortCompressed", true, false);
}
public void testLengthTooShortCompressedStrict()
throws IOException {
String eMessage = null;
try {
lengthTooShort("testLengthTooShortCompressedStrict",
true, true);
} catch (RuntimeException e) {
eMessage = e.getMessage();
}
assertTrue("Didn't get expected exception: " + eMessage,
eMessage.startsWith("java.io.IOException: Record STARTING at"));
}
protected void lengthTooShort(String name, boolean compress, boolean strict)
throws IOException {
CorruptibleARCWriter writer = null;
try {
writer = createArcWithOneRecord(name, compress);
// Add some bytes on the end to mess up the record.
String content = getContent();
ByteArrayInputStream bais = getBais(content+"SOME TRAILING BYTES");
writeRecord(writer, SOME_URL, "text/html",
content.length(), bais);
writer.setEndJunk("SOME TRAILING BYTES".getBytes());
writeRecord(writer, SOME_URL, "text/html",
content.length(), getBais(content));
} finally {
Closeables.close(writer, true);
}
// Catch System.err into a byte stream.
ByteArrayOutputStream os = new ByteArrayOutputStream();
PrintStream origErr = System.err;
ARCReader r = null;
try {
System.setErr(new PrintStream(os));
r = ARCReaderFactory.get(writer.getFile());
r.setStrict(strict);
int count = iterateRecords(r);
assertTrue("Count wrong " + count, count == 4);
// Make sure we get the warning string which complains about the
// trailing bytes.
String err = os.toString();
assertTrue("No message " + err, err.startsWith("WARNING") &&
(err.indexOf("Record STARTING at") > 0));
r.close();
} finally {
Closeables.close(r, true);
System.setErr(origErr);
}
}
// If uncompressed, length has to be right or parse will fail.
//
// public void testLengthTooLong()
// throws IOException {
// lengthTooLong("testLengthTooLongCompressed-" + PREFIX,
// false, false);
// }
public void testLengthTooLongCompressed()
throws IOException {
lengthTooLong("testLengthTooLongCompressed",
true, false);
}
public void testLengthTooLongCompressedStrict() {
String eMessage = null;
try {
lengthTooLong("testLengthTooLongCompressed",
true, true);
} catch (IOException e) {
eMessage = e.getMessage();
}
assertTrue("Didn't get expected exception: " + eMessage,
eMessage.startsWith("Premature EOF before end-of-record"));
}
protected void lengthTooLong(String name, boolean compress,
boolean strict)
throws IOException {
ARCWriter writer = createArcWithOneRecord(name, compress);
// Add a record with a length that is too long.
String content = getContent();
writeRecord(writer, SOME_URL+"2", "text/html",
content.length() + 10, getBais(content));
writeRecord(writer, SOME_URL+"3", "text/html",
content.length(), getBais(content));
writer.close();
// Catch System.err.
ByteArrayOutputStream os = new ByteArrayOutputStream();
PrintStream origErr = System.err;
ARCReader r = null;
try {
System.setErr(new PrintStream(os));
r = ARCReaderFactory.get(writer.getFile());
r.setStrict(strict);
int count = iterateRecords(r);
assertTrue("Count wrong " + count, count == 4);
// Make sure we get the warning string which complains about the
// trailing bytes.
String err = os.toString();
assertTrue("No message " + err,
err.startsWith("WARNING Premature EOF before end-of-record"));
} finally {
Closeables.close(r, true);
System.setErr(origErr);
}
}
public void testGapError() throws IOException {
ARCWriter writer = createArcWithOneRecord("testGapError", true);
String content = getContent();
// Make a 'weird' RIS that returns bad 'remaining' length
// awhen remaining should be 0
ReplayInputStream ris = new ReplayInputStream(content.getBytes(),
content.length(), null) {
public long remaining() {
return (super.remaining()==0) ? -1 : super.remaining();
}
};
String message = null;
try {
writer.write(SOME_URL, "text/html", "192.168.1.1",
(new Date()).getTime(), content.length(), ris);
} catch (IOException e) {
message = e.getMessage();
} finally {
IOUtils.closeQuietly(ris);
}
writer.close();
assertTrue("No gap when should be",
message != null &&
message.indexOf("Gap between expected and actual") >= 0);
}
/**
* Write an arc file for other tests to use.
* @param arcdir Directory to write to.
* @param compress True if file should be compressed.
* @return ARC written.
* @throws IOException
*/
public static File createARCFile(File arcdir, boolean compress)
throws IOException {
File [] files = {arcdir};
ARCWriter writer = new ARCWriter(SERIAL_NO,
new WriterPoolSettingsData(
"",
"test",
DEFAULT_MAX_ARC_FILE_SIZE,
compress,
Arrays.asList(files),
null));
String content = getContent();
writeRecord(writer, SOME_URL, "text/html", content.length(),
getBais(content));
writer.close();
return writer.getFile();
}
// public void testSpeed() throws IOException {
// ARCWriter writer = createArcWithOneRecord("speed", true);
// // Add a record with a length that is too long.
// String content = getContent();
// final int count = 100000;
// logger.info("Starting speed write of " + count + " records.");
// for (int i = 0; i < count; i++) {
// writeRecord(writer, SOME_URL, "text/html", content.length(),
// getBaos(content));
// }
// writer.close();
// logger.info("Finished speed write test.");
// }
public void testValidateMetaLine() throws Exception {
final String line = "http://www.aandw.net/images/walden2.png " +
"128.197.34.86 20060111174224 image/png 2160";
ARCWriter w = createARCWriter("testValidateMetaLine", true);
try {
w.validateMetaLine(line);
w.validateMetaLine(line + LINE_SEPARATOR);
w.validateMetaLine(line + "\\r\\n");
} finally {
w.close();
}
}
public void testArcRecordOffsetReads() throws Exception {
ARCReader r = getSingleRecordReader("testArcRecordInBufferStream");
ARCRecord ar = getSingleRecord(r);
// Now try getting some random set of bytes out of it
// at an odd offset (used to fail because we were
// doing bad math to find where in buffer to read).
final byte[] buffer = new byte[17];
final int maxRead = 4;
int totalRead = 0;
while (totalRead < maxRead) {
totalRead = totalRead
+ ar.read(buffer, 13 + totalRead, maxRead - totalRead);
assertTrue(totalRead > 0);
}
r.close();
}
// available should always be >= 0; extra read()s should all give EOF
public void testArchiveRecordAvailableConsistent() throws Exception {
// first test reading byte-at-a-time via no-param read()
ARCReader r = getSingleRecordReader("testArchiveRecordAvailableConsistent");
ARCRecord record = getSingleRecord(r);
int c = record.read();
while(c>=0) {
c = record.read();
}
// consecutive reads after EOR should always give -1, still show zero available()
for (int i=0; i<5; i++) {
assertTrue("available negative:"+record.available(), record.available()>=0);
assertEquals(-1, record.read());
}
r.close();
}
// should always give -1 on repeated reads past EOR
public void testArchiveRecordEORConsistent() throws Exception {
ARCReader r = getSingleRecordReader("testArchiveRecordEORConsistent");
ARCRecord record = getSingleRecord(r);
this.readToEOS(record);
// consecutive reads after EOR should always give -1
for (int i=0; i<5; i++) {
assertEquals(-1, record.read(new byte[1]));
}
r.close();
}
// should not throw premature EOF when wrapped with BufferedInputStream
// [HER-1450] showed this was the case using Apache Tika
public void testArchiveRecordMarkSupport() throws Exception {
ARCReader r = getSingleRecordReader("testArchiveRecordMarkSupport");
ARCRecord record = getSingleRecord(r);
record.setStrict(true);
// ensure mark support
InputStream stream = new BufferedInputStream(record);
if (stream.markSupported()) {
for (int i=0; i<3; i++) {
this.readToEOS(stream);
stream.mark(stream.available());
stream.reset();
}
stream.close();
}
r.close();
}
/**
* Test a particular style of using the reader iterator. (Should
* possibly be on a reader-centric test class, but the best setup
* functionality is here.)
*
* @throws IOException
*/
public void testReadIterator() throws IOException {
final int recordCount = 3;
File arcFile = writeRecords("writeRecord", true,
DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
ARCReader reader = ARCReaderFactory.get(arcFile);
Iterator<ArchiveRecord> it = reader.iterator();
while (it.hasNext()) {
ArchiveRecord next = it.next();
next.close();
}
reader.close();
}
protected void readToEOS(InputStream in) throws Exception {
byte [] buf = new byte[1024];
int read = 0;
while (read >= 0) {
read = in.read(buf);
// System.out.println("readToEOS read " + read + " bytes");
}
}
protected ARCReader getSingleRecordReader(String name) throws Exception {
// Get an ARC with one record.
WriterPoolMember w = createArcWithOneRecord(name, true);
w.close();
// Get reader on said ARC.
ARCReader r = ARCReaderFactory.get(w.getFile());
return r;
}
protected ARCRecord getSingleRecord(ARCReader r) {
final Iterator<ArchiveRecord> i = r.iterator();
// Skip first ARC meta record.
i.next();
i.hasNext();
// Now we're at first and only record in ARC.
return (ARCRecord) i.next();
}
}