/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.datamodel;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlURI;
import org.archive.modules.SchedulingConstants;
import org.archive.modules.extractor.LinkContext.SimpleLinkContext;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.TmpDirTestCase;
/**
* Tests related to CrawlURI
*
* @contributor stack
* @contributor gojomo
* @version $Revision$, $Date$
*/
public class CrawlURITest extends TmpDirTestCase {
CrawlURI seed = null;
protected void setUp() throws Exception {
super.setUp();
final String url = "http://www.dh.gov.uk/Home/fs/en";
this.seed = new CrawlURI(UURIFactory.getInstance(url));
this.seed.setSchedulingDirective(SchedulingConstants.MEDIUM);
this.seed.setSeed(true);
// Force caching of string.
this.seed.toString();
// TODO: should this via really be itself?
this.seed.setVia(UURIFactory.getInstance(url));
}
/**
* Test serialization/deserialization works.
*
* @throws IOException
* @throws ClassNotFoundException
*/
final public void testSerialization()
throws IOException, ClassNotFoundException {
File serialize = new File(getTmpDir(),
this.getClass().getName() + ".serialize");
try {
FileOutputStream fos = new FileOutputStream(serialize);
ObjectOutputStream oos = new ObjectOutputStream(fos);
oos.writeObject(this.seed);
oos.reset();
oos.writeObject(this.seed);
oos.reset();
oos.writeObject(this.seed);
oos.close();
// Read in the object.
FileInputStream fis = new FileInputStream(serialize);
ObjectInputStream ois = new ObjectInputStream(fis);
CrawlURI deserializedCuri = (CrawlURI)ois.readObject();
deserializedCuri = (CrawlURI)ois.readObject();
deserializedCuri = (CrawlURI)ois.readObject();
assertEquals("Deserialized not equal to original",
this.seed.toString(), deserializedCuri.toString());
String host = this.seed.getUURI().getHost();
assertTrue("Deserialized host not null",
host != null && host.length() >= 0);
} finally {
serialize.delete();
}
}
public void testCandidateURIWithLoadedAList()
throws URIException {
UURI uuri = UURIFactory.getInstance("http://www.archive.org");
CrawlURI curi = new CrawlURI(uuri);
curi.setSeed(true);
curi.getData().put("key", "value");
assertTrue("Didn't find AList item",
curi.getData().get("key").equals("value"));
}
public void testExtendHopsPath() {
assertEquals("from empty","L",CrawlURI.extendHopsPath("",'L'));
assertEquals("from one","LX",CrawlURI.extendHopsPath("L",'X'));
assertEquals(
"from fortynine",
"LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLX",
CrawlURI.extendHopsPath("LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL",'X'));
assertEquals(
"from fifty",
"1+LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLX",
CrawlURI.extendHopsPath("LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL",'X'));
assertEquals(
"from 149",
"100+LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLX",
CrawlURI.extendHopsPath("99+LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL",'X'));
}
public void testNullPathFromSeed() throws URIException {
// check comparing with null
CrawlURI a = new CrawlURI(
UURIFactory.getInstance("http://example.com/1"), // a == b
null, // a < b
UURIFactory.getInstance("http://example.com/via/1"), // a == b
new SimpleLinkContext("1")); // a == b
assertEquals("", a.getPathFromSeed());
CrawlURI b = new CrawlURI(
UURIFactory.getInstance("http://example.com/1"), // a == b
"", // a < b
UURIFactory.getInstance("http://example.com/via/1"), // a == b
new SimpleLinkContext("1")); // a == b
assertEquals("", b.getPathFromSeed());
assertEquals(0, a.compareTo(b));
assertEquals(0, b.compareTo(a));
}
public void testOrdering() throws URIException {
// check that via is highest precedence
CrawlURI a = new CrawlURI(
UURIFactory.getInstance("http://example.com/2"), // a > b
"2", // a > b
UURIFactory.getInstance("http://example.com/via/1"), // a < b
new SimpleLinkContext("2")); // a > b
CrawlURI b = new CrawlURI(
UURIFactory.getInstance("http://example.com/1"), // a > b
"1", // a > b
UURIFactory.getInstance("http://example.com/via/2"), // a < b
new SimpleLinkContext("1")); // a > b
assertEquals(-1, a.compareTo(b));
assertEquals(1, b.compareTo(a));
// check that uri is next highest
a = new CrawlURI(
UURIFactory.getInstance("http://example.com/1"), // a < b
"2", // a > b
UURIFactory.getInstance("http://example.com/via/1"), // a == b
new SimpleLinkContext("2")); // a > b
b = new CrawlURI(
UURIFactory.getInstance("http://example.com/2"), // a < b
"1", // a > b
UURIFactory.getInstance("http://example.com/via/1"), // a == b
new SimpleLinkContext("1")); // a > b
assertEquals(-1, a.compareTo(b));
assertEquals(1, b.compareTo(a));
// check that via context is next
a = new CrawlURI(
UURIFactory.getInstance("http://example.com/1"), // a == b
"2", // a > b
UURIFactory.getInstance("http://example.com/via/1"), // a == b
new SimpleLinkContext("1")); // a < b
b = new CrawlURI(
UURIFactory.getInstance("http://example.com/1"), // a == b
"1", // a > b
UURIFactory.getInstance("http://example.com/via/1"), // a == b
new SimpleLinkContext("2")); // a < b
assertEquals(-1, a.compareTo(b));
assertEquals(1, b.compareTo(a));
// check that pathFromSeed is next
a = new CrawlURI(
UURIFactory.getInstance("http://example.com/1"), // a == b
"1", // a < b
UURIFactory.getInstance("http://example.com/via/1"), // a == b
new SimpleLinkContext("1")); // a == b
b = new CrawlURI(
UURIFactory.getInstance("http://example.com/1"), // a == b
"2", // a < b
UURIFactory.getInstance("http://example.com/via/1"), // a == b
new SimpleLinkContext("1")); // a == b
assertEquals(-1, a.compareTo(b));
assertEquals(1, b.compareTo(a));
// check equality
a = new CrawlURI(
UURIFactory.getInstance("http://example.com/1"), // a == b
"1", // a == b
UURIFactory.getInstance("http://example.com/via/1"), // a == b
new SimpleLinkContext("1")); // a == b
b = new CrawlURI(
UURIFactory.getInstance("http://example.com/1"), // a == b
"1", // a == b
UURIFactory.getInstance("http://example.com/via/1"), // a == b
new SimpleLinkContext("1")); // a == b
assertEquals(0, a.compareTo(b));
assertEquals(0, b.compareTo(a));
}
}