/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.extractor;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlURI;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.Recorder;
public class ExtractorPDFContentTest extends ContentExtractorTestBase {
protected static final String TEST_RESOURCE_FILE_1 = "ExtractorPDFContentTest1.pdf";
protected static final String TEST_RESOURCE_FILE_2 = "ExtractorPDFContentTest2.pdf";
protected static final String TEST_RESOURCE_FILE_3 = "ExtractorPDFContentTest3.pdf";
protected static final String TEST_RESOURCE_FILE_4 = "ExtractorPDFContentTest4.pdf";
public void testA() throws URIException, UnsupportedEncodingException, IOException, InterruptedException{
CrawlURI testUri = createTestUri("http://www.example.com/fake.pdf", TEST_RESOURCE_FILE_1);
extractor.process(testUri);
Set<CrawlURI> expected = makeLinkSet(testUri, new String[]{"http://www.businessdictionary.com/definition/supervisor.html","http://management.about.com/od/policiesandprocedures/g/supervisor1.html"});
assertTrue(testUri.getOutLinks().containsAll(expected));
}
public void testEndingInDot() throws URIException, UnsupportedEncodingException, IOException, InterruptedException{
CrawlURI testUri = createTestUri("http://www.example.com/fake.pdf", TEST_RESOURCE_FILE_2);
extractor.process(testUri);
Set<CrawlURI> expected = makeLinkSet(testUri, new String[]{"http://www.fec.gov/data/CommitteeSummary.do",
"http://www.opensecrets.org/bigpicture/elec_stats.php",
"http://www.opensecrets.org/pacs"});
assertTrue(testUri.getOutLinks().containsAll(expected));
}
public void testUnderscoreInURL() throws URIException, UnsupportedEncodingException, IOException, InterruptedException{
CrawlURI testUri = createTestUri("http://www.example.com/fake.pdf", TEST_RESOURCE_FILE_3);
extractor.process(testUri);
Set<CrawlURI> expected = makeLinkSet(testUri, new String[]{"http://www.dot.gov/sites/dot.dev/files/docs/2014_February_ATCR.pdf"});
assertTrue(testUri.getOutLinks().containsAll(expected));
}
public void testParenthesis() throws URIException, UnsupportedEncodingException, IOException, InterruptedException{
CrawlURI testUri = createTestUri("http://www.example.com/fake.pdf", TEST_RESOURCE_FILE_4);
extractor.process(testUri);
Set<CrawlURI> expected = makeLinkSet(testUri, new String[]{"http://www.unisys.com","http://www.myserver.mycorp.com/images/exttest.jpg","http://www.adobe.com/intro?100,200","http://www.w3.org/1999/xhtml","http://www.xfa.org/schema/xfa-data/1.0","http://www.adobe.com","http://www.adobe.com/getacro.gif","http://www.example.com/testOpeningParen"});
assertTrue(testUri.getOutLinks().containsAll(expected));
}
public void testNewlineSeparatedURIs() throws URIException, UnsupportedEncodingException, IOException, InterruptedException{
CrawlURI testUri = createTestUri("http://www.example.com/fake.pdf", TEST_RESOURCE_FILE_4);
extractor.process(testUri);
Set<CrawlURI> expected = makeLinkSet(testUri, new String[]{"http://www.unisys.com","http://www.myserver.mycorp.com/images/exttest.jpg","http://www.example.com/test","http://www.adobe.com/intro?100,200","http://www.w3.org/1999/xhtml","http://www.xfa.org/schema/xfa-data/1.0","http://www.adobe.com","http://www.adobe.com/getacro.gif"});
assertTrue(testUri.getOutLinks().containsAll(expected));
}
@Override
protected Extractor makeExtractor() {
ExtractorPDFContent result = new ExtractorPDFContent();
UriErrorLoggerModule ulm = new UnitTestUriLoggerModule();
result.setLoggerModule(ulm);
return (Extractor)result;
}
private Set<CrawlURI> makeLinkSet(CrawlURI sourceUri, String[] urlStrs) throws URIException {
HashSet<CrawlURI> linkSet = new HashSet<CrawlURI>();
for (String urlStr : urlStrs) {
CrawlURI link = sourceUri.createCrawlURI(urlStr, HTMLLinkContext.NAVLINK_MISC, Hop.NAVLINK);
linkSet.add(link);
}
return linkSet;
}
private CrawlURI createTestUri(String urlStr, String resourceFileName) throws URIException,
UnsupportedEncodingException, IOException {
UURI testUuri = UURIFactory.getInstance(urlStr);
CrawlURI testUri = new CrawlURI(testUuri, null, null, LinkContext.NAVLINK_MISC);
File temp = File.createTempFile("test", ".tmp");
Recorder recorder = new Recorder(temp, 1024, 1024);
InputStream is = recorder.inputWrap(ExtractorPDFContentTest.class.getClassLoader().getResourceAsStream(resourceFileName));
recorder.markContentBegin();
for(int x = is.read(); x>=0; x=is.read());
is.close();
testUri.setContentType("application/pdf");
testUri.setFetchStatus(200);
testUri.setRecorder(recorder);
testUri.setContentSize(recorder.getResponseContentLength());
return testUri;
}
}