/*
* Copyright 2009-2013 Scale Unlimited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package bixo.parser;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import junit.framework.Assert;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.ccil.cowan.tagsoup.Parser;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;
import org.dom4j.XPath;
import org.dom4j.io.SAXReader;
import org.hsqldb.lib.StringInputStream;
import org.junit.Test;
import bixo.config.ParserPolicy;
import bixo.datum.ContentBytes;
import bixo.datum.FetchedDatum;
import bixo.datum.HttpHeaders;
import bixo.datum.Outlink;
import bixo.datum.ParsedDatum;
import bixo.fetcher.HttpHeaderNames;
public class SimpleParserTest {
@Test
public void testRelativeLinkWithBaseUrl() throws Exception {
// Read in test data from test/resources
String html = readFromFile("parser-files/base-url.html");
// Create FetchedDatum using data
String url = "http://olddomain.com/base-url.html";
String contentType = "text/html; charset=utf-8";
HttpHeaders headers = new HttpHeaders();
headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
// Call parser.parse
SimpleParser parser = new SimpleParser();
ParsedDatum parsedDatum = parser.parse(fetchedDatum);
// Verify outlink is correct.
Outlink[] outlinks = parsedDatum.getOutlinks();
Assert.assertEquals(2, outlinks.length);
// TODO KKr - reenable this test when Tika parser calls my handler with
// the <base> element, which is needed to correctly resolve relative links.
// Assert.assertEquals("http://newdomain.com/link", outlinks[0].getToUrl());
Assert.assertEquals("link1", outlinks[0].getAnchor());
Assert.assertEquals("http://domain.com/link", outlinks[1].getToUrl());
Assert.assertEquals("link2", outlinks[1].getAnchor());
}
@Test
public void testRelativeLinkWithLocationUrl() throws Exception {
// Read in test data from test/resources
String html = readFromFile("parser-files/relative-urls.html");
// Create FetchedDatum using data
String url = "http://olddomain.com/relative-urls.html";
String location = "http://newdomain.com";
String contentType = "text/html; charset=utf-8";
HttpHeaders headers = new HttpHeaders();
headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
headers.add(HttpHeaderNames.CONTENT_LOCATION, location);
ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
// Call parser.parse
SimpleParser parser = new SimpleParser();
ParsedDatum parsedDatum = parser.parse(fetchedDatum);
// Verify outlink is correct.
Outlink[] outlinks = parsedDatum.getOutlinks();
Assert.assertEquals(2, outlinks.length);
Assert.assertEquals("http://newdomain.com/link1", outlinks[0].getToUrl());
Assert.assertEquals("link1", outlinks[0].getAnchor());
// TODO KKr - reenable this test when Tika changes are submitted:
// Assert.assertEquals("nofollow", outlinks[0].getRelAttributes());
Assert.assertEquals("http://domain.com/link2", outlinks[1].getToUrl());
Assert.assertEquals("link2", outlinks[1].getAnchor());
}
@Test
public void testRelativeLinkWithRelativeLocationUrl() throws Exception {
// Read in test data from test/resources
String html = readFromFile("parser-files/relative-urls.html");
// Create FetchedDatum using data
String url = "http://olddomain.com/relative-urls.html";
String location = "redirected/";
String contentType = "text/html; charset=utf-8";
HttpHeaders headers = new HttpHeaders();
headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
headers.add(HttpHeaderNames.CONTENT_LOCATION, location);
ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
// Call parser.parse
SimpleParser parser = new SimpleParser();
ParsedDatum parsedDatum = parser.parse(fetchedDatum);
// Verify outlink is correct.
Outlink[] outlinks = parsedDatum.getOutlinks();
Assert.assertEquals(2, outlinks.length);
Assert.assertEquals("http://olddomain.com/redirected/link1", outlinks[0].getToUrl());
Assert.assertEquals("link1", outlinks[0].getAnchor());
Assert.assertEquals("http://domain.com/link2", outlinks[1].getToUrl());
Assert.assertEquals("link2", outlinks[1].getAnchor());
}
@Test
public void testRelativeLinkWithRedirectUrl() throws Exception {
// Read in test data from test/resources
String html = readFromFile("parser-files/relative-urls.html");
// Create FetchedDatum using data
String url = "http://olddomain.com/relative-urls.html";
String redirectedUrl = "http://newdomain.com";
String contentType = "text/html; charset=utf-8";
HttpHeaders headers = new HttpHeaders();
headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
FetchedDatum fetchedDatum = new FetchedDatum(url, redirectedUrl, System.currentTimeMillis(), headers, content, contentType, 0);
// Call parser.parse
SimpleParser parser = new SimpleParser();
ParsedDatum parsedDatum = parser.parse(fetchedDatum);
// Verify outlink is correct.
Outlink[] outlinks = parsedDatum.getOutlinks();
Assert.assertEquals(2, outlinks.length);
Assert.assertEquals("http://newdomain.com/link1", outlinks[0].getToUrl());
Assert.assertEquals("link1", outlinks[0].getAnchor());
Assert.assertEquals("http://domain.com/link2", outlinks[1].getToUrl());
Assert.assertEquals("link2", outlinks[1].getAnchor());
}
@Test
public void testDefaultLinkTypes() throws Exception {
// Read in test data from test/resources
String html = readFromFile("parser-files/all-link-types.html");
// Create FetchedDatum using data
String url = "http://domain.com/all-link-types.html";
String contentType = "text/html; charset=utf-8";
HttpHeaders headers = new HttpHeaders();
headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
// Call parser.parse
SimpleParser parser = new SimpleParser();
ParsedDatum parsedDatum = parser.parse(fetchedDatum);
// Verify outlinks are correct (and we only get the a href ones).
Outlink[] outlinks = parsedDatum.getOutlinks();
Assert.assertEquals(2, outlinks.length);
Assert.assertEquals("http://newdomain.com/link1", outlinks[0].getToUrl());
Assert.assertEquals("link1", outlinks[0].getAnchor());
Assert.assertEquals("http://domain.com/link2", outlinks[1].getToUrl());
Assert.assertEquals("link2", outlinks[1].getAnchor());
}
@Test
public void testAllLinkTypes() throws Exception {
// Read in test data from test/resources
String html = readFromFile("parser-files/all-link-types.html");
// Create FetchedDatum using data
String url = "http://domain.com/all-link-types.html";
String contentType = "text/html; charset=utf-8";
HttpHeaders headers = new HttpHeaders();
headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
// Call parser.parse
ParserPolicy policy = new ParserPolicy( ParserPolicy.DEFAULT_MAX_PARSE_DURATION,
BaseLinkExtractor.ALL_LINK_TAGS,
BaseLinkExtractor.ALL_LINK_ATTRIBUTE_TYPES);
SimpleParser parser = new SimpleParser(policy);
ParsedDatum parsedDatum = parser.parse(fetchedDatum);
// Verify outlinks are correct (and we only get the a href ones).
Outlink[] outlinks = parsedDatum.getOutlinks();
Assert.assertEquals(7, outlinks.length);
Assert.assertEquals("http://newdomain.com/favicon.ico", outlinks[0].getToUrl());
Assert.assertEquals("http://newdomain.com/link1", outlinks[1].getToUrl());
Assert.assertEquals("link1", outlinks[1].getAnchor());
Assert.assertEquals("http://domain.com/link2", outlinks[2].getToUrl());
Assert.assertEquals("link2", outlinks[2].getAnchor());
Assert.assertEquals("http://newdomain.com/giant-prawn.jpg", outlinks[3].getToUrl());
Assert.assertEquals("http://en.wikipedia.org/wiki/Australia's_Big_Things",
outlinks[4].getToUrl());
Assert.assertEquals("http://newdomain.com/giant-dog.jpg", outlinks[5].getToUrl());
Assert.assertEquals("http://www.brucelawson.co.uk/index.php/2005/stupid-stock-photography/",
outlinks[6].getToUrl());
}
@SuppressWarnings("serial")
@Test
public void testSomeLinkTypes() throws Exception {
// Read in test data from test/resources
String html = readFromFile("parser-files/all-link-types.html");
// Create FetchedDatum using data
String url = "http://domain.com/all-link-types.html";
String contentType = "text/html; charset=utf-8";
HttpHeaders headers = new HttpHeaders();
headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
// Call parser.parse
Set<String> linkTags =
new HashSet<String>() {{
add("a");
add("img");
add("link");
}};
Set<String> linkAttributeTypes =
new HashSet<String>() {{
add("href");
add("src");
}};
ParserPolicy policy = new ParserPolicy( ParserPolicy.DEFAULT_MAX_PARSE_DURATION,
linkTags,
linkAttributeTypes);
SimpleParser parser = new SimpleParser(policy);
ParsedDatum parsedDatum = parser.parse(fetchedDatum);
// Verify outlinks are correct (and we only get the a href ones).
Outlink[] outlinks = parsedDatum.getOutlinks();
Assert.assertEquals(4, outlinks.length);
Assert.assertEquals("http://newdomain.com/favicon.ico", outlinks[0].getToUrl());
Assert.assertEquals("http://newdomain.com/link1", outlinks[1].getToUrl());
Assert.assertEquals("link1", outlinks[1].getAnchor());
Assert.assertEquals("http://domain.com/link2", outlinks[2].getToUrl());
Assert.assertEquals("link2", outlinks[2].getAnchor());
Assert.assertEquals("http://newdomain.com/giant-prawn.jpg", outlinks[3].getToUrl());
}
@Test
public void testContentExtraction() throws Exception {
// Read in test data from test/resources
String html = readFromFile("parser-files/simple-content.html");
// Create FetchedDatum using data
String url = "http://domain.com/simple-content.html";
String contentType = "text/html; charset=utf-8";
HttpHeaders headers = new HttpHeaders();
headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
// Call parser.parse
SimpleParser parser = new SimpleParser();
ParsedDatum parsedDatum = parser.parse(fetchedDatum);
// Verify content is correct
Assert.assertEquals("Simple", parsedDatum.getTitle());
compareTermsInStrings("Simple Content", parsedDatum.getParsedText());
}
@Test
public void testHtmlParsing() throws Exception {
URL path = SimpleParserTest.class.getResource("/simple-page.html");
BaseParser parser = new SimpleParser();
FetchedDatum content = makeFetchedDatum(path);
ParsedDatum parse = parser.parse(content);
Assert.assertNotNull(parse.getParsedText());
// TODO - add back in title text to simple-page, when we generate this
File parsedTextFile = new File(SimpleParserTest.class.getResource("/" + "simple-page.txt").getFile());
String expectedString = FileUtils.readFileToString(parsedTextFile, "utf-8");
String actualString = parse.getParsedText();
// Trim of leading returns so split() doesn't give us an empty term
// TODO - use our own split that skips leading/trailing separators
compareTermsInStrings(expectedString, actualString.replaceFirst("^[\\n]+", ""));
// TODO reenable when Tika bug is fixed re not emitting <img> links.
// Outlink[] outlinks = parse.getOutlinks();
// Assert.assertEquals(10, outlinks.length);
Assert.assertEquals("TransPac Software", parse.getTitle());
}
@SuppressWarnings("serial")
@Test
public void testCustomContentExtractor() throws Exception {
String html = readFromFile("parser-files/simple-content.html");
String url = "http://domain.com/simple-content.html";
String contentType = "text/html; charset=utf-8";
HttpHeaders headers = new HttpHeaders();
headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
SimpleParser parser = new SimpleParser(new BaseContentExtractor() {
@Override
public String getContent() {
return "Custom";
}
},
new BaseLinkExtractor() {
@Override
public Outlink[] getLinks() {
return new Outlink[0];
}
},
new ParserPolicy());
ParsedDatum parsedDatum = parser.parse(fetchedDatum);
// Verify content is correct
Assert.assertEquals("Simple", parsedDatum.getTitle());
compareTermsInStrings("Custom", parsedDatum.getParsedText());
}
@Test
public void testLinkExtractorWithMetaTags() throws Exception {
String html = readFromFile("parser-files/meta-nofollow.html");
String url = "http://domain.com/meta-nofollow.html";
String contentType = "text/html; charset=utf-8";
HttpHeaders headers = new HttpHeaders();
headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
ParserPolicy policy = new ParserPolicy(Integer.MAX_VALUE);
SimpleParser parser = new SimpleParser(policy);
ParsedDatum parsedDatum = parser.parse(fetchedDatum);
// Verify we got no URLs
Assert.assertEquals(0, parsedDatum.getOutlinks().length);
}
@Test
public void testLanguageDetectionHttpHeader() throws Exception {
// Read in test data from test/resources
String html = readFromFile("parser-files/simple-content.html");
// Create FetchedDatum using data
String url = "http://domain.com/simple-content.html";
String contentType = "text/html; charset=utf-8";
HttpHeaders headers = new HttpHeaders();
headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
headers.add(HttpHeaderNames.CONTENT_LANGUAGE, "en");
ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
// Call parser.parse
SimpleParser parser = new SimpleParser();
ParsedDatum parsedDatum = parser.parse(fetchedDatum);
// Verify content is correct
Assert.assertEquals("Simple", parsedDatum.getTitle());
compareTermsInStrings("Simple Content", parsedDatum.getParsedText());
Assert.assertEquals("en", parsedDatum.getLanguage());
}
@Test
public void testLanguageDetectionDublinCore() throws Exception {
// Read in test data from test/resources
String html = readFromFile("parser-files/lang-dc.html");
// Create FetchedDatum using data
String url = "http://domain.com/lang-dc.html";
String contentType = "text/html; charset=utf-8";
HttpHeaders headers = new HttpHeaders();
headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
headers.add(HttpHeaderNames.CONTENT_LANGUAGE, "en");
ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
// Call parser.parse
SimpleParser parser = new SimpleParser();
ParsedDatum parsedDatum = parser.parse(fetchedDatum);
// Verify content is correct
Assert.assertEquals("DublinCore Language Example", parsedDatum.getTitle());
compareTermsInStrings("DublinCore Language Example Content", parsedDatum.getParsedText());
Assert.assertEquals("ja", parsedDatum.getLanguage());
}
@Test
public void testLanguageDetectionHttpEquiv() throws Exception {
// Read in test data from test/resources
String html = readFromFile("parser-files/lang-http-equiv.html");
// Create FetchedDatum using data
String url = "http://domain.com/lang-dc.html";
String contentType = "text/html; charset=utf-8";
HttpHeaders headers = new HttpHeaders();
headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
headers.add(HttpHeaderNames.CONTENT_LANGUAGE, "en");
ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
// Call parser.parse
SimpleParser parser = new SimpleParser();
ParsedDatum parsedDatum = parser.parse(fetchedDatum);
// Verify content is correct
Assert.assertEquals("SimpleHttpEquiv", parsedDatum.getTitle());
compareTermsInStrings("SimpleHttpEquiv Content", parsedDatum.getParsedText());
Assert.assertEquals("ja", parsedDatum.getLanguage());
}
@Test
public void testExtractingObjectTag() throws Exception {
final String html = "<html><head><title>Title</title></head>" +
"<body><object data=\"http://domain.com/song.mid\" /></body></html>";
// Create FetchedDatum using data
String url = "http://domain.com/music.html";
String contentType = "text/html; charset=utf-8";
HttpHeaders headers = new HttpHeaders();
headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
// Call parser.parse
ParserPolicy policy = new ParserPolicy( ParserPolicy.NO_MAX_PARSE_DURATION,
BaseLinkExtractor.ALL_LINK_TAGS,
BaseLinkExtractor.ALL_LINK_ATTRIBUTE_TYPES);
SimpleParser parser = new SimpleParser(new SimpleContentExtractor(), new SimpleLinkExtractor(), policy, true);
ParsedDatum parsedDatum = parser.parse(fetchedDatum);
// Verify outlinks are correct
Outlink[] outlinks = parsedDatum.getOutlinks();
Assert.assertEquals(1, outlinks.length);
Assert.assertEquals("http://domain.com/song.mid", outlinks[0].getToUrl());
}
@Test
public void testHtmlWithTags() throws Exception {
final String htmlText = "<html><head><title>Title</title></head>" +
"<body><p>this is a test</p></body></html>";
// Create FetchedDatum using data
String url = "http://domain.com/page.html";
String contentType = "text/html; charset=utf-8";
HttpHeaders headers = new HttpHeaders();
headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
ContentBytes content = new ContentBytes(htmlText.getBytes("utf-8"));
FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
// Call parser.parse
SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
ParsedDatum parsedDatum = parser.parse(fetchedDatum);
// Now take the resulting HTML, process it using Dom4J
SAXReader reader = new SAXReader(new Parser());
reader.setEncoding("UTF-8");
String htmlWithMarkup = parsedDatum.getParsedText();
Document doc = reader.read(new StringInputStream(htmlWithMarkup));
// We have to do helicopter stunts since HTML has a global namespace on it, set
// at the <html> element level.
XPath xpath = DocumentHelper.createXPath("/xhtml:html/xhtml:body/xhtml:p");
Map<String, String> namespaceUris = new HashMap<String, String>();
namespaceUris.put("xhtml", "http://www.w3.org/1999/xhtml");
xpath.setNamespaceURIs(namespaceUris);
Node paragraphNode = xpath.selectSingleNode(doc);
Assert.assertNotNull(paragraphNode);
Assert.assertEquals("this is a test", paragraphNode.getText());
}
private static String readFromFile(String filePath) throws IOException {
InputStream is = SimpleParserTest.class.getResourceAsStream("/" + filePath);
return IOUtils.toString(is);
}
private FetchedDatum makeFetchedDatum(URL path) throws IOException {
File file = new File(path.getFile());
byte[] bytes = new byte[(int) file.length()];
DataInputStream in = new DataInputStream(new FileInputStream(file));
in.readFully(bytes);
String url = path.toExternalForm().toString();
FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), new HttpHeaders(), new ContentBytes(bytes), "text/html", 0);
return fetchedDatum;
}
private void compareTermsInStrings(String expected, String actual) {
String[] expectedTerms = expected.split("[ \\n\\r\\t\\n]+");
// Trim of leading returns so split() doesn't give us an empty term
// TODO - use our own split that skips leading/trailing separators
String[] actualTerms = actual.split("[ \\n\\r\\t\\n]+");
int compLength = Math.min(expectedTerms.length, actualTerms.length);
for (int i = 0; i < compLength; i++) {
Assert.assertEquals("Term at index " + i, expectedTerms[i], actualTerms[i]);
}
Assert.assertEquals(expectedTerms.length, actualTerms.length);
}
}