/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.migrate;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;
import org.apache.commons.lang.StringUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
/**
* Utility class which takes a H1 order.xml and creates a similar
* H3 job directory, with as many simple settings converted over
* (as top-of-crawler-beans overrides) as possible at this time.
*
* (Future versions will handle more complicated H1 settings
* customizations, such as per-host overrides or choices of
* alternate implementing classes for Scope, Processors, etc.)
*
* @contributor siznax
* @contributor gojomo
*/
public class MigrateH1to3Tool {
protected Document sourceOrderXmlDom;
protected static DocumentBuilder DOCUMENT_BUILDER;
static {
try {
DOCUMENT_BUILDER = DocumentBuilderFactory.newInstance().newDocumentBuilder();
} catch (ParserConfigurationException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
new MigrateH1to3Tool().instanceMain(args);
}
public void instanceMain(String[] args) throws Exception {
if(args.length != 2) {
printHelp();
return;
}
String sourceOrderXmlFileArg = args[0];
String destinationH3JobDirArg = args[1];
File sourceOrderXmlFile = new File(sourceOrderXmlFileArg);
if (!sourceOrderXmlFile.isFile()) {
System.err.println("ERROR sourceOrderXmlFileArg is not a file: " + sourceOrderXmlFileArg);
System.exit(1);
}
File destinationH3JobDir = new File(destinationH3JobDirArg);
org.archive.util.FileUtils.ensureWriteableDirectory(destinationH3JobDir);
System.out.println("H1 source: "+sourceOrderXmlFile.getAbsolutePath());
System.out.println("H3 destination: "+destinationH3JobDir.getAbsolutePath());
System.out.print("Migrating settings...");
InputStream inStream = getClass().getResourceAsStream(
"/org/archive/crawler/migrate/migrate-template-crawler-beans.cxml");
String template = IOUtils.toString(inStream);
inStream.close();
Map<String,String> migrateH1toH3Map = getMigrateMap();
try {
sourceOrderXmlDom = DOCUMENT_BUILDER.parse(sourceOrderXmlFile);
} catch (SAXException e) {
System.err.println("ERROR caught exception parsing input file: "
+ e.getMessage() + "\n");
e.printStackTrace();
}
Map<String,String> h1simpleSettings = flattenH1Order(sourceOrderXmlDom);
List<String> notApplicable = new ArrayList<String>();
List<String> needsAttention = new ArrayList<String>();
int migrated = 0;
StringBuilder sb = new StringBuilder();
for(String key : h1simpleSettings.keySet()) {
String beanPath = migrateH1toH3Map.get(key);
String value = h1simpleSettings.get(key);
System.out.print(".");
if(beanPath==null) {
// no equivalence rule
needsAttention.add(key+" "+value);
continue;
}
if (beanPath.startsWith("$")) {
// rule indicates not-available/not-applicable
notApplicable.add(key+" "+value);
continue;
}
if (beanPath.startsWith("*")) {
// TODO: needs special handling
if(beanPath.equals("*metadata.userAgentTemplate")) {
splitH1userAgent(value,sb);
migrated += 2;
} else {
needsAttention.add(key+" "+value);
}
continue;
}
if (beanPath.startsWith("^")) {
// uppercase to new enum-style
value = value.toUpperCase();
beanPath = beanPath.substring(1);
}
sb
.append(beanPath)
.append("=")
.append(value)
.append("\n");
migrated++;
}
System.out.println();
System.out.println();
// patch all overrides derived from H1 into H3 template
String beansCxml = template.replace("###MIGRATE_OVERRIDES###", sb.toString());
File targetBeansXmlFile = new File(destinationH3JobDir,"crawler-beans.cxml");
FileUtils.writeStringToFile(targetBeansXmlFile, beansCxml);
File sourceSeedsTxtFile = new File(sourceOrderXmlFile.getParentFile(), "seeds.txt");
File destinationSeedsTxtFile = new File(destinationH3JobDir, "seeds.txt");
if (!sourceSeedsTxtFile.isFile()) {
System.err.println("ERROR sourceSeedsTxtFile not found: " + sourceSeedsTxtFile);
System.exit(1);
}
FileUtils.copyFile(sourceSeedsTxtFile,destinationSeedsTxtFile);
System.out.println(notApplicable.size()+" settings skipped as not-applicable");
System.out.println("These are probably harmless, but if the following settings were");
System.out.println("important to your crawl process, investigate other options.");
listProblems(notApplicable);
System.out.println();
System.out.println(needsAttention.size()+" settings may need attention");
System.out.println("Please review your original crawl and the created H3 job, for each");
System.out.println("of the following, and manually update as needed.");
listProblems(needsAttention);
System.out.println();
System.out.println(migrated +" H1 settings successfully migrated to H3 configuration");
System.out.println();
System.out.println("Review your converted crawler-beans.cxml at:");
System.out.println(targetBeansXmlFile.getAbsolutePath());
}
protected void listProblems(List<String> problems) {
for(String problem : problems) {
System.out.println(" "+problem);
}
}
protected void printHelp() {
System.out.println(
"Usage: takes two arguments. First argument is path to a " +
"Heritrix 1.X order.xml, second argument is path for a new " +
"Heritrix 3.X job directory. Will generate a basic H3 job " +
"with as many of the H1 settings replicated as currently " +
"possible.");
}
protected void splitH1userAgent(String userAgent, StringBuilder sb) {
String originalUrl = userAgent.replaceAll(
"^.*?\\+(http://[^)]*).*$",
"$1");
String newTemplate = userAgent.replace(originalUrl,"@OPERATOR_CONTACT_URL@");
// TODO: catch, change outdated version info?
sb
.append("metadata.operatorContactUrl=")
.append(originalUrl)
.append("\n")
.append("metadata.userAgentTemplate=")
.append(newTemplate)
.append("\n");
}
protected Map<String, String> getMigrateMap() throws IOException {
Map<String,String> map = new HashMap<String,String>();
InputStream inStream = getClass().getResourceAsStream(
"/org/archive/crawler/migrate/H1toH3.map");
LineIterator iter = IOUtils.lineIterator(inStream, "UTF-8");
while(iter.hasNext()) {
String[] fields = iter.nextLine().split("\\|");
map.put(fields[1], fields[0]);
}
inStream.close();
return map;
}
/**
* Given a Document, return a Map of all non-blank simple text
* nodes, keyed by the pseudo-XPath to their parent element.
*
* @param h1order Document to extract Map
* @return Map<String,String> Xpath-like-String -> non-blank text content
* @throws XPathExpressionException
*/
public static Map<String,String> flattenH1Order(Document h1order) throws XPathExpressionException {
Map<String,String> flattened = new LinkedHashMap<String,String>();
XPathExpression xpath = XPathFactory.newInstance().newXPath().compile("//text()");
NodeList nodes = (NodeList) xpath.evaluate(h1order,XPathConstants.NODESET);
for(int i = 0; i< nodes.getLength(); i++) {
Node node = nodes.item(i);
if(StringUtils.isNotBlank(node.getTextContent())) {
String pseudoXPath = getPseudoXpath(node.getParentNode());
pseudoXPath = pseudoXPath.replaceFirst("/crawl-order", "/");
// System.out.println(
// pseudoXPath
// +" "+node.getTextContent());
flattened.put(pseudoXPath, node.getTextContent());
}
}
// System.err.println(flattened.size());
// System.err.println(flattened);
return flattened;
}
/**
* Given a node, give back an XPath-like string that addresses it.
* (For our constrained order.xml files, it is a valid and unique
* XPath, but the simple approach used here might not generate
* unique XPaths on all XML.
*
* @param node node to get pseudo-XPath
* @return String pseudo-XPath
*/
protected static String getPseudoXpath(Node node) {
String pseudoXpath = "";
Node currentNode = node;
while(currentNode.getParentNode()!=null) {
String thisSegment = currentNode.getNodeName();
if(currentNode.getAttributes().getNamedItem("name")!=null) {
thisSegment =
"*[@"
+currentNode.getAttributes().getNamedItem("name")
+"]";
}
pseudoXpath = "/" + thisSegment + pseudoXpath;
currentNode = currentNode.getParentNode();
}
return pseudoXpath;
}
}