Package org.archive.crawler.migrate

Source Code of org.archive.crawler.migrate.MigrateH1to3Tool

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/

package org.archive.crawler.migrate;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;
import org.apache.commons.lang.StringUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

/**
* Utility class which takes a H1 order.xml and creates a similar
* H3 job directory, with as many simple settings converted over
* (as top-of-crawler-beans overrides) as possible at this time.
*
* (Future versions will handle more complicated H1 settings
* customizations, such as per-host overrides or choices of
* alternate implementing classes for Scope, Processors, etc.)
*
* @contributor siznax
* @contributor gojomo
*/
public class MigrateH1to3Tool {

    protected Document sourceOrderXmlDom;

    protected static DocumentBuilder DOCUMENT_BUILDER;

    static {
        try {
            DOCUMENT_BUILDER = DocumentBuilderFactory.newInstance().newDocumentBuilder();
        } catch (ParserConfigurationException e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) throws Exception {
        new MigrateH1to3Tool().instanceMain(args);
    }

    public void instanceMain(String[] args) throws Exception {

        if(args.length != 2) {
            printHelp();
            return;
        }

        String sourceOrderXmlFileArg = args[0];
        String destinationH3JobDirArg = args[1];

        File sourceOrderXmlFile = new File(sourceOrderXmlFileArg);
        if (!sourceOrderXmlFile.isFile()) {
            System.err.println("ERROR sourceOrderXmlFileArg is not a file: " + sourceOrderXmlFileArg);
            System.exit(1);
        }
        File destinationH3JobDir = new File(destinationH3JobDirArg);

        org.archive.util.FileUtils.ensureWriteableDirectory(destinationH3JobDir);

        System.out.println("H1 source: "+sourceOrderXmlFile.getAbsolutePath());
        System.out.println("H3 destination: "+destinationH3JobDir.getAbsolutePath());
       
        System.out.print("Migrating settings...");
       
        InputStream inStream = getClass().getResourceAsStream(
                "/org/archive/crawler/migrate/migrate-template-crawler-beans.cxml");
        String template = IOUtils.toString(inStream);
        inStream.close();

        Map<String,String> migrateH1toH3Map = getMigrateMap();

        try {
            sourceOrderXmlDom = DOCUMENT_BUILDER.parse(sourceOrderXmlFile);
        } catch (SAXException e) {
            System.err.println("ERROR caught exception parsing input file: "
                    + e.getMessage() + "\n");
            e.printStackTrace();
        }

        Map<String,String> h1simpleSettings = flattenH1Order(sourceOrderXmlDom);

        List<String> notApplicable = new ArrayList<String>();
        List<String> needsAttention = new ArrayList<String>();
        int migrated = 0;
        StringBuilder sb = new StringBuilder();
        for(String key : h1simpleSettings.keySet()) {
            String beanPath = migrateH1toH3Map.get(key);
            String value = h1simpleSettings.get(key);
            System.out.print(".");
            if(beanPath==null) {
                // no equivalence rule
                needsAttention.add(key+" "+value)
                continue;
            }
            if (beanPath.startsWith("$")) {
                // rule indicates not-available/not-applicable
                notApplicable.add(key+" "+value);
                continue;
           
            if (beanPath.startsWith("*")) {
                // TODO: needs special handling
                if(beanPath.equals("*metadata.userAgentTemplate")) {
                    splitH1userAgent(value,sb);
                    migrated += 2;
                } else {
                    needsAttention.add(key+" "+value)
                }
                continue;
            }
            if (beanPath.startsWith("^")) {
                // uppercase to new enum-style
                value = value.toUpperCase();
                beanPath = beanPath.substring(1);
            }
            sb
            .append(beanPath)
            .append("=")
            .append(value)
            .append("\n");
            migrated++;
        }


        System.out.println();
        System.out.println();

        // patch all overrides derived from H1 into H3 template
        String beansCxml = template.replace("###MIGRATE_OVERRIDES###", sb.toString());

        File targetBeansXmlFile = new File(destinationH3JobDir,"crawler-beans.cxml");
        FileUtils.writeStringToFile(targetBeansXmlFile, beansCxml);

        File sourceSeedsTxtFile = new File(sourceOrderXmlFile.getParentFile(), "seeds.txt");
        File destinationSeedsTxtFile = new File(destinationH3JobDir, "seeds.txt");

        if (!sourceSeedsTxtFile.isFile()) {
            System.err.println("ERROR sourceSeedsTxtFile not found: " + sourceSeedsTxtFile);
            System.exit(1);
        }

        FileUtils.copyFile(sourceSeedsTxtFile,destinationSeedsTxtFile);

        System.out.println(notApplicable.size()+" settings skipped as not-applicable");
        System.out.println("These are probably harmless, but if the following settings were");
        System.out.println("important to your crawl process, investigate other options.");
        listProblems(notApplicable);
        System.out.println();
        System.out.println(needsAttention.size()+" settings may need attention");
        System.out.println("Please review your original crawl and the created H3 job, for each");
        System.out.println("of the following, and manually update as needed.");
        listProblems(needsAttention);
        System.out.println();
        System.out.println(migrated +" H1 settings successfully migrated to H3 configuration");
        System.out.println();
        System.out.println("Review your converted crawler-beans.cxml at:");
        System.out.println(targetBeansXmlFile.getAbsolutePath());

    }

    protected void listProblems(List<String> problems) {
        for(String problem : problems) {
            System.out.println(" "+problem);
        }
    }

    protected void printHelp() {
        System.out.println(
            "Usage: takes two arguments. First argument is path to a " +
            "Heritrix 1.X order.xml, second argument is path for a new " +
            "Heritrix 3.X job directory. Will generate a basic H3 job " +
            "with as many of the H1 settings replicated as currently " +
            "possible.");
    }

    protected void splitH1userAgent(String userAgent, StringBuilder sb) {
        String originalUrl = userAgent.replaceAll(
                "^.*?\\+(http://[^)]*).*$",
                "$1");
        String newTemplate = userAgent.replace(originalUrl,"@OPERATOR_CONTACT_URL@");
        // TODO: catch, change outdated version info?
        sb
         .append("metadata.operatorContactUrl=")
         .append(originalUrl)
         .append("\n")
         .append("metadata.userAgentTemplate=")
         .append(newTemplate)
         .append("\n");
    }

    protected Map<String, String> getMigrateMap() throws IOException {
        Map<String,String> map = new HashMap<String,String>();
        InputStream inStream = getClass().getResourceAsStream(
                "/org/archive/crawler/migrate/H1toH3.map");
        LineIterator iter = IOUtils.lineIterator(inStream, "UTF-8");
        while(iter.hasNext()) {
            String[] fields = iter.nextLine().split("\\|");
            map.put(fields[1], fields[0]);
        }
        inStream.close();
        return map;
    }
   
    /**
     * Given a Document, return a Map of all non-blank simple text
     * nodes, keyed by the pseudo-XPath to their parent element.
     *
     * @param h1order Document to extract Map
     * @return Map<String,String> Xpath-like-String -> non-blank text content
     * @throws XPathExpressionException
     */
    public static Map<String,String> flattenH1Order(Document h1order) throws XPathExpressionException {
        Map<String,String> flattened = new LinkedHashMap<String,String>();
        XPathExpression xpath = XPathFactory.newInstance().newXPath().compile("//text()");
        NodeList nodes = (NodeList) xpath.evaluate(h1order,XPathConstants.NODESET);
        for(int i = 0; i< nodes.getLength(); i++) {
            Node node = nodes.item(i);
            if(StringUtils.isNotBlank(node.getTextContent())) {
                String pseudoXPath = getPseudoXpath(node.getParentNode());
                pseudoXPath = pseudoXPath.replaceFirst("/crawl-order", "/");
               
//                System.out.println(
//                        pseudoXPath
//                        +" "+node.getTextContent());
               
                flattened.put(pseudoXPath, node.getTextContent());
            }
        }
//        System.err.println(flattened.size());
//        System.err.println(flattened);
       
        return flattened;
    }

    /**
     * Given a node, give back an XPath-like string that addresses it.
     * (For our constrained order.xml files, it is a valid and unique
     * XPath, but the simple approach used here might not generate
     * unique XPaths on all XML.
     *
     * @param node node to get pseudo-XPath
     * @return String pseudo-XPath
     */
    protected static String getPseudoXpath(Node node) {
        String pseudoXpath = "";
        Node currentNode = node;
        while(currentNode.getParentNode()!=null) {
            String thisSegment = currentNode.getNodeName();
            if(currentNode.getAttributes().getNamedItem("name")!=null) {
                thisSegment =
                    "*[@"
                    +currentNode.getAttributes().getNamedItem("name")
                    +"]";
            }
            pseudoXpath = "/" + thisSegment + pseudoXpath;
            currentNode = currentNode.getParentNode();
        }
        return pseudoXpath;
    }
}
TOP

Related Classes of org.archive.crawler.migrate.MigrateH1to3Tool

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.