Source Code of de.zib.scalaris.examples.wikipedia.data.xml.Main

/**
 *  Copyright 2007-2011 Zuse Institute Berlin
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package de.zib.scalaris.examples.wikipedia.data.xml;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.Calendar;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.zip.GZIPInputStream;


import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;


import de.zib.scalaris.examples.wikipedia.data.Revision;
import de.zib.scalaris.examples.wikipedia.data.xml.WikiDumpHandler.ReportAtShutDown;


/**
 * Provides abilities to read an xml wiki dump file and write Wiki pages to
 * Scalaris.
 * 
 * @author Nico Kruber, kruber@zib.de
 */
public class Main {
    /**
     * Default blacklist - pages with these names are not imported
     */
    public final static Set<String> blacklist = new HashSet<String>();
    
    /**
     * The main function of the application. Some articles are blacklisted and
     * will not be processed (see implementation for a list of them).
     * 
     * @param args
     *            first argument should be the xml file to convert.
     */
    public static void main(String[] args) {
        try {
            String filename = args[0];
            
            if (args.length > 1) {
                if (args[1].equals("filter")) {
                    doFilter(filename, Arrays.copyOfRange(args, 2, args.length));
                } else if (args[1].equals("import")) {
                    doImport(filename, Arrays.copyOfRange(args, 2, args.length), false);
                } else if (args[1].equals("prepare")) {
                    doImport(filename, Arrays.copyOfRange(args, 2, args.length), true);
                }
            }
        } catch (SAXException e) {
            System.err.println(e.getMessage());
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }


    /**
     * Imports all pages in the Wikipedia XML dump from the given file to Scalaris.
     * 
     * @param filename
     * @param args
     * @param prepare
     * 
     * @throws RuntimeException
     * @throws IOException
     * @throws SAXException
     * @throws FileNotFoundException
     */
    private static void doImport(String filename, String[] args, boolean prepare) throws RuntimeException, IOException,
            SAXException, FileNotFoundException {
        
        int i = 0;
        int maxRevisions = -1;
        if (args.length > i) {
            try {
                maxRevisions = Integer.parseInt(args[i]);
            } catch (NumberFormatException e) {
                System.err.println("no number: " + args[i]);
                System.exit(-1);
            }
        }
        ++i;
        
        // a timestamp in ISO8601 format
        Calendar minTime = null;
        if (args.length > i && !args[i].isEmpty()) {
            try {
                minTime = Revision.stringToCalendar(args[i]);
            } catch (IllegalArgumentException e) {
                System.err.println("no date in ISO8601: " + args[i]);
                System.exit(-1);
            }
        }
        ++i;
        
        // a timestamp in ISO8601 format
        Calendar maxTime = null;
        if (args.length > i && !args[i].isEmpty()) {
            try {
                maxTime = Revision.stringToCalendar(args[i]);
            } catch (IllegalArgumentException e) {
                System.err.println("no date in ISO8601: " + args[i]);
                System.exit(-1);
            }
        }
        ++i;
        
        Set<String> whitelist = null;
        String whitelistFile = "";
        if (args.length > i && !args[i].isEmpty()) {
            whitelistFile = args[i];
            FileReader inFile = new FileReader(whitelistFile);
            BufferedReader br = new BufferedReader(inFile);
            whitelist = new HashSet<String>();
            String line;
            while ((line = br.readLine()) != null) {
                if (!line.isEmpty()) {
                    whitelist.add(line);
                }
            }
            if (whitelist.isEmpty()) {
                whitelist = null;
            }
        }
        ++i;


        if (prepare) {
            // only prepare the import to Scalaris, i.e. pre-process K/V pairs?
            String dbFileName = "";
            if (args.length > i && !args[i].isEmpty()) {
                dbFileName = args[i];
            } else {
                System.err.println("need a DB file name for prepare; arguments given: " + Arrays.toString(args));
                System.exit(-1);
            }
            ++i;
            WikiDumpHandler.println(System.out, "wiki prepare file " + dbFileName);
            WikiDumpHandler.println(System.out, " wiki dump     : " + filename);
            WikiDumpHandler.println(System.out, " white list    : " + whitelistFile);
            WikiDumpHandler.println(System.out, " max revisions : " + maxRevisions);
            WikiDumpHandler.println(System.out, " min time      : " + minTime);
            WikiDumpHandler.println(System.out, " max time      : " + maxTime);
            WikiDumpHandler handler = new WikiDumpPrepareSQLiteForScalarisHandler(
                    blacklist, whitelist, maxRevisions, minTime, maxTime,
                    dbFileName);
            InputSource file = getFileReader(filename);
            runXmlHandler(handler, file);
        } else {
            if (filename.endsWith(".db")) {
                WikiDumpHandler.println(System.out, "wiki import from " + filename);
                WikiDumpPreparedSQLiteToScalaris handler =
                        new WikiDumpPreparedSQLiteToScalaris(filename);
                handler.setUp();
                WikiDumpPreparedSQLiteToScalaris.ReportAtShutDown shutdownHook = handler.new ReportAtShutDown();
                Runtime.getRuntime().addShutdownHook(shutdownHook);
                handler.writeToScalaris();
                handler.tearDown();
                shutdownHook.run();
                Runtime.getRuntime().removeShutdownHook(shutdownHook);
            } else {
                WikiDumpHandler.println(System.out, "wiki import from " + filename);
                WikiDumpHandler.println(System.out, " white list    : " + whitelistFile);
                WikiDumpHandler.println(System.out, " max revisions : " + maxRevisions);
                WikiDumpHandler.println(System.out, " min time      : " + minTime);
                WikiDumpHandler.println(System.out, " max time      : " + maxTime);
                WikiDumpHandler handler = new WikiDumpToScalarisHandler(
                        blacklist, whitelist, maxRevisions, minTime, maxTime);
                InputSource file = getFileReader(filename);
                runXmlHandler(handler, file);
            }
        }
    }


    /**
     * @param handler
     * @param file
     * @throws SAXException
     * @throws IOException
     */
    static void runXmlHandler(WikiDumpHandler handler, InputSource file)
            throws SAXException, IOException {
        XMLReader reader = XMLReaderFactory.createXMLReader();
        handler.setUp();
        ReportAtShutDown shutdownHook = handler.new ReportAtShutDown();
        Runtime.getRuntime().addShutdownHook(shutdownHook);
        reader.setContentHandler(handler);
        reader.parse(file);
        handler.tearDown();
        shutdownHook.run();
        Runtime.getRuntime().removeShutdownHook(shutdownHook);
    }


    /**
     * Filters all pages in the Wikipedia XML dump from the given file and
     * creates a list of page names belonging to certain categories.
     * 
     * @param filename
     * @param args
     * 
     * @throws RuntimeException
     * @throws IOException
     * @throws SAXException
     * @throws FileNotFoundException
     */
    private static void doFilter(String filename, String[] args) throws RuntimeException, IOException,
            SAXException, FileNotFoundException {
        int i = 0;
        int recursionLvl = 1;
        if (args.length > i) {
            try {
                recursionLvl = Integer.parseInt(args[i]);
            } catch (NumberFormatException e) {
                System.err.println("no number: " + args[i]);
                System.exit(-1);
            }
        }
        ++i;
        
        // a timestamp in ISO8601 format
        Calendar maxTime = null;
        if (args.length > i && !args[i].isEmpty()) {
            try {
                maxTime = Revision.stringToCalendar(args[i]);
            } catch (IllegalArgumentException e) {
                System.err.println("no date in ISO8601: " + args[i]);
                System.exit(-1);
            }
        }
        ++i;


        String pageListFileName = "";
        if (args.length > i && !args[i].isEmpty()) {
            pageListFileName = args[i];
        } else {
            System.err.println("need a pagelist file name for filter; arguments given: " + Arrays.toString(args));
            System.exit(-1);
        }
        ++i;
        
        Set<String> allowedPages = new HashSet<String>();
        allowedPages.add("Main Page");
        allowedPages.add("MediaWiki:Noarticletext");
        String allowedPagesFileName = "";
        if (args.length > i && !args[i].isEmpty()) {
            allowedPagesFileName = args[i];
            FileReader inFile = new FileReader(allowedPagesFileName);
            BufferedReader br = new BufferedReader(inFile);
            String line;
            while ((line = br.readLine()) != null) {
                if (!line.isEmpty()) {
                    allowedPages.add(line);
                }
            }
        }
        ++i;
        
        LinkedList<String> rootCategories = new LinkedList<String>();
        if (args.length > i) {
            for (String rCat : Arrays.asList(args).subList(i, args.length)) {
                if (!rCat.isEmpty()) {
                    rootCategories.add(rCat);
                }
            }
        }
        WikiDumpHandler.println(System.out, "filtering by categories " + rootCategories.toString() + " ...");
        WikiDumpHandler.println(System.out, " wiki dump     : " + filename);
        WikiDumpHandler.println(System.out, " max time      : " + maxTime);
        WikiDumpHandler.println(System.out, " allowed pages : " + allowedPagesFileName);
        WikiDumpHandler.println(System.out, " recursion lvl : " + recursionLvl);
        SortedSet<String> pages = getPageList(filename, maxTime, allowedPages,
                rootCategories, recursionLvl);


        do {
            FileWriter outFile = new FileWriter(pageListFileName);
            PrintWriter out = new PrintWriter(outFile);
            for (String page : pages) {
                out.println(page);
            }
            out.close();
        } while(false);
    }


    /**
     * Extracts all allowed pages in the given root categories as well as those
     * pages explicitly mentioned in a list of allowed pages.
     * 
     * Gets the category and template trees from a file, i.e.
     * <tt>filename + "-trees.db"</tt>, or if this does not exist, builds the
     * trees and stores them to this file.
     * 
     * @param filename
     *            the name of the xml wiki dump file
     * @param maxTime
     *            the maximum time of a revision to use for category parsing
     * @param allowedPages
     *            all allowed pages (un-normalised page titles)
     * @param rootCategories
     *            all allowed categories (un-normalised page titles)
     * @param recursionLvl
     *            recursion level to include pages linking to
     * 
     * @return a set of (de-normalised) page titles
     * 
     * @throws RuntimeException
     * @throws FileNotFoundException
     * @throws IOException
     * @throws SAXException
     */
    protected static SortedSet<String> getPageList(String filename, Calendar maxTime,
            Set<String> allowedPages, LinkedList<String> rootCategories,
            int recursionLvl) throws RuntimeException, FileNotFoundException,
            IOException, SAXException {
        Map<String, Set<String>> templateTree = new HashMap<String, Set<String>>();
        Map<String, Set<String>> includeTree = new HashMap<String, Set<String>>();
        Map<String, Set<String>> referenceTree = new HashMap<String, Set<String>>();


        File trees = new File(filename + "-trees.db");
        if (trees.exists()) {
            // read trees from tree file
            WikiDumpHandler.println(System.out, "reading category tree from " + trees.getAbsolutePath() + " ...");
            WikiDumpGetCategoryTreeHandler.readTrees(trees.getAbsolutePath(),
                    templateTree, includeTree, referenceTree);
        } else {
            // build trees from xml file
            // need to get all subcategories recursively, as they must be
            // included as well
            WikiDumpHandler.println(System.out, "building category tree from " + filename + " ...");
            WikiDumpGetCategoryTreeHandler handler = new WikiDumpGetCategoryTreeHandler(
                    blacklist, null, maxTime, trees.getPath());
            InputSource file = getFileReader(filename);
            runXmlHandler(handler, file);
            WikiDumpGetCategoryTreeHandler.readTrees(trees.getAbsolutePath(),
                    templateTree, includeTree, referenceTree);
        }


        WikiDumpHandler.println(System.out, "creating list of pages to import (recursion level: " + recursionLvl + ") ...");
        Set<String> allowedCats = new HashSet<String>(rootCategories);


        return WikiDumpGetCategoryTreeHandler.getPagesInCategories(
                trees.getAbsolutePath(), allowedCats, allowedPages, recursionLvl,
                templateTree, includeTree, referenceTree, System.out, false);
    }
    
    /**
     * Gets an appropriate file reader for the given file.
     * 
     * @param filename
     *            the name of the file
     * 
     * @return a file reader
     * 
     * @throws FileNotFoundException
     * @throws IOException
     */
    public static InputSource getFileReader(String filename) throws FileNotFoundException, IOException {
        InputStream is;
        if (filename.endsWith(".xml.gz")) {
            is = new GZIPInputStream(new FileInputStream(filename));
        } else if (filename.endsWith(".xml.bz2")) {
            is = new BZip2CompressorInputStream(new FileInputStream(filename));
        } else if (filename.endsWith(".xml")) {
            is = new FileInputStream(filename);
        } else {
            System.err.println("Unsupported file: " + filename + ". Supported: *.xml.gz, *.xml.bz2, *.xml");
            System.exit(-1);
            return null; // will never be reached but is necessary to keep javac happy
        }
        BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
        return new InputSource(br);
    }
}
Source Code of de.zib.scalaris.examples.wikipedia.data.xml.Main

Related Classes of de.zib.scalaris.examples.wikipedia.data.xml.Main