Package de.zib.scalaris.examples.wikipedia.data.xml

Source Code of de.zib.scalaris.examples.wikipedia.data.xml.WikiDumpPageHandler$byShortRevId

package de.zib.scalaris.examples.wikipedia.data.xml;

import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Set;

import de.zib.scalaris.examples.wikipedia.data.Page;
import de.zib.scalaris.examples.wikipedia.data.Revision;
import de.zib.scalaris.examples.wikipedia.data.ShortRevision;
import de.zib.scalaris.examples.wikipedia.data.SiteInfo;

/**
* Intermediate class between XML processing of a Wiki dump and custom
* exporting of read {@link SiteInfo} and {@link Page} objects, including all
* revisions and the list of short revisions.
*
* @author Nico Kruber, kruber@zib.de
*/
public abstract class WikiDumpPageHandler extends WikiDumpHandler {
    protected static final int UPDATE_PAGELIST_EVERY = 400;
    protected static final int NEW_CATS_HASH_DEF_SIZE = 100;
    protected static final int NEW_TPLS_HASH_DEF_SIZE = 100;
    protected static final int NEW_BLNKS_HASH_DEF_SIZE = 100;
   
    protected ArrayList<String> newPages = new ArrayList<String>(UPDATE_PAGELIST_EVERY);
    protected ArrayList<String> newArticles = new ArrayList<String>(UPDATE_PAGELIST_EVERY);
    protected HashMap<String, List<String>> newCategories = new HashMap<String, List<String>>(NEW_CATS_HASH_DEF_SIZE);
    protected HashMap<String, List<String>> newTemplates = new HashMap<String, List<String>>(NEW_TPLS_HASH_DEF_SIZE);
    protected HashMap<String, List<String>> newBackLinks = new HashMap<String, List<String>>(NEW_BLNKS_HASH_DEF_SIZE);

    /**
     * Sets up a SAX XmlHandler exporting all parsed pages except the ones in a
     * blacklist.
     *
     * @param blacklist
     *            a number of page titles to ignore
     * @param whitelist
     *            only import these pages
     * @param maxRevisions
     *            maximum number of revisions per page (starting with the most
     *            recent) - <tt>-1/tt> imports all revisions
     *            (useful to speed up the import / reduce the DB size)
     * @param minTime
     *            minimum time a revision should have (only one revision older
     *            than this will be imported) - <tt>null/tt> imports all
     *            revisions
     * @param maxTime
     *            maximum time a revision should have (newer revisions are
     *            omitted) - <tt>null/tt> imports all revisions
     *            (useful to create dumps of a wiki at a specific point in time)
     */
    public WikiDumpPageHandler(Set<String> blacklist, Set<String> whitelist,
            int maxRevisions, Calendar minTime, Calendar maxTime) {
        super(blacklist, whitelist, maxRevisions, minTime, maxTime);
    }

    /**
     * Exports the given siteinfo to Scalaris
     *
     * @param revisions
     *            the siteinfo to export
     */
    @Override
    protected void export(XmlSiteInfo siteinfo_xml) {
        doExport(siteinfo_xml.getSiteInfo());
    }

    /**
     * Exports the given page (including all revisions) to Scalaris
     *
     * @param page_xml
     *            the page object extracted from XML
     */
    @Override
    protected void export(XmlPage page_xml) {
        Page page = page_xml.getPage();
        ++pageCount;
       
        if (page.getCurRev() != null) {
            List<Revision> revisions = page_xml.getRevisions();
            List<ShortRevision> revisions_short = ShortRevision.fromRevisions(revisions);
            Collections.sort(revisions, Collections.reverseOrder(new byRevId()));
            Collections.sort(revisions_short, Collections.reverseOrder(new byShortRevId()));
   
            if (!revisions.isEmpty() && wikiModel != null) {
                wikiModel.setUp();
                wikiModel.setPageName(page.getTitle());
                wikiModel.render(null, revisions.get(0).unpackedText());
                for (String cat_raw: wikiModel.getCategories().keySet()) {
                    String category = wikiModel.getCategoryNamespace() + ":" + cat_raw;
                    List<String> catPages = newCategories.get(category);
                    if (catPages == null) {
                        catPages = new ArrayList<String>(UPDATE_PAGELIST_EVERY / 4);
                    }
                    catPages.add(page.getTitle());
                    newCategories.put(category, catPages);
                }
                for (String tpl_raw: wikiModel.getTemplates()) {
                    String template = wikiModel.getTemplateNamespace() + ":" + tpl_raw;
                    List<String> templatePages = newTemplates.get(template);
                    if (templatePages == null) {
                        templatePages = new ArrayList<String>(UPDATE_PAGELIST_EVERY / 4);
                    }
                    templatePages.add(page.getTitle());
                    newTemplates.put(template, templatePages);
                }
                for (String link: wikiModel.getLinks()) {
                    List<String> backLinks = newBackLinks.get(link);
                    if (backLinks == null) {
                        backLinks = new ArrayList<String>(UPDATE_PAGELIST_EVERY / 4);
                    }
                    backLinks.add(page.getTitle());
                    newBackLinks.put(link, backLinks);
                }
                wikiModel.tearDown();
            }
   
            doExport(page, revisions, revisions_short);
        }
        if ((pageCount % UPDATE_PAGELIST_EVERY) == 0) {
            println("processed pages: " + pageCount);
        }
    }

    abstract protected void doExport(SiteInfo siteInfo);

    abstract protected void doExport(Page page, List<Revision> revisions,
            List<ShortRevision> revisions_short);

    /**
     * Provides a comparator for sorting {@link Revision} objects by their IDs.
     *
     * @author Nico Kruber, kruber@zib.de
     */
    protected static class byRevId implements java.util.Comparator<Revision> {
        /**
         * Compares its two arguments for order. Returns a negative integer,
         * zero, or a positive integer as the first argument is less than, equal
         * to, or greater than the second.
         *
         * @param rev1
         *            the first revision to be compared.
         * @param rev2
         *            the second revision to be compared.
         *
         * @return a negative integer, zero, or a positive integer as the first
         *         argument is less than, equal to, or greater than the second.
         */
        @Override
        public int compare(Revision rev1, Revision rev2) {
            return (rev1.getId() - rev2.getId());
        }
    }

    /**
     * Provides a comparator for sorting {@link Revision} objects by their IDs.
     *
     * @author Nico Kruber, kruber@zib.de
     */
    protected static class byShortRevId implements java.util.Comparator<ShortRevision> {
        /**
         * Compares its two arguments for order. Returns a negative integer,
         * zero, or a positive integer as the first argument is less than, equal
         * to, or greater than the second.
         *
         * @param rev1
         *            the first revision to be compared.
         * @param rev2
         *            the second revision to be compared.
         *
         * @return a negative integer, zero, or a positive integer as the first
         *         argument is less than, equal to, or greater than the second.
         */
        @Override
        public int compare(ShortRevision rev1, ShortRevision rev2) {
            return (rev1.getId() - rev2.getId());
        }
    }
}
TOP

Related Classes of de.zib.scalaris.examples.wikipedia.data.xml.WikiDumpPageHandler$byShortRevId

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.