Package com.oltpbenchmark.benchmarks.wikipedia

Source Code of com.oltpbenchmark.benchmarks.wikipedia.WikipediaLoader

package com.oltpbenchmark.benchmarks.wikipedia;

import java.io.File;
import java.io.PrintStream;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.log4j.Logger;

import com.oltpbenchmark.api.Loader;
import com.oltpbenchmark.benchmarks.wikipedia.data.PageHistograms;
import com.oltpbenchmark.benchmarks.wikipedia.data.TextHistograms;
import com.oltpbenchmark.benchmarks.wikipedia.data.UserHistograms;
import com.oltpbenchmark.benchmarks.wikipedia.util.TransactionSelector;
import com.oltpbenchmark.catalog.Table;
import com.oltpbenchmark.types.DatabaseType;
import com.oltpbenchmark.util.Pair;
import com.oltpbenchmark.util.RandomDistribution.Flat;
import com.oltpbenchmark.util.RandomDistribution.FlatHistogram;
import com.oltpbenchmark.util.RandomDistribution.Zipf;
import com.oltpbenchmark.util.SQLUtil;
import com.oltpbenchmark.util.StringUtil;
import com.oltpbenchmark.util.TextGenerator;
import com.oltpbenchmark.util.TimeUtil;

/**
* Synthetic Wikipedia Data Loader
* @author pavlo
* @author djellel
*/
public class WikipediaLoader extends Loader {
    private static final Logger LOG = Logger.getLogger(WikipediaLoader.class);

    private final int num_users;
    private final int num_pages;
   
    /**
     * UserId -> # of Revisions
     */
    private final int user_revision_ctr[];

    /**
     * PageId -> Last Revision Id
     */
    private final int page_last_rev_id[];
   
    /**
     * PageId -> Last Revision Length
     */
    private final int page_last_rev_length[];
   
    /**
     * Pair<PageNamespace, PageTitle>
     */
    private List<Pair<Integer, String>> titles = new ArrayList<Pair<Integer, String>>();

    /**
     * Constructor
     * @param benchmark
     * @param c
     */
    public WikipediaLoader(WikipediaBenchmark benchmark, Connection c) {
        super(benchmark, c);
        this.num_users = (int) Math.round(WikipediaConstants.USERS * this.scaleFactor);
        this.num_pages = (int) Math.round(WikipediaConstants.PAGES * this.scaleFactor);
       
        this.user_revision_ctr = new int[this.num_users];
        Arrays.fill(this.user_revision_ctr, 0);
       
        this.page_last_rev_id = new int[this.num_pages];
        Arrays.fill(this.page_last_rev_id, -1);
        this.page_last_rev_length = new int[this.num_pages];
        Arrays.fill(this.page_last_rev_length, -1);
       
        if (LOG.isDebugEnabled()) {
            LOG.debug("# of USERS:  " + this.num_users);
            LOG.debug("# of PAGES: " + this.num_pages);
        }
    }

    @Override
    public void load() {
        try {
            // Load Data
            this.loadUsers();
            this.loadPages();
            this.loadWatchlist();
            this.loadRevision();

            // Generate Trace File
            this.genTrace();
           
        } catch (SQLException e) {
            e.printStackTrace();
            if (e.getNextException() != null) e = e.getNextException();
            throw new RuntimeException(e);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }
   
    private File genTrace() throws Exception {
        WikipediaBenchmark b = (WikipediaBenchmark)this.benchmark;
        File file = b.getTraceOutput();
        if (file == null || b.getTraceSize() == 0) return (null);
       
        assert(this.num_pages == this.titles.size());
        LOG.info(String.format("Generating a %dk traces to '%s'", b.getTraceSize(), file));
       
        Flat z_users = new Flat(rng(), 1, this.num_users);
        Zipf z_pages = new Zipf(rng(), 1, this.num_pages, WikipediaConstants.USER_ID_SIGMA);
       
        PrintStream ps = new PrintStream(file);
        for (int i = 0, cnt = (b.getTraceSize() * 1000); i < cnt; i++) {
            int user_id = -1;
           
            // Check whether this should be an anonymous update
            if (rng().nextInt(100) < WikipediaConstants.ANONYMOUS_PAGE_UPDATE_PROB) {
                user_id = WikipediaConstants.ANONYMOUS_USER_ID;
            }
            // Otherwise figure out what user is updating this page
            else {
                user_id = z_users.nextInt();
            }
            assert(user_id != -1);
           
            // Figure out what page they're going to update
            int page_id = z_pages.nextInt();
            Pair<Integer, String> p = this.titles.get(page_id);
            assert(p != null);
           
            TransactionSelector.writeEntry(ps, user_id, p.getFirst(), p.getSecond());
        } // FOR
        ps.close();
        return (file);
    }
   
    /**
     * USERACCTS
     */
    private void loadUsers() throws SQLException {
        Table catalog_tbl = this.getTableCatalog(WikipediaConstants.TABLENAME_USER);
        assert(catalog_tbl != null);

        String sql = SQLUtil.getInsertSQL(catalog_tbl);
        PreparedStatement userInsert = this.conn.prepareStatement(sql);

        FlatHistogram<Integer> h_nameLength = new FlatHistogram<Integer>(this.rng(), UserHistograms.NAME_LENGTH);
        FlatHistogram<Integer> h_realNameLength = new FlatHistogram<Integer>(this.rng(), UserHistograms.REAL_NAME_LENGTH);
        FlatHistogram<Integer> h_revCount = new FlatHistogram<Integer>(this.rng(), UserHistograms.REVISION_COUNT);

        int types[] = catalog_tbl.getColumnTypes();
        int batchSize = 0;
        int lastPercent = -1;
        for (int i = 1; i <= this.num_users; i++) {
            // The name will be prefixed with their UserId. This increases
            // the likelihood that all of our usernames are going to be unique
            // It's not a guarantee, but it's good enough...
            String name = Integer.toString(i) + TextGenerator.randomStr(rng(), h_nameLength.nextValue().intValue());
            String realName = TextGenerator.randomStr(rng(), h_realNameLength.nextValue().intValue());
            int revCount = h_revCount.nextValue().intValue();
            String password = StringUtil.repeat("*", rng().nextInt(32));
           
            char eChars[] = TextGenerator.randomChars(rng(), rng().nextInt(32) + 5);
            eChars[4 + rng().nextInt(eChars.length-4)] = '@';
            String email = new String(eChars);
           
            String token = TextGenerator.randomStr(rng(), WikipediaConstants.TOKEN_LENGTH);
            String userOptions = "fake_longoptionslist";
            String newPassTime = TimeUtil.getCurrentTimeString14();
            String touched = TimeUtil.getCurrentTimeString14();

            int param = 1;
            userInsert.setInt(param++, i);                // user_id
            userInsert.setString(param++, name);          // user_name
            userInsert.setString(param++, realName);      // user_real_name
            userInsert.setString(param++, password);      // user_password
            userInsert.setString(param++, password);      // user_newpassword
            userInsert.setString(param++, newPassTime);   // user_newpass_time
            userInsert.setString(param++, email);         // user_email
            userInsert.setString(param++, userOptions);   // user_options
            userInsert.setString(param++, touched);       // user_touched
            userInsert.setString(param++, token);         // user_token
            userInsert.setNull(param++, types[param-2]);    // user_email_authenticated
            userInsert.setNull(param++, types[param-2]);    // user_email_token
            userInsert.setNull(param++, types[param-2]);    // user_email_token_expires
            userInsert.setNull(param++, types[param-2]);    // user_registration
            userInsert.setInt(param++, revCount);         // user_editcount
            userInsert.addBatch();

            if (++batchSize % WikipediaConstants.BATCH_SIZE == 0) {
                userInsert.executeBatch();
                this.conn.commit();
                userInsert.clearBatch();
                this.addToTableCount(catalog_tbl.getName(), batchSize);
                batchSize = 0;
                if (LOG.isDebugEnabled()) {
                    int percent = (int) (((double) i / (double) this.num_users) * 100);
                    if (percent != lastPercent) LOG.debug("USERACCT (" + percent + "%)");
                    lastPercent = percent;
                }
            }
        } // FOR
        if (batchSize > 0) {
            this.addToTableCount(catalog_tbl.getName(), batchSize);
            userInsert.executeBatch();
            this.conn.commit();
            userInsert.clearBatch();
        }
        userInsert.close();
        if (this.getDatabaseType() == DatabaseType.POSTGRES) {
            this.updateAutoIncrement(catalog_tbl.getColumn(0), this.num_users);
        }
        if (LOG.isDebugEnabled())
            LOG.debug("Users  % " + this.num_users);
    }

    /**
     * PAGE
     */
    private void loadPages() throws SQLException {
        Table catalog_tbl = this.getTableCatalog(WikipediaConstants.TABLENAME_PAGE);
        assert(catalog_tbl != null);

        String sql = SQLUtil.getInsertSQL(catalog_tbl);
        PreparedStatement pageInsert = this.conn.prepareStatement(sql);
       
        FlatHistogram<Integer> h_titleLength = new FlatHistogram<Integer>(this.rng(), PageHistograms.TITLE_LENGTH);
        FlatHistogram<Integer> h_namespace = new FlatHistogram<Integer>(this.rng(), PageHistograms.NAMESPACE);
        FlatHistogram<String> h_restrictions = new FlatHistogram<String>(this.rng(), PageHistograms.RESTRICTIONS);

        int batchSize = 0;
        int lastPercent = -1;
        for (int i = 1; i <= this.num_pages; i++) {
            String title = TextGenerator.randomStr(rng(), h_titleLength.nextValue().intValue());
            int namespace = h_namespace.nextValue().intValue();
            String restrictions = h_restrictions.nextValue();
            double pageRandom = rng().nextDouble();
            String pageTouched = TimeUtil.getCurrentTimeString14();
           
            int param = 1;
            pageInsert.setInt(param++, i);              // page_id
            pageInsert.setInt(param++, namespace);      // page_namespace
            pageInsert.setString(param++, title);       // page_title
            pageInsert.setString(param++, restrictions);// page_restrictions
            pageInsert.setInt(param++, 0);              // page_counter
            pageInsert.setInt(param++, 0);              // page_is_redirect
            pageInsert.setInt(param++, 0);              // page_is_new
            pageInsert.setDouble(param++, pageRandom)// page_random
            pageInsert.setString(param++, pageTouched); // page_touched
            pageInsert.setInt(param++, 0);              // page_latest
            pageInsert.setInt(param++, 0);              // page_len
            pageInsert.addBatch();
            this.titles.add(Pair.of(namespace, title));

            if (++batchSize % WikipediaConstants.BATCH_SIZE == 0) {
                pageInsert.executeBatch();
                this.conn.commit();
                pageInsert.clearBatch();
                this.addToTableCount(catalog_tbl.getName(), batchSize);
                batchSize = 0;
                if (LOG.isDebugEnabled()) {
                    int percent = (int) (((double) i / (double) this.num_pages) * 100);
                    if (percent != lastPercent) LOG.debug("PAGE (" + percent + "%)");
                    lastPercent = percent;
                }
            }
        } // FOR
        if (batchSize > 0) {
            pageInsert.executeBatch();
            this.conn.commit();
            pageInsert.clearBatch();
            this.addToTableCount(catalog_tbl.getName(), batchSize);
        }
        pageInsert.close();
        if (this.getDatabaseType() == DatabaseType.POSTGRES) {
            this.updateAutoIncrement(catalog_tbl.getColumn(0), this.num_pages);
        }
        if (LOG.isDebugEnabled())
            LOG.debug("Users  % " + this.num_pages);
    }

    /**
     * WATCHLIST
     */
    private void loadWatchlist() throws SQLException {
        Table catalog_tbl = this.getTableCatalog(WikipediaConstants.TABLENAME_WATCHLIST);
        assert(catalog_tbl != null);
       
        String sql = SQLUtil.getInsertSQL(catalog_tbl, 1);
        PreparedStatement watchInsert = this.conn.prepareStatement(sql);
       
        Zipf h_numWatches = new Zipf(rng(), 0, this.num_pages, WikipediaConstants.NUM_WATCHES_PER_USER_SIGMA);
        Zipf h_pageId = new Zipf(rng(), 1, this.num_pages, WikipediaConstants.WATCHLIST_PAGE_SIGMA);

        int batchSize = 0;
        int lastPercent = -1;
        Set<Integer> userPages = new HashSet<Integer>();
        for (int user_id = 1; user_id <= this.num_users; user_id++) {
            int num_watches = h_numWatches.nextInt();
            if (LOG.isTraceEnabled())
                LOG.trace(user_id + " => " + num_watches);
           
            userPages.clear();
            for (int i = 0; i < num_watches; i++) {
                int pageId = h_pageId.nextInt();
                while (userPages.contains(pageId)) {
                    pageId = h_pageId.nextInt();
                } // WHILE
                userPages.add(pageId);
               
                Pair<Integer, String> page = this.titles.get(pageId);
                assert(page != null) : "Invalid PageId " + pageId;
               
                int param = 1;
                watchInsert.setInt(param++, user_id); // wl_user
                watchInsert.setInt(param++, page.getFirst()); // wl_namespace
                watchInsert.setString(param++, page.getSecond()); // wl_title
                watchInsert.setNull(param++, java.sql.Types.VARCHAR); // wl_notificationtimestamp
                watchInsert.addBatch();
                batchSize++;
            } // FOR

            if (batchSize >= WikipediaConstants.BATCH_SIZE) {
                watchInsert.executeBatch();
                this.conn.commit();
                watchInsert.clearBatch();
                this.addToTableCount(catalog_tbl.getName(), batchSize);
                batchSize = 0;
                if (LOG.isDebugEnabled()) {
                    int percent = (int) (((double) user_id / (double) this.num_users) * 100);
                    if (percent != lastPercent) LOG.debug("WATCHLIST (" + percent + "%)");
                    lastPercent = percent;
                }
            }
        } // FOR
        if (batchSize > 0) {
            watchInsert.executeBatch();
            watchInsert.clearBatch();
            this.conn.commit();
            this.addToTableCount(catalog_tbl.getName(), batchSize);
        }
        watchInsert.close();
        if (LOG.isDebugEnabled())
            LOG.debug("Watchlist Loaded");
    }

    /**
     * REVISIONS
     */
    private void loadRevision() throws SQLException {
       
        // TEXT
        Table textTable = this.getTableCatalog(WikipediaConstants.TABLENAME_TEXT);
        String textSQL = SQLUtil.getInsertSQL(textTable);
        PreparedStatement textInsert = this.conn.prepareStatement(textSQL);

        // REVISION
        Table revTable = this.getTableCatalog(WikipediaConstants.TABLENAME_REVISION);
        String revSQL = SQLUtil.getInsertSQL(revTable);
        PreparedStatement revisionInsert = this.conn.prepareStatement(revSQL);

        WikipediaBenchmark b = (WikipediaBenchmark)this.benchmark;
        int batchSize = 1;
        Zipf h_users = new Zipf(this.rng(), 1, this.num_users, WikipediaConstants.REVISION_USER_SIGMA);
        FlatHistogram<Integer> h_textLength = new FlatHistogram<Integer>(this.rng(), TextHistograms.TEXT_LENGTH);
        FlatHistogram<Integer> h_commentLength = b.commentLength;
        FlatHistogram<Integer> h_minorEdit = b.minorEdit;
        FlatHistogram<Integer> h_nameLength = new FlatHistogram<Integer>(this.rng(), UserHistograms.NAME_LENGTH);
        FlatHistogram<Integer> h_numRevisions = new FlatHistogram<Integer>(this.rng(), PageHistograms.REVISIONS_PER_PAGE);
       
        int rev_id = 1;
        int lastPercent = -1;
        for (int page_id = 1; page_id <= this.num_pages; page_id++) {
            // There must be at least one revision per page
            int num_revised = h_numRevisions.nextValue().intValue();
           
            // Generate what the new revision is going to be
            int old_text_length = h_textLength.nextValue().intValue();
            assert(old_text_length > 0);
            char old_text[] = TextGenerator.randomChars(rng(), old_text_length);
           
            for (int i = 0; i < num_revised; i++) {
                // Generate the User who's doing the revision and the Page revised
                // Makes sure that we always update their counter
                int user_id = h_users.nextInt();
                assert(user_id > 0 && user_id <= this.num_users) : "Invalid UserId '" + user_id + "'";
                this.user_revision_ctr[user_id-1]++;
               
                // Generate what the new revision is going to be
                if (i > 0) {
                    old_text = b.generateRevisionText(old_text);
                    old_text_length = old_text.length;
                }
               
                char rev_comment[] = TextGenerator.randomChars(rng(), h_commentLength.nextValue().intValue());

                // The REV_USER_TEXT field is usually the username, but we'll just
                // put in gibberish for now
                char user_text[] = TextGenerator.randomChars(rng(), h_nameLength.nextValue().intValue());
               
                // Insert the text
                int col = 1;
                textInsert.setInt(col++, rev_id); // old_id
                textInsert.setString(col++, new String(old_text)); // old_text
                textInsert.setString(col++, "utf-8"); // old_flags
                textInsert.setInt(col++, page_id); // old_page
                textInsert.addBatch();

                // Insert the revision
                col = 1;
                revisionInsert.setInt(col++, rev_id); // rev_id
                revisionInsert.setInt(col++, page_id); // rev_page
                revisionInsert.setInt(col++, rev_id); // rev_text_id
                revisionInsert.setString(col++, new String(rev_comment)); // rev_comment
                revisionInsert.setInt(col++, user_id); // rev_user
                revisionInsert.setString(col++, new String(user_text)); // rev_user_text
                revisionInsert.setString(col++, TimeUtil.getCurrentTimeString14()); // rev_timestamp
                revisionInsert.setInt(col++, h_minorEdit.nextValue().intValue()); // rev_minor_edit
                revisionInsert.setInt(col++, 0); // rev_deleted
                revisionInsert.setInt(col++, 0); // rev_len
                revisionInsert.setInt(col++, 0); // rev_parent_id
                revisionInsert.addBatch();
               
                // Update Last Revision Stuff
                this.page_last_rev_id[page_id-1] = rev_id;
                this.page_last_rev_length[page_id-1] = old_text_length;
                rev_id++;
                batchSize++;
            } // FOR (revision)
            if (batchSize > WikipediaConstants.BATCH_SIZE) {
                textInsert.executeBatch();
                revisionInsert.executeBatch();
                this.conn.commit();
                this.addToTableCount(textTable.getName(), batchSize);
                this.addToTableCount(revTable.getName(), batchSize);
                batchSize = 0;
               
                if (LOG.isDebugEnabled()) {
                    int percent = (int) (((double) page_id / (double) this.num_pages) * 100);
                    if (percent != lastPercent) LOG.debug("REVISIONS (" + percent + "%)");
                    lastPercent = percent;
                }
            }
        } // FOR (page)
        revisionInsert.close();
        textInsert.close();
        if (this.getDatabaseType() == DatabaseType.POSTGRES) {
            this.updateAutoIncrement(textTable.getColumn(0), rev_id);
            this.updateAutoIncrement(revTable.getColumn(0), rev_id);
        }
       
        // UPDATE USER
        revTable = this.getTableCatalog(WikipediaConstants.TABLENAME_USER);
        String updateUserSql = "UPDATE " + revTable.getEscapedName() +
                               "   SET user_editcount = ?, " +
                               "       user_touched = ? " +
                               " WHERE user_id = ?";
        PreparedStatement userUpdate = this.conn.prepareStatement(updateUserSql);
        batchSize = 0;
        for (int i = 0; i < this.num_users; i++) {
            int col = 1;
            userUpdate.setInt(col++, this.user_revision_ctr[i]);
            userUpdate.setString(col++, TimeUtil.getCurrentTimeString14());
            userUpdate.setInt(col++, i+1); // ids start at 1
            userUpdate.addBatch();
            if ((++batchSize % WikipediaConstants.BATCH_SIZE) == 0) {
                userUpdate.executeBatch();
                this.conn.commit();
                userUpdate.clearBatch();
                batchSize = 0;
            }
        } // FOR
        if (batchSize > 0) {
            userUpdate.executeBatch();
            this.conn.commit();
            userUpdate.clearBatch();
        }
        userUpdate.close();
       
        // UPDATE PAGES
        revTable = this.getTableCatalog(WikipediaConstants.TABLENAME_PAGE);
        String updatePageSql = "UPDATE " + revTable.getEscapedName() +
                               "   SET page_latest = ?, " +
                               "       page_touched = ?, " +
                               "       page_is_new = 0, " +
                               "       page_is_redirect = 0, " +
                               "       page_len = ? " +
                               " WHERE page_id = ?";
        PreparedStatement pageUpdate = this.conn.prepareStatement(updatePageSql);
        batchSize = 0;
        for (int i = 0; i < this.num_pages; i++) {
            if (this.page_last_rev_id[i] == -1) continue;
           
            int col = 1;
            pageUpdate.setInt(col++, this.page_last_rev_id[i]);
            pageUpdate.setString(col++, TimeUtil.getCurrentTimeString14());
            pageUpdate.setInt(col++, this.page_last_rev_length[i]);
            pageUpdate.setInt(col++, i+1); // ids start at 1
            pageUpdate.addBatch();
            if ((++batchSize % WikipediaConstants.BATCH_SIZE) == 0) {
                pageUpdate.executeBatch();
                this.conn.commit();
                pageUpdate.clearBatch();
                batchSize = 0;
            }
        } // FOR
        if (batchSize > 0) {
            pageUpdate.executeBatch();
            this.conn.commit();
            pageUpdate.clearBatch();
        }
        pageUpdate.close();
       
        if (LOG.isDebugEnabled()) {
            LOG.debug("Revision loaded");
        }
    }
}
TOP

Related Classes of com.oltpbenchmark.benchmarks.wikipedia.WikipediaLoader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.