Package edu.brown.benchmark.wikipedia

Source Code of edu.brown.benchmark.wikipedia.WikipediaLoader

package edu.brown.benchmark.wikipedia;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.log4j.Logger;
import org.voltdb.CatalogContext;
import org.voltdb.VoltTable;
import org.voltdb.catalog.Column;
import org.voltdb.catalog.Database;
import org.voltdb.catalog.Table;
import org.voltdb.client.Client;
import org.voltdb.client.ClientResponse;
import org.voltdb.types.TimestampType;

import edu.brown.api.Loader;
import edu.brown.benchmark.wikipedia.data.PageHistograms;
import edu.brown.benchmark.wikipedia.data.TextHistograms;
import edu.brown.benchmark.wikipedia.data.UserHistograms;
import edu.brown.benchmark.wikipedia.procedures.UpdateRevisionCounters;
import edu.brown.benchmark.wikipedia.util.TextGenerator;
import edu.brown.benchmark.wikipedia.util.WikipediaUtil;
import edu.brown.catalog.CatalogUtil;
import edu.brown.hstore.Hstoreservice.Status;
import edu.brown.logging.LoggerUtil;
import edu.brown.logging.LoggerUtil.LoggerBoolean;
import edu.brown.rand.RandomDistribution.FlatHistogram;
import edu.brown.rand.RandomDistribution.Zipf;
import edu.brown.utils.StringUtil;
import edu.brown.utils.ThreadUtil;

/**
* Synthetic Wikipedia Data Loader
* @author pavlo
* @author djellel
* @author xin
*/
public class WikipediaLoader extends Loader {
    private static final Logger LOG = Logger.getLogger(WikipediaLoader.class);
    private static final LoggerBoolean debug = new LoggerBoolean();
    private static final LoggerBoolean trace = new LoggerBoolean();
    static {
        LoggerUtil.attachObserver(LOG, debug, trace);
    }

    private final Random randGenerator = new Random();
    private final WikipediaUtil util;
   
    /**
     * UserId -> # of Revisions
     */
    private final int user_revision_ctr[];

    /**
     * PageId -> Last Revision Id
     */
    private final int page_last_rev_id[];
   
    /**
     * PageId -> Last Revision Length
     */
    private final int page_last_rev_length[];
   
    private final AtomicInteger page_counter = new AtomicInteger(0);
   
    /**
     * Constructor
     * @param benchmark
     * @param c
     */
    public WikipediaLoader(String[] args) {
        super(args);
        this.util = new WikipediaUtil(this.randGenerator, this.getScaleFactor());
       
        this.user_revision_ctr = new int[this.util.num_users];
        this.page_last_rev_id = new int[this.util.num_pages];
        this.page_last_rev_length = new int[this.util.num_pages];
       
        Arrays.fill(this.page_last_rev_id, 0);
        Arrays.fill(this.user_revision_ctr, 0);
        Arrays.fill(this.page_last_rev_length, 0);
       
        if (debug.val) {
            LOG.debug("# of USERS:  " + util.num_users);
            LOG.debug("# of PAGES: " + util.num_pages);
        }
    }
   
    @Override
    public void load() throws IOException {
        final CatalogContext catalogContext = this.getCatalogContext();
        try {
            // Load Data
            this.loadUsers(catalogContext.database);
            this.loadPages(catalogContext.database);
            this.loadWatchlist(catalogContext.database);
           
            // Multiple Threads
            List<Runnable> runnables = new ArrayList<Runnable>();
            int num_threads = ThreadUtil.availableProcessors();
            int pageId = 1;
            int pagesPerThread = (int)Math.ceil(util.num_pages / (double)num_threads);
            for (int i = 0; i < num_threads; i++) {
                final int firstPageId = pageId;
                final int lastPageId = Math.min(util.num_pages, firstPageId + pagesPerThread);
                Runnable r = new Runnable() {
                    @Override
                    public void run() {
                        WikipediaLoader.this.loadRevision(catalogContext.database, firstPageId, lastPageId);
                    }
                };
                runnables.add(r);
                pageId += pagesPerThread;
            } // FOR
            ThreadUtil.runGlobalPool(runnables);
           
            // Update Counters
            this.updateCounters();
        } catch (Exception e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        }
    }
   
    private void updateCounters() throws Exception {
        // UPDATE USER & UPDATE PAGES
        Client client = this.getClientHandle();
        ClientResponse cr = client.callProcedure(UpdateRevisionCounters.class.getSimpleName(),
                                                 this.user_revision_ctr,
                                                 util.num_pages,
                                                 this.page_last_rev_id,
                                                 this.page_last_rev_length);
        assert(cr != null);
        assert(cr.getStatus() == Status.OK);
        if (debug.val) LOG.debug("Updated page/user revision counters");
    }
   
    /**
     * USERACCTS
     */
    private void loadUsers(Database catalog_db) {
        Table userTable = catalog_db.getTables().getIgnoreCase(WikipediaConstants.TABLENAME_USER);
        assert(userTable != null);
       

        VoltTable vt = CatalogUtil.getVoltTable(userTable);
        int num_cols = userTable.getColumns().size();
        int batchSize = 0;
        int lastPercent = -1;
        for (int userId = 1; userId <= util.num_users; userId++) {
            // The name will be prefixed with their UserId. This increases
            // the likelihood that all of our usernames are going to be unique
            // It's not a guarantee, but it's good enough...
            String name = Integer.toString(userId) + TextGenerator.randomStr(randGenerator, util.h_nameLength.nextValue().intValue());
            String realName = TextGenerator.randomStr(randGenerator, util.h_realNameLength.nextValue().intValue());
            int revCount = util.h_revCount.nextValue().intValue();
            String password = StringUtil.repeat("*", randGenerator.nextInt(32));
           
            char eChars[] = TextGenerator.randomChars(randGenerator, randGenerator.nextInt(32) + 5);
            eChars[4 + randGenerator.nextInt(eChars.length-4)] = '@';
            String email = new String(eChars);
           
            String token = TextGenerator.randomStr(randGenerator, WikipediaConstants.TOKEN_LENGTH);
            String userOptions = "fake_longoptionslist";
            TimestampType newPassTime = new TimestampType();
            TimestampType touched = new TimestampType();

            Object row[] = new Object[num_cols];
            int param = 0;
            row[param++] = userId;      // user_id
            row[param++] = name;        // user_name
            row[param++] = realName;    // user_real_name
            row[param++] = password;    // user_password
            row[param++] = password;    // user_newpassword
            row[param++] = newPassTime; // user_newpass_time
            row[param++] = email;       // user_email
            row[param++] = userOptions; // user_options
            row[param++] = touched;     // user_touched
            row[param++] = token;       // user_token
            row[param++] = null;        // user_email_authenticated
            row[param++] = null;        // user_email_token
            row[param++] = null;        // user_email_token_expires
            row[param++] = null;        // user_registration
            row[param++] = revCount;    // user_editcount
            vt.addRow(row);

            if (++batchSize % WikipediaConstants.BATCH_SIZE == 0) {
                this.loadVoltTable(userTable.getName(), vt);
                vt.clearRowData();
                batchSize = 0;
                if (debug.val) {
                    int percent = (int) (((double) userId / (double) util.num_users) * 100);
                    if (percent != lastPercent) LOG.debug("USERACCT (" + percent + "%)");
                    lastPercent = percent;
                }
            }
        } // FOR
        if (batchSize > 0) {
            this.loadVoltTable(userTable.getName(), vt);
            vt.clearRowData();
        }
       
        if (debug.val) LOG.debug(userTable.getName() + " Loaded");
    }

    /**
     * PAGE
     */
    private void loadPages(Database catalog_db) {
        Table pageTable = catalog_db.getTables().get(WikipediaConstants.TABLENAME_PAGE);
        assert(pageTable != null);

        VoltTable vt = CatalogUtil.getVoltTable(pageTable);
        int num_cols = pageTable.getColumns().size();
        int batchSize = 0;
        int lastPercent = -1;
        for (long pageId = 1; pageId <= util.num_pages; pageId++) {
            String title = TextGenerator.randomStr(this.randGenerator, util.h_titleLength.nextValue().intValue());
            int namespace = util.getPageNameSpace(pageId);
            String restrictions = util.h_restrictions.nextValue();
            double pageRandom = randGenerator.nextDouble();
            TimestampType pageTouched = new TimestampType();
           
            Object row[] = new Object[num_cols];
            int param = 0;
            row[param++] = pageId;          // page_id
            row[param++] = namespace;       // page_namespace
            row[param++] = title;           // page_title
            row[param++] = restrictions;    // page_restrictions
            row[param++] = 0;               // page_counter
            row[param++] = 0;               // page_is_redirect
            row[param++] = 0;               // page_is_new
            row[param++] = pageRandom;      // page_random
            row[param++] = pageTouched;     // page_touched
            row[param++] = 0;               // page_latest
            row[param++] = 0;               // page_len
           
            vt.addRow(row);

            if (++batchSize % WikipediaConstants.BATCH_SIZE == 0) {
                this.loadVoltTable(pageTable.getName(), vt);
                vt.clearRowData();
                batchSize = 0;
                if (debug.val) {
                    int percent = (int) (((double) pageId / (double) util.num_pages) * 100);
                    if (percent != lastPercent) LOG.debug("PAGE (" + percent + "%)");
                    lastPercent = percent;
                }
            }
        } // FOR
        if (batchSize > 0) {
            this.loadVoltTable(pageTable.getName(), vt);
            vt.clearRowData();
        }
        if (debug.val) LOG.debug(pageTable.getName() + " Loaded");
    }

    /**
     * WATCHLIST
     */
    private void loadWatchlist(Database catalog_db) {
        Table watchTable = catalog_db.getTables().get(WikipediaConstants.TABLENAME_WATCHLIST);
        assert(watchTable != null);
       
        VoltTable vt = CatalogUtil.getVoltTable(watchTable);
        int num_cols = watchTable.getColumns().size();
        int batchSize = 0;
        int lastPercent = -1;
        Set<Long> userPages = new HashSet<Long>();
        for (int user_id = 1; user_id <= util.num_users; user_id++) {
            int num_watches = util.h_watchPageCount.nextInt();
            if (trace.val) LOG.trace(user_id + " => " + num_watches);
           
            userPages.clear();
            for (int i = 0; i < num_watches; i++) {
                long pageId = util.h_watchPageId.nextLong();
                while (userPages.contains(pageId)) {
                    pageId = util.h_watchPageId.nextLong();
                } // WHILE
                userPages.add(pageId);
                int nameSpace = util.getPageNameSpace(pageId);
               
                Object row[] = new Object[num_cols];
                int param = 0;
                row[param++] = user_id;     // wl_user
                row[param++] = nameSpace;   // wl_namespace
                row[param++] = pageId;      // wl_page
                row[param++] = null;        // wl_notificationtimestamp
                vt.addRow(row);
                batchSize++;
            } // FOR

            if (batchSize >= WikipediaConstants.BATCH_SIZE) {
                if (trace.val) LOG.trace("watchList(batch):\n" + vt);
                this.loadVoltTable(watchTable.getName(), vt);
                vt.clearRowData();
                batchSize = 0;
                if (debug.val) {
                    int percent = (int) (((double) user_id / (double) util.num_users) * 100);
                    if (percent != lastPercent) LOG.debug("WATCHLIST (" + percent + "%)");
                    lastPercent = percent;
                }
            }
        } // FOR
        if (batchSize > 0) {
            if (trace.val) LOG.trace("watchList(<batch):\n" + vt);
            this.loadVoltTable(watchTable.getName(), vt);
            vt.clearRowData();
        }
        if (debug.val) LOG.debug(watchTable.getName() + " Loaded");
    }

    /**
     * REVISIONS
     */
    private void loadRevision(Database catalog_db, int firstPageId, long lastPageId) {
       
        // TEXT
        Table textTable = catalog_db.getTables().get(WikipediaConstants.TABLENAME_TEXT);
        assert(textTable != null) : "Failed to find " + WikipediaConstants.TABLENAME_TEXT;
        Column textTableColumn = textTable.getColumns().getIgnoreCase("OLD_TEXT");
        assert(textTableColumn != null) : "Failed to find " + WikipediaConstants.TABLENAME_TEXT + ".OLD_TEXT";
        int max_text_length = textTableColumn.getSize();
       
        // REVISION
        Table revTable = catalog_db.getTables().get(WikipediaConstants.TABLENAME_REVISION);
        assert(revTable != null) : "Failed to find " + WikipediaConstants.TABLENAME_REVISION;
       
        VoltTable vtText = CatalogUtil.getVoltTable(textTable);
        VoltTable vtRev = CatalogUtil.getVoltTable(revTable);
        int num_txt_cols = textTable.getColumns().size();
        int num_rev_cols = revTable.getColumns().size();
        int batchSize = 1;
       
        Zipf h_users = new Zipf(this.randGenerator, 1, util.num_users, WikipediaConstants.REVISION_USER_SIGMA);
        FlatHistogram<Integer> h_textLength = new FlatHistogram<Integer>(this.randGenerator, TextHistograms.TEXT_LENGTH);
        FlatHistogram<Integer> h_nameLength = new FlatHistogram<Integer>(this.randGenerator, UserHistograms.NAME_LENGTH);
        FlatHistogram<Integer> h_numRevisions = new FlatHistogram<Integer>(this.randGenerator, PageHistograms.REVISIONS_PER_PAGE);
       
        int lastPercent = -1;
        for (int pageId = firstPageId; pageId <= lastPageId; pageId++) {
            // There must be at least one revision per page
            int num_revised = h_numRevisions.nextValue().intValue();
           
            // Generate what the new revision is going to be
            int old_text_length = h_textLength.nextValue().intValue();
            if (trace.val) LOG.trace("Max length:" + max_text_length + " old_text_length:" + old_text_length);
            assert(old_text_length > 0);
            assert(old_text_length < max_text_length);
            char old_text[] = TextGenerator.randomChars(randGenerator, old_text_length);
            long batchBytes = 0;
           
            for (int i = 0; i < num_revised; i++) {
                // Generate the User who's doing the revision and the Page revised
                // Makes sure that we always update their counter
                int user_id = h_users.nextInt();
                assert(user_id > 0 && user_id <= util.num_users) : "Invalid UserId '" + user_id + "'";
                this.user_revision_ctr[user_id-1]++;
                TimestampType timestamp = new TimestampType();
               
                // Generate what the new revision is going to be
                if (i > 0) {
                    old_text = util.generateRevisionText(old_text);
                    old_text_length = old_text.length;
                }
               
                int rev_id = ++this.page_last_rev_id[pageId-1];
                this.page_last_rev_length[pageId-1] = old_text_length;
               
                // TEXT
                Object row[] = new Object[num_txt_cols];
                int col = 0;
                row[col++] = rev_id;                // old_id
                row[col++] = new String(old_text)// old_text
                row[col++] = "utf-8";               // old_flags
                row[col++] = pageId;                // old_page
                vtText.addRow(row);

                // The REV_USER_TEXT field is usually the username, but we'll just
                // put in gibberish for now
                String user_text = new String(TextGenerator.randomChars(randGenerator, h_nameLength.nextValue().intValue()));
                String rev_comment = new String(TextGenerator.randomChars(randGenerator, util.h_commentLength.nextValue().intValue()));
                int minor_edit = util.h_minorEdit.nextValue().intValue();
               
                // REVISION
                col = 0;
                row = new Object[num_rev_cols];
                row[col++] = rev_id;                // rev_id
                row[col++] = pageId;                // rev_page
                row[col++] = rev_id;                // rev_text_id
                row[col++] = rev_comment;           // rev_comment
                row[col++] = user_id;               // rev_user
                row[col++] = user_text;             // rev_user_text
                row[col++] = timestamp;             // rev_timestamp
                row[col++] = minor_edit;            // rev_minor_edit
                row[col++] = 0;                     // rev_deleted
                row[col++] = old_text.length;       // rev_len
                row[col++] = 0;                     // rev_parent_id
                vtRev.addRow(row);
               
                if (trace.val) LOG.trace(String.format("%s [pageId=%05d / revId=%05d]",
                                                         revTable.getName(), pageId, rev_id));
                batchBytes += old_text.length;
                batchSize++;
               
                if (batchSize > WikipediaConstants.BATCH_SIZE || batchBytes >= 16777216) {
                    this.loadVoltTable(textTable.getName(), vtText);
                    this.loadVoltTable(revTable.getName(), vtRev);
                    vtText.clearRowData();
                    vtRev.clearRowData();
                    batchSize = 0;
                    batchBytes = 0;
                }
            } // FOR (revision)
           
            // XXX: We have to push out the batch for each page, because sometimes we
            // generate a batch that is too large and we lose our connection to the database
            if (batchSize > WikipediaConstants.BATCH_SIZE || batchBytes >= 16777216) {
                this.loadVoltTable(textTable.getName(), vtText);
                this.loadVoltTable(revTable.getName(), vtRev);
                vtText.clearRowData();
                vtRev.clearRowData();
                batchSize = 0;
                batchBytes = 0;
            }
           
            if (debug.val) {
                int percent = (int) (((double) this.page_counter.incrementAndGet() / (double) util.num_pages) * 100);
                if (percent != lastPercent) LOG.debug("REVISIONS (" + percent + "%)");
                lastPercent = percent;
            }
        } // FOR (page)
        if (batchSize > 0) {
            this.loadVoltTable(textTable.getName(), vtText);
            this.loadVoltTable(revTable.getName(), vtRev);
            vtText.clearRowData();
            vtRev.clearRowData();
        }
       
       
        if (debug.val) LOG.debug(textTable.getName() + " Loaded");
        if (debug.val) LOG.debug(revTable.getName() + " Loaded");
    }
  
}
TOP

Related Classes of edu.brown.benchmark.wikipedia.WikipediaLoader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.