package edu.brown.benchmark.wikipedia;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.log4j.Logger;
import org.voltdb.CatalogContext;
import org.voltdb.VoltTable;
import org.voltdb.catalog.Column;
import org.voltdb.catalog.Database;
import org.voltdb.catalog.Table;
import org.voltdb.client.Client;
import org.voltdb.client.ClientResponse;
import org.voltdb.types.TimestampType;
import edu.brown.api.Loader;
import edu.brown.benchmark.wikipedia.data.PageHistograms;
import edu.brown.benchmark.wikipedia.data.TextHistograms;
import edu.brown.benchmark.wikipedia.data.UserHistograms;
import edu.brown.benchmark.wikipedia.procedures.UpdateRevisionCounters;
import edu.brown.benchmark.wikipedia.util.TextGenerator;
import edu.brown.benchmark.wikipedia.util.WikipediaUtil;
import edu.brown.catalog.CatalogUtil;
import edu.brown.hstore.Hstoreservice.Status;
import edu.brown.logging.LoggerUtil;
import edu.brown.logging.LoggerUtil.LoggerBoolean;
import edu.brown.rand.RandomDistribution.FlatHistogram;
import edu.brown.rand.RandomDistribution.Zipf;
import edu.brown.utils.StringUtil;
import edu.brown.utils.ThreadUtil;
/**
* Synthetic Wikipedia Data Loader
* @author pavlo
* @author djellel
* @author xin
*/
public class WikipediaLoader extends Loader {
private static final Logger LOG = Logger.getLogger(WikipediaLoader.class);
private static final LoggerBoolean debug = new LoggerBoolean();
private static final LoggerBoolean trace = new LoggerBoolean();
static {
LoggerUtil.attachObserver(LOG, debug, trace);
}
private final Random randGenerator = new Random();
private final WikipediaUtil util;
/**
* UserId -> # of Revisions
*/
private final int user_revision_ctr[];
/**
* PageId -> Last Revision Id
*/
private final int page_last_rev_id[];
/**
* PageId -> Last Revision Length
*/
private final int page_last_rev_length[];
private final AtomicInteger page_counter = new AtomicInteger(0);
/**
* Constructor
* @param benchmark
* @param c
*/
public WikipediaLoader(String[] args) {
super(args);
this.util = new WikipediaUtil(this.randGenerator, this.getScaleFactor());
this.user_revision_ctr = new int[this.util.num_users];
this.page_last_rev_id = new int[this.util.num_pages];
this.page_last_rev_length = new int[this.util.num_pages];
Arrays.fill(this.page_last_rev_id, 0);
Arrays.fill(this.user_revision_ctr, 0);
Arrays.fill(this.page_last_rev_length, 0);
if (debug.val) {
LOG.debug("# of USERS: " + util.num_users);
LOG.debug("# of PAGES: " + util.num_pages);
}
}
@Override
public void load() throws IOException {
final CatalogContext catalogContext = this.getCatalogContext();
try {
// Load Data
this.loadUsers(catalogContext.database);
this.loadPages(catalogContext.database);
this.loadWatchlist(catalogContext.database);
// Multiple Threads
List<Runnable> runnables = new ArrayList<Runnable>();
int num_threads = ThreadUtil.availableProcessors();
int pageId = 1;
int pagesPerThread = (int)Math.ceil(util.num_pages / (double)num_threads);
for (int i = 0; i < num_threads; i++) {
final int firstPageId = pageId;
final int lastPageId = Math.min(util.num_pages, firstPageId + pagesPerThread);
Runnable r = new Runnable() {
@Override
public void run() {
WikipediaLoader.this.loadRevision(catalogContext.database, firstPageId, lastPageId);
}
};
runnables.add(r);
pageId += pagesPerThread;
} // FOR
ThreadUtil.runGlobalPool(runnables);
// Update Counters
this.updateCounters();
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
private void updateCounters() throws Exception {
// UPDATE USER & UPDATE PAGES
Client client = this.getClientHandle();
ClientResponse cr = client.callProcedure(UpdateRevisionCounters.class.getSimpleName(),
this.user_revision_ctr,
util.num_pages,
this.page_last_rev_id,
this.page_last_rev_length);
assert(cr != null);
assert(cr.getStatus() == Status.OK);
if (debug.val) LOG.debug("Updated page/user revision counters");
}
/**
* USERACCTS
*/
private void loadUsers(Database catalog_db) {
Table userTable = catalog_db.getTables().getIgnoreCase(WikipediaConstants.TABLENAME_USER);
assert(userTable != null);
VoltTable vt = CatalogUtil.getVoltTable(userTable);
int num_cols = userTable.getColumns().size();
int batchSize = 0;
int lastPercent = -1;
for (int userId = 1; userId <= util.num_users; userId++) {
// The name will be prefixed with their UserId. This increases
// the likelihood that all of our usernames are going to be unique
// It's not a guarantee, but it's good enough...
String name = Integer.toString(userId) + TextGenerator.randomStr(randGenerator, util.h_nameLength.nextValue().intValue());
String realName = TextGenerator.randomStr(randGenerator, util.h_realNameLength.nextValue().intValue());
int revCount = util.h_revCount.nextValue().intValue();
String password = StringUtil.repeat("*", randGenerator.nextInt(32));
char eChars[] = TextGenerator.randomChars(randGenerator, randGenerator.nextInt(32) + 5);
eChars[4 + randGenerator.nextInt(eChars.length-4)] = '@';
String email = new String(eChars);
String token = TextGenerator.randomStr(randGenerator, WikipediaConstants.TOKEN_LENGTH);
String userOptions = "fake_longoptionslist";
TimestampType newPassTime = new TimestampType();
TimestampType touched = new TimestampType();
Object row[] = new Object[num_cols];
int param = 0;
row[param++] = userId; // user_id
row[param++] = name; // user_name
row[param++] = realName; // user_real_name
row[param++] = password; // user_password
row[param++] = password; // user_newpassword
row[param++] = newPassTime; // user_newpass_time
row[param++] = email; // user_email
row[param++] = userOptions; // user_options
row[param++] = touched; // user_touched
row[param++] = token; // user_token
row[param++] = null; // user_email_authenticated
row[param++] = null; // user_email_token
row[param++] = null; // user_email_token_expires
row[param++] = null; // user_registration
row[param++] = revCount; // user_editcount
vt.addRow(row);
if (++batchSize % WikipediaConstants.BATCH_SIZE == 0) {
this.loadVoltTable(userTable.getName(), vt);
vt.clearRowData();
batchSize = 0;
if (debug.val) {
int percent = (int) (((double) userId / (double) util.num_users) * 100);
if (percent != lastPercent) LOG.debug("USERACCT (" + percent + "%)");
lastPercent = percent;
}
}
} // FOR
if (batchSize > 0) {
this.loadVoltTable(userTable.getName(), vt);
vt.clearRowData();
}
if (debug.val) LOG.debug(userTable.getName() + " Loaded");
}
/**
* PAGE
*/
private void loadPages(Database catalog_db) {
Table pageTable = catalog_db.getTables().get(WikipediaConstants.TABLENAME_PAGE);
assert(pageTable != null);
VoltTable vt = CatalogUtil.getVoltTable(pageTable);
int num_cols = pageTable.getColumns().size();
int batchSize = 0;
int lastPercent = -1;
for (long pageId = 1; pageId <= util.num_pages; pageId++) {
String title = TextGenerator.randomStr(this.randGenerator, util.h_titleLength.nextValue().intValue());
int namespace = util.getPageNameSpace(pageId);
String restrictions = util.h_restrictions.nextValue();
double pageRandom = randGenerator.nextDouble();
TimestampType pageTouched = new TimestampType();
Object row[] = new Object[num_cols];
int param = 0;
row[param++] = pageId; // page_id
row[param++] = namespace; // page_namespace
row[param++] = title; // page_title
row[param++] = restrictions; // page_restrictions
row[param++] = 0; // page_counter
row[param++] = 0; // page_is_redirect
row[param++] = 0; // page_is_new
row[param++] = pageRandom; // page_random
row[param++] = pageTouched; // page_touched
row[param++] = 0; // page_latest
row[param++] = 0; // page_len
vt.addRow(row);
if (++batchSize % WikipediaConstants.BATCH_SIZE == 0) {
this.loadVoltTable(pageTable.getName(), vt);
vt.clearRowData();
batchSize = 0;
if (debug.val) {
int percent = (int) (((double) pageId / (double) util.num_pages) * 100);
if (percent != lastPercent) LOG.debug("PAGE (" + percent + "%)");
lastPercent = percent;
}
}
} // FOR
if (batchSize > 0) {
this.loadVoltTable(pageTable.getName(), vt);
vt.clearRowData();
}
if (debug.val) LOG.debug(pageTable.getName() + " Loaded");
}
/**
* WATCHLIST
*/
private void loadWatchlist(Database catalog_db) {
Table watchTable = catalog_db.getTables().get(WikipediaConstants.TABLENAME_WATCHLIST);
assert(watchTable != null);
VoltTable vt = CatalogUtil.getVoltTable(watchTable);
int num_cols = watchTable.getColumns().size();
int batchSize = 0;
int lastPercent = -1;
Set<Long> userPages = new HashSet<Long>();
for (int user_id = 1; user_id <= util.num_users; user_id++) {
int num_watches = util.h_watchPageCount.nextInt();
if (trace.val) LOG.trace(user_id + " => " + num_watches);
userPages.clear();
for (int i = 0; i < num_watches; i++) {
long pageId = util.h_watchPageId.nextLong();
while (userPages.contains(pageId)) {
pageId = util.h_watchPageId.nextLong();
} // WHILE
userPages.add(pageId);
int nameSpace = util.getPageNameSpace(pageId);
Object row[] = new Object[num_cols];
int param = 0;
row[param++] = user_id; // wl_user
row[param++] = nameSpace; // wl_namespace
row[param++] = pageId; // wl_page
row[param++] = null; // wl_notificationtimestamp
vt.addRow(row);
batchSize++;
} // FOR
if (batchSize >= WikipediaConstants.BATCH_SIZE) {
if (trace.val) LOG.trace("watchList(batch):\n" + vt);
this.loadVoltTable(watchTable.getName(), vt);
vt.clearRowData();
batchSize = 0;
if (debug.val) {
int percent = (int) (((double) user_id / (double) util.num_users) * 100);
if (percent != lastPercent) LOG.debug("WATCHLIST (" + percent + "%)");
lastPercent = percent;
}
}
} // FOR
if (batchSize > 0) {
if (trace.val) LOG.trace("watchList(<batch):\n" + vt);
this.loadVoltTable(watchTable.getName(), vt);
vt.clearRowData();
}
if (debug.val) LOG.debug(watchTable.getName() + " Loaded");
}
/**
* REVISIONS
*/
private void loadRevision(Database catalog_db, int firstPageId, long lastPageId) {
// TEXT
Table textTable = catalog_db.getTables().get(WikipediaConstants.TABLENAME_TEXT);
assert(textTable != null) : "Failed to find " + WikipediaConstants.TABLENAME_TEXT;
Column textTableColumn = textTable.getColumns().getIgnoreCase("OLD_TEXT");
assert(textTableColumn != null) : "Failed to find " + WikipediaConstants.TABLENAME_TEXT + ".OLD_TEXT";
int max_text_length = textTableColumn.getSize();
// REVISION
Table revTable = catalog_db.getTables().get(WikipediaConstants.TABLENAME_REVISION);
assert(revTable != null) : "Failed to find " + WikipediaConstants.TABLENAME_REVISION;
VoltTable vtText = CatalogUtil.getVoltTable(textTable);
VoltTable vtRev = CatalogUtil.getVoltTable(revTable);
int num_txt_cols = textTable.getColumns().size();
int num_rev_cols = revTable.getColumns().size();
int batchSize = 1;
Zipf h_users = new Zipf(this.randGenerator, 1, util.num_users, WikipediaConstants.REVISION_USER_SIGMA);
FlatHistogram<Integer> h_textLength = new FlatHistogram<Integer>(this.randGenerator, TextHistograms.TEXT_LENGTH);
FlatHistogram<Integer> h_nameLength = new FlatHistogram<Integer>(this.randGenerator, UserHistograms.NAME_LENGTH);
FlatHistogram<Integer> h_numRevisions = new FlatHistogram<Integer>(this.randGenerator, PageHistograms.REVISIONS_PER_PAGE);
int lastPercent = -1;
for (int pageId = firstPageId; pageId <= lastPageId; pageId++) {
// There must be at least one revision per page
int num_revised = h_numRevisions.nextValue().intValue();
// Generate what the new revision is going to be
int old_text_length = h_textLength.nextValue().intValue();
if (trace.val) LOG.trace("Max length:" + max_text_length + " old_text_length:" + old_text_length);
assert(old_text_length > 0);
assert(old_text_length < max_text_length);
char old_text[] = TextGenerator.randomChars(randGenerator, old_text_length);
long batchBytes = 0;
for (int i = 0; i < num_revised; i++) {
// Generate the User who's doing the revision and the Page revised
// Makes sure that we always update their counter
int user_id = h_users.nextInt();
assert(user_id > 0 && user_id <= util.num_users) : "Invalid UserId '" + user_id + "'";
this.user_revision_ctr[user_id-1]++;
TimestampType timestamp = new TimestampType();
// Generate what the new revision is going to be
if (i > 0) {
old_text = util.generateRevisionText(old_text);
old_text_length = old_text.length;
}
int rev_id = ++this.page_last_rev_id[pageId-1];
this.page_last_rev_length[pageId-1] = old_text_length;
// TEXT
Object row[] = new Object[num_txt_cols];
int col = 0;
row[col++] = rev_id; // old_id
row[col++] = new String(old_text); // old_text
row[col++] = "utf-8"; // old_flags
row[col++] = pageId; // old_page
vtText.addRow(row);
// The REV_USER_TEXT field is usually the username, but we'll just
// put in gibberish for now
String user_text = new String(TextGenerator.randomChars(randGenerator, h_nameLength.nextValue().intValue()));
String rev_comment = new String(TextGenerator.randomChars(randGenerator, util.h_commentLength.nextValue().intValue()));
int minor_edit = util.h_minorEdit.nextValue().intValue();
// REVISION
col = 0;
row = new Object[num_rev_cols];
row[col++] = rev_id; // rev_id
row[col++] = pageId; // rev_page
row[col++] = rev_id; // rev_text_id
row[col++] = rev_comment; // rev_comment
row[col++] = user_id; // rev_user
row[col++] = user_text; // rev_user_text
row[col++] = timestamp; // rev_timestamp
row[col++] = minor_edit; // rev_minor_edit
row[col++] = 0; // rev_deleted
row[col++] = old_text.length; // rev_len
row[col++] = 0; // rev_parent_id
vtRev.addRow(row);
if (trace.val) LOG.trace(String.format("%s [pageId=%05d / revId=%05d]",
revTable.getName(), pageId, rev_id));
batchBytes += old_text.length;
batchSize++;
if (batchSize > WikipediaConstants.BATCH_SIZE || batchBytes >= 16777216) {
this.loadVoltTable(textTable.getName(), vtText);
this.loadVoltTable(revTable.getName(), vtRev);
vtText.clearRowData();
vtRev.clearRowData();
batchSize = 0;
batchBytes = 0;
}
} // FOR (revision)
// XXX: We have to push out the batch for each page, because sometimes we
// generate a batch that is too large and we lose our connection to the database
if (batchSize > WikipediaConstants.BATCH_SIZE || batchBytes >= 16777216) {
this.loadVoltTable(textTable.getName(), vtText);
this.loadVoltTable(revTable.getName(), vtRev);
vtText.clearRowData();
vtRev.clearRowData();
batchSize = 0;
batchBytes = 0;
}
if (debug.val) {
int percent = (int) (((double) this.page_counter.incrementAndGet() / (double) util.num_pages) * 100);
if (percent != lastPercent) LOG.debug("REVISIONS (" + percent + "%)");
lastPercent = percent;
}
} // FOR (page)
if (batchSize > 0) {
this.loadVoltTable(textTable.getName(), vtText);
this.loadVoltTable(revTable.getName(), vtRev);
vtText.clearRowData();
vtRev.clearRowData();
}
if (debug.val) LOG.debug(textTable.getName() + " Loaded");
if (debug.val) LOG.debug(revTable.getName() + " Loaded");
}
}