/*******************************************************************************
* oltpbenchmark.com
*
* Project Info: http://oltpbenchmark.com
* Project Members: Carlo Curino <carlo.curino@gmail.com>
* Evan Jones <ej@evanjones.ca>
* DIFALLAH Djellel Eddine <djelleleddine.difallah@unifr.ch>
* Andy Pavlo <pavlo@cs.brown.edu>
* CUDRE-MAUROUX Philippe <philippe.cudre-mauroux@unifr.ch>
* Yang Zhang <yaaang@gmail.com>
*
*
* This library is free software; you can redistribute it and/or modify it under the terms
* of the GNU General Public License as published by the Free Software Foundation;
* either version 3.0 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
******************************************************************************/
package edu.brown.benchmark.wikipedia.util;
import java.util.Random;
import edu.brown.benchmark.wikipedia.WikipediaConstants;
import edu.brown.benchmark.wikipedia.data.PageHistograms;
import edu.brown.benchmark.wikipedia.data.RevisionHistograms;
import edu.brown.benchmark.wikipedia.data.UserHistograms;
import edu.brown.rand.RandomDistribution.FlatHistogram;
import edu.brown.rand.RandomDistribution.Zipf;
/**
* Helper class that contains all of the useful information about the benchmark database
* @author pavlo
*/
public final class WikipediaUtil {
private final Random rand;
private final double scaleFactor;
public final int num_users;
public final int num_pages;
public final FlatHistogram<Integer> h_nameLength;
public final FlatHistogram<Integer> h_realNameLength;
public final FlatHistogram<Integer> h_revCount;
public final FlatHistogram<Integer> h_titleLength;
public final FlatHistogram<String> h_restrictions;
public final Zipf h_watchPageCount;
public final Zipf h_watchPageId;
public final FlatHistogram<Integer> h_commentLength;
public final FlatHistogram<Integer> h_minorEdit;
/**
* Constructor
* @param rand
* @param scaleFactor
*/
public WikipediaUtil(Random rand, double scaleFactor) {
this.rand = rand;
this.scaleFactor = scaleFactor;
this.num_users = (int) Math.round(WikipediaConstants.USERS * this.scaleFactor);
this.num_pages = (int) Math.round(WikipediaConstants.PAGES * this.scaleFactor);
this.h_nameLength = new FlatHistogram<Integer>(this.rand, UserHistograms.NAME_LENGTH);
this.h_realNameLength = new FlatHistogram<Integer>(this.rand, UserHistograms.REAL_NAME_LENGTH);
this.h_revCount = new FlatHistogram<Integer>(this.rand, UserHistograms.REVISION_COUNT);
this.h_titleLength = new FlatHistogram<Integer>(this.rand, PageHistograms.TITLE_LENGTH);
this.h_restrictions = new FlatHistogram<String>(this.rand, PageHistograms.RESTRICTIONS);
this.h_watchPageCount = new Zipf(this.rand, 0, this.num_pages, WikipediaConstants.NUM_WATCHES_PER_USER_SIGMA);
this.h_watchPageId = new Zipf(this.rand, 1, this.num_pages, WikipediaConstants.WATCHLIST_PAGE_SIGMA);
this.h_commentLength = new FlatHistogram<Integer>(this.rand, RevisionHistograms.COMMENT_LENGTH);
this.h_minorEdit = new FlatHistogram<Integer>(this.rand, RevisionHistograms.MINOR_EDIT);
}
/**
* Return the computed namespace for the given pageId
* @param pageId
* @return
*/
public int getPageNameSpace(long pageId) {
return (0);
}
/**
*
* @param orig_text
* @return
*/
public char[] generateRevisionText(char orig_text[]) {
Random randGenerator = new Random();
@SuppressWarnings("unchecked")
FlatHistogram<Integer> revisionDeltas[] = (FlatHistogram<Integer>[])new FlatHistogram [RevisionHistograms.REVISION_DELTA_SIZES.length];
for (int i = 0; i < revisionDeltas.length; i++) {
revisionDeltas[i] = new FlatHistogram<Integer>(randGenerator, RevisionHistograms.REVISION_DELTAS[i]);
} // FOR
// Figure out how much we are going to change
// If the delta is greater than the length of the original
// text, then we will just cut our length in half. Where is your god now?
// There is probably some sort of minimal size that we should adhere to, but
// it's 12:30am and I simply don't feel like dealing with that now
FlatHistogram<Integer> h = null;
for (int i = 0; i < revisionDeltas.length-1; i++) {
if (orig_text.length <= RevisionHistograms.REVISION_DELTA_SIZES[i]) {
h = revisionDeltas[i];
}
} // FOR
if (h == null) h = revisionDeltas[revisionDeltas.length-1];
assert(h != null);
int delta = h.nextValue().intValue();
if (orig_text.length + delta <= 0) {
delta = -1 * (int)Math.round(orig_text.length / 1.5);
if (Math.abs(delta) == orig_text.length && delta < 0) delta /= 2;
}
if (delta != 0) orig_text = TextGenerator.resizeText(randGenerator, orig_text, delta);
// And permute it a little bit. This ensures that the text is slightly
// different than the last revision
orig_text = TextGenerator.permuteText(randGenerator, orig_text);
return (orig_text);
}
}