public void generateJson(String prefix, PrintWriter pw,
VWorkspace vWorkspace) {
VWorksheet vWorksheet = vWorkspace.getViewFactory().getVWorksheetByWorksheetId(worksheetId);
Worksheet worksheet = vWorksheet.getWorksheet();
List<HNodePath> columnPaths = worksheet.getHeaders().getAllPaths();
ColumnMetadata colMetadata = worksheet.getMetadataContainer().getColumnMetadata();
List<String> columnsInvoked = new ArrayList<String>();
for (HNodePath path:columnPaths) {
String leafHNodeId = path.getLeaf().getId();
List<Node> nodes = new ArrayList<Node>(Math.max(1000, worksheet.getDataTable().getNumRows()));
worksheet.getDataTable().collectNodes(path, nodes, selection);
final int sampleSize = (nodes.size() > 1000) ? 1000 : nodes.size();
try {
// Check if the column metadata doesn't contains the cleaning information
if (colMetadata.getColumnHistogramData(leafHNodeId) == null
|| forceUpdates) {
// Prepare the input data for the cleaning service
JSONArray requestJsonArray = new JSONArray();
if (sampleSize == nodes.size()) {
for (Node node : nodes) {
JSONObject jsonRecord = new JSONObject();
jsonRecord.put(, node.getId());
String originalVal = node.getValue().asString();
originalVal = originalVal == null ? "" : originalVal;
jsonRecord.put(, originalVal);
else {
Set<Integer> randomNums = new HashSet<Integer>();
Random gen = new Random();
for (int i = 0; i < sampleSize; i++) {
int r = gen.nextInt(nodes.size());
while (randomNums.contains(r))
r = gen.nextInt(nodes.size());
Node node = nodes.get(r);
JSONObject jsonRecord = new JSONObject();
jsonRecord.put(, node.getId());
String originalVal = node.getValue().asString();
originalVal = originalVal == null ? "" : originalVal;
jsonRecord.put(, originalVal);
//TODO put the estimate back
if (requestJsonArray.length() == 0) {
logger.error("Empty values input for path" + path.toColumnNamePath());
String cleaningServiceURL = ServletContextParameterMap.getParameterValue(
Map<String, String> formParams = new HashMap<String, String>();
formParams.put(, requestJsonArray.toString());
String reqResponse = HTTPUtil.executeHTTPPostRequest(cleaningServiceURL, null,
null, formParams);
// logger.debug("***");
// logger.debug(path.getLeaf().getColumnName());
// logger.debug(reqResponse);
try {
// Test if the output is valid JSON object. Throws exception if not.
JSONObject output = new JSONObject(reqResponse);
long sampleRate = Math.round(nodes.size() * 1.0 / sampleSize);
JSONArray array = new JSONArray(output.getString("histogram"));
for (int i = 0; i < array.length(); i++) {
JSONObject obj = array.getJSONObject(i);
long value = Integer.parseInt(obj.getString("Frequency")) * sampleRate;
obj.put("Frequency", value);
output.put("histogram", array.toString());
// Add to the metadata if valid
colMetadata.addColumnHistogramData(leafHNodeId, output);
// Parse the request response to populate the column metadata for the worksheet
int colLength = getColumnLength(path.getLeaf(), output,
colMetadata.addColumnPreferredLength(leafHNodeId, colLength);
// Add the hNodeId to the list for which we invoked successfully
} catch (JSONException e) {
logger.error("Error occured with cleaning service for HNode: "
+ path.toColumnNamePath(), e);
// Set to a default column word length
colMetadata.addColumnPreferredLength(leafHNodeId, DEFAULT_COLUMN_LENGTH);
} catch (Exception e) {
logger.error("Error while invoking cleaning service", e);
// Prepare the Update that is going to be sent to the browser
JSONObject response = new JSONObject();
try {
response.put(, this.getClass().getSimpleName());
response.put(, worksheetId);
JSONArray chartData = new JSONArray();
for (String hNodeId:columnsInvoked) {
JSONObject columnChartData = new JSONObject();
columnChartData.put(, hNodeId);
try {
} catch (JSONException e) {
logger.error("Error occured with cleaning service for HNode: " + hNodeId, e);