package org.deri.grefine.reconcile.commands;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.deri.grefine.reconcile.sindice.SindiceBroker;
import org.deri.grefine.reconcile.util.GRefineJsonUtilitiesImpl;
import org.json.JSONWriter;
import com.google.refine.commands.Command;
import com.google.refine.expr.ExpressionUtils;
import com.google.refine.model.Column;
import com.google.refine.model.Project;
import com.google.refine.model.Row;
public class SindiceGuessTypeCommand extends Command {
@Override
public void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
try{
Project project = getProject(request);
String columnName = request.getParameter("columnName");
response.setCharacterEncoding("UTF-8");
response.setHeader("Content-Type", "application/json");
JSONWriter writer = new JSONWriter(response.getWriter());
writer.object();
Column column = project.columnModel.getColumnByName(columnName);
if (column == null) {
writer.key("code");
writer.value("error");
writer.key("message");
writer.value("No such column");
} else {
try {
writer.key("code");
writer.value("ok");
List<String> domains = guessDomain(project, column);
writer.key("domains");
writer.array();
for(String domain:domains){
writer.value(domain);
}
writer.endArray();
} catch (Exception e) {
writer.key("code");
writer.value("error");
}
}
writer.endObject();
} catch (Exception e) {
respondException(response, e);
}
}
final static int s_sampleSize = 10;
final static int s_resultsLimit = 3;
private List<String> guessDomain(Project project, Column column){
Map<String,Integer> domainsMap = new HashMap<String, Integer>();
int cellIndex = column.getCellIndex();
List<String> samples = new ArrayList<String>(s_sampleSize);
Set<String> sampleSet = new HashSet<String>();
for (Row row : project.rows) {
Object value = row.getCellValue(cellIndex);
if (ExpressionUtils.isNonBlankData(value)) {
String s = value.toString().trim();
if (!sampleSet.contains(s)) {
samples.add(s);
sampleSet.add(s);
if (samples.size() >= s_sampleSize) {
break;
}
}
}
}
SindiceBroker service = new SindiceBroker();
for(int j=0;j<samples.size();j++){
String s = samples.get(j);
List<String> domains = service.guessDomain(s,s_resultsLimit,new GRefineJsonUtilitiesImpl());
for(String domain:domains){
if(domainsMap.containsKey(domain)){
domainsMap.put(domain,domainsMap.get(domain).intValue() + 1);
}else{
domainsMap.put(domain, 0);
}
}
}
List<Entry<String, Integer>> domainsEntries = new LinkedList<Entry<String, Integer>>(domainsMap.entrySet());
Collections.sort(domainsEntries, new Comparator<Entry<String, Integer>>() {
public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) {
return o2.getValue().compareTo(o1.getValue());
}
});
List<String> domains = new ArrayList<String>();
for(Entry<String, Integer> entry: domainsEntries){
domains.add(entry.getKey());
}
return domains;
}
}