/**
*
*/
package com.taobao.top.analysis.util.bloom;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import com.taobao.top.analysis.statistics.data.DistinctCountEntryValue;
/**
* @author fangweng
* email: fangweng@taobao.com
* 上午12:19:43
*
*/
public class DistinctCountTool {
public static void main(String[] args) throws IOException
{
if (args == null || (args != null && args.length > 3))
System.out.println("DistinctTool command : destFiles groupby distinctcolumn bloomFileMaxKeys errorRate");
long beg = System.currentTimeMillis();
Map<String,DistinctCountEntryValue> result = new HashMap<String,DistinctCountEntryValue>();
String destFiles = args[0];
int distinctColumn = Integer.parseInt(args[2]);
String[] gby = args[1].split(",");
Integer[] groupby = new Integer[gby.length];
for(int i = 0; i < gby.length; i++)
{
groupby[i] = Integer.parseInt(gby[i]);
}
File f = new File(destFiles);
if (f.exists())
{
File[] fs;
if (f.isDirectory())
{
fs = f.listFiles();
}
else
fs = new File[]{f};
for(int i = 0 ; i < fs.length; i++)
{
if (args.length == 5)
doDistinct(fs[i],Integer.parseInt(args[3]),Float.parseFloat(args[4]),distinctColumn,groupby,result);
else
doDistinct(fs[i],100000,0.0001F,distinctColumn,groupby,result);
}
System.out.println("time consume : " + (System.currentTimeMillis() - beg) + "result size :" + result.size());
new File("out.txt").createNewFile();
File out = new File("out.txt");
java.io.BufferedWriter bw = new java.io.BufferedWriter(new java.io.FileWriter(out));
try
{
for(Map.Entry<String,DistinctCountEntryValue> e : result.entrySet())
{
bw.write(new StringBuilder().append(e.getKey()).append(",").append(e.getValue().getCount()).append("\r\n").toString());
}
}
finally
{
if (bw != null)
bw.close();
}
}
else
{
System.out.println("desfFiles not exist : " + destFiles);
}
}
static void doDistinct(File f,int maxKeys,float errorRate,int distinctColumn,Integer[] groupby,Map<String,DistinctCountEntryValue>result) throws IOException
{
java.io.BufferedReader br = new java.io.BufferedReader(new java.io.FileReader(f));
try
{
String c = null;
while((c = br.readLine() )!= null)
{
try
{
String[] contents = StringUtils.splitByWholeSeparator(c, "%!");
if (contents.length == 0 || (contents.length < distinctColumn))
continue;
StringBuilder key = new StringBuilder();
for (Integer k : groupby)
{
key.append(contents[k]).append("--");
}
DistinctCountEntryValue distinctEntry = result.get(key.toString());
if (distinctEntry == null)
{
distinctEntry = new DistinctCountEntryValue();
ByteBloomFilter bloomFilter;
bloomFilter = new ByteBloomFilter(maxKeys,errorRate,1);
distinctEntry.setBloomFilter(bloomFilter);
result.put(key.toString(), distinctEntry);
}
distinctEntry.add(contents[distinctColumn]);
}
catch(Exception ex)
{
System.out.print(ex.getCause());
}
}
}
finally
{
if (br != null)
br.close();
}
}
}