Package com.taobao.top.analysis.util.bloom

Source Code of com.taobao.top.analysis.util.bloom.DistinctCountTool

/**
*
*/
package com.taobao.top.analysis.util.bloom;

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang.StringUtils;

import com.taobao.top.analysis.statistics.data.DistinctCountEntryValue;


/**
* @author fangweng
* email: fangweng@taobao.com
* 上午12:19:43
*
*/
public class DistinctCountTool {
 
  public static void main(String[] args) throws IOException
  {
    if (args == null || (args != null && args.length > 3))
      System.out.println("DistinctTool command : destFiles groupby distinctcolumn bloomFileMaxKeys errorRate");
   
    long beg = System.currentTimeMillis();
   
    Map<String,DistinctCountEntryValue> result = new HashMap<String,DistinctCountEntryValue>();
   
    String destFiles = args[0]
    int distinctColumn = Integer.parseInt(args[2]);
    String[] gby = args[1].split(",");
    Integer[] groupby = new Integer[gby.length];
   
    for(int i = 0; i < gby.length; i++)
    {
      groupby[i] = Integer.parseInt(gby[i]);
    }
   
   
    File f = new File(destFiles);
   
    if (f.exists())
    {
      File[] fs;
      if (f.isDirectory())
      {
        fs = f.listFiles();
      }
      else
        fs = new File[]{f};
     
      for(int i = 0 ; i < fs.length; i++)
      {
        if (args.length == 5)
          doDistinct(fs[i],Integer.parseInt(args[3]),Float.parseFloat(args[4]),distinctColumn,groupby,result);
        else
          doDistinct(fs[i],100000,0.0001F,distinctColumn,groupby,result);
      }
     
     
      System.out.println("time consume : " + (System.currentTimeMillis() - beg) + "result size :" + result.size());
     
      new File("out.txt").createNewFile();
      File out = new File("out.txt");
      java.io.BufferedWriter bw = new java.io.BufferedWriter(new java.io.FileWriter(out));
     
      try
      {
        for(Map.Entry<String,DistinctCountEntryValue> e : result.entrySet())
        {
          bw.write(new StringBuilder().append(e.getKey()).append(",").append(e.getValue().getCount()).append("\r\n").toString());
        }
      }
      finally
      {
        if (bw != null)
          bw.close();
      }
     
    }
    else
    {
      System.out.println("desfFiles not exist : " + destFiles);
    }
   
  }
 
  static void doDistinct(File f,int maxKeys,float errorRate,int distinctColumn,Integer[] groupby,Map<String,DistinctCountEntryValue>result) throws IOException
  {
    java.io.BufferedReader br = new java.io.BufferedReader(new java.io.FileReader(f));
   
    try
    {
      String c = null;
     
      while((c = br.readLine() )!= null)
      {
        try
        {
          String[] contents = StringUtils.splitByWholeSeparator(c, "%!");
         
          if (contents.length == 0 || (contents.length < distinctColumn))
            continue;
         
          StringBuilder key = new StringBuilder();
         
          for (Integer k : groupby)
          {
            key.append(contents[k]).append("--");
          }
         
          DistinctCountEntryValue distinctEntry = result.get(key.toString());
         
          if (distinctEntry == null)
          {
            distinctEntry = new DistinctCountEntryValue();
            ByteBloomFilter bloomFilter;
           
            bloomFilter = new ByteBloomFilter(maxKeys,errorRate,1);
            distinctEntry.setBloomFilter(bloomFilter);
            result.put(key.toString(), distinctEntry);
          }
         
          distinctEntry.add(contents[distinctColumn]);
        }
        catch(Exception ex)
        {
          System.out.print(ex.getCause());
        }

      }
    }
    finally
    {
      if (br != null)
        br.close();
    }

  }

}
TOP

Related Classes of com.taobao.top.analysis.util.bloom.DistinctCountTool

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.