/*
* Copyright (C) 2012 Chris Neasbitt
* Author: Chris Neasbitt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package edu.uga.cs.fluxbuster.features;
import java.io.IOException;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.Formatter;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.TreeMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.joda.time.DateTime;
import org.joda.time.Days;
import edu.uga.cs.fluxbuster.db.DBInterface;
import edu.uga.cs.fluxbuster.db.DBInterfaceFactory;
import edu.uga.cs.fluxbuster.utils.PropertiesUtils;
/**
* This class calculates longitudinal features of each cluster and
* stores them in the database.
*
* @author Chris Neasbitt
*/
public class FeatureCalculator {
private DBInterface dbi = null;
private Properties properties = null;
private SimpleDateFormat df = null;
private ArrayList<Date> prevDateBuf = null;
private Date prevDateBufDate = null;
private int prevDateBufWindow = 0;
private static final String TABLES_QUERY1KEY = "TABLES_QUERY1";
private static final String DOMAINSPREFIXKEY = "DOMAINS_TABLE_PREFIX";
private static final String RESIPSPREFIXKEY = "RESIPS_TABLE_PREFIX";
private static final String NOVELTY_QUERY1_1KEY = "NOVELTY_QUERY1_PART1";
private static final String NOVELTY_QUERY1_2KEY = "NOVELTY_QUERY1_PART2";
private static final String NOVELTY_QUERY1_3KEY = "NOVELTY_QUERY1_PART3";
private static final String NOVELTY_QUERY2KEY = "NOVELTY_QUERY2";
private static final String NOVELTY_WINDOWSKEY = "NOVELTY_WINDOWS";
private static final String NOVELTY_WINFIELDSKEY = "NOVELTY_WINDOW_FIELDS";
private static final String NOVELTY_QUERY3KEY = "NOVELTY_QUERY3";
private static final String PREVCLUSTER_QUERY1KEY = "PREVCLUSTER_QUERY1";
private static final String PREVCLUSTER_QUERY2KEY = "PREVCLUSTER_QUERY2";
private static final String PREVCLUSTER_QUERY3KEY = "PREVCLUSTER_QUERY3";
private static final String PREVCLUSTER_QUERY4KEY = "PREVCLUSTER_QUERY4";
private static final String PREVCLUSTER_WINDOWKEY = "PREVCLUSTER_WINDOW";
private static final String DOMAINSPERNETWORK_WINDOWKEY = "DOMAINSPERNETWORK_WINDOW";
private static final String DOMAINSPERNETWORK_QUERY1KEY = "DOMAINSPERNETWORK_QUERY1";
private static final String DOMAINSPERNETWORK_QUERY2KEY = "DOMAINSPERNETWORK_QUERY2";
private static final String DOMAINSPERNETWORK_QUERY3KEY = "DOMAINSPERNETWORK_QUERY3";
private static Log log = LogFactory.getLog(FeatureCalculator.class);
/**
* Instantiates a new feature calculator.
*
* @throws IOException if the FeatureCalculator.properties file
* can not be read
*/
public FeatureCalculator() throws IOException {
this(DBInterfaceFactory.loadDBInterface());
}
/**
* Instantiates a new feature calculator with a specific database
* interface.
*
* @param dbi the database interface
* @throws IOException if the FeatureCalculator.properties file
* can not be read
*/
public FeatureCalculator(DBInterface dbi) throws IOException {
this.dbi = dbi;
properties = PropertiesUtils.loadProperties(this.getClass());
df = new SimpleDateFormat("yyyyMMdd");
}
/**
* Calculates the domains per network feature for each cluster generated
* on a specific run date.
*
* @param log_date the run date
* @param window the number of days previous to use in feature calculation
* @return a table of values where the keys are cluster ids and the values
* are the feature values
* @throws SQLException if there is an error calculating the feature values
*/
public Map<Integer, Double> calculateDomainsPerNetwork(Date log_date,
int window) throws SQLException{
HashMap<Integer, Double> retval = new HashMap<Integer, Double>();
ArrayList<Date> prevDates = getPrevDates(log_date, window);
if (prevDates.size() > 0) {
String logDateStr = df.format(log_date);
StringBuffer add_query = new StringBuffer();
Formatter formatter = new Formatter(add_query);
for(Date prevDate : prevDates){
String prevDateStr = df.format(prevDate);
formatter.format(" " + properties.getProperty(DOMAINSPERNETWORK_QUERY1KEY) + " ",
logDateStr, prevDateStr, prevDateStr);
}
formatter.close();
StringBuffer querybuf = new StringBuffer();
formatter = new Formatter(querybuf);
formatter.format(properties.getProperty(DOMAINSPERNETWORK_QUERY2KEY),
logDateStr, logDateStr, logDateStr,add_query.toString());
ResultSet rs = null;
try{
rs = dbi.executeQueryWithResult(querybuf.toString());
while(rs.next()){
retval.put(rs.getInt(1), rs.getDouble(2));
}
} catch (Exception e) {
if(log.isErrorEnabled()){
log.error(e);
}
} finally {
if(rs != null && !rs.isClosed()){
rs.close();
}
formatter.close();
}
}
return retval;
}
/**
* Calculates the cluster novelty feature for each cluster generated
* on a specific run date.
*
* @param log_date the run date
* @param window the number of days previous to use in feature calculation
* @return a table of values where the keys are cluster ids and the values
* are the feature values
* @throws SQLException if there is an error calculating the feature values
*/
public Map<Integer, Double> calculateNoveltyFeature(Date log_date,
int window) throws SQLException {
HashMap<Integer, Double> retval = new HashMap<Integer, Double>();
ArrayList<Date> prevDates = getPrevDates(log_date, window);
if (prevDates.size() > 0) {
StringBuffer querybuf = new StringBuffer();
Formatter formatter = new Formatter(querybuf);
String curdatestr = df.format(log_date);
formatter.format(properties.getProperty(NOVELTY_QUERY1_1KEY),
curdatestr, curdatestr, curdatestr, curdatestr);
for (Date prevDate : prevDates) {
formatter
.format(" "
+ properties.getProperty(NOVELTY_QUERY1_2KEY)
+ " ", df.format(prevDate));
}
formatter.format(properties.getProperty(NOVELTY_QUERY1_3KEY),
curdatestr, curdatestr);
ResultSet rs2 = null;
Hashtable<Integer, Hashtable<String, Long>> new_resolved_ips
= new Hashtable<Integer, Hashtable<String, Long>>();
try{
rs2 = dbi.executeQueryWithResult(querybuf.toString());
while (rs2.next()) {
int cluster_id = rs2.getInt(2);
if (!new_resolved_ips.containsKey(cluster_id)) {
new_resolved_ips.put(cluster_id,
new Hashtable<String, Long>());
}
String secondLevelDomainName = rs2.getString(1);
long newips = rs2.getLong(3);
Hashtable<String, Long> clustertable = new_resolved_ips
.get(cluster_id);
clustertable.put(secondLevelDomainName, newips);
}
} catch(Exception e) {
if(log.isErrorEnabled()){
log.error(e);
}
} finally{
if(rs2 != null && !rs2.isClosed()){
rs2.close();
}
formatter.close();
}
Hashtable<String, List<Integer>> numDays = new Hashtable<String, List<Integer>>();
for (Date prevDate : prevDates) {
String prevDateStr = df.format(prevDate);
querybuf = new StringBuffer();
formatter = new Formatter(querybuf);
formatter.format(properties.getProperty(NOVELTY_QUERY2KEY),
curdatestr, prevDateStr, curdatestr, prevDateStr);
ResultSet rs3 = null;
try{
rs3 = dbi.executeQueryWithResult(querybuf.toString());
while (rs3.next()) {
String sldn = rs3.getString(1);
if (!numDays.containsKey(sldn)) {
numDays.put(sldn, new ArrayList<Integer>());
}
Date pd = rs3.getDate(2);
DateTime start = new DateTime(pd.getTime());
DateTime end = new DateTime(log_date.getTime());
Days d = Days.daysBetween(start, end);
int diffDays = d.getDays();
numDays.get(sldn).add(diffDays);
}
} catch (Exception e){
if(log.isErrorEnabled()){
log.error(e);
}
} finally {
if(rs3 != null && !rs3.isClosed()){
rs3.close();
}
formatter.close();
}
}
Hashtable<Integer, List<Float>> clusterValues = new Hashtable<Integer, List<Float>>();
for (int clusterID : new_resolved_ips.keySet()) {
clusterValues.put(clusterID, new ArrayList<Float>());
Hashtable<String, Long> sldnValues = new_resolved_ips.get(clusterID);
for(String sldn : sldnValues.keySet()){
if(numDays.keySet().contains(sldn)){
long newIPCount = sldnValues.get(sldn);
float f = ((float)newIPCount)/Collections.max(numDays.get(sldn));
clusterValues.get(clusterID).add(f);
}
}
}
for(int clusterID : clusterValues.keySet()){
if(clusterValues.get(clusterID) == null){ //I dont think it is possible for this to ever be true
retval.put(clusterID, null);
} else {
double sum = 0;
for(double d : clusterValues.get(clusterID)){
sum += d;
}
double val = 0;
if(clusterValues.get(clusterID).size() > 0){
val = sum/clusterValues.get(clusterID).size();
}
retval.put(clusterID, val);
}
}
}
return retval;
}
/**
* Calculates the previous cluster ratio feature for each cluster generated
* on a specific run date and within the a specific window
*
* @param log_date the run date
* @param window the number of days previous to use in feature calculation
* @return a table of results, the keys of the table are cluster ids and the
* values are lists of two elements. The first element is the
* last_growth_ratio_prev_clusters value and the second element is the
* last_growth_prefix_ratio_prev_clusters value
* @throws SQLException if there is and error calculating the feature
*/
public Hashtable<Integer, List<Double>> calculatePrevClusterRatios
(Date log_date, int window) throws SQLException{
Hashtable<Integer, List<Double>> retval = new Hashtable<Integer, List<Double>>();
ArrayList<Date> prevDates = getPrevDates(log_date, window);
String query1 = properties.getProperty(PREVCLUSTER_QUERY1KEY);
String query2 = properties.getProperty(PREVCLUSTER_QUERY2KEY);
String logDateStr = df.format(log_date);
String completequery = new String();
StringBuffer addQueryBuff = new StringBuffer();
for(int i = 0; i < prevDates.size(); i++){
String prevDateStr = df.format(prevDates.get(i));
StringBuffer querybuf = new StringBuffer();
Formatter formatter = new Formatter(querybuf);
formatter.format(query1, logDateStr, logDateStr,
prevDateStr, prevDateStr, prevDateStr);
addQueryBuff.append(querybuf.toString());
if(i < prevDates.size() - 1){
addQueryBuff.append(" UNION ");
}
formatter.close();
}
if(addQueryBuff.length() > 0){
StringBuffer querybuf = new StringBuffer();
Formatter formatter = new Formatter(querybuf);
formatter.format(query2, logDateStr, logDateStr,
addQueryBuff.toString());
completequery = querybuf.toString();
formatter.close();
}
if(completequery.length() > 0){
ResultSet rs = null;
try{
rs = dbi.executeQueryWithResult(completequery);
while(rs.next()){
ArrayList<Double> temp = new ArrayList<Double>();
temp.add(rs.getDouble(3));
temp.add(rs.getDouble(4));
retval.put(rs.getInt(1), temp);
}
} catch (Exception e){
if(log.isErrorEnabled()){
log.error(e);
}
} finally {
if(rs != null && !rs.isClosed()){
rs.close();
}
}
Hashtable<Integer, Double> queryPerDomain = getQueriesPerDomain(log_date);
for(Integer clusterid : retval.keySet()){
List<Double> values = retval.get(clusterid);
values.set(0, values.get(0)/queryPerDomain.get(clusterid));
values.set(1, values.get(1)/queryPerDomain.get(clusterid));
}
}
return retval;
}
/**
* Gets run dates previous to a specific date within a window
* of days from that date.
*
* @param log_date the run date
* @param window the number of days previous to the current date
* @return the list of previous run dates
* @throws SQLException if there is an error retrieving the previous
* run dates
*/
public ArrayList<Date> getPrevDates(Date log_date, int window) throws SQLException{
ArrayList<Date> prevDates = new ArrayList<Date>();
if(prevDateBufDate != null && prevDateBuf != null && prevDateBufDate.equals(log_date)
&& prevDateBufWindow >= window){
//pull the dates within the day window from the prevDateBuf cache
Date pd = null;
int windowcount = 0;
for(Date d : prevDateBuf){
if(windowcount >= window){
break;
}
if(pd == null){
pd = d;
windowcount++;
} else {
DateTime morerecent = new DateTime(d.getTime());
DateTime lessrecent = new DateTime(pd.getTime());
Days days = Days.daysBetween(morerecent, lessrecent);
windowcount += days.getDays();
pd = d;
}
prevDates.add(d);
}
} else {
String domainsprefix = properties.getProperty(DOMAINSPREFIXKEY);
String resipsprefix = properties.getProperty(RESIPSPREFIXKEY);
ArrayList<String> tablenames = new ArrayList<String>();
ResultSet rs1 = null;
try{
rs1 = dbi.executeQueryWithResult(properties
.getProperty(TABLES_QUERY1KEY));
while (rs1.next()) {
tablenames.add(rs1.getString(1));
}
} catch(Exception e){
if(log.isErrorEnabled()){
log.error(e);
}
} finally {
if(rs1 != null && !rs1.isClosed()){
rs1.close();
}
}
GregorianCalendar cal = new GregorianCalendar();
cal.setTime(log_date);
for (int i = 0; i < window; i++) {
cal.roll(Calendar.DAY_OF_YEAR, false);
Date temp = cal.getTime();
String datestr = df.format(temp);
if (tablenames.contains(domainsprefix + "_" + datestr)
&& tablenames.contains(resipsprefix + "_" + datestr)) {
prevDates.add(temp);
}
}
//cache the values for later
if(prevDateBuf == null){
prevDateBuf = new ArrayList<Date>();
} else {
prevDateBuf.clear();
}
prevDateBuf.addAll(prevDates);
prevDateBufDate = log_date;
prevDateBufWindow = window;
}
return prevDates;
}
/**
* Retrieves the number of dns queries per domain for each cluster
* generated on a specific run date.
*
* @param log_date the run date
* @return a table of values where the keys are cluster ids and the values
* are the queries per domain value
* @throws SQLException if there is an error retrieving the queries
* per domain values
*/
private Hashtable<Integer, Double> getQueriesPerDomain(Date log_date) throws SQLException{
Hashtable<Integer, Double> retval = new Hashtable<Integer, Double>();
StringBuffer querybuf = new StringBuffer();
Formatter formatter = new Formatter(querybuf);
formatter.format(properties.getProperty(PREVCLUSTER_QUERY3KEY), df.format(log_date));
ResultSet rs = null;
try{
rs = dbi.executeQueryWithResult(querybuf.toString());
while(rs.next()){
retval.put(rs.getInt(1), rs.getDouble(2));
}
} catch(Exception e) {
if(log.isErrorEnabled()){
log.error(e);
}
} finally {
if(rs != null && !rs.isClosed()){
rs.close();
}
formatter.close();
}
return retval;
}
/**
* Calculates the domains per network feature for each cluster generated
* on a specific run date and stores them in the database.
*
* @param log_date the run date
* @throws Exception if there is an error calculating or storing the
* feature values
*/
public void updateDomainsPerNetwork(Date log_date) throws Exception{
Map<Integer, Double> dpn =
this.calculateDomainsPerNetwork(log_date,
Integer.parseInt(properties.getProperty(DOMAINSPERNETWORK_WINDOWKEY)));
for(int clusterid : dpn.keySet()){
StringBuffer querybuf = new StringBuffer();
Formatter formatter = new Formatter(querybuf);
formatter.format(properties.getProperty(DOMAINSPERNETWORK_QUERY3KEY),
df.format(log_date), dpn.get(clusterid).toString(), String.valueOf(clusterid));
dbi.executeQueryNoResult(querybuf.toString());
formatter.close();
}
}
/**
* Updates each cluster's longitudinal features for all clusters
* generated during a specific run date.
*
* @param log_date the run date
* @throws Exception if unable to calculate or store the longitudinal
* feature values
*/
public void updateFeatures(Date log_date) throws Exception{
String simplename = null;
if(log.isInfoEnabled()){
simplename = this.getClass().getSimpleName();
log.info(simplename + " Started: "
+ Calendar.getInstance().getTime());
log.info("Updating novelty features.");
}
dbi.initClusterTables(log_date);
updateNoveltyFeature(log_date);
if(log.isInfoEnabled()){
log.info("Novelty features updated.");
log.info("Updating previous cluster ratio features.");
}
updatePrevClusterRatios(log_date);
if(log.isInfoEnabled()){
log.info("Previous cluster ratio features updated.");
log.info("Updating domains per network feature.");
}
updateDomainsPerNetwork(log_date);
if(log.isInfoEnabled()){
log.info("Domains per network feature updated.");
log.info(simplename + " Finished: "
+ Calendar.getInstance().getTime());
}
}
/**
* Calculates the cluster novelty feature for each cluster generated
* on a specific run date and stores them in the database.
*
* @param log_date the run date
* @throws Exception if there is an error calculating or storing the feature
* values
*/
public void updateNoveltyFeature(Date log_date) throws Exception{
Map<Integer, String> windowvals = new TreeMap<Integer, String>();
String[] windowsstr = properties.getProperty(NOVELTY_WINDOWSKEY).split(",");
String[] windowfields = properties.getProperty(NOVELTY_WINFIELDSKEY).split(",");
if(windowfields.length != windowsstr.length){
throw new Exception("Number of novelty window values and fields do not match.");
}
for(int i = 0; i < windowsstr.length; i++){
windowvals.put(Integer.parseInt(windowsstr[i]), windowfields[i]);
}
//We start from largest window to smallest so we can cache the prevDates results for later use
List<Integer> windowkeys = new ArrayList<Integer>(windowvals.keySet());
Collections.reverse(windowkeys);
for(int window : windowkeys){
Map<Integer, Double> novelty = calculateNoveltyFeature(log_date, window);
for(int clusterid : novelty.keySet()){
StringBuffer querybuf = new StringBuffer();
Formatter formatter = new Formatter(querybuf);
formatter.format(properties.getProperty(NOVELTY_QUERY3KEY),
df.format(log_date), windowvals.get(window),
String.valueOf(novelty.get(clusterid)),
String.valueOf(clusterid), df.format(log_date));
dbi.executeQueryNoResult(querybuf.toString());
formatter.close();
}
}
}
/**
* Calculates the previous cluster ratio feature for each cluster generated
* on a specific run date and stores them in the database.
*
* @param log_date the run date
* @throws SQLException if the feature values can not be stored in the database
*/
public void updatePrevClusterRatios(Date log_date) throws SQLException{
Hashtable<Integer, List<Double>> ratios =
this.calculatePrevClusterRatios(log_date,
Integer.parseInt(properties.getProperty(PREVCLUSTER_WINDOWKEY)));
for(int clusterid : ratios.keySet()){
List<Double> ratiovals = ratios.get(clusterid);
StringBuffer querybuf = new StringBuffer();
Formatter formatter = new Formatter(querybuf);
formatter.format(properties.getProperty(PREVCLUSTER_QUERY4KEY),
df.format(log_date), ratiovals.get(0).toString(),
ratiovals.get(1).toString(), Integer.toString(clusterid));
dbi.executeQueryNoResult(querybuf.toString());
formatter.close();
}
}
}