package dmt.clustering;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.Vector;
import com.aliasi.util.Distance;
import dmt.tools.CSVFileReader;
import dmt.tools.SparseVector;
public class TextInstance
{
String id;
private SparseVector bagOfWordsTfIdf;
private double norm;
public static Set<TextInstance> textInstancesSet = new HashSet<TextInstance>();
public static void loadTextInstances() throws IOException
{
CSVFileReader in1 = new CSVFileReader("csv_out.csv", ',');
// skip the headers
Vector<String> fields1 = in1.readFields();
fields1 = in1.readFields();
int no_instances = 1000;
int k = 0;
while (fields1 != null)
{
String id = fields1.get(0) + fields1.get(1);
Object[] bagOfWordsTfIdf1 = fields1.subList(4, fields1.size())
.toArray();
TextInstance instance1 = new TextInstance(bagOfWordsTfIdf1, id);
// instance1.clearBagOfWordsTfIdf();
textInstancesSet.add(instance1);
fields1 = in1.readFields();
k++;
if (no_instances == k) break;
}
}
public TextInstance(Object[] bagOfWordsTfIdf, String id) {
this.bagOfWordsTfIdf = new SparseVector(bagOfWordsTfIdf.length);
for (int i = 0; i < bagOfWordsTfIdf.length; i++)
{
try
{
this.bagOfWordsTfIdf.put(i, Double
.parseDouble(((String) bagOfWordsTfIdf[i])));
} catch (Exception e)
{
System.err.println(e.getMessage());
}
}
this.id = id;
computeNorm();
}
public String getId()
{
return id;
}
private void computeNorm()
{
norm = 0;
for (int i = 0; i < bagOfWordsTfIdf.size(); i++)
{
// norm += Math.pow(bagOfWordsTfIdf[i], 2);
norm += Math.pow(bagOfWordsTfIdf.get(i), 2);
}
norm = Math.sqrt(norm);
}
private double getNorm()
{
return norm;
}
private double getLength()
{
// return bagOfWordsTfIdf.length;
return bagOfWordsTfIdf.size();
}
private double getTfIdfAt(int i)
{
// return bagOfWordsTfIdf[i];
return bagOfWordsTfIdf.get(i);
}
public SparseVector getBagOfWordsTfIdf()
{
return bagOfWordsTfIdf;
}
private static double computeProduct(TextInstance instance1,
TextInstance instance2)
{
double product = 0;
for (int i = 0; i < instance1.getLength(); i++)
{
product += instance1.getTfIdfAt(i) * instance2.getTfIdfAt(i);
}
return product;
}
private static double computeCosine(TextInstance instance1,
TextInstance instance2)
{
return computeProduct(instance1, instance2)
/ (instance1.getNorm() * instance2.getNorm());
}
public static double computeEuclideanDistance(TextInstance instance1,
TextInstance instance2)
{
return instance1.getBagOfWordsTfIdf().euclideanDistance(
instance2.getBagOfWordsTfIdf());
}
public static double computeManhattanDistance(TextInstance instance1,
TextInstance instance2)
{
return instance1.getBagOfWordsTfIdf().manhattanDistance(
instance2.getBagOfWordsTfIdf());
}
public static final Distance<TextInstance> COSINE_DISTANCE = new Distance<TextInstance>()
{
public double distance(TextInstance instance1, TextInstance instance2)
{
double oneMinusCosine = 1.0 - TextInstance.computeCosine(instance1,
instance2);
if (oneMinusCosine > 1.0)
return 1.0;
else if (oneMinusCosine < 0.0)
return 0.0;
else
return oneMinusCosine;
}
};
static final Distance<TextInstance> EUCLIDEAN_DISTANCE = new Distance<TextInstance>()
{
public double distance(TextInstance instance1, TextInstance instance2)
{
// return
// euclideanDistances[instance1.getIndex()][instance2.getIndex()];
return TextInstance.computeEuclideanDistance(instance1, instance2);
}
};
static final Distance<TextInstance> MANHATTAN_DISTANCE = new Distance<TextInstance>()
{
public double distance(TextInstance instance1, TextInstance instance2)
{
// return
// manhattanDistances[instance1.getIndex()][instance2.getIndex()];
return TextInstance.computeManhattanDistance(instance1, instance2);
}
};
}