Package dmt.clustering

Source Code of dmt.clustering.TextInstance

package dmt.clustering;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.Vector;

import com.aliasi.util.Distance;

import dmt.tools.CSVFileReader;
import dmt.tools.SparseVector;

public class TextInstance
{
  String id;
  private SparseVector bagOfWordsTfIdf;
  private double norm;
  public static Set<TextInstance> textInstancesSet = new HashSet<TextInstance>();

  public static void loadTextInstances() throws IOException
  {
    CSVFileReader in1 = new CSVFileReader("csv_out.csv", ',');
    // skip the headers
    Vector<String> fields1 = in1.readFields();
    fields1 = in1.readFields();
    int no_instances = 1000;
    int k = 0;
    while (fields1 != null)
    {
      String id = fields1.get(0) + fields1.get(1);
      Object[] bagOfWordsTfIdf1 = fields1.subList(4, fields1.size())
          .toArray();
      TextInstance instance1 = new TextInstance(bagOfWordsTfIdf1, id);
      // instance1.clearBagOfWordsTfIdf();
      textInstancesSet.add(instance1);
      fields1 = in1.readFields();
      k++;
      if (no_instances == k) break;
    }
  }

  public TextInstance(Object[] bagOfWordsTfIdf, String id) {
    this.bagOfWordsTfIdf = new SparseVector(bagOfWordsTfIdf.length);
    for (int i = 0; i < bagOfWordsTfIdf.length; i++)
    {
      try
      {
        this.bagOfWordsTfIdf.put(i, Double
            .parseDouble(((String) bagOfWordsTfIdf[i])));
      } catch (Exception e)
      {
        System.err.println(e.getMessage());
      }
    }
    this.id = id;
    computeNorm();
  }

  public String getId()
  {
    return id;
  }

  private void computeNorm()
  {
    norm = 0;
    for (int i = 0; i < bagOfWordsTfIdf.size(); i++)
    {
      // norm += Math.pow(bagOfWordsTfIdf[i], 2);
      norm += Math.pow(bagOfWordsTfIdf.get(i), 2);
    }
    norm = Math.sqrt(norm);
  }

  private double getNorm()
  {
    return norm;
  }

  private double getLength()
  {
    // return bagOfWordsTfIdf.length;
    return bagOfWordsTfIdf.size();
  }

  private double getTfIdfAt(int i)
  {
    // return bagOfWordsTfIdf[i];
    return bagOfWordsTfIdf.get(i);
  }

  public SparseVector getBagOfWordsTfIdf()
  {
    return bagOfWordsTfIdf;
  }

  private static double computeProduct(TextInstance instance1,
      TextInstance instance2)
  {
    double product = 0;
    for (int i = 0; i < instance1.getLength(); i++)
    {
      product += instance1.getTfIdfAt(i) * instance2.getTfIdfAt(i);
    }
    return product;
  }

  private static double computeCosine(TextInstance instance1,
      TextInstance instance2)
  {
    return computeProduct(instance1, instance2)
        / (instance1.getNorm() * instance2.getNorm());
  }

  public static double computeEuclideanDistance(TextInstance instance1,
      TextInstance instance2)
  {
    return instance1.getBagOfWordsTfIdf().euclideanDistance(
        instance2.getBagOfWordsTfIdf());
  }

  public static double computeManhattanDistance(TextInstance instance1,
      TextInstance instance2)
  {
    return instance1.getBagOfWordsTfIdf().manhattanDistance(
        instance2.getBagOfWordsTfIdf());
  }

  public static final Distance<TextInstance> COSINE_DISTANCE = new Distance<TextInstance>()
  {
    public double distance(TextInstance instance1, TextInstance instance2)
    {
      double oneMinusCosine = 1.0 - TextInstance.computeCosine(instance1,
          instance2);
      if (oneMinusCosine > 1.0)
        return 1.0;
      else if (oneMinusCosine < 0.0)
        return 0.0;
      else
        return oneMinusCosine;
    }
  };

  static final Distance<TextInstance> EUCLIDEAN_DISTANCE = new Distance<TextInstance>()
  {
    public double distance(TextInstance instance1, TextInstance instance2)
    {
      // return
      // euclideanDistances[instance1.getIndex()][instance2.getIndex()];
      return TextInstance.computeEuclideanDistance(instance1, instance2);
    }
  };

  static final Distance<TextInstance> MANHATTAN_DISTANCE = new Distance<TextInstance>()
  {
    public double distance(TextInstance instance1, TextInstance instance2)
    {
      // return
      // manhattanDistances[instance1.getIndex()][instance2.getIndex()];
      return TextInstance.computeManhattanDistance(instance1, instance2);
    }
  };
}
TOP

Related Classes of dmt.clustering.TextInstance

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.