Package cc.mallet.cluster

Examples of cc.mallet.cluster.Clusterings


    CommandOption
                  .setSummary(Clusterings2Clusterings.class,
                              "A tool to manipulate Clusterings.");
    CommandOption.process(Clusterings2Clusterings.class, args);

    Clusterings clusterings = null;
    try {
      ObjectInputStream iis =
          new ObjectInputStream(new FileInputStream(inputFile.value));
      clusterings = (Clusterings) iis.readObject();
    } catch (Exception e) {
      System.err.println("Exception reading clusterings from "
                          + inputFile.value + " " + e);
      e.printStackTrace();
    }

    logger.info("number clusterings=" + clusterings.size());

    // Prune clusters based on size.
    if (minClusterSize.value > 1) {
      for (int i = 0; i < clusterings.size(); i++) {
        Clustering clustering = clusterings.get(i);
        InstanceList oldInstances = clustering.getInstances();
        Alphabet alph = oldInstances.getDataAlphabet();
        LabelAlphabet lalph = (LabelAlphabet) oldInstances.getTargetAlphabet();
        if (alph == null) alph = new Alphabet();
        if (lalph == null) lalph = new LabelAlphabet();
        Pipe noop = new Noop(alph, lalph);
        InstanceList newInstances = new InstanceList(noop);
        for (int j = 0; j < oldInstances.size(); j++) {
          int label = clustering.getLabel(j);
          Instance instance = oldInstances.get(j);
          if (clustering.size(label) >= minClusterSize.value)
            newInstances.add(noop.pipe(new Instance(instance.getData(), lalph.lookupLabel(new Integer(label)), instance.getName(), instance.getSource())));
        }
        clusterings.set(i, createSmallerClustering(newInstances));
      }
      if (outputPrefixFile.value != null) {
        try {
          ObjectOutputStream oos =
            new ObjectOutputStream(new FileOutputStream(outputPrefixFile.value));
          oos.writeObject(clusterings);
          oos.close();
        } catch (Exception e) {
          logger.warning("Exception writing clustering to file " + outputPrefixFile.value                        + " " + e);
          e.printStackTrace();
        }
      }
    }
   
   
    // Split into training/testing
    if (trainingProportion.value > 0) {
      if (clusterings.size() > 1)
        throw new IllegalArgumentException("Expect one clustering to do train/test split, not " + clusterings.size());
      Clustering clustering = clusterings.get(0);
      int targetTrainSize = (int)(trainingProportion.value * clustering.getNumInstances());
      TIntHashSet clustersSampled = new TIntHashSet();
      Randoms random = new Randoms(123);
      LabelAlphabet lalph = new LabelAlphabet();
      InstanceList trainingInstances = new InstanceList(new Noop(null, lalph));
      while (trainingInstances.size() < targetTrainSize) {
        int cluster = random.nextInt(clustering.getNumClusters());
        if (!clustersSampled.contains(cluster)) {
          clustersSampled.add(cluster);
          InstanceList instances = clustering.getCluster(cluster);
          for (int i = 0; i < instances.size(); i++) {
            Instance inst = instances.get(i);
            trainingInstances.add(new Instance(inst.getData(), lalph.lookupLabel(new Integer(cluster)), inst.getName(), inst.getSource()));
          }
        }
      }
      trainingInstances.shuffle(random);
      Clustering trainingClustering = createSmallerClustering(trainingInstances);
     
      InstanceList testingInstances = new InstanceList(null, lalph);
      for (int i = 0; i < clustering.getNumClusters(); i++) {
        if (!clustersSampled.contains(i)) {
          InstanceList instances = clustering.getCluster(i);
          for (int j = 0; j < instances.size(); j++) {
            Instance inst = instances.get(j);
            testingInstances.add(new Instance(inst.getData(), lalph.lookupLabel(new Integer(i)), inst.getName(), inst.getSource()));
          }         
        }
      }
      testingInstances.shuffle(random);
      Clustering testingClustering = createSmallerClustering(testingInstances);
      logger.info(outputPrefixFile.value + ".train : " + trainingClustering.getNumClusters() + " objects");
      logger.info(outputPrefixFile.value + ".test : " + testingClustering.getNumClusters() + " objects");
      if (outputPrefixFile.value != null) {
        try {
          ObjectOutputStream oos =
            new ObjectOutputStream(new FileOutputStream(new File(outputPrefixFile.value + ".train")));
          oos.writeObject(new Clusterings(new Clustering[]{trainingClustering}));
          oos.close();
          oos =
            new ObjectOutputStream(new FileOutputStream(new File(outputPrefixFile.value + ".test")));
          oos.writeObject(new Clusterings(new Clustering[]{testingClustering}));
          oos.close();         
        } catch (Exception e) {
          logger.warning("Exception writing clustering to file " + outputPrefixFile.value                        + " " + e);
          e.printStackTrace();
        }
View Full Code Here


    CommandOption
                  .setSummary(Clusterings2Info.class,
                              "A tool to print statistics about a Clusterings.");
    CommandOption.process(Clusterings2Info.class, args);

    Clusterings clusterings = null;
    try {
      ObjectInputStream iis =
          new ObjectInputStream(new FileInputStream(inputFile.value));
      clusterings = (Clusterings) iis.readObject();
    } catch (Exception e) {
      System.err.println("Exception reading clusterings from "
                          + inputFile.value + " " + e);
      e.printStackTrace();
    }

    if (printOption.value) {
      for (int i = 0; i < clusterings.size(); i++) {
        Clustering c = clusterings.get(i);
        for (int j = 0; j < c.getNumClusters(); j++) {
          InstanceList cluster = c.getCluster(j);
          for (int k = 0; k < cluster.size(); k++) {
            System.out.println("clustering " + i + " cluster " + j + " element " + k + " " + cluster.get(k).getData());
          }
          System.out.println();
        }
      }
    }
    logger.info("number clusterings=" + clusterings.size());

    int totalInstances = 0;
    int totalClusters = 0;

    for (int i = 0; i < clusterings.size(); i++) {
      Clustering c = clusterings.get(i);
      totalClusters += c.getNumClusters();
      totalInstances += c.getNumInstances();
    }
    logger.info("total instances=" + totalInstances);
    logger.info("total clusters=" + totalClusters);
    logger.info("instances per clustering=" + (double) totalInstances
                / clusterings.size());
    logger.info("instances per cluster=" + (double) totalInstances
                / totalClusters);
    logger.info("clusters per clustering=" + (double) totalClusters
                / clusterings.size());
  }
View Full Code Here

    logger.info("\nread " + fi + " objects in " + clusterings.length + " clusterings.");
    try {
      ObjectOutputStream oos =
          new ObjectOutputStream(new FileOutputStream(outputFile.value));
      oos.writeObject(new Clusterings(clusterings));
      oos.close();
    } catch (Exception e) {
      logger.warning("Exception writing clustering to file " + outputFile.value
                      + " " + e);
      e.printStackTrace();
View Full Code Here

    // TRAIN

    Randoms random = new Randoms(123);
    Clusterer clusterer = null;
    if (!loadClusterer.value.exists()) {
      Clusterings training = readClusterings(trainingFile.value);

      Alphabet fieldAlphabet = ((Record) training.get(0).getInstances()
          .get(0).getData()).fieldAlphabet();

      Pipe pipe = new ClusteringPipe(string2ints(exactMatchFields.value, fieldAlphabet),
                                 string2ints(approxMatchFields.value, fieldAlphabet),
                                 string2ints(substringMatchFields.value, fieldAlphabet));

      InstanceList trainingInstances = new InstanceList(pipe);
      for (int i = 0; i < training.size(); i++) {
        PairSampleIterator iterator = new PairSampleIterator(training
            .get(i), random, 0.5, training.get(i).getNumInstances());
        while(iterator.hasNext()) {
          Instance inst = iterator.next();
          trainingInstances.add(pipe.pipe(inst));
        }
      }
      logger.info("generated " + trainingInstances.size()
          + " training instances");
      Classifier classifier = new MaxEntTrainer().train(trainingInstances);
      logger.info("InfoGain:\n");
      new InfoGain(trainingInstances).printByRank(System.out);
      logger.info("pairwise training accuracy="
          + new Trial(classifier, trainingInstances).getAccuracy());
      NeighborEvaluator neval = new PairwiseEvaluator(classifier, "YES",
          new PairwiseEvaluator.Average(), true);       
      clusterer = new GreedyAgglomerativeByDensity(
          training.get(0).getInstances().getPipe(), neval, 0.5, false,
          random);
      training = null;
      trainingInstances = null;
    } else {
      ObjectInputStream ois = new ObjectInputStream(new FileInputStream(loadClusterer.value));
      clusterer = (Clusterer) ois.readObject();
    }

    // TEST

    Clusterings testing = readClusterings(testingFile.value);
    ClusteringEvaluator evaluator = (ClusteringEvaluator) clusteringEvaluatorOption.value;
    if (evaluator == null)
      evaluator = new ClusteringEvaluators(
          new ClusteringEvaluator[] { new BCubedEvaluator(),
              new PairF1Evaluator(), new MUCEvaluator(), new AccuracyEvaluator() });
    ArrayList<Clustering> predictions = new ArrayList<Clustering>();
    for (int i = 0; i < testing.size(); i++) {
      Clustering clustering = testing.get(i);
      Clustering predicted = clusterer.cluster(clustering.getInstances());
      predictions.add(predicted);
      logger.info(evaluator.evaluate(clustering, predicted));
    }
    logger.info(evaluator.evaluateTotals());
View Full Code Here

TOP

Related Classes of cc.mallet.cluster.Clusterings

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.