Examples of cc.mallet.types.Instance

cc.mallet.types.Instance
A machine learning "example" to be used in training, testing or performance of various machine learning algorithms.
An instance contains four generic fields of predefined name: "data", "target", "name", and "source". "Data" holds the data represented `by the instance, "target" is often a label associated with the instance, "name" is a short identifying name for the instance (such as a filename), and "source" is human-readable sourceinformation, (such as the original text).
Each field has no predefined type, and may change type as the instance is processed. For example, the data field may start off being a string that represents a file name and then be processed by a {@link cc.mallet.pipe.Pipe} into a CharSequencerepresenting the contents of the file, and eventually to a feature vector holding indices into an {@link cc.mallet.types.Alphabet} holding words found in the file.It is up to each pipe which fields in the Instance it modifies; the most common case is that the pipe modifies the data field.
Generally speaking, there are two modes of operation for Instances. (1) An instance gets created and passed through a Pipe, and the resulting data/target/name/source fields are used. This is generally done for training instances. (2) An instance gets created with raw values in its slots, then different users of the instance call newPipedCopy() with their respective different pipes. This might be done for test instances at "performance" time.
Rather than store an {@link cc.mallet.types.Alphabet} in the Instance,we obtain it through the Pipe instance variable, because the Pipe also indicates where the data came from and how to interpret the Alphabet.
Instances can be made immutable if locked. Although unlocked Instances are mutable, typically the only code that changes the values in the four slots is inside Pipes.
Note that constructing an instance with a pipe argument means "Construct the instance and then run it through the pipe". {@link cc.mallet.types.InstanceList} uses this methodwhen adding instances through a pipeInputIterator. @see Pipe @see Alphabet @see InstanceList @author Andrew McCallum mccallum@cs.umass.edu

  
  public void evaluateInstanceList (TransducerTrainer tt, InstanceList data, String description)
  {
    int correct = 0;
    for (int i = 0; i < data.size(); i++) {
      Instance instance = data.get(i);
      Sequence input = (Sequence) instance.getData();
      Sequence trueOutput = (Sequence) instance.getTarget();
      assert (input.size() == trueOutput.size());
      Sequence predOutput = tt.getTransducer().transduce (input);
      assert (predOutput.size() == trueOutput.size());
      if (sequencesMatch (trueOutput, predOutput))
        correct++;

View Full Code Here

  }


  private double computeLikelihood(InstanceList trainingSample) {
    double loglik = 0.0;
    for (int i = 0; i < trainingSample.size(); i++) {
      Instance trainingInstance = trainingSample.get(i);
      FeatureVectorSequence fvs = (FeatureVectorSequence) trainingInstance
          .getData();
      Sequence labelSequence = (Sequence) trainingInstance.getTarget();
      loglik += new SumLatticeDefault(crf, fvs, labelSequence, null)
          .getTotalWeight();
      loglik -= new SumLatticeDefault(crf, fvs, null, null)
          .getTotalWeight();
    }

View Full Code Here

  }


  // The PipeInputIterator interface
  public Instance next ()
  {
    Instance inst = subIt.next ();
    inst = pipe.pipe (inst);
    return new Instance (inst.getData (), inst.getTarget (), inst.getName (), inst.getSource ());
  }

View Full Code Here

      throw new UnsupportedOperationException ("Training with multiple sets not supported.");
    }


    trainingGatheredFor = training;
    for (int i = 0; i < training.size(); i++) {
      Instance instance = training.get(i);
      FeatureVectorSequence input = (FeatureVectorSequence) instance.getData();
      FeatureSequence output = (FeatureSequence) instance.getTarget();
      // Do it for the paths consistent with the labels...
      new SumLatticeDefault (memm, input, output, new Transducer.Incrementor() {
        public void incrementFinalState(Transducer.State s, double count) { }
        public void incrementInitialState(Transducer.State s, double count) { }
        public void incrementTransition(Transducer.TransitionIterator ti, double count) {
          MEMM.State source = (MEMM.State) ti.getSourceState();
          if (count != 0) {
            // Create the source state's trainingSet if it doesn't exist yet.
            if (source.trainingSet == null)
              // New InstanceList with a null pipe, because it doesn't do any processing of input.
              source.trainingSet = new InstanceList (null);
            // TODO We should make sure we don't add duplicates (through a second call to setWeightsDimenstion..!
            // TODO Note that when the training data still allows ambiguous outgoing transitions
            // this will add the same FV more than once to the source state's trainingSet, each
            // with >1.0 weight.  Not incorrect, but inefficient.
//            System.out.println ("From: "+source.getName()+" ---> "+getOutput()+" : "+getInput());
            source.trainingSet.add (new Instance(ti.getInput (), ti.getOutput (), null, null), count);
          }
        }
      });
    }
  }

View Full Code Here

      if (training == null) {
        System.out.println ("No data");
        continue;
      }
      for (int j = 0; j < training.size(); j++) {
        Instance inst = training.get (j);
        System.out.println ("From : "+state.getName()+" To : "+inst.getTarget());
        System.out.println ("Instance "+j);
        System.out.println (inst.getTarget());
        System.out.println (inst.getData());
      }
    }
  }

View Full Code Here

          System.out.println ("Empty training set for state "+s.name);
          continue;
        }


        for (int j = 0; j < s.trainingSet.size(); j++) {
          Instance instance = s.trainingSet.get (j);
          double instWeight = s.trainingSet.getInstanceWeight (j);
          FeatureVector fv = (FeatureVector) instance.getData ();
          String labelString = (String) instance.getTarget ();
          TransitionIterator iter = new TransitionIterator (s, fv, gatherConstraints?labelString:null, memm);
          while (iter.hasNext ()) {
            // gsc
            iter.nextState(); // advance the iterator
//            State destination = (MEMM.State) iter.nextState();  // Just to advance the iterator

View Full Code Here


    totalTokens = numCorrectTokens = 0;
    for (int n = 0; n < numTrueSegments.length; n++)
      numTrueSegments[n] = numPredictedSegments[n] = numCorrectSegments[n] = 0;
    for (int i = 0; i < data.size(); i++) {
      Instance instance = data.get(i);
      Sequence input = (Sequence) instance.getData();
      //String tokens = null;
      //if (instance.getSource() != null)
      //tokens = (String) instance.getSource().toString();
      Sequence trueOutput = (Sequence) instance.getTarget();
      assert (input.size() == trueOutput.size());
      Sequence predOutput = model.transduce (input);
      assert (predOutput.size() == trueOutput.size());
      int trueStart, predStart;        // -1 for non-start, otherwise index into segmentStartTag
      for (int j = 0; j < trueOutput.size(); j++) {

View Full Code Here

    for (int n = 0; n < numTrueSegments.length; n++)
      numTrueSegments[n] = numPredictedSegments[n] = numCorrectSegments[n] = 0;
    for (int i = 0; i < data.size(); i++) {
      if (viterbiOutputStream != null)
        viterbiOutputStream.println ("Viterbi path for "+description+" instance #"+i);
      Instance instance = data.get(i);
      Sequence input = (Sequence) instance.getData();      
      //String tokens = null;
      //if (instance.getSource() != null)
      //tokens = (String) instance.getSource().toString();
      Sequence trueOutput = (Sequence) instance.getTarget();
      assert (input.size() == trueOutput.size());
      Sequence predOutput = (Sequence) predictedSequences.get (i);
      if (predOutput == null) // skip this instance
        continue;
      assert (predOutput.size() == trueOutput.size());

View Full Code Here

  }
  
  public static Clustering mergeInstancesWithSameLabel (Clustering clustering) {
    InstanceList list = clustering.getInstances();
    for (int i = 0; i < list.size(); i++) {
      Instance ii = list.get(i);
      int li = clustering.getLabel(i);
      for (int j = i + 1; j < list.size(); j++) {
        Instance ij = list.get(j);
        int lj = clustering.getLabel(j);
        if (li != lj && ii.getLabeling().equals(ij.getLabeling()))
          clustering = ClusterUtils.mergeClusters(clustering, li, lj);
      }
    }  
    return clustering;
  }

View Full Code Here

        .createSingletonClustering(clustering.getInstances());
    double total = 0;
    int count = 0;
    for (AllPairsIterator iter = new AllPairsIterator(singletons); iter
        .hasNext(); count++) {
      Instance instance = (Instance) iter.next();
      AgglomerativeNeighbor neighbor = (AgglomerativeNeighbor) instance
          .getData();
      double score = evaluator.evaluate(neighbor);
      int[][] clusters = neighbor.getOldClusters();
      if (clustering.getLabel(clusters[0][0]) == clustering
          .getLabel(clusters[1][0]))

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of cc.mallet.types.Instance

cc.mallet.classify.BalancedWinnowTrainer

cc.mallet.classify.C45$Node

cc.mallet.classify.Classification

cc.mallet.classify.Classifier

cc.mallet.classify.DecisionTree$Node

cc.mallet.classify.evaluate.ConfusionMatrix

cc.mallet.classify.FeatureConstraintUtil

cc.mallet.classify.MaxEntOptimizableByGE

cc.mallet.classify.MaxEntOptimizableByLabelDistribution

cc.mallet.classify.MaxEntOptimizableByLabelLikelihood

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.