Examples of cc.mallet.types.Alphabet

cc.mallet.types.Alphabet
A mapping between integers and objects where the mapping in each direction is efficient. Integers are assigned consecutively, starting at zero, as objects are added to the Alphabet. Objects can not be deleted from the Alphabet and thus the integers are never reused.
The most common use of an alphabet is as a dictionary of feature names associated with a {@link cc.mallet.types.FeatureVector} in an{@link cc.mallet.types.Instance}. In a simple document classification usage, each unique word in a document would be a unique entry in the Alphabet with a unique integer associated with it. FeatureVectors rely on the integer part of the mapping to efficiently represent the subset of the Alphabet present in the FeatureVector. @see FeatureVector @see Instance @see cc.mallet.pipe.Pipe

   * @return ArrayList with the int indices of the selected features.
   */
  public static ArrayList<Integer> selectTopLDAFeatures(int numSelFeatures, ParallelTopicModel lda, Alphabet alphabet) {
    ArrayList<Integer> features = new ArrayList<Integer>();


    Alphabet seqAlphabet = lda.getAlphabet();
    
    int numTopics = lda.getNumTopics();
    
    Object[][] sorted = lda.getTopWords(seqAlphabet.size());


    for (int pos = 0; pos < seqAlphabet.size(); pos++) {
      for (int ti = 0; ti < numTopics; ti++) {
        Object feat = sorted[ti][pos].toString();
        int fi = alphabet.lookupIndex(feat,false);
        if ((fi >=0) && (!features.contains(fi))) {
          logger.info("Selected feature: " + feat);

View Full Code Here

  }


  public MaxEntOptimizableByLabelDistribution (InstanceList trainingSet, MaxEnt initialClassifier)
  {
    this.trainingList = trainingSet;
    Alphabet fd = trainingSet.getDataAlphabet();
    LabelAlphabet ld = (LabelAlphabet) trainingSet.getTargetAlphabet();
    // Don't fd.stopGrowth, because someone might want to do feature induction
    ld.stopGrowth();
    // Add one feature for the "default feature".
    this.numLabels = ld.size();
    this.numFeatures = fd.size() + 1;
    this.defaultFeatureIndex = numFeatures-1;
    this.parameters = new double [numLabels * numFeatures];
    this.constraints = new double [numLabels * numFeatures];
    this.cachedGradient = new double [numLabels * numFeatures];
    Arrays.fill (parameters, 0.0);
    Arrays.fill (constraints, 0.0);
    Arrays.fill (cachedGradient, 0.0);
    this.featureSelection = trainingSet.getFeatureSelection();
    this.perLabelFeatureSelection = trainingSet.getPerLabelFeatureSelection();
    // Add the default feature index to the selection
    if (featureSelection != null)
      featureSelection.add (defaultFeatureIndex);
    if (perLabelFeatureSelection != null)
      for (int i = 0; i < perLabelFeatureSelection.length; i++)
        perLabelFeatureSelection[i].add (defaultFeatureIndex);
    // xxx Later change this to allow both to be set, but select which one to use by a boolean flag?
    assert (featureSelection == null || perLabelFeatureSelection == null);
    if (initialClassifier != null) {
      this.theClassifier = initialClassifier;
      this.parameters = theClassifier.parameters;
      this.featureSelection = theClassifier.featureSelection;
      this.perLabelFeatureSelection = theClassifier.perClassFeatureSelection;
      this.defaultFeatureIndex = theClassifier.defaultFeatureIndex;
      assert (initialClassifier.getInstancePipe() == trainingSet.getPipe());
    }
    else if (this.theClassifier == null) {
      this.theClassifier = new MaxEnt (trainingSet.getPipe(), parameters, featureSelection, perLabelFeatureSelection);
    }
    cachedValueStale = true;
    cachedGradientStale = true;


    // Initialize the constraints
    logger.fine("Number of instances in training list = " + trainingList.size());
    for (Instance inst : trainingList) {
      double instanceWeight = trainingList.getInstanceWeight(inst);
      Labeling labeling = inst.getLabeling ();
      if (labeling == null)
        continue;
      //logger.fine ("Instance "+ii+" labeling="+labeling);
      FeatureVector fv = (FeatureVector) inst.getData ();
      Alphabet fdict = fv.getAlphabet();
      assert (fv.getAlphabet() == fd);


      // Here is the difference between this code and the single label 
      //  version: rather than only picking out the "best" index, 
      //  loop over all label indices.
      
      assert(labeling.numLocations() == trainingSet.getTargetAlphabet().size());
      for (int pos = 0; pos < labeling.numLocations(); pos++){
        MatrixOps.rowPlusEquals (constraints, numFeatures,
                     labeling.indexAtLocation(pos),
                     fv,
                     instanceWeight*labeling.valueAtLocation(pos));
      }


      assert(!Double.isNaN(instanceWeight)) : "instanceWeight is NaN";


      boolean hasNaN = false;
      for (int i = 0; i < fv.numLocations(); i++) {
        if (Double.isNaN(fv.valueAtLocation(i))) {
          logger.info("NaN for feature " + fdict.lookupObject(fv.indexAtLocation(i)).toString()); 
          hasNaN = true;
        }
      }
      if (hasNaN)
        logger.info("NaN in instance: " + inst.getName());

View Full Code Here

    public MaximizableTrainer (){}


    public MaximizableTrainer (InstanceList ilist, MCMaxEnt initialClassifier)
    {
      this.trainingList = ilist;
      Alphabet fd = ilist.getDataAlphabet();
      LabelAlphabet ld = (LabelAlphabet) ilist.getTargetAlphabet();
      // Don't fd.stopGrowth, because someone might want to do feature induction
      ld.stopGrowth();
      // Add one feature for the "default feature".
      this.numLabels = ld.size();
      this.numFeatures = fd.size() + 1;
      this.defaultFeatureIndex = numFeatures-1;
      this.parameters = new double [numLabels * numFeatures];
      this.constraints = new double [numLabels * numFeatures];
      this.cachedGradient = new double [numLabels * numFeatures];
      Arrays.fill (parameters, 0.0);
      Arrays.fill (constraints, 0.0);
      Arrays.fill (cachedGradient, 0.0);
      this.featureSelection = ilist.getFeatureSelection();
      this.perLabelFeatureSelection = ilist.getPerLabelFeatureSelection();
      // Add the default feature index to the selection
      if (featureSelection != null)
        featureSelection.add (defaultFeatureIndex);
      if (perLabelFeatureSelection != null)
        for (int i = 0; i < perLabelFeatureSelection.length; i++)
          perLabelFeatureSelection[i].add (defaultFeatureIndex);
      // xxx Later change this to allow both to be set, but select which one to use by a boolean flag?
      assert (featureSelection == null || perLabelFeatureSelection == null);
      if (initialClassifier != null) {


        this.theClassifier = initialClassifier;
        this.parameters = theClassifier.parameters;
        this.featureSelection = theClassifier.featureSelection;
        this.perLabelFeatureSelection = theClassifier.perClassFeatureSelection;
        this.defaultFeatureIndex = theClassifier.defaultFeatureIndex;
        assert (initialClassifier.getInstancePipe() == ilist.getPipe());
      }
      else if (this.theClassifier == null) {
        this.theClassifier = new MCMaxEnt (ilist.getPipe(), parameters, featureSelection, perLabelFeatureSelection);
      }
      cachedValueStale = true;
      cachedGradientStale = true;


      // Initialize the constraints
      logger.fine("Number of instances in training list = " + trainingList.size());
      for (Instance inst : trainingList) {
        double instanceWeight = trainingList.getInstanceWeight(inst);
        Labeling labeling = inst.getLabeling ();
        //logger.fine ("Instance "+ii+" labeling="+labeling);
        FeatureVector fv = (FeatureVector) inst.getData ();
        Alphabet fdict = fv.getAlphabet();
        assert (fv.getAlphabet() == fd);
        int li = labeling.getBestIndex();
        // The "2*" below is because there is one copy for the p(y|x)and another for the p(x|y).
        MatrixOps.rowPlusEquals (constraints, numFeatures, li, fv, 2*instanceWeight);
        // For the default feature, whose weight is 1.0
        assert(!Double.isNaN(instanceWeight)) : "instanceWeight is NaN";
        assert(!Double.isNaN(li)) : "bestIndex is NaN";
        boolean hasNaN = false;
        for(int i = 0; i < fv.numLocations(); i++) {
          if(Double.isNaN(fv.valueAtLocation(i))) {
            logger.info("NaN for feature " + fdict.lookupObject(fv.indexAtLocation(i)).toString());
            hasNaN = true;
          }
        }
        if(hasNaN)
          logger.info("NaN in instance: " + inst.getName());

View Full Code Here


  public Alphabet getAlphabet () { return classCentroidDistribution.getAlphabet(); }


  private static Alphabet dictOfSize (int size)
  {
    Alphabet ret = new Alphabet ();
    for (int i = 0; i < size; i++)
      ret.lookupIndex ("feature"+i);
    return ret;
  }

View Full Code Here

    // Prune clusters based on size.
    if (minClusterSize.value > 1) {
      for (int i = 0; i < clusterings.size(); i++) {
        Clustering clustering = clusterings.get(i);
        InstanceList oldInstances = clustering.getInstances();
        Alphabet alph = oldInstances.getDataAlphabet();
        LabelAlphabet lalph = (LabelAlphabet) oldInstances.getTargetAlphabet();
        if (alph == null) alph = new Alphabet();
        if (lalph == null) lalph = new LabelAlphabet();
        Pipe noop = new Noop(alph, lalph);
        InstanceList newInstances = new InstanceList(noop);
        for (int j = 0; j < oldInstances.size(); j++) {
          int label = clustering.getLabel(j);

View Full Code Here

    }


    Clustering[] clusterings = new Clustering[classDirs.value.length];
    int fi = 0;
    for (int i = 0; i < classDirs.value.length; i++) {
      Alphabet fieldAlph = new Alphabet();
      Alphabet valueAlph = new Alphabet();
      File directory = new File(classDirs.value[i]);
      File[] subdirs = getSubDirs(directory);
      Alphabet clusterAlph = new Alphabet();
      InstanceList instances = new InstanceList(new Noop());
      TIntArrayList labels = new TIntArrayList();
      for (int j = 0; j < subdirs.length; j++) {
        ArrayList<File> records = new FileIterator(subdirs[j]).getFileArray();
        int label = clusterAlph.lookupIndex(subdirs[j].toString());
        for (int k = 0; k < records.size(); k++) {
          if (fi % 100 == 0) System.out.print(fi);
          else if (fi % 10 == 0) System.out.print(".");
          if (fi % 1000 == 0 && fi > 0) System.out.println();
          System.out.flush();

View Full Code Here

  public DMROptimizable () {}


  public DMROptimizable (InstanceList instances, MaxEnt initialClassifier) {


    this.trainingList = instances;
    Alphabet alphabet = instances.getDataAlphabet();
    Alphabet labelAlphabet = instances.getTargetAlphabet();


    this.numLabels = labelAlphabet.size();


    // Add one feature for the "default feature".
    this.numFeatures = alphabet.size() + 1; // add a spot for the intercept term
            
    //System.out.println("num features: " + numFeatures + " numLabels: " + numLabels);

View Full Code Here


  public Alphabet getAlphabet () { return classCentroidDistribution.getAlphabet(); }


  private static Alphabet dictOfSize (int size)
  {
    Alphabet ret = new Alphabet ();
    for (int i = 0; i < size; i++)
      ret.lookupIndex ("feature"+i);
    return ret;
  }

View Full Code Here

    this (dictionary, dictionary);
  }


  public FeatureTransducer ()
  {
    this (new Alphabet ());
  }

View Full Code Here


  public void testGetSetParameters()
  {
    int inputVocabSize = 100;
    int numStates = 5;
    Alphabet inputAlphabet = new Alphabet();
    for (int i = 0; i < inputVocabSize; i++)
      inputAlphabet.lookupIndex("feature" + i);
    Alphabet outputAlphabet = new Alphabet();
    MEMM memm = new MEMM (inputAlphabet, outputAlphabet);
    String[] stateNames = new String[numStates];
    for (int i = 0; i < numStates; i++)
      stateNames[i] = "state" + i;
    memm.addFullyConnectedStates(stateNames);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of cc.mallet.types.Alphabet

cc.mallet.classify.BalancedWinnowTrainer

cc.mallet.classify.FeatureConstraintUtil

cc.mallet.classify.MaxEnt

cc.mallet.classify.MaxEntOptimizableByLabelDistribution

cc.mallet.classify.MaxEntOptimizableByLabelLikelihood

cc.mallet.classify.MCMaxEnt

cc.mallet.classify.MCMaxEntTrainer$MaximizableTrainer

cc.mallet.classify.RankMaxEnt

cc.mallet.classify.RankMaxEntTrainer$MaximizableTrainer

cc.mallet.classify.WinnowTrainer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.