Package cc.mallet.types

Examples of cc.mallet.types.Alphabet


   * @return ArrayList with the int indices of the selected features.
   */
  public static ArrayList<Integer> selectTopLDAFeatures(int numSelFeatures, ParallelTopicModel lda, Alphabet alphabet) {
    ArrayList<Integer> features = new ArrayList<Integer>();

    Alphabet seqAlphabet = lda.getAlphabet();
   
    int numTopics = lda.getNumTopics();
   
    Object[][] sorted = lda.getTopWords(seqAlphabet.size());

    for (int pos = 0; pos < seqAlphabet.size(); pos++) {
      for (int ti = 0; ti < numTopics; ti++) {
        Object feat = sorted[ti][pos].toString();
        int fi = alphabet.lookupIndex(feat,false);
        if ((fi >=0) && (!features.contains(fi))) {
          logger.info("Selected feature: " + feat);
View Full Code Here


  }

  public MaxEntOptimizableByLabelDistribution (InstanceList trainingSet, MaxEnt initialClassifier)
  {
    this.trainingList = trainingSet;
    Alphabet fd = trainingSet.getDataAlphabet();
    LabelAlphabet ld = (LabelAlphabet) trainingSet.getTargetAlphabet();
    // Don't fd.stopGrowth, because someone might want to do feature induction
    ld.stopGrowth();
    // Add one feature for the "default feature".
    this.numLabels = ld.size();
    this.numFeatures = fd.size() + 1;
    this.defaultFeatureIndex = numFeatures-1;
    this.parameters = new double [numLabels * numFeatures];
    this.constraints = new double [numLabels * numFeatures];
    this.cachedGradient = new double [numLabels * numFeatures];
    Arrays.fill (parameters, 0.0);
    Arrays.fill (constraints, 0.0);
    Arrays.fill (cachedGradient, 0.0);
    this.featureSelection = trainingSet.getFeatureSelection();
    this.perLabelFeatureSelection = trainingSet.getPerLabelFeatureSelection();
    // Add the default feature index to the selection
    if (featureSelection != null)
      featureSelection.add (defaultFeatureIndex);
    if (perLabelFeatureSelection != null)
      for (int i = 0; i < perLabelFeatureSelection.length; i++)
        perLabelFeatureSelection[i].add (defaultFeatureIndex);
    // xxx Later change this to allow both to be set, but select which one to use by a boolean flag?
    assert (featureSelection == null || perLabelFeatureSelection == null);
    if (initialClassifier != null) {
      this.theClassifier = initialClassifier;
      this.parameters = theClassifier.parameters;
      this.featureSelection = theClassifier.featureSelection;
      this.perLabelFeatureSelection = theClassifier.perClassFeatureSelection;
      this.defaultFeatureIndex = theClassifier.defaultFeatureIndex;
      assert (initialClassifier.getInstancePipe() == trainingSet.getPipe());
    }
    else if (this.theClassifier == null) {
      this.theClassifier = new MaxEnt (trainingSet.getPipe(), parameters, featureSelection, perLabelFeatureSelection);
    }
    cachedValueStale = true;
    cachedGradientStale = true;

    // Initialize the constraints
    logger.fine("Number of instances in training list = " + trainingList.size());
    for (Instance inst : trainingList) {
      double instanceWeight = trainingList.getInstanceWeight(inst);
      Labeling labeling = inst.getLabeling ();
      if (labeling == null)
        continue;
      //logger.fine ("Instance "+ii+" labeling="+labeling);
      FeatureVector fv = (FeatureVector) inst.getData ();
      Alphabet fdict = fv.getAlphabet();
      assert (fv.getAlphabet() == fd);

      // Here is the difference between this code and the single label
      //  version: rather than only picking out the "best" index,
      //  loop over all label indices.
     
      assert(labeling.numLocations() == trainingSet.getTargetAlphabet().size());
      for (int pos = 0; pos < labeling.numLocations(); pos++){
        MatrixOps.rowPlusEquals (constraints, numFeatures,
                     labeling.indexAtLocation(pos),
                     fv,
                     instanceWeight*labeling.valueAtLocation(pos));
      }

      assert(!Double.isNaN(instanceWeight)) : "instanceWeight is NaN";

      boolean hasNaN = false;
      for (int i = 0; i < fv.numLocations(); i++) {
        if (Double.isNaN(fv.valueAtLocation(i))) {
          logger.info("NaN for feature " + fdict.lookupObject(fv.indexAtLocation(i)).toString());
          hasNaN = true;
        }
      }
      if (hasNaN)
        logger.info("NaN in instance: " + inst.getName());
View Full Code Here

    public MaximizableTrainer (){}

    public MaximizableTrainer (InstanceList ilist, MCMaxEnt initialClassifier)
    {
      this.trainingList = ilist;
      Alphabet fd = ilist.getDataAlphabet();
      LabelAlphabet ld = (LabelAlphabet) ilist.getTargetAlphabet();
      // Don't fd.stopGrowth, because someone might want to do feature induction
      ld.stopGrowth();
      // Add one feature for the "default feature".
      this.numLabels = ld.size();
      this.numFeatures = fd.size() + 1;
      this.defaultFeatureIndex = numFeatures-1;
      this.parameters = new double [numLabels * numFeatures];
      this.constraints = new double [numLabels * numFeatures];
      this.cachedGradient = new double [numLabels * numFeatures];
      Arrays.fill (parameters, 0.0);
      Arrays.fill (constraints, 0.0);
      Arrays.fill (cachedGradient, 0.0);
      this.featureSelection = ilist.getFeatureSelection();
      this.perLabelFeatureSelection = ilist.getPerLabelFeatureSelection();
      // Add the default feature index to the selection
      if (featureSelection != null)
        featureSelection.add (defaultFeatureIndex);
      if (perLabelFeatureSelection != null)
        for (int i = 0; i < perLabelFeatureSelection.length; i++)
          perLabelFeatureSelection[i].add (defaultFeatureIndex);
      // xxx Later change this to allow both to be set, but select which one to use by a boolean flag?
      assert (featureSelection == null || perLabelFeatureSelection == null);
      if (initialClassifier != null) {

        this.theClassifier = initialClassifier;
        this.parameters = theClassifier.parameters;
        this.featureSelection = theClassifier.featureSelection;
        this.perLabelFeatureSelection = theClassifier.perClassFeatureSelection;
        this.defaultFeatureIndex = theClassifier.defaultFeatureIndex;
        assert (initialClassifier.getInstancePipe() == ilist.getPipe());
      }
      else if (this.theClassifier == null) {
        this.theClassifier = new MCMaxEnt (ilist.getPipe(), parameters, featureSelection, perLabelFeatureSelection);
      }
      cachedValueStale = true;
      cachedGradientStale = true;

      // Initialize the constraints
      logger.fine("Number of instances in training list = " + trainingList.size());
      for (Instance inst : trainingList) {
        double instanceWeight = trainingList.getInstanceWeight(inst);
        Labeling labeling = inst.getLabeling ();
        //logger.fine ("Instance "+ii+" labeling="+labeling);
        FeatureVector fv = (FeatureVector) inst.getData ();
        Alphabet fdict = fv.getAlphabet();
        assert (fv.getAlphabet() == fd);
        int li = labeling.getBestIndex();
        // The "2*" below is because there is one copy for the p(y|x)and another for the p(x|y).
        MatrixOps.rowPlusEquals (constraints, numFeatures, li, fv, 2*instanceWeight);
        // For the default feature, whose weight is 1.0
        assert(!Double.isNaN(instanceWeight)) : "instanceWeight is NaN";
        assert(!Double.isNaN(li)) : "bestIndex is NaN";
        boolean hasNaN = false;
        for(int i = 0; i < fv.numLocations(); i++) {
          if(Double.isNaN(fv.valueAtLocation(i))) {
            logger.info("NaN for feature " + fdict.lookupObject(fv.indexAtLocation(i)).toString());
            hasNaN = true;
          }
        }
        if(hasNaN)
          logger.info("NaN in instance: " + inst.getName());
View Full Code Here

  public Alphabet getAlphabet () { return classCentroidDistribution.getAlphabet(); }

  private static Alphabet dictOfSize (int size)
  {
    Alphabet ret = new Alphabet ();
    for (int i = 0; i < size; i++)
      ret.lookupIndex ("feature"+i);
    return ret;
  }
View Full Code Here

    // Prune clusters based on size.
    if (minClusterSize.value > 1) {
      for (int i = 0; i < clusterings.size(); i++) {
        Clustering clustering = clusterings.get(i);
        InstanceList oldInstances = clustering.getInstances();
        Alphabet alph = oldInstances.getDataAlphabet();
        LabelAlphabet lalph = (LabelAlphabet) oldInstances.getTargetAlphabet();
        if (alph == null) alph = new Alphabet();
        if (lalph == null) lalph = new LabelAlphabet();
        Pipe noop = new Noop(alph, lalph);
        InstanceList newInstances = new InstanceList(noop);
        for (int j = 0; j < oldInstances.size(); j++) {
          int label = clustering.getLabel(j);
View Full Code Here

    }

    Clustering[] clusterings = new Clustering[classDirs.value.length];
    int fi = 0;
    for (int i = 0; i < classDirs.value.length; i++) {
      Alphabet fieldAlph = new Alphabet();
      Alphabet valueAlph = new Alphabet();
      File directory = new File(classDirs.value[i]);
      File[] subdirs = getSubDirs(directory);
      Alphabet clusterAlph = new Alphabet();
      InstanceList instances = new InstanceList(new Noop());
      TIntArrayList labels = new TIntArrayList();
      for (int j = 0; j < subdirs.length; j++) {
        ArrayList<File> records = new FileIterator(subdirs[j]).getFileArray();
        int label = clusterAlph.lookupIndex(subdirs[j].toString());
        for (int k = 0; k < records.size(); k++) {
          if (fi % 100 == 0) System.out.print(fi);
          else if (fi % 10 == 0) System.out.print(".");
          if (fi % 1000 == 0 && fi > 0) System.out.println();
          System.out.flush();
View Full Code Here

  public DMROptimizable () {}

  public DMROptimizable (InstanceList instances, MaxEnt initialClassifier) {

    this.trainingList = instances;
    Alphabet alphabet = instances.getDataAlphabet();
    Alphabet labelAlphabet = instances.getTargetAlphabet();

    this.numLabels = labelAlphabet.size();

    // Add one feature for the "default feature".
    this.numFeatures = alphabet.size() + 1; // add a spot for the intercept term
           
    //System.out.println("num features: " + numFeatures + " numLabels: " + numLabels);
View Full Code Here

  public Alphabet getAlphabet () { return classCentroidDistribution.getAlphabet(); }

  private static Alphabet dictOfSize (int size)
  {
    Alphabet ret = new Alphabet ();
    for (int i = 0; i < size; i++)
      ret.lookupIndex ("feature"+i);
    return ret;
  }
View Full Code Here

    this (dictionary, dictionary);
  }

  public FeatureTransducer ()
  {
    this (new Alphabet ());
  }
View Full Code Here

  public void testGetSetParameters()
  {
    int inputVocabSize = 100;
    int numStates = 5;
    Alphabet inputAlphabet = new Alphabet();
    for (int i = 0; i < inputVocabSize; i++)
      inputAlphabet.lookupIndex("feature" + i);
    Alphabet outputAlphabet = new Alphabet();
    MEMM memm = new MEMM (inputAlphabet, outputAlphabet);
    String[] stateNames = new String[numStates];
    for (int i = 0; i < numStates; i++)
      stateNames[i] = "state" + i;
    memm.addFullyConnectedStates(stateNames);
View Full Code Here

TOP

Related Classes of cc.mallet.types.Alphabet

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.