Package org.fnlp.ml.feature

Source Code of org.fnlp.ml.feature.FeatureSelect

package org.fnlp.ml.feature;

import gnu.trove.iterator.TIntFloatIterator;

import java.util.ArrayList;
import java.util.Arrays;

import org.fnlp.ml.classifier.bayes.Heap;
import org.fnlp.ml.classifier.bayes.ItemFrequency;
import org.fnlp.ml.types.sv.HashSparseVector;
/**
* 特征选择 保存选择后的特征
* @author sywu
*
*/
public class FeatureSelect {
  private boolean isUseful[];
  int size;
  public FeatureSelect(int size){
    this.size=size;
    isUseful=new boolean[size];
    Arrays.fill(isUseful, true);
  }
  /**
   * 特征选择 卡方法 保留特征为每个类别前percent的最大卡方值特征的并集
   * @param tf 词频类
   * @param percent 保留百分比
   */
  public void fS_CS(ItemFrequency tf,float percent){featureSelectionChiSquare(tf,percent);}
  public void featureSelectionChiSquare(ItemFrequency tf,float percent){
    int feaSize=tf.getFeatureSize();
    int typeSize=tf.getTypeSize();
    isUseful=new boolean[feaSize];
    Arrays.fill(isUseful, false);
    for(int j=0;j<typeSize;j++){
      Heap<Integer> heap=new Heap<Integer>((int)(feaSize*percent),true);
      for(int i=0;i<feaSize;i++){
        double A,B,C,D,AC,AB,N;
        N=tf.getTotal();
        A=tf.getItemFrequency(i, j);
        AB=tf.getFeatureFrequency(i);
        AC=tf.getTypeFrequency(j);
        B=AB-A;
        C=AC-A;
        D=N-AB-C;
        double score=(A*D-B*C)*(A*D-B*C)/AB/(C+D);
        heap.insert(score, i);
      }
      ArrayList<Integer> data=heap.getData();
      for(int i=1;i<data.size();i++)
        isUseful[data.get(i)]=true;
    }
    int total=0;
    for(int i=0;i<feaSize;i++){
      if(isUseful[i])
        total++;
    }
    System.out.println("Feature Selection"+total+"/"+feaSize);
  }
  /**
   * 特征选择 卡方法
   * 对特征对每个类别求卡方,保留最大项,保留结果为前percent的特征
   * @param tf 词频类
   * @param percent 保留百分比
   */
  public void fS_CS_Max(ItemFrequency tf,float percent){featureSelectionChiSquareMax(tf,percent);}
  public void featureSelectionChiSquareMax(ItemFrequency tf,float percent){
    int feaSize=tf.getFeatureSize();
    int typeSize=tf.getTypeSize();
    isUseful=new boolean[feaSize];
    Arrays.fill(isUseful, false);
    Heap<Integer> heap=new Heap<Integer>((int)(feaSize*percent),true);
    for(int i=0;i<feaSize;i++){
      double max=0;
      for(int j=0;j<typeSize;j++){
        double A,B,C,D,AC,AB,N;
        N=tf.getTotal();
        A=tf.getItemFrequency(i, j);
        AB=tf.getFeatureFrequency(i);
        AC=tf.getTypeFrequency(j);
        B=AB-A;
        C=AC-A;
        D=N-AB-C;
        double score=(A*D-B*C)*(A*D-B*C)/AB/(C+D)/AC/(B+D);
        if(score>max)
          max=score;
      }

      heap.insert(max, i);
    }
    ArrayList<Integer> data=heap.getData();
    for(int i=1;i<data.size();i++)
      isUseful[data.get(i)]=true;
    int total=0;
    for(int i=0;i<feaSize;i++){
      if(isUseful[i])
        total++;
    }
    System.out.println("Feature Selection"+total+"/"+feaSize);
 
  /**
   * 特征选择 信息增益
   * @param tf 词频类
   * @param percent 保留百分比
   */
  public void fS_IG(ItemFrequency tf,float percent){featureSelectionInformationGain(tf,percent);}
  public void featureSelectionInformationGain(ItemFrequency tf,float percent){
    int feaSize=tf.getFeatureSize();
    int typeSize=tf.getTypeSize();
    isUseful=new boolean[feaSize];
    Arrays.fill(isUseful, false);
    Heap<Integer> heap=new Heap<Integer>((int)(feaSize*percent),true);
    for(int i=0;i<feaSize;i++){
      double ig=0;
      for(int j=0;j<typeSize;j++){
        double A,B,C,D,AC,AB,N;
        N=tf.getTotal();
        A=tf.getItemFrequency(i, j);
        AB=tf.getFeatureFrequency(i);
        AC=tf.getTypeFrequency(j);
        B=AB-A;
        C=AC-A;
        D=N-AB-C;
        ig+=-AC/N*Math.log(AC/N);
        ig+=AB/N*A/N*Math.log(A/N);
        ig+=(1-AB/N)*C/N*Math.log(C/N);;
      }
      heap.insert(ig, i);
    }
    ArrayList<Integer> data=heap.getData();
    for(int i=1;i<data.size();i++)
      isUseful[data.get(i)]=true;
    int total=0;
    for(int i=0;i<feaSize;i++){
      if(isUseful[i])
        total++;
    }
    System.out.println("Feature Selection"+total+"/"+feaSize);
  }
  public void noFeatureSelection(){
    Arrays.fill(isUseful, true);
  }
  public HashSparseVector select(HashSparseVector vec){
    HashSparseVector sv=new HashSparseVector();   
    TIntFloatIterator it=vec.data.iterator();
    while(it.hasNext()){
      it.advance();
      if(isUseful[it.key()])
        sv.put(it.key(), it.value());
    }
    return sv;
  }
}
TOP

Related Classes of org.fnlp.ml.feature.FeatureSelect

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.