Package edu.illinois.lis.document

Source Code of edu.illinois.lis.document.FeatureVector

package edu.illinois.lis.document;


import java.io.IOException;
import java.io.StringReader;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;

import edu.illinois.lis.utils.KeyValuePair;
import edu.illinois.lis.utils.ScorableComparator;
import edu.illinois.lis.utils.Stopper;






/**
* Simple container mapping term->count pairs grabbed from an input text.
*
* @author Miles Efron
*
*/
public class FeatureVector  {
  private static StandardAnalyzer analyzer; 
  private Map<String, Double> features;
  private Stopper stopper;
  private double length = 0.0;


  // CONSTRUCTORS 
  public FeatureVector(String text, Stopper stopper) {
    this.stopper = stopper;
    if(stopper==null || stopper.asSet().size()==0) {
      analyzer = new StandardAnalyzer(Version.LUCENE_41, CharArraySet.EMPTY_SET);
    } else {
      CharArraySet charArraySet = new CharArraySet(Version.LUCENE_41, stopper.asSet(), true);
      analyzer = new StandardAnalyzer(Version.LUCENE_41, charArraySet);
    }
    features = new HashMap<String, Double>();
    List<String> terms = this.analyze(text);
    Iterator<String> termsIt = terms.iterator();
    while(termsIt.hasNext()) {
      String term = termsIt.next();
      length += 1.0;
      Double val = (Double)features.get(term);
      if(val == null) {
        features.put(term, new Double(1.0));
      } else {
        double v = val.doubleValue() + 1.0;
        features.put(term, new Double(v));
      }
    }
  }

  public FeatureVector(Stopper stopper) {
    this.stopper = stopper;
    if(stopper==null || stopper.asSet().size()==0) {
      analyzer = new StandardAnalyzer(Version.LUCENE_41, CharArraySet.EMPTY_SET);
    } else {
      CharArraySet charArraySet = new CharArraySet(Version.LUCENE_41, stopper.asSet(), true);
      analyzer = new StandardAnalyzer(Version.LUCENE_41, charArraySet);
    }
    features = new HashMap<String,Double>();
  }






  // MUTATORS

  /**
   * Add all the terms in a string to this vector
   * @param text a space-delimited string where we want to add each word.
   */
  public void addText(String text) {
    List<String> terms = this.analyze(text);
    Iterator<String> termsIt = terms.iterator();
    while(termsIt.hasNext()) {
      String term = termsIt.next();   
      addTerm(term);
    }
  }

  /**
   * Add a term to this vector.  if it's already here, increment its count.
   * @param term
   */
  public void addTerm(String term) {
    if(stopper != null && stopper.isStopWord(term))
      return;

    Double freq = ((Double)features.get(term));
    if(freq == null) {
      features.put(term, new Double(1.0));
    } else {
      double f = freq.doubleValue();
      features.put(term, new Double(f+1.0));
    }
    length += 1.0;
  }


  /**
   * Add a term to this vector with this weight.  if it's already here, supplement its weight.
   * @param term
   * @param weight
   */
  public void addTerm(String term, double weight) {
    Double w = ((Double)features.get(term));
    if(w == null) {
      features.put(term, new Double(weight));
    } else {
      double f = w.doubleValue();
      features.put(term, new Double(f+weight));
    }
    length += weight;
  }

  /**
   * in case we want to override the derived length.
   * @param length
   */
  public void setLength(double length) {
    this.length = length;
  }

  public void pruneToSize(int k) {
    List<KeyValuePair> kvpList = getOrderedFeatures();

    Iterator<KeyValuePair> it = kvpList.iterator();

    Map<String,Double> newMap = new HashMap<String,Double>(k);
    int i=0;
    while(it.hasNext()) {
      KeyValuePair kvp = it.next();
      newMap.put((String)kvp.getKey(), kvp.getScore());
      if(i++ > k)
        break;
    }

    features = (HashMap<String, Double>) newMap;

  }

  public void normalizeToOne() {
    Map<String,Double> f = new HashMap<String,Double>(features.size());

    Iterator<String> it = features.keySet().iterator();
    while(it.hasNext()) {
      String feature = it.next();
      double obs = features.get(feature);
      f.put(feature, obs/length);
    }

    features = f;
  }


  // ACCESSORS

  public Set<String> getFeatures() {
    return features.keySet();
  }

  public double getLength() {
    return length;
  }

  public int getDimensions() {
    return features.size();
  }

  public double getFeaturetWeight(String feature) {
    Double w = (Double)features.get(feature);
    return (w==null) ? 0.0 : w.doubleValue();
  }

  public Iterator<String> iterator() {
    return features.keySet().iterator();
  }

  public boolean contains(Object key) {
    return features.containsKey(key);
  }

  public double getVectorNorm() {
    double norm = 0.0;
    Iterator<String> it = features.keySet().iterator();
    while(it.hasNext()) {
      norm += Math.pow(features.get(it.next()), 2.0);
    }
    return Math.sqrt(norm);
  }



  // VIEWING

  @Override
  public String toString() {
    return this.toString(features.size());
  }

  private List<KeyValuePair> getOrderedFeatures() {
    List<KeyValuePair> kvpList = new ArrayList<KeyValuePair>(features.size());
    Iterator<String> featureIterator = features.keySet().iterator();
    while(featureIterator.hasNext()) {
      String feature = featureIterator.next();
      double value   = features.get(feature);
      KeyValuePair keyValuePair = new KeyValuePair(feature, value);
      kvpList.add(keyValuePair);
    }
    ScorableComparator comparator = new ScorableComparator(true);
    Collections.sort(kvpList, comparator);

    return kvpList;
  }

  public String toString(int k) {
    DecimalFormat format = new DecimalFormat("#.#########");
    StringBuilder b = new StringBuilder();
    List<KeyValuePair> kvpList = getOrderedFeatures();
    Iterator<KeyValuePair> it = kvpList.iterator();
    int i=0;
    while(it.hasNext() && i++ < k) {
      KeyValuePair pair = it.next();
      b.append(format.format(pair.getScore()) + " " + pair.getKey() + "\n");
    }
    return b.toString();

  }


  // UTILS
  public List<String> analyze(String text) {
    List<String> result = new LinkedList<String>();
    try {
      TokenStream stream = null;
      stream = analyzer.tokenStream("text", new StringReader(text));

      CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);
      stream.reset();
      while(stream.incrementToken()) {
        String term = charTermAttribute.toString();
        result.add(term);
      }
    } catch (IOException e) {
      e.printStackTrace();
    }
    return result;
  }

  public static FeatureVector interpolate(FeatureVector x, FeatureVector y, double xWeight) {
    FeatureVector z = new FeatureVector(null);
    Set<String> vocab = new HashSet<String>();
    vocab.addAll(x.getFeatures());
    vocab.addAll(y.getFeatures());
    Iterator<String> features = vocab.iterator();
    while(features.hasNext()) {
      String feature = features.next();
      double weight  = xWeight*x.getFeaturetWeight(feature) + (1.0-xWeight)*y.getFeaturetWeight(feature);
      z.addTerm(feature, weight);
    }
    return z;
  }


  public static void main(String[] args) {
    String text = "This. This is NOT a test, nor is it better than 666!";

    Stopper stopper = new Stopper();
    stopper.addStopword("this");
    stopper.addStopword("is");
    stopper.addStopword("better");
   
    FeatureVector featureVector = new FeatureVector(stopper);
    List<String> terms = featureVector.analyze(text);
    Iterator<String> termIterator = terms.iterator();
    while(termIterator.hasNext()) {
      System.out.println(termIterator.next());
    }
  }


}
TOP

Related Classes of edu.illinois.lis.document.FeatureVector

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.