Examples of weka.core.Stopwords

weka.core.Stopwords
s.cmu.edu/~mccallum/bow/rainbow/" target="_blank">Rainbow.
Accepts the following parameter:
-i file
loads the stopwords from the given file
-o file
saves the stopwords to the given file
-p
outputs the current stopwords on stdout
Any additional parameters are interpreted as words to test as stopwords. @author Eibe Frank (eibe@cs.waikato.ac.nz) @author Ashraf M. Kibriya (amk14@cs.waikato.ac.nz) @author FracPete (fracpete at waikato dot ac dot nz) @version $Revision: 1.6 $

  /**
   * determines the dictionary.
   */
  private void determineDictionary() {
    // initialize stopwords
    Stopwords stopwords = new Stopwords();
    if (getUseStoplist()) {
      try {
  if (getStopwords().exists() && !getStopwords().isDirectory())
    stopwords.read(getStopwords());
      }
      catch (Exception e) {
  e.printStackTrace();
      }
    }


    // Operate on a per-class basis if class attribute is set
    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
      values = getInputFormat().attribute(classInd).numValues();
    }


    //TreeMap dictionaryArr [] = new TreeMap[values];
    TreeMap [] dictionaryArr = new TreeMap[values];
    for (int i = 0; i < values; i++) {
      dictionaryArr[i] = new TreeMap();
    }


    // Make sure we know which fields to convert
    determineSelectedRange();


    // Tokenize all training text into an orderedMap of "words".
    long pruneRate = 
      Math.round((m_PeriodicPruningRate/100.0)*getInputFormat().numInstances());
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
      Instance instance = getInputFormat().instance(i);
      int vInd = 0;
      if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
  vInd = (int)instance.classValue();
      }


      // Iterate through all relevant string attributes of the current instance
      Hashtable h = new Hashtable();
      for (int j = 0; j < instance.numAttributes(); j++) { 
  if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {


    // Get tokenizer
    m_Tokenizer.tokenize(instance.stringValue(j));


    // Iterate through tokens, perform stemming, and remove stopwords
    // (if required)
    while (m_Tokenizer.hasMoreElements()) {
      String word = ((String)m_Tokenizer.nextElement()).intern();


      if(this.m_lowerCaseTokens==true)
        word = word.toLowerCase();


      word = m_Stemmer.stem(word);


      if(this.m_useStoplist==true)
        if(stopwords.is(word))
    continue;


      if(!(h.contains(word)))
        h.put(word, new Integer(0));

View Full Code Here

  /**
   * determines the dictionary.
   */
  private void determineDictionary() {
    // initialize stopwords
    Stopwords stopwords = new Stopwords();
    if (getUseStoplist()) {
      try {
  if (getStopwords().exists() && !getStopwords().isDirectory())
    stopwords.read(getStopwords());
      }
      catch (Exception e) {
  e.printStackTrace();
      }
    }


    // Operate on a per-class basis if class attribute is set
    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
      values = getInputFormat().attribute(classInd).numValues();
    }


    //TreeMap dictionaryArr [] = new TreeMap[values];
    TreeMap [] dictionaryArr = new TreeMap[values];
    for (int i = 0; i < values; i++) {
      dictionaryArr[i] = new TreeMap();
    }


    // Make sure we know which fields to convert
    determineSelectedRange();


    // Tokenize all training text into an orderedMap of "words".
    long pruneRate = 
      Math.round((m_PeriodicPruningRate/100.0)*getInputFormat().numInstances());
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
      Instance instance = getInputFormat().instance(i);
      int vInd = 0;
      if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
  vInd = (int)instance.classValue();
      }


      // Iterate through all relevant string attributes of the current instance
      Hashtable h = new Hashtable();
      for (int j = 0; j < instance.numAttributes(); j++) { 
  if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {


    // Get tokenizer
    m_Tokenizer.tokenize(instance.stringValue(j));


    // Iterate through tokens, perform stemming, and remove stopwords
    // (if required)
    while (m_Tokenizer.hasMoreElements()) {
      String word = ((String)m_Tokenizer.nextElement()).intern();


      if(this.m_lowerCaseTokens==true)
        word = word.toLowerCase();


      word = m_Stemmer.stem(word);


      if(this.m_useStoplist==true)
        if(stopwords.is(word))
    continue;


      if(!(h.contains(word)))
        h.put(word, new Integer(0));

View Full Code Here

    } else {
      m_inputVector.clear();
    }
    
    if (m_useStopList && m_stopwords == null) {
      m_stopwords = new Stopwords();
      try {
        if (getStopwords().exists() && !getStopwords().isDirectory()) {
          m_stopwords.read(getStopwords());
        }
      } catch (Exception ex) {

View Full Code Here

  /**
   * determines the dictionary.
   */
  private void determineDictionary() {
    // initialize stopwords
    Stopwords stopwords = new Stopwords();
    if (getUseStoplist()) {
      try {
  if (getStopwords().exists() && !getStopwords().isDirectory())
    stopwords.read(getStopwords());
      }
      catch (Exception e) {
  e.printStackTrace();
      }
    }


    // Operate on a per-class basis if class attribute is set
    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
      values = getInputFormat().attribute(classInd).numValues();
    }


    //TreeMap dictionaryArr [] = new TreeMap[values];
    TreeMap [] dictionaryArr = new TreeMap[values];
    for (int i = 0; i < values; i++) {
      dictionaryArr[i] = new TreeMap();
    }


    // Make sure we know which fields to convert
    determineSelectedRange();


    // Tokenize all training text into an orderedMap of "words".
    long pruneRate = 
      Math.round((m_PeriodicPruningRate/100.0)*getInputFormat().numInstances());
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
      Instance instance = getInputFormat().instance(i);
      int vInd = 0;
      if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
  vInd = (int)instance.classValue();
      }


      // Iterate through all relevant string attributes of the current instance
      Hashtable h = new Hashtable();
      for (int j = 0; j < instance.numAttributes(); j++) { 
  if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {


    // Get tokenizer
    m_Tokenizer.tokenize(instance.stringValue(j));


    // Iterate through tokens, perform stemming, and remove stopwords
    // (if required)
    while (m_Tokenizer.hasMoreElements()) {
      String word = ((String)m_Tokenizer.nextElement()).intern();


      if(this.m_lowerCaseTokens==true)
        word = word.toLowerCase();


      word = m_Stemmer.stem(word);


      if(this.m_useStoplist==true)
        if(stopwords.is(word))
    continue;


      if(!(h.contains(word)))
        h.put(word, new Integer(0));

View Full Code Here

        this.selectedAttributeName = selectedAttributeName;
    }


    private HashSet<String> prepareStopwords() {
        // initialize stopwords
        Stopwords stops = new Stopwords();
        if (getStopList() != null) {
            try {
                if (getStopList().exists() && !getStopList().isDirectory()) {
                    stops.read(getStopList());
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        
        HashSet<String> result = new HashSet<String>();
        Enumeration<String> words = (Enumeration<String>)stops.elements();
        while (words.hasMoreElements()) {
            result.add(words.nextElement());
        }
        return result;
    }

View Full Code Here

TOP

Related Classes of weka.core.Stopwords

etc.aloe.filters.WordFeaturesExtractor

weka.classifiers.functions.SGDText

weka.filters.unsupervised.attribute.StringToWordVector

java.util.HashSet

java.util.Iterator

java.io.FileReader

java.io.BufferedReader

java.util.Vector

java.io.FileWriter

java.util.Date

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.