Package weka.core

Examples of weka.core.Stopwords


  /**
   * determines the dictionary.
   */
  private void determineDictionary() {
    // initialize stopwords
    Stopwords stopwords = new Stopwords();
    if (getUseStoplist()) {
      try {
  if (getStopwords().exists() && !getStopwords().isDirectory())
    stopwords.read(getStopwords());
      }
      catch (Exception e) {
  e.printStackTrace();
      }
    }

    // Operate on a per-class basis if class attribute is set
    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
      values = getInputFormat().attribute(classInd).numValues();
    }

    //TreeMap dictionaryArr [] = new TreeMap[values];
    TreeMap [] dictionaryArr = new TreeMap[values];
    for (int i = 0; i < values; i++) {
      dictionaryArr[i] = new TreeMap();
    }

    // Make sure we know which fields to convert
    determineSelectedRange();

    // Tokenize all training text into an orderedMap of "words".
    long pruneRate =
      Math.round((m_PeriodicPruningRate/100.0)*getInputFormat().numInstances());
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
      Instance instance = getInputFormat().instance(i);
      int vInd = 0;
      if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
  vInd = (int)instance.classValue();
      }

      // Iterate through all relevant string attributes of the current instance
      Hashtable h = new Hashtable();
      for (int j = 0; j < instance.numAttributes(); j++) {
  if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {

    // Get tokenizer
    m_Tokenizer.tokenize(instance.stringValue(j));

    // Iterate through tokens, perform stemming, and remove stopwords
    // (if required)
    while (m_Tokenizer.hasMoreElements()) {
      String word = ((String)m_Tokenizer.nextElement()).intern();

      if(this.m_lowerCaseTokens==true)
        word = word.toLowerCase();

      word = m_Stemmer.stem(word);

      if(this.m_useStoplist==true)
        if(stopwords.is(word))
    continue;

      if(!(h.contains(word)))
        h.put(word, new Integer(0));

View Full Code Here


  /**
   * determines the dictionary.
   */
  private void determineDictionary() {
    // initialize stopwords
    Stopwords stopwords = new Stopwords();
    if (getUseStoplist()) {
      try {
  if (getStopwords().exists() && !getStopwords().isDirectory())
    stopwords.read(getStopwords());
      }
      catch (Exception e) {
  e.printStackTrace();
      }
    }

    // Operate on a per-class basis if class attribute is set
    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
      values = getInputFormat().attribute(classInd).numValues();
    }

    //TreeMap dictionaryArr [] = new TreeMap[values];
    TreeMap [] dictionaryArr = new TreeMap[values];
    for (int i = 0; i < values; i++) {
      dictionaryArr[i] = new TreeMap();
    }

    // Make sure we know which fields to convert
    determineSelectedRange();

    // Tokenize all training text into an orderedMap of "words".
    long pruneRate =
      Math.round((m_PeriodicPruningRate/100.0)*getInputFormat().numInstances());
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
      Instance instance = getInputFormat().instance(i);
      int vInd = 0;
      if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
  vInd = (int)instance.classValue();
      }

      // Iterate through all relevant string attributes of the current instance
      Hashtable h = new Hashtable();
      for (int j = 0; j < instance.numAttributes(); j++) {
  if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {

    // Get tokenizer
    m_Tokenizer.tokenize(instance.stringValue(j));

    // Iterate through tokens, perform stemming, and remove stopwords
    // (if required)
    while (m_Tokenizer.hasMoreElements()) {
      String word = ((String)m_Tokenizer.nextElement()).intern();

      if(this.m_lowerCaseTokens==true)
        word = word.toLowerCase();

      word = m_Stemmer.stem(word);

      if(this.m_useStoplist==true)
        if(stopwords.is(word))
    continue;

      if(!(h.contains(word)))
        h.put(word, new Integer(0));

View Full Code Here

    } else {
      m_inputVector.clear();
    }
   
    if (m_useStopList && m_stopwords == null) {
      m_stopwords = new Stopwords();
      try {
        if (getStopwords().exists() && !getStopwords().isDirectory()) {
          m_stopwords.read(getStopwords());
        }
      } catch (Exception ex) {
View Full Code Here

  /**
   * determines the dictionary.
   */
  private void determineDictionary() {
    // initialize stopwords
    Stopwords stopwords = new Stopwords();
    if (getUseStoplist()) {
      try {
  if (getStopwords().exists() && !getStopwords().isDirectory())
    stopwords.read(getStopwords());
      }
      catch (Exception e) {
  e.printStackTrace();
      }
    }

    // Operate on a per-class basis if class attribute is set
    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
      values = getInputFormat().attribute(classInd).numValues();
    }

    //TreeMap dictionaryArr [] = new TreeMap[values];
    TreeMap [] dictionaryArr = new TreeMap[values];
    for (int i = 0; i < values; i++) {
      dictionaryArr[i] = new TreeMap();
    }

    // Make sure we know which fields to convert
    determineSelectedRange();

    // Tokenize all training text into an orderedMap of "words".
    long pruneRate =
      Math.round((m_PeriodicPruningRate/100.0)*getInputFormat().numInstances());
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
      Instance instance = getInputFormat().instance(i);
      int vInd = 0;
      if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
  vInd = (int)instance.classValue();
      }

      // Iterate through all relevant string attributes of the current instance
      Hashtable h = new Hashtable();
      for (int j = 0; j < instance.numAttributes(); j++) {
  if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {

    // Get tokenizer
    m_Tokenizer.tokenize(instance.stringValue(j));

    // Iterate through tokens, perform stemming, and remove stopwords
    // (if required)
    while (m_Tokenizer.hasMoreElements()) {
      String word = ((String)m_Tokenizer.nextElement()).intern();

      if(this.m_lowerCaseTokens==true)
        word = word.toLowerCase();

      word = m_Stemmer.stem(word);

      if(this.m_useStoplist==true)
        if(stopwords.is(word))
    continue;

      if(!(h.contains(word)))
        h.put(word, new Integer(0));

View Full Code Here

        this.selectedAttributeName = selectedAttributeName;
    }

    private HashSet<String> prepareStopwords() {
        // initialize stopwords
        Stopwords stops = new Stopwords();
        if (getStopList() != null) {
            try {
                if (getStopList().exists() && !getStopList().isDirectory()) {
                    stops.read(getStopList());
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
       
        HashSet<String> result = new HashSet<String>();
        Enumeration<String> words = (Enumeration<String>)stops.elements();
        while (words.hasMoreElements()) {
            result.add(words.nextElement());
        }
        return result;
    }
View Full Code Here

TOP

Related Classes of weka.core.Stopwords

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.