Package dmt.features.definitions

Source Code of dmt.features.definitions.NumberOfCharsStemmedFeature

package dmt.features.definitions;

import java.io.IOException;
import java.util.HashMap;
import java.util.Vector;
import java.util.regex.Pattern;

import org.tartarus.snowball.SnowballStemmer;

import dmt.config.Configuration;
import dmt.features.Feature;
import dmt.tools.CSVFileReader;
import dmt.tools.CSVFileWriter;

public class NumberOfCharsStemmedFeature extends Feature
{
 
  private static Vector<String> stopWords = null;
 
  private static HashMap<String, Integer> stemWords = new HashMap<String, Integer>();
  private static HashMap<String, Integer> stemWordsTotal = new HashMap<String, Integer>();
  private static HashMap<String, String> documentsWords = new HashMap<String, String>();
 
  @Override
  public String[] getNames()
  {
    return new String[] { "Final Number of Chars" };
  }

  @Override
  public Object[] getValues(Vector<String> fields,
      HashMap<String, Boolean> hashMap)
  {
    String text = fields.get(3).toLowerCase();
    loadStopWords();
    SnowballStemmer stemmer = new org.tartarus.snowball.ext.englishStemmer();
   
    for(int i=0; i < stopWords.size(); i++)
    {
      text = text.replaceAll("[\\s]"+stopWords.get(i)+"[\\s]", " ");
    }
   
    Pattern p = Pattern.compile("[^a-zA-Z]+");
    String [] words = p.split(text);
    text = "";
    for(int i=0; i < words.length; i++)
    {
      if(words[i].length() < 3)
      {
        continue;
      }
     
      stemmer.setCurrent(words[i]);
      stemmer.stem();
      String stemmed = stemmer.getCurrent();
     
      if(VerifyParents.getParent(fields.get(1)) == null)
      {
        if(stemWordsTotal.containsKey(stemmed))
        {
          stemWordsTotal.put(stemmed, stemWordsTotal.get(stemmed) + 1);
        }
        else
        {
          stemWordsTotal.put(stemmed, 1);
        }
      }
     
      /*if(text.indexOf(stemmed) >= 0)
      {
        continue;
      }*/
     
      if(VerifyParents.getParent(fields.get(1)) == null)
      {
        if(documentsWords.containsKey(stemmed))
        {
          if(documentsWords.get(stemmed).indexOf(fields.get(0)) < 0)
          {
            documentsWords.put(stemmed, documentsWords.get(stemmed) + " " + fields.get(0));
            stemWords.put(stemmed, stemWords.get(stemmed) + 1);
          }
        }
        else
        {
          stemWords.put(stemmed, 1);
          documentsWords.put(stemmed, fields.get(0));
        }
       
      }
     
     
      text += stemmed;
    }
    return new Object[] { new Integer(text.length())};
  }
 
  private static void loadStopWords()
  {
    if(stopWords != null)
    {
      return;
    }
   
    CSVFileReader in;
    try
    {
      in = new CSVFileReader(Configuration.STOP_WORDS_LIST_WITH_CONTRACTIONS, ',');
      stopWords = in.readFields();
      in.close();
    } catch (IOException e)
    {
      e.printStackTrace();
    }
  }
 
  public static void printWords() throws IOException
  {
    CSVFileWriter out = new CSVFileWriter(Configuration.OUTPUT_BAG_OF_WORDS_ALL, ',');
    Object[] keys = stemWords.keySet().toArray();
   
    for(int i=0; i< keys.length; i++)
    {
      Vector<String> fields = new Vector<String>();
      fields.add(keys[i].toString());
      fields.add(stemWords.get(keys[i]).toString());
      fields.add(stemWordsTotal.get(keys[i]).toString());
      out.writeFields(fields);
    }
    out.close();
  }

}
TOP

Related Classes of dmt.features.definitions.NumberOfCharsStemmedFeature

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.