Source Code of cc.mallet.pipe.TokenSequenceRemoveStopwords

/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */










package cc.mallet.pipe;




import java.util.HashSet;
import java.util.ArrayList;
import java.io.*;


import cc.mallet.types.FeatureSequenceWithBigrams;
import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
/**
 * Remove tokens from the token sequence in the data field whose text is in the stopword list.
 @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/


public class TokenSequenceRemoveStopwords extends Pipe implements Serializable
{
  // xxx Use a gnu.trove collection instead
  HashSet<String> stoplist = null;
  boolean caseSensitive = true;
  boolean markDeletions = false;


  private HashSet<String> newDefaultStopList ()
  {
    HashSet<String> sl = new HashSet<String>();
    for (int i = 0; i < stopwords.length; i++)
      sl.add (stopwords[i]);
    return sl;
  }




  public TokenSequenceRemoveStopwords (boolean caseSensitive, boolean markDeletions)
  {
    stoplist = newDefaultStopList();
    this.caseSensitive = caseSensitive;
    this.markDeletions = markDeletions;
  }


  public TokenSequenceRemoveStopwords (boolean caseSensitive)
  {
    stoplist = newDefaultStopList();
    this.caseSensitive = caseSensitive;
  }


  public TokenSequenceRemoveStopwords ()
  {
    this (false);
  }


  /**
   *  Load a stoplist from a file.
   *  @param stoplistFile    The file to load
   *  @param encoding        The encoding of the stoplist file (eg UTF-8)
   *  @param includeDefault  Whether to include the standard mallet English stoplist
   */
  public TokenSequenceRemoveStopwords(File stoplistFile, String encoding, boolean includeDefault,
                    boolean caseSensitive, boolean markDeletions) {
    if (! includeDefault) { stoplist = new HashSet<String>(); }
    else { stoplist = newDefaultStopList(); }


    addStopWords (fileToStringArray(stoplistFile, encoding));


    this.caseSensitive = caseSensitive;
        this.markDeletions = markDeletions;
  }


  public TokenSequenceRemoveStopwords setCaseSensitive (boolean flag)
  {
    this.caseSensitive = flag;
    return this;
  }


  public TokenSequenceRemoveStopwords setMarkDeletions (boolean flag)
  {
    this.markDeletions = flag;
    return this;
  }


  public TokenSequenceRemoveStopwords addStopWords (String[] words)
  {
    for (int i = 0; i < words.length; i++)
      stoplist.add (words[i]);
    return this;
  }




  public TokenSequenceRemoveStopwords removeStopWords (String[] words)
  {
    for (int i = 0; i < words.length; i++)
      stoplist.remove (words[i]);
    return this;
  }


  /** Remove whitespace-separated tokens in file "wordlist" to the stoplist. */
  public TokenSequenceRemoveStopwords removeStopWords (File wordlist)
  {
    this.removeStopWords (fileToStringArray(wordlist, null));
    return this;
  }


  /** Add whitespace-separated tokens in file "wordlist" to the stoplist. */
  public TokenSequenceRemoveStopwords addStopWords (File wordlist)
  {
    if (wordlist != null)
      this.addStopWords (fileToStringArray(wordlist, null));
    return this;
  }




  private String[] fileToStringArray (File f, String encoding)
  {
    ArrayList<String> wordarray = new ArrayList<String>();


    try {


      BufferedReader input = null;
      if (encoding == null) {
        input = new BufferedReader (new FileReader (f));
      }
      else {
        input = new BufferedReader( new InputStreamReader( new FileInputStream(f), encoding ));
      }
      String line;


      while (( line = input.readLine()) != null) {
        String[] words = line.split ("\\s+");
        for (int i = 0; i < words.length; i++)
          wordarray.add (words[i]);
      }


    } catch (IOException e) {
      throw new IllegalArgumentException("Trouble reading file "+f);
    }
    return (String[]) wordarray.toArray(new String[]{});
  }
  
  public Instance pipe (Instance carrier)
  {
    TokenSequence ts = (TokenSequence) carrier.getData();
    // xxx This doesn't seem so efficient.  Perhaps have TokenSequence
    // use a LinkedList, and remove Tokens from it? -?
    // But a LinkedList implementation of TokenSequence would be quite inefficient -AKM
    TokenSequence ret = new TokenSequence ();
    Token prevToken = null;
    for (int i = 0; i < ts.size(); i++) {
      Token t = ts.get(i);
      if (! stoplist.contains (caseSensitive ? t.getText() : t.getText().toLowerCase())) {
        // xxx Should we instead make and add a copy of the Token?
        ret.add (t);
        prevToken = t;
      } else if (markDeletions && prevToken != null)
        prevToken.setProperty (FeatureSequenceWithBigrams.deletionMark, t.getText());
    }
    carrier.setData(ret);
    return carrier;
  }


  // Serialization 
  
  private static final long serialVersionUID = 1;
  private static final int CURRENT_SERIAL_VERSION = 2;
  
  private void writeObject (ObjectOutputStream out) throws IOException {
    out.writeInt (CURRENT_SERIAL_VERSION);
    out.writeBoolean(caseSensitive);
    out.writeBoolean(markDeletions);
    out.writeObject(stoplist); // New as of CURRENT_SERIAL_VERSION 2
  }
  
  private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
    int version = in.readInt ();
    caseSensitive = in.readBoolean();
    if (version > 0)
      markDeletions = in.readBoolean();
    if (version > 1) {
      stoplist = (HashSet<String>) in.readObject();
    }


  }


  
  static final String[] stopwords =
  {
    "a",
    "able",
    "about",
    "above",
    "according",
    "accordingly",
    "across",
    "actually",
    "after",
    "afterwards",
    "again",
    "against",
    "all",
    "allow",
    "allows",
    "almost",
    "alone",
    "along",
    "already",
    "also",
    "although",
    "always",
    "am",
    "among",
    "amongst",
    "an",
    "and",
    "another",
    "any",
    "anybody",
    "anyhow",
    "anyone",
    "anything",
    "anyway",
    "anyways",
    "anywhere",
    "apart",
    "appear",
    "appreciate",
    "appropriate",
    "are",
    "around",
    "as",
    "aside",
    "ask",
    "asking",
    "associated",
    "at",
    "available",
    "away",
    "awfully",
    "b",
    "be",
    "became",
    "because",
    "become",
    "becomes",
    "becoming",
    "been",
    "before",
    "beforehand",
    "behind",
    "being",
    "believe",
    "below",
    "beside",
    "besides",
    "best",
    "better",
    "between",
    "beyond",
    "both",
    "brief",
    "but",
    "by",
    "c",
    "came",
    "can",
    "cannot",
    "cant",
    "cause",
    "causes",
    "certain",
    "certainly",
    "changes",
    "clearly",
    "co",
    "com",
    "come",
    "comes",
    "concerning",
    "consequently",
    "consider",
    "considering",
    "contain",
    "containing",
    "contains",
    "corresponding",
    "could",
    "course",
    "currently",
    "d",
    "definitely",
    "described",
    "despite",
    "did",
    "different",
    "do",
    "does",
    "doing",
    "done",
    "down",
    "downwards",
    "during",
    "e",
    "each",
    "edu",
    "eg",
    "eight",
    "either",
    "else",
    "elsewhere",
    "enough",
    "entirely",
    "especially",
    "et",
    "etc",
    "even",
    "ever",
    "every",
    "everybody",
    "everyone",
    "everything",
    "everywhere",
    "ex",
    "exactly",
    "example",
    "except",
    "f",
    "far",
    "few",
    "fifth",
    "first",
    "five",
    "followed",
    "following",
    "follows",
    "for",
    "former",
    "formerly",
    "forth",
    "four",
    "from",
    "further",
    "furthermore",
    "g",
    "get",
    "gets",
    "getting",
    "given",
    "gives",
    "go",
    "goes",
    "going",
    "gone",
    "got",
    "gotten",
    "greetings",
    "h",
    "had",
    "happens",
    "hardly",
    "has",
    "have",
    "having",
    "he",
    "hello",
    "help",
    "hence",
    "her",
    "here",
    "hereafter",
    "hereby",
    "herein",
    "hereupon",
    "hers",
    "herself",
    "hi",
    "him",
    "himself",
    "his",
    "hither",
    "hopefully",
    "how",
    "howbeit",
    "however",
    "i",
    "ie",
    "if",
    "ignored",
    "immediate",
    "in",
    "inasmuch",
    "inc",
    "indeed",
    "indicate",
    "indicated",
    "indicates",
    "inner",
    "insofar",
    "instead",
    "into",
    "inward",
    "is",
    "it",
    "its",
    "itself",
    "j",
    "just",
    "k",
    "keep",
    "keeps",
    "kept",
    "know",
    "knows",
    "known",
    "l",
    "last",
    "lately",
    "later",
    "latter",
    "latterly",
    "least",
    "less",
    "lest",
    "let",
    "like",
    "liked",
    "likely",
    "little",
    "look",
    "looking",
    "looks",
    "ltd",
    "m",
    "mainly",
    "many",
    "may",
    "maybe",
    "me",
    "mean",
    "meanwhile",
    "merely",
    "might",
    "more",
    "moreover",
    "most",
    "mostly",
    "much",
    "must",
    "my",
    "myself",
    "n",
    "name",
    "namely",
    "nd",
    "near",
    "nearly",
    "necessary",
    "need",
    "needs",
    "neither",
    "never",
    "nevertheless",
    "new",
    "next",
    "nine",
    "no",
    "nobody",
    "non",
    "none",
    "noone",
    "nor",
    "normally",
    "not",
    "nothing",
    "novel",
    "now",
    "nowhere",
    "o",
    "obviously",
    "of",
    "off",
    "often",
    "oh",
    "ok",
    "okay",
    "old",
    "on",
    "once",
    "one",
    "ones",
    "only",
    "onto",
    "or",
    "other",
    "others",
    "otherwise",
    "ought",
    "our",
    "ours",
    "ourselves",
    "out",
    "outside",
    "over",
    "overall",
    "own",
    "p",
    "particular",
    "particularly",
    "per",
    "perhaps",
    "placed",
    "please",
    "plus",
    "possible",
    "presumably",
    "probably",
    "provides",
    "q",
    "que",
    "quite",
    "qv",
    "r",
    "rather",
    "rd",
    "re",
    "really",
    "reasonably",
    "regarding",
    "regardless",
    "regards",
    "relatively",
    "respectively",
    "right",
    "s",
    "said",
    "same",
    "saw",
    "say",
    "saying",
    "says",
    "second",
    "secondly",
    "see",
    "seeing",
    "seem",
    "seemed",
    "seeming",
    "seems",
    "seen",
    "self",
    "selves",
    "sensible",
    "sent",
    "serious",
    "seriously",
    "seven",
    "several",
    "shall",
    "she",
    "should",
    "since",
    "six",
    "so",
    "some",
    "somebody",
    "somehow",
    "someone",
    "something",
    "sometime",
    "sometimes",
    "somewhat",
    "somewhere",
    "soon",
    "sorry",
    "specified",
    "specify",
    "specifying",
    "still",
    "sub",
    "such",
    "sup",
    "sure",
    "t",
    "take",
    "taken",
    "tell",
    "tends",
    "th",
    "than",
    "thank",
    "thanks",
    "thanx",
    "that",
    "thats",
    "the",
    "their",
    "theirs",
    "them",
    "themselves",
    "then",
    "thence",
    "there",
    "thereafter",
    "thereby",
    "therefore",
    "therein",
    "theres",
    "thereupon",
    "these",
    "they",
    "think",
    "third",
    "this",
    "thorough",
    "thoroughly",
    "those",
    "though",
    "three",
    "through",
    "throughout",
    "thru",
    "thus",
    "to",
    "together",
    "too",
    "took",
    "toward",
    "towards",
    "tried",
    "tries",
    "truly",
    "try",
    "trying",
    "twice",
    "two",
    "u",
    "un",
    "under",
    "unfortunately",
    "unless",
    "unlikely",
    "until",
    "unto",
    "up",
    "upon",
    "us",
    "use",
    "used",
    "useful",
    "uses",
    "using",
    "usually",
    "uucp",
    "v",
    "value",
    "various",
    "very",
    "via",
    "viz",
    "vs",
    "w",
    "want",
    "wants",
    "was",
    "way",
    "we",
    "welcome",
    "well",
    "went",
    "were",
    "what",
    "whatever",
    "when",
    "whence",
    "whenever",
    "where",
    "whereafter",
    "whereas",
    "whereby",
    "wherein",
    "whereupon",
    "wherever",
    "whether",
    "which",
    "while",
    "whither",
    "who",
    "whoever",
    "whole",
    "whom",
    "whose",
    "why",
    "will",
    "willing",
    "wish",
    "with",
    "within",
    "without",
    "wonder",
    "would",
    "would",
    "x",
    "y",
    "yes",
    "yet",
    "you",
    "your",
    "yours",
    "yourself",
    "yourselves",
    "z",
    "zero",
    // stop words for paper abstracts
    //    "abstract",
    //"paper",
    //"presents",
    //"discuss",
    //"discusses",
    //"conclude",
    //"concludes",
    //"based",
    //"approach"
  };  
    //stopwords for french, added by Limin Yao
  static final String[] stopwordsFrench = {
    "fut",
    "S",
    "ces",
    "ral",
    "new",
    "tr",
    "arm",
    "y",
    "autres",
    "o",
    "tait",
    "dont",
    "ann",
    "apr",
    "sous",
    "ans",
    "cette",
    "politique",
    "of",
    "c",
    "contre",
    "leur",
    "ville",
    "fait",
    "res",
    "on",
    "deux",
    "cle",
    "v",
    "publique",
    "france",
    "te",
    "guerre",
    "sident",
    "unis",
    "mais",
    "entre",
    "aussi",
    "tat",
    "ais",
    "ses",
    "sa",
    "ont",
    "tre",
    "d",
    "pays",
    "en",
    "Il",
    "tats",
    "comme",
    "am",
    "si",
    "c",
    "fran",
    "pas",
    "g",
    "qu",
    "R",
    "aux",
    "ce",
    "f",
    "p",
    "ne",
    "son",
    "me",
    "avec",
    "l",
    "se",
    "ou",
    "sont",
    "il",
    "Les",
    "re",
    "plus",
    "m",
    "es",
    "pr",
    "la",
    "sur",
    "que",
    "pour",
    "modifier",
    "a",
    "qui",
    "Le",
    "t",
    "n",
    "au",
    "dans",
    "une",
    "par",
    "un",
    "r",
    "est",
    "e",
    "du",
    "s",
    "les",
    "en",
    "des",
    "le",
    "et",
    "l",
    "d",
    "la",
    "de",


  };




}
Source Code of cc.mallet.pipe.TokenSequenceRemoveStopwords

Related Classes of cc.mallet.pipe.TokenSequenceRemoveStopwords