Package edu.ucla.sspace.text

Source Code of edu.ucla.sspace.text.TokenFilter

/*
* Copyright 2009 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.text;

import edu.ucla.sspace.util.FileResourceFinder;
import edu.ucla.sspace.util.ResourceFinder;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOError;
import java.io.IOException;

import java.util.HashSet;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Set;


/**
* A utility for asserting what tokens are valid and invalid within a stream of
* tokens.  A filter may be either inclusive or exclusive.<p>
*
* An inclusive filter will accept only those tokens with which it was
* initialized.  For an example, an inclusive filter initialized with all of the
* words in the english dictionary would exclude all misspellings or foreign
* words in a token stream.<p>
*
* An exclusive filter will aceept only those tokens that are not in set with
* which it was initialized.  An exclusive filter is often used with a list of
* common words that should be excluded, which is also known as a "stop
* list."<p>
*
* {@code TokenFilter} instances may be combined into a linear chain of filters.
* This allows for a highly configurable filter to be made from mulitple rules.
* Chained filters are created in a linear order and each filter must accept the
* token for the last filter to return {@code}.  If the any of the earlier
* filters return {@code false}, then the token is not accepted.<p>
*
* This class also provides a static utility function {@link
* #loadFromSpecification(String) loadFromSpecification} for initializing a
* chain of filters from a text configuration.  This is intended to facility
* command-line tools that want to provide easily configurable filters.  An
* example configuration might look like:
* <tt>include=top-tokens.txt:test-words.txt,exclude=stop-words.txt</tt>
*
* @see FilteredIterator
*/
public class TokenFilter {

    /**
     * The set of tokens used to filter the output
     */
    private final Set<String> tokens;

    /**
     * {@code true} if the returned tokens must not be in the filter set
     */
    private final boolean excludeTokens;

    /**
     * A filter that is to be applied before this filter when determining
     * whether a token should  be accepted
     */
    private TokenFilter parent;

    /**
     * Constructs a filter that accepts only those tokens present in {@code tokens}.
     */
    public TokenFilter(Set<String> tokens) {
  this(tokens, false, null);
    }

    /**
     * Constructs a filter using {@code tokens} that if {@code excludeTokens} is
     * {@code false} will accept those in {@code tokens}, or if {@code
     * excludeTokens} is {@code true}, will accept those that are <i>not</i> in
     * {@code tokens}.
     *
     * @param tokens the set of tokens to use in filtering the output
     * @param excludeTokens {@code true} if tokens in {@code tokens} should be
     *        excluded, {@code false} if only tokens in {@code tokens} should
     *        be included
     */
    public TokenFilter(Set<String> tokens, boolean excludeTokens) {
  this(tokens, excludeTokens, null);
    }

    /**
     * Constructs a chained filter that accepts the subset of what the parent
     * accepts after applying its own filter to any tokens that the parent
     * accepts.  Note that if the parent does not accept a token, then this
     * filter will not either.
     *
     * @param tokens the set of tokens to use in filtering the output
     * @param excludeTokens {@code true} if tokens in {@code tokens} should be
     *        excluded, {@code false} if only tokens in {@code tokens} should
     *        be included
     * @param parent a filter to be applied before determining whether a token
     *        is to be accepted
     */
    public TokenFilter(Set<String> tokens, boolean excludeTokens,
           TokenFilter parent) {
  this.tokens = tokens;
  this.excludeTokens = excludeTokens;
  this.parent = parent;
    }

    /**
     * Returns {@code true} if the token is valid according to the configuration
     * of this filter.
     *
     * @param token a token to be considered
     *
     * @return {@code true} if this token is valid
     */
    public boolean accept(String token) {
  return (parent == null || parent.accept(token)) &&
    tokens.contains(token) ^ excludeTokens;
    }

    /**
     * Creates a chained filter by accepting the subset of whatever {@code
     * parent} accepts less what tokens this filter rejects. 
     *
     * @param parent a filter to be applied before determining whether a token
     *        is to be accepted
     *
     * @return the previous parent filter or {@code null} if one had not been
     *         assigned
     */
    public TokenFilter combine(TokenFilter parent) {
  TokenFilter oldParent = parent;
  this.parent = parent;
  return oldParent;
    }
   
    /**
     * Loads a series of chained {@code TokenFilter} instances from the
     * specified configuration string.  This method will assume that all
     * specified resources exist on the local file system.<p>
     *
     * A configuration lists sets of files that contain tokens to be included or
     * excluded.  The behavior, {@code include} or {@code exclude} is specified
     * first, followed by one or more file names, each separated by colons.
     * Multiple behaviors may be specified one after the other using a {@code ,}
     * character to separate them.  For example, a typicaly configuration may
     * look like: "include=top-tokens.txt,test-words.txt:exclude=stop-words.txt"
     * <b>Note</b> behaviors are applied in the order they are presented on the
     * command-line.
     *
     * @param configuration a token filter configuration
     *
     * @return the chained TokenFilter instance made of all the specification,
     *         or {@code null} if the configuration did not specify any filters
     *
     * @throws IOError if any error occurs when reading the word list files
     */
    public static TokenFilter loadFromSpecification(String configuration) {
        return loadFromSpecification(configuration, new FileResourceFinder());
    }

    /**
     * Loads a series of chained {@code TokenFilter} instances from the
     * specified configuration string using the provided {@link ResourceFinder}
     * to locate the resources.  This method is provided for applications that
     * need to load resources from a custom environment or file system.<p>
     *
     * A configuration lists sets of files that contain tokens to be included or
     * excluded.  The behavior, {@code include} or {@code exclude} is specified
     * first, followed by one or more file names, each separated by colons.
     * Multiple behaviors may be specified one after the other using a {@code ,}
     * character to separate them.  For example, a typicaly configuration may
     * look like: "include=top-tokens.txt,test-words.txt:exclude=stop-words.txt"
     * <b>Note</b> behaviors are applied in the order they are presented on the
     * command-line.
     *
     * @param configuration a token filter configuration
     * @param finder the {@code ResourceFinder} used to locate the file
     *        resources specified in the configuration string.
     *
     * @return the chained TokenFilter instance made of all the specification,
     *         or {@code null} if the configuration did not specify any filters
     *
     * @throws IOError if any error occurs when reading the word list files
     */   
    public static TokenFilter loadFromSpecification(String configuration,
                                                    ResourceFinder finder) {

  TokenFilter toReturn = null;

  // multiple filter are separated by a ':'
  String[] filters = configuration.split(",");

  for (String s : filters) {
            String[] optionAndFiles = s.split("=");
            if (optionAndFiles.length != 2)
                throw new IllegalArgumentException(
                    "Invalid number of filter parameters: " + s);
           
            String behavior = optionAndFiles[0];
            boolean exclude = behavior.equals("exclude");
            // Sanity check that the behavior was include
            if (!exclude && !behavior.equals("include"))
                throw new IllegalArgumentException(
                    "Invalid filter behavior: " + behavior);
               
            String[] files = optionAndFiles[1].split(":");
           
      // Load the words in the file(s)
      Set<String> words = new HashSet<String>();
      try {
                for (String f : files) {
                    BufferedReader br = finder.open(f);
                    for (String line = null; (line = br.readLine()) != null; )
                        words.add(line);
                    br.close();
                }
      } catch (IOException ioe) {
    // rethrow since filter error is fatal to correct execution
    throw new IOError(ioe);
      }
     
            // Chain the filters on top of each other
      toReturn = new TokenFilter(words, exclude, toReturn);
  }

  return toReturn;
    }
}
TOP

Related Classes of edu.ucla.sspace.text.TokenFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.