Source Code of edu.ucla.sspace.tools.TokenCounter

/*
 * Copyright 2009 David Jurgens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.tools;


import edu.ucla.sspace.common.ArgOptions;


import edu.ucla.sspace.mains.OptionDescriptions;


import edu.ucla.sspace.text.DocumentPreprocessor;
import edu.ucla.sspace.text.IteratorFactory;
import edu.ucla.sspace.text.StringUtils;


import edu.ucla.sspace.util.LoggerUtil;
import edu.ucla.sspace.util.TrieMap;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.File;
import java.io.FileReader;
import java.io.PrintWriter;


import java.util.Collections;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;


import java.util.logging.Level;
import java.util.logging.Logger;


/**
 * A utility class for counting tokens in one or more files.  This class also
 * supports counting compound token instances, as well as counting for only a
 * subset of the unique tokens.  This class is intended for token counting in
 * very large corpora where space-efficiency is important.  The output is
 * equivalent to the command <tt>cat <i>corpus.txt</i> | awk '{ split($0,a); for
 * (i in a) { print a[i]; }}' | uniq -c</tt>.  However, this
 * command is significantly more memory and CPU intensive. 
 *
 * @author David Jurgens
 */
public class TokenCounter {


    /**
     * The number of tokens to process before emitting a verbose message about
     * the counting status.
     */
    private static final int UPDATE_INTERVAL = 10000;


    /**
     * The logger used to emit messages for this class
     */
    private static final Logger LOGGER = 
        Logger.getLogger(TokenCounter.class.getName());


    /**
     * A mapping from token to the number of times it occurred
     */
    private final Map<String,Integer> tokenToCount;


    /**
     * {@code true} if the token counter should lower case all tokens before
     * counting
     */
    private final boolean doLowerCasing;


    /**
     * Creates a new token counter
     */
    public TokenCounter() { 
        this(false);
    }


    /**
     * Creates a new token counter that optionally lower cases tokens
     *
     * @param doLowerCasing {@code true} if the token counter should lower case
     *        all tokens before counting
     */
    public TokenCounter(boolean doLowerCasing) { 
        this.doLowerCasing = doLowerCasing;
        tokenToCount = new TrieMap<Integer>();
    }


    /**
     * Returns a mapping from each seen token to the number of times it occurred
     */
    public Map<String,Integer> getTokenCounts() {
        return Collections.unmodifiableMap(tokenToCount);
    }


    /**
     * Counts all of the tokens in the file with specified name
     */
    public void processFile(String fileName) throws IOException {
        process(new BufferedReader(new FileReader(fileName)));
    }


    /**
     * Counts all of the tokens in the file
     */
    public void processFile(File file) throws IOException {
        process(new BufferedReader(new FileReader(file)));
    }


    /**
     * Counts all of the tokens in the reader
     */
    public void process(BufferedReader br) {
        process(IteratorFactory.tokenize(br));
    }
     
    /**
     * Counts all of the tokens in the string
     */
    public void process(String tokens) {
        process(IteratorFactory.tokenize(tokens));
    }


    /**
     * Counts all of the tokens in the iterator
     */
    private void process(Iterator<String> tokens) {
        // NOTE: this method is intentionally private to ensure that the
        // IteratorFactory.tokenize() tokenization scheme is enforced on the
        // input data
        long numTokens = 0;
        while (tokens.hasNext()) {
            String token = tokens.next();
            if (doLowerCasing)
                token = token.toLowerCase();
            if (token.matches("[0-9]+"))
                token = "<NUM>";
            if (token.matches("[^\\w\\s;:\\(\\)\\[\\]'!/&?\",\\.<>]"))
                continue;


            Integer count = tokenToCount.get(token);
            tokenToCount.put(token, (count == null) ? 1 : 1 + count);
            numTokens++;
            if (numTokens % UPDATE_INTERVAL == 0 
                    && LOGGER.isLoggable(Level.FINE))
                LOGGER.fine("Processed " + numTokens + " tokens.  Currently " 
                            + tokenToCount.size() + " unique tokens");
        }
    }


    public static void main(String[] args) {
        ArgOptions options = new ArgOptions();
        options.addOption('Z', "stemmingAlgorithm",
                          "specifices the stemming algorithm to use on " +
                          "tokens while iterating.  (default: none)",
                          true, "CLASSNAME", "Tokenizing Options");
        options.addOption('F', "tokenFilter", "filters to apply to the input " +
                          "token stream", true, "FILTER_SPEC", 
                          "Tokenizing Options");
        options.addOption('C', "compoundWords", "a file where each line is a " +
                          "recognized compound word", true, "FILE", 
                          "Tokenizing Options");
        options.addOption('L', "lowerCase", "lower-cases each token after " +
                          "all other filtering has been applied", false, null, 
                          "Tokenizing Options");
        options.addOption('z', "wordLimit", "Set the maximum number of words " +
                          "a document can return",
                          true, "INT", "Tokenizing Options");
        options.addOption('v', "verbose",
                          "Print verbose output about counting status",
                          false, null, "Optional");
        options.parseOptions(args);
        if (options.numPositionalArgs() < 2) {
            System.out.println(
                "usage: java TokenCounter" 
                + " [options] <output-file> <input-file> [<input-file>]*\n"
                + options.prettyPrint() 
                + "\n" + OptionDescriptions.COMPOUND_WORDS_DESCRIPTION
                + "\n\n" + OptionDescriptions.TOKEN_FILTER_DESCRIPTION);
            return;
        }


        if (options.hasOption("verbose")) 
            LoggerUtil.setLevel(Level.FINE);




        boolean doLowerCasing = options.hasOption("lowerCase");


        Properties props = System.getProperties();
        // Initialize the IteratorFactory to tokenize the documents according to
        // the specified configuration (e.g. filtering, compound words)
        if (options.hasOption("tokenFilter"))
            props.setProperty(IteratorFactory.TOKEN_FILTER_PROPERTY,
                              options.getStringOption("tokenFilter"));
        // Set any tokenizing options.
        if (options.hasOption("stemmingAlgorithm"))
            props.setProperty(IteratorFactory.STEMMER_PROPERTY,
                              options.getStringOption("stemmingAlgorithm"));
         
        if (options.hasOption("compoundWords")) 
            props.setProperty(IteratorFactory.COMPOUND_TOKENS_FILE_PROPERTY,
                              options.getStringOption("compoundWords"));
        if (options.hasOption("wordLimit"))
            props.setProperty(IteratorFactory.TOKEN_COUNT_LIMIT_PROPERTY,
                              options.getStringOption("wordLimit"));


        IteratorFactory.setProperties(props);


        try {
            TokenCounter counter = new TokenCounter(doLowerCasing);
            // Process each of the input files
            for (int i = 1; i < options.numPositionalArgs(); ++i)
                counter.processFile(options.getPositionalArg(i));
            // Then write the results to disk
            PrintWriter pw = new PrintWriter(options.getPositionalArg(0));
            for (Map.Entry<String,Integer> e 
                     : counter.tokenToCount.entrySet())
                pw.println(e.getKey() + " " + e.getValue());
            pw.close();
        } catch (Throwable t) {
            t.printStackTrace();
        }
    }
}
Source Code of edu.ucla.sspace.tools.TokenCounter

Related Classes of edu.ucla.sspace.tools.TokenCounter