Source Code of edu.ucla.sspace.tools.DepSemTokenCounter

/*
 * Copyright 2009 Keith Stevens
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.tools;


import edu.ucla.sspace.common.ArgOptions;


import edu.ucla.sspace.mains.OptionDescriptions;


import edu.ucla.sspace.dependency.CoNLLDependencyExtractor;
import edu.ucla.sspace.dependency.DependencyExtractor;
import edu.ucla.sspace.dependency.DependencyTreeNode;
import edu.ucla.sspace.dependency.WaCKyDependencyExtractor;


import edu.ucla.sspace.text.DependencyFileDocumentIterator;
import edu.ucla.sspace.text.Document;
import edu.ucla.sspace.text.TokenFilter;
import edu.ucla.sspace.text.Stemmer;


import edu.ucla.sspace.util.LoggerUtil;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.File;
import java.io.FileReader;
import java.io.PrintWriter;


import java.util.Collections;
import java.util.Iterator;
import java.util.HashSet;
import java.util.Set;
import java.util.Properties;


import java.util.logging.Level;
import java.util.logging.Logger;




/**
 * @author Keith Stevens 
 */
public class DepSemTokenCounter {


    /**
     * A mapping from token to the number of times it occurred
     */
    private final Set<String> foundTokens;


    /**
     * The {@link DependencyExtractor} used to extract parse trees.
     */
    private final DependencyExtractor extractor;


    /**
     * Creates a new token counter that optionally lower cases tokens
     *
     * @param doLowerCasing {@code true} if the token counter should lower case
     *        all tokens before counting
     */
    public DepSemTokenCounter(DependencyExtractor extractor) { 
        this.extractor = extractor;
        foundTokens = new HashSet<String>();
    }


    public Set<String> getTokens() {
        return foundTokens;
    }


    /**
     * Counts all of the tokens in the iterator
     */
    private void process(Iterator<Document> docs) throws IOException {
        long numTokens = 0;
        while (docs.hasNext()) {
            Document doc = docs.next();
            BufferedReader br = doc.reader();
            String header = br.readLine();


            DependencyTreeNode[] nodes = extractor.readNextTree(br);
            int index;
            for (index = 0; index < nodes.length; ++index)
                if (nodes[index].lemma().equals(header))
                    break;


            foundTokens.add(nodes[index].word().toLowerCase());
            for (int i = index+1; i < index+10 && i < nodes.length; ++i)
                foundTokens.add(nodes[i].word().toLowerCase());
            for (int i = Math.max(0, index-10); i < index; ++i)
                foundTokens.add(nodes[i].word().toLowerCase());
        }
    }


    public static void main(String[] args) throws Exception {
        // Setup the argument options.
        ArgOptions options = new ArgOptions();
        options.addOption('F', "tokenFilter", "filters to apply to the input " +
                          "token stream", true, "FILTER_SPEC", 
                          "Tokenizing Options");
        options.addOption('v', "verbose",
                          "Print verbose output about counting status",
                          false, null, "Optional");


        // Parse and validate the options.
        options.parseOptions(args);
        if (options.numPositionalArgs() < 2) {
            System.out.println(
                "usage: java DepTokenCounter" 
                + " [options] <output-file> <input-file> [<input-file>]*\n"
                + options.prettyPrint() 
                + "\n\n" + OptionDescriptions.TOKEN_FILTER_DESCRIPTION);
            return;
        }


        // Setup logging.
        if (options.hasOption("verbose")) 
            LoggerUtil.setLevel(Level.FINE);


        TokenFilter filter = (options.hasOption("tokenFilter"))
            ? TokenFilter.loadFromSpecification(options.getStringOption('F'))
            : null;


        // setup the dependency extractor.
        DependencyExtractor e = new CoNLLDependencyExtractor(filter, null);
        DepSemTokenCounter counter = new DepSemTokenCounter(e);


        // Process each of the input files
        for (int i = 1; i < options.numPositionalArgs(); ++i)
            counter.process(new DependencyFileDocumentIterator(
                        options.getPositionalArg(i)));


        // Then write the results to disk
        PrintWriter pw = new PrintWriter(options.getPositionalArg(0));
        for (String term : counter.getTokens()) 
            pw.println(term);
        pw.close();
    }
}
Source Code of edu.ucla.sspace.tools.DepSemTokenCounter

Related Classes of edu.ucla.sspace.tools.DepSemTokenCounter