Examples of edu.ucla.sspace.common.ArgOptions

edu.ucla.sspace.common.ArgOptions
A utility class for parsing command line arguments. Command line arguments can be added with long options, short options, or both. Option keys cannot be repeated. After all desired options have been added, {@code parseOptions}must be called, which will disect an array of Strings and create values for each option. For all option types, a default value may be specified when requesting the option value. In the case of options corresponding to classnames, the accessor assumes that no argument constructor exists.
Below are some common usages of this class:
```
 ArgOptions opts = new ArgOptions(); opts.addOption('c', "numClusters", "the number of clusters", true, "INT", "Required"); opts.addOption('C', "clustering", "the clustering algorithm", true, "CLASSNAME", "Required"); opts.addOption('v', "verbose", "true if logs should be emitted", false, null, "Optional"); opts.parseOptions(args); int numClusters = opts.getIntOption('c'); boolean verbose = opts.hasOption('v'); String clusterName = opts.getStringOption('C'); 
```
@author David Jurgens

     * Adds the default options for running semantic space algorithms from the
     * command line.  Subclasses should override this method and return a
     * different instance if the default options need to be different.
     */
    protected ArgOptions setupOptions() {
        ArgOptions options = new ArgOptions();


        // Add input file options.
        options.addOption('f', "fileList", "a list of document files", 
                          true, "FILE[,FILE...]", "Required (at least one of)");
        options.addOption('d', "docFile", 
                          "a file where each line is a document", true,
                          "FILE[,FILE...]", "Required (at least one of)");
        options.addOption('R', "corpusReader", 
                          "Specifies a CorpusReader which will " +
                          "automatically parse the document files that are " +
                          "not in the formats expected by -f and -d.",
                          true, "CLASSNAME,FILE[,FILE...]",
                          "Required (at least one of)");


        // Add run time options.
        options.addOption('o', "outputFormat", "the .sspace format to use",
                          true, "FORMAT", 
                          "Program Options");
        if (isMultiThreaded) {
            options.addOption('t', "threads", "the number of threads to use",
                              true, "INT", "Program Options");
        }
        options.addOption('w', "overwrite", "specifies whether to " +
                          "overwrite the existing output", true, "BOOL",
                          "Program Options");
        options.addOption('v', "verbose", "prints verbose output",
                          false, null, "Program Options");


        // Add tokenizing options.
        options.addOption('Z', "stemmingAlgorithm",
                          "specifices the stemming algorithm to use on " +
                          "tokens while iterating.  (default: none)",
                          true, "CLASSNAME", "Tokenizing Options");
        options.addOption('F', "tokenFilter", "filters to apply to the input " +
                          "token stream", true, "FILTER_SPEC", 
                          "Tokenizing Options");
        options.addOption('C', "compoundWords", "a file where each line is a " +
                          "recognized compound word", true, "FILE", 
                          "Tokenizing Options");
        options.addOption('z', "wordLimit", "Set the maximum number of words " +
                          "a document can return",
                          true, "INT", "Tokenizing Options");


        addExtraOptions(options);
        return options;

View Full Code Here

    /**
     * Adds the options for running the {@code Grefenstette} algorithm
     */
    @Override
    protected ArgOptions setupOptions() {
        ArgOptions options = new ArgOptions();
        options.addOption('s', "sentenceFile", 
                          "a file where each line is a sentence", true,
                          "FILE[,FILE...]", "Required (at least one of)");


        options.addOption('o', "outputFormat", "the .sspace format to use",
                          true, "{text|binary}", "Program Options");
        // options.addOption('t', "threads", "the number of threads to use",
        //                           true, "INT", "Program Options");
        options.addOption('w', "overwrite", "specifies whether to " +
                          "overwrite the existing output", true, "BOOL",
                          "Program Options");
        options.addOption('t', "threads", "the number of threads to use",
                          true, "INT", "Program Options");
        options.addOption('v', "verbose", "prints verbose output",
                          false, null, "Program Options");
        addExtraOptions(options);
        return options;
    }

View Full Code Here


    /**
     * Adds all of the options to the {@link ArgOptions}.
     */
    protected ArgOptions createOptions() {
        ArgOptions options = new ArgOptions();
        options.addOption('f', "fileList", "a list of document files", 
                          true, "FILE[,FILE...]", "Required (at least one of)");
        options.addOption('d', "docFile", 
                          "a file where each line is a document", true,
                          "FILE[,FILE...]", "Required (at least one of)");


        options.addOption('T', "timespan", "the timespan for each semantic " +
                          "partition", true, "Date String", "Required");


        options.addOption('o', "outputFormat", "the .sspace format to use",
                          true, "{text|binary}", "Program Options");
        options.addOption('t', "threads", "the number of threads to use",
                          true, "INT", "Program Options");
        options.addOption('w', "overwrite", "specifies whether to " +
                          "overwrite the existing output", true, "BOOL",
                          "Program Options");
        options.addOption('v', "verbose", "prints verbose output",
                          false, null, "Program Options");


        // Algorithm Options
        options.addOption('i', "vectorGenerator", "IndexVectorGenerator "
                          + "class to use", true,
                          "CLASSNAME", "Algorithm Options");
        options.addOption('l', "vectorLength", "length of semantic vectors",
                          true, "INT", "Algorithm Options");
        options.addOption('n', "permutationFunction", "permutation function "
                          + "to use", true,
                          "CLASSNAME", "Algorithm Options");
        options.addOption('p', "usePermutations", "whether to permute " +
                          "index vectors based on word order", true,
                          "BOOL", "Algorithm Options");
        options.addOption('r', "useSparseSemantics", "use a sparse encoding of "
                          + "semantics to save memory", true,
                          "BOOL", "Algorithm Options");
        options.addOption('s', "windowSize", "how many words to consider " +
                          "in each direction", true,
                          "INT", "Algorithm Options");
        options.addOption('S', "saveVectors", "save word-to-IndexVector mapping"
                          + " after processing", true,
                          "FILE", "Algorithm Options");
        options.addOption('L', "loadVectors", "load word-to-IndexVector mapping"
                          + " before processing", true,
                          "FILE", "Algorithm Options");


        // Input Options
        options.addOption('F', "tokenFilter", "filters to apply to the input " +
                          "token stream", true, "FILTER_SPEC", 
                          "Tokenizing Options");
        options.addOption('C', "compoundWords", "a file where each line is a " +
                          "recognized compound word", true, "FILE", 
                          "Tokenizing Options");


        options.addOption('W', "semanticFilter", "exclusive list of word",
                          true, "FILE", "Input Options");


        // Output Options
        options.addOption('I', "interestingTokenList", "list of interesting " +
                          "words", true, "FILE", "Output Options");
        options.addOption('K', "printShiftRankings", "print ranked list of " +
                          "semantic shifts for each interesting word", false, 
                          null, "Output Options");
        options.addOption('R', "savePartitions", "write semantic partitions as " +
                          ".sspace files to disk", false, null, 
                          "Output Options");
        options.addOption('P', "printInterestingTokenShifts", "prints the "
                          + "vectors for each interesting word", false, null, 
                          "Output Options");
        options.addOption('N', "printInterestingTokenNeighbors", "prints the "
                          + "nearest neighbors for each interesting word", true,
                          "INT", "Output Options");
        options.addOption('Z', "printInterestingTokenNeighborComparison",
                          "prints the distances between each of the"
                          + "nearest neighbors for each interesting word", 
                          false, null , "Output Options");


        return options;

View Full Code Here


    /**
     * Creates the {@code EvaluatorMain}.
     */
    public EvaluatorMain() {
        argOptions = new ArgOptions();
        addOptions();
    }

View Full Code Here


    /**
     * {@inheritDoc}
     */
    protected SemanticSpace getSpace() {
        ArgOptions options = argOptions;
        // Setup the assignment reporter.  When training, the assignment report
        // will only be used If the evaluation mode will be for pseudoWord.
        AssignmentReporter reporter = null;
        if (options.hasOption('P'))
            reporter = new PseudoWordReporter(System.out);


        int numClusters = options.getIntOption('c', 0);


        // If Wordsi is being used in an evaluation mode, set up word space
        // accordingly.
        if (options.hasOption('e')) {
            // If the evaluation type is not set, report an error and exit.
            if (!options.hasOption('E') && !options.hasOption('P')) {
                usage();
                System.out.println(
                        "An Evaluation Type must be set when evaluating " +
                        " a trained Wordsi model.");
                System.exit(1);
            }


            // Load the semantic space that has the predefined word senses from
            // disk and return an EvaluationWordsi instance.
            try {
                SemanticSpace sspace = SemanticSpaceIO.load(
                        options.getStringOption('e'));
                if (options.hasOption('E'))
                    reporter = new SemEvalReporter(System.out);
                return new EvaluationWordsi(
                        getAcceptedWords(), getExtractor(), sspace, reporter);
            } catch (IOException ioe) {
                throw new IOError(ioe);
            }
        } else if (options.hasOption('s')) {
            // Create a StreamingWordsi instance that uses the specified online 
            // cluster generator.
            System.getProperties().setProperty(
                    OnlineClustering.NUM_CLUSTERS_PROPERTY,
                    options.getStringOption('c'));
            Generator<OnlineClustering<SparseDoubleVector>> clusterGenerator =
                ReflectionUtil.getObjectInstance(options.getStringOption('s'));
            return new StreamingWordsi(getAcceptedWords(), getExtractor(),
                                       clusterGenerator, reporter, numClusters);
        } else if (options.hasOption('b')) {
            // Create a WaitingWordsi instance that uses the specified batch
            // clustering implementation.
            Clustering clustering = 
                ReflectionUtil.getObjectInstance(options.getStringOption('b'));
            return new WaitingWordsi(getAcceptedWords(), getExtractor(), 
                                     clustering, reporter, numClusters);
        } else {
            // None of the required options was provided, report an error and
            // exit.

View Full Code Here

     * Adds the default options for running semantic space algorithms from the
     * command line.  Subclasses should override this method and return a
     * different instance if the default options need to be different.
     */
    protected ArgOptions setupOptions() {
        ArgOptions options = new ArgOptions();
        options.addOption('c', "corpusDir", "the directory of the corpus", 
                          true, "DIR", "Required");
        options.addOption('a', "analogyFile",
                          "the file containing list of word pairs", 
                          true, "FILE", "Required");
        options.addOption('t', "testAnalogies",
                           "the file containing list of analogies",
                           true, "FILE", "Required"); 
        options.addOption('o', "outputFile",
                          "the file containing the cosine similarity output " +
                          "for the analogies from testAnalogies",
                          true, "FILE", "Required"); 
        options.addOption('i', "indexDir",
                          "a Directory for storing or loading "
                          + "the Lucene index", true, "DIR", "Required");
        options.addOption('n', "dimensions", 
                          "the number of dimensions in the semantic space",
                          true, "INT"); 
        options.addOption('r', "readMatrixFile",
                          "file containing projection matrix"
                          , true, "FILE");
        options.addOption('s', "skipIndex",
                          "turn indexing off.  Must specify index directory",
                          false , null);
        options.addOption('v', "verbose",
                          "prints verbose output",
                          false, null, "Program Options");
        options.addOption('w', "writeMatrixFile",
                          "file to write projection matrix to"
                          , true, "FILE");
        return options;
    }

View Full Code Here

     * Adds the default options for running semantic space algorithms from the
     * command line.  Subclasses should override this method and return a
     * different instance if the default options need to be different.
     */
    protected ArgOptions setupOptions() {
        ArgOptions options = new ArgOptions();


        // Add run time options.
        options.addOption('o', "outputFormat", "the .sspace format to use",
                          true, "FORMAT", 
                          "Program Options");
        options.addOption('w', "overwrite", "specifies whether to " +
                          "overwrite the existing output", true, "BOOL",
                          "Program Options");
        options.addOption('v', "verbose", "prints verbose output",
                          false, null, "Program Options");


        // Add tokenizing options.
        
        options.addOption('Z', "stemmingAlgorithm",
                          "specifices the stemming algorithm to use on " +
                          "tokens while iterating.  (default: none)",
                          true, "CLASSNAME", "Tokenizing Options");
        options.addOption('F', "tokenFilter", "filters to apply to the input " +
                          "token stream", true, "FILTER_SPEC", 
                          "Tokenizing Options");
        options.addOption('C', "compoundWords", "a file where each line is a " +
                          "recognized compound word", true, "FILE", 
                          "Tokenizing Options");
        options.addOption('z', "wordLimit", "Set the maximum number of words " +
                          "an document can return",
                          true, "INT", "Tokenizing Options");        


        addExtraOptions(options);
        return options;

View Full Code Here


    private static final Logger LOGGER = 
        Logger.getLogger(FanmodTool.class.getName());


    public static void main(String[] args) {
        ArgOptions opts = new ArgOptions();
        
        opts.addOption('h', "help", "Generates a help message and exits",
                          false, null, "Program Options");
        opts.addOption('v', "verbose", "Turns on verbose output",
                          false, null, "Program Options");
        opts.addOption('V', "verbVerbose", "Turns on very verbose output",
                          false, null, "Program Options");


        opts.addOption('r', "randomGraphs", "The number of random graphs" +
                       " to use for the null model (default: 1000)",
                       true, "INT", "Algorithm Options");
        opts.addOption('z', "motifSize", "The number of vertices in the" +
                       " identified motifs (default: 3)",
                       true, "INT", "Algorithm Options");
        opts.addOption('s', "useSimpleMotifs", "If searching for motifs in a " +
                       "multigraph, counts only simple graphs as motifs",
                       false, null, "Algorithm Options");


        opts.addOption('Z', "minZScore", "The minimum Z-Score for any motif" +
                       " in the original network to be used for computing " +
                       "modularity (default: 1)",
                       true, "DOUBLE", "Algorithm Options");
        opts.addOption('O', "minOccurrences", "The minimum number of occurrences"
                       + " for any motif" +
                       " in the original network to be used for computing " +
                       "modularity (default: 1)",
                       true, "INT", "Algorithm Options");




//         opts.addOption('w', "weighted", "Uses a weighted edge simiarity",
//                           false, null, "Input Options");
        opts.addOption('d', "loadAsDirectedGraph", "Loads the input graph as " +
                       "a directed graph",
                       false, null, "Input Options");
        opts.addOption('m', "loadAsMultigraph", "Loads the input graph as " +
                       "a multigraph",
                       false, null, "Input Options");


        opts.addOption('o', "outputFormat", "The type of format to use " +
                       "when writing the graphs (default: serialized)",
                       true, "FORMAT", "Output Options");        
        opts.addOption('H', "makeHtml", "Generates an HTML rendering" +
                       "of the significant motifs",
                       true, "DIR", "Output Options");


        opts.parseOptions(args);


        if (opts.numPositionalArgs() < 2 || opts.hasOption("help")) {
            usage(opts);
            return;
        }


        // If verbose output is enabled, update all the loggers in the S-Space
        // package logging tree to output at Level.FINE (normally, it is
        // Level.INFO).  This provides a more detailed view of how the execution
        // flow is proceeding.
        if (opts.hasOption('v')) 
            LoggerUtil.setLevel(Level.FINE);
        if (opts.hasOption('V')) 
            LoggerUtil.setLevel(Level.FINER);




        LOGGER.info("Loading graph file");
        Indexer<String> vertexLabels = new ObjectIndexer<String>();
        File f = new File(opts.getPositionalArg(0));


        int motifSize = (opts.hasOption('z')) ? opts.getIntOption('z') : 3;
        int numRandomGraphs = (opts.hasOption('r')) 
            ? opts.getIntOption('r') : 1000;


        double minZScore = opts.hasOption('Z')
            ? opts.getDoubleOption('Z')
            : 1d;
        int minOccurrences = opts.hasOption('O')
            ? opts.getIntOption('O')
            : 1;
        Fanmod.MotifFilter filter = 
            new Fanmod.FrequencyAndZScoreFilter(minOccurrences, minZScore);


        info(LOGGER, "retaining motifs occurring at least %d times with a " +
             "z-score at or above %f", minOccurrences, minZScore);
       
        boolean isMultigraph = opts.hasOption('m');
        boolean isDirected = opts.hasOption('d');
        Fanmod fanmod = new Fanmod();
        
        try {       
            if (isMultigraph && isDirected) {
                DirectedMultigraph<String> dm = 
                    GraphIO.readDirectedMultigraph(f, vertexLabels);
                boolean findSimpleMotifs = opts.hasOption('s');
                Map<Multigraph<String,DirectedTypedEdge<String>>,Fanmod.Result> 
                    motifToZScore = fanmod.findMotifs( 
                      dm, findSimpleMotifs, motifSize, numRandomGraphs, filter);
                info(LOGGER, "found %d motifs with z-score above %f%n", 
                     motifToZScore.size(), minZScore);
                if (opts.hasOption('H')) {
                    File baseDir = new File(opts.getStringOption('H'));
                    // Check that we can create output in that directory
                    if (!baseDir.exists())
                        baseDir.mkdir();
                    DotIO dio = new DotIO();
                    
                    // Generate a consistent set of edge colors to user across
                    // all the motif visualizations
                    Map<String,Color> edgeColors = new HashMap<String,Color>();
                    ColorGenerator cg = new ColorGenerator();
                    for (String type : dm.edgeTypes())
                        edgeColors.put(type, cg.next());


                    PrintWriter pw = new PrintWriter(new File(baseDir, "index.html"));
                    PrintWriter imgScript = new PrintWriter(new File(baseDir, "img-script.sh"));
                    imgScript.println("#!/bin/bash");
                    pw.println("<html>");
                    pw.println("<head><script src=\"http://www.kryogenix.org/code/browser/sorttable/sorttable.js\"></script></head>");
                    // pw.println("<head><script src=\"sorttable.js\"></script></head>");
                    pw.println("<body><table border=\"2\" class=\"sortable\">");
                    pw.println("  <tr>" + 
                               "<td><h1><u>Motif</u></h1></td>" +
                               "<td><h1><u>Count</u></h1></td>" +
                               "<td><h1><u>Z-Score</u></h1></td>" +
                               "<td><h1><u>Mean Count in Random Graphs</u></h1></td>" +
                               "<td><h1><u>StdDev in Random Graphs</u></h1></td>" +
                               "</tr>");
                    int graphNum = 0;
                    for (Map.Entry<Multigraph<String,DirectedTypedEdge<String>>,Fanmod.Result> e :
                             motifToZScore.entrySet()) {
                        File dotFile = new File(baseDir, "graph-" + (graphNum++) + ".dot");
                        dio.writeDirectedMultigraph(e.getKey(), dotFile, edgeColors);
                        String imgFile = dotFile.getName();
                        imgFile = imgFile.substring(0, imgFile.length() - 3) + "gif";
                        imgScript.printf("dot -Tgif %s -o %s%n", dotFile.getName(), imgFile);
                        int count = e.getValue().count;
                        double zScore = e.getValue().statistic;
                        double mean = e.getValue().meanCountInNullModel;
                        double stddev = e.getValue().stddevInNullModel;
                        pw.printf("  <tr><td><img src=\"%s\"></td><td>%d</td><td>%f</td><td>%f</td><td>%f</td></tr>%n",
                                  imgFile, count, zScore, mean, stddev);
                    }
                    pw.println("</table></body></html>");
                    imgScript.close();
                    pw.close();
                }
                
                info(LOGGER, "writing final motifs to %s", 
                     opts.getPositionalArg(1));
                // Write the results to file
                File output = new File(opts.getPositionalArg(1));
                // Copy the motifs to a new HashSet to avoid writing the result
                // as a KeySet, which includes the fanmod result values.
                SerializableUtil.save(
                    new HashSet<Multigraph<String,
                        DirectedTypedEdge<String>>>(motifToZScore.keySet()), output);
            }


            else if (isMultigraph) {
                boolean findSimpleMotifs = opts.hasOption('s');
                UndirectedMultigraph<String> um = 
                    GraphIO.readUndirectedMultigraph(f, vertexLabels);
                Map<Multigraph<String,TypedEdge<String>>,Fanmod.Result> 
                    motifToZScore = fanmod.findMotifs( 
                      um, findSimpleMotifs, motifSize, numRandomGraphs, filter);
                info(LOGGER, "found %d motifs with z-score above %f%n", 
                     motifToZScore.size(), minZScore);
                if (opts.hasOption('H')) {
                    File baseDir = new File(opts.getStringOption('H'));
                    // Check that we can create output in that directory
                    if (!baseDir.exists())
                        baseDir.mkdir();
                    DotIO dio = new DotIO();
                    // Generate a consistent set of edge colors to user across
                    // all the motif visualizations
                    Map<String,Color> edgeColors = new HashMap<String,Color>();
                    ColorGenerator cg = new ColorGenerator();
                    for (String type : um.edgeTypes())
                        edgeColors.put(type, cg.next());


                    PrintWriter pw = new PrintWriter(new File(baseDir, "index.html"));
                    PrintWriter imgScript = new PrintWriter(new File(baseDir, "img-script.sh"));
                    imgScript.println("#!/bin/bash");
                    pw.println("<html>");
                    pw.println("<head><script src=\"http://www.kryogenix.org/code/browser/sorttable/sorttable.js\"></script></head>");
                    // pw.println("<head><script src=\"sorttable.js\"></script></head>");
                    pw.println("<body><table border=\"2\" class=\"sortable\">");
                    pw.println("  <tr>" + 
                               "<td><h1><u>Motif</u></h1></td>" +
                               "<td><h1><u>Count</u></h1></td>" +
                               "<td><h1><u>Z-Score</u></h1></td>" +
                               "<td><h1><u>Mean Count in Random Graphs</u></h1></td>" +
                               "<td><h1><u>StdDev in Random Graphs</u></h1></td>" +
                               "</tr>");
                    int graphNum = 0;
                    for (Map.Entry<Multigraph<String,TypedEdge<String>>,Fanmod.Result> e :
                             motifToZScore.entrySet()) {
                        File dotFile = new File(baseDir, "graph-" + (graphNum++) + ".dot");
                        dio.writeUndirectedMultigraph(e.getKey(), dotFile, edgeColors);
                        String imgFile = dotFile.getName();
                        imgFile = imgFile.substring(0, imgFile.length() - 3) + "gif";
                        imgScript.printf("dot -Tgif %s -o %s%n", dotFile.getName(), imgFile);
                        int count = e.getValue().count;
                        double zScore = e.getValue().statistic;
                        double mean = e.getValue().meanCountInNullModel;
                        double stddev = e.getValue().stddevInNullModel;
                        pw.printf("  <tr><td><img src=\"%s\"></td><td>%d</td><td>%f</td><td>%f</td><td>%f</td></tr>%n",
                                  imgFile, count, zScore, mean, stddev);
                    }
                    pw.println("</table></body></html>");
                    imgScript.close();
                    pw.close();


                    info(LOGGER, "writing final motifs to %s", 
                         opts.getPositionalArg(1));
                    // Write the results to file
                    File output = new File(opts.getPositionalArg(1));
                    // Copy the motifs to a new HashSet to avoid writing the result
                    // as a KeySet, which includes the fanmod result values.
                    SerializableUtil.save(
                        new HashSet<Multigraph<String,TypedEdge<String>>>(
                            motifToZScore.keySet()), output);

View Full Code Here

 * @author Keith Stevens
 */
public class MatrixConverter {


    public static void main(String[] args) throws IOException {
        ArgOptions options = new ArgOptions();
        options.addOption('i', "inputFormat",
                          "the matrix format of the input matrix",
                          true, "STRING", "Required");
        options.addOption('o', "ouputFormat",
                          "the matrix format of the output matrix",
                          true, "STRING", "Required");
        options.parseOptions(args);


        if (options.numPositionalArgs() != 2 ||
            !options.hasOption('i') || !options.hasOption('o')) {
            System.out.println(
               "usage: java MatrixConverter [options] <int.mat> <out.mat>\n" +
               options.prettyPrint());
            System.exit(1);
        }


        File inMatFile = new File(options.getPositionalArg(0));
        File outMatFile = new File(options.getPositionalArg(1));
        Format inMatFormat = Format.valueOf(
                options.getStringOption('i').toUpperCase());
        Format outMatFormat = Format.valueOf(
                options.getStringOption('o').toUpperCase());
        Matrix matrix = MatrixIO.readMatrix(inMatFile, inMatFormat);
        MatrixIO.writeMatrix(matrix, outMatFile, outMatFormat);
    }

View Full Code Here

        }
    }


    public static void main(String[] args) throws Exception {
        // Setup the argument options.
        ArgOptions options = new ArgOptions();
        options.addOption('Z', "stemmingAlgorithm",
                          "specifices the stemming algorithm to use on " +
                          "tokens while iterating.  (default: none)",
                          true, "CLASSNAME", "Tokenizing Options");
        options.addOption('F', "tokenFilter", "filters to apply to the input " +
                          "token stream", true, "FILTER_SPEC", 
                          "Tokenizing Options");
        options.addOption('L', "lowerCase", "lower-cases each token after " +
                          "all other filtering has been applied", false, null, 
                          "Tokenizing Options");
        options.addOption('P', "partOfSpeech",
                          "use part of speech tags for each token.",
                          false, null, "Tokenizing Options");
        options.addOption('H', "discardHeader",
                          "If true, the first line of each dependency " +
                          "document will be discarded.",
                          false, null, "Tokenizing Options");
        options.addOption('v', "verbose",
                          "Print verbose output about counting status",
                          false, null, "Optional");
        options.addOption('D', "dependencyParseFormat",
                          "the name of the dependency parsed format for " +
                          "the corpus (defalt: CoNLL)",
                          true, "STR", 
                          "Advanced Dependency Parsing");


        // Parse and validate the options.
        options.parseOptions(args);
        if (options.numPositionalArgs() < 2) {
            System.out.println(
                "usage: java DepTokenCounter" 
                + " [options] <output-file> <input-file> [<input-file>]*\n"
                + options.prettyPrint() 
                + "\n\n" + OptionDescriptions.TOKEN_FILTER_DESCRIPTION);
            return;
        }


        // Setup logging.
        if (options.hasOption("verbose")) 
            LoggerUtil.setLevel(Level.FINE);


        // Extract key arguments.
        boolean doLowerCasing = options.hasOption("lowerCase");
        boolean doPos = options.hasOption("partOfSpeech");
        boolean discardHeader = options.hasOption('H');


        TokenFilter filter = (options.hasOption("tokenFilter"))
            ? TokenFilter.loadFromSpecification(options.getStringOption('F'))
            : null;


        Stemmer stemmer = options.getObjectOption("stemmingAlgorithm", null);


        String format = options.getStringOption(
                "dependencyParseFormat", "CoNLL");


        // setup the dependency extractor.
        DependencyExtractor e = null;
        if (format.equals("CoNLL"))
            e = new CoNLLDependencyExtractor(filter, stemmer);
        else if (format.equals("WaCKy"))
            e = new WaCKyDependencyExtractor(filter, stemmer);


        DepTokenCounter counter = new DepTokenCounter(doLowerCasing, doPos, e);


        // Process each of the input files
        for (int i = 1; i < options.numPositionalArgs(); ++i)
            counter.process(new DependencyFileDocumentIterator(
                        options.getPositionalArg(i), discardHeader));


        // Then write the results to disk
        PrintWriter pw = new PrintWriter(options.getPositionalArg(0));
        for (Map.Entry<String,Integer> entry 
                 : counter.tokenToCount.entrySet())
            pw.printf("%s %d\n", entry.getKey(), entry.getValue());
        pw.close();
    }

View Full Code Here

0 1 2

TOP

Related Classes of edu.ucla.sspace.common.ArgOptions

edu.ucla.sspace.mains.EvaluatorMain

edu.ucla.sspace.mains.FixedDurationTemporalRandomIndexingMain

edu.ucla.sspace.mains.GenericMain

edu.ucla.sspace.mains.GenericWordsiMain

edu.ucla.sspace.mains.GrefenstetteMain

edu.ucla.sspace.mains.HadoopGenericMain

edu.ucla.sspace.mains.LRAMain

edu.ucla.sspace.tools.BigramExtractor

edu.ucla.sspace.tools.BlogPreProcessor

edu.ucla.sspace.tools.ChildesParser

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.