Package edu.ucla.sspace.common

Examples of edu.ucla.sspace.common.ArgOptions


     * Adds the default options for running semantic space algorithms from the
     * command line.  Subclasses should override this method and return a
     * different instance if the default options need to be different.
     */
    protected ArgOptions setupOptions() {
        ArgOptions options = new ArgOptions();

        // Add input file options.
        options.addOption('f', "fileList", "a list of document files",
                          true, "FILE[,FILE...]", "Required (at least one of)");
        options.addOption('d', "docFile",
                          "a file where each line is a document", true,
                          "FILE[,FILE...]", "Required (at least one of)");
        options.addOption('R', "corpusReader",
                          "Specifies a CorpusReader which will " +
                          "automatically parse the document files that are " +
                          "not in the formats expected by -f and -d.",
                          true, "CLASSNAME,FILE[,FILE...]",
                          "Required (at least one of)");

        // Add run time options.
        options.addOption('o', "outputFormat", "the .sspace format to use",
                          true, "FORMAT",
                          "Program Options");
        if (isMultiThreaded) {
            options.addOption('t', "threads", "the number of threads to use",
                              true, "INT", "Program Options");
        }
        options.addOption('w', "overwrite", "specifies whether to " +
                          "overwrite the existing output", true, "BOOL",
                          "Program Options");
        options.addOption('v', "verbose", "prints verbose output",
                          false, null, "Program Options");

        // Add tokenizing options.
        options.addOption('Z', "stemmingAlgorithm",
                          "specifices the stemming algorithm to use on " +
                          "tokens while iterating.  (default: none)",
                          true, "CLASSNAME", "Tokenizing Options");
        options.addOption('F', "tokenFilter", "filters to apply to the input " +
                          "token stream", true, "FILTER_SPEC",
                          "Tokenizing Options");
        options.addOption('C', "compoundWords", "a file where each line is a " +
                          "recognized compound word", true, "FILE",
                          "Tokenizing Options");
        options.addOption('z', "wordLimit", "Set the maximum number of words " +
                          "a document can return",
                          true, "INT", "Tokenizing Options");

        addExtraOptions(options);
        return options;
View Full Code Here


    /**
     * Adds the options for running the {@code Grefenstette} algorithm
     */
    @Override
    protected ArgOptions setupOptions() {
        ArgOptions options = new ArgOptions();
        options.addOption('s', "sentenceFile",
                          "a file where each line is a sentence", true,
                          "FILE[,FILE...]", "Required (at least one of)");

        options.addOption('o', "outputFormat", "the .sspace format to use",
                          true, "{text|binary}", "Program Options");
        // options.addOption('t', "threads", "the number of threads to use",
        //                           true, "INT", "Program Options");
        options.addOption('w', "overwrite", "specifies whether to " +
                          "overwrite the existing output", true, "BOOL",
                          "Program Options");
        options.addOption('t', "threads", "the number of threads to use",
                          true, "INT", "Program Options");
        options.addOption('v', "verbose", "prints verbose output",
                          false, null, "Program Options");
        addExtraOptions(options);
        return options;
    }
View Full Code Here

    /**
     * Adds all of the options to the {@link ArgOptions}.
     */
    protected ArgOptions createOptions() {
        ArgOptions options = new ArgOptions();
        options.addOption('f', "fileList", "a list of document files",
                          true, "FILE[,FILE...]", "Required (at least one of)");
        options.addOption('d', "docFile",
                          "a file where each line is a document", true,
                          "FILE[,FILE...]", "Required (at least one of)");

        options.addOption('T', "timespan", "the timespan for each semantic " +
                          "partition", true, "Date String", "Required");

        options.addOption('o', "outputFormat", "the .sspace format to use",
                          true, "{text|binary}", "Program Options");
        options.addOption('t', "threads", "the number of threads to use",
                          true, "INT", "Program Options");
        options.addOption('w', "overwrite", "specifies whether to " +
                          "overwrite the existing output", true, "BOOL",
                          "Program Options");
        options.addOption('v', "verbose", "prints verbose output",
                          false, null, "Program Options");

        // Algorithm Options
        options.addOption('i', "vectorGenerator", "IndexVectorGenerator "
                          + "class to use", true,
                          "CLASSNAME", "Algorithm Options");
        options.addOption('l', "vectorLength", "length of semantic vectors",
                          true, "INT", "Algorithm Options");
        options.addOption('n', "permutationFunction", "permutation function "
                          + "to use", true,
                          "CLASSNAME", "Algorithm Options");
        options.addOption('p', "usePermutations", "whether to permute " +
                          "index vectors based on word order", true,
                          "BOOL", "Algorithm Options");
        options.addOption('r', "useSparseSemantics", "use a sparse encoding of "
                          + "semantics to save memory", true,
                          "BOOL", "Algorithm Options");
        options.addOption('s', "windowSize", "how many words to consider " +
                          "in each direction", true,
                          "INT", "Algorithm Options");
        options.addOption('S', "saveVectors", "save word-to-IndexVector mapping"
                          + " after processing", true,
                          "FILE", "Algorithm Options");
        options.addOption('L', "loadVectors", "load word-to-IndexVector mapping"
                          + " before processing", true,
                          "FILE", "Algorithm Options");

        // Input Options
        options.addOption('F', "tokenFilter", "filters to apply to the input " +
                          "token stream", true, "FILTER_SPEC",
                          "Tokenizing Options");
        options.addOption('C', "compoundWords", "a file where each line is a " +
                          "recognized compound word", true, "FILE",
                          "Tokenizing Options");

        options.addOption('W', "semanticFilter", "exclusive list of word",
                          true, "FILE", "Input Options");

        // Output Options
        options.addOption('I', "interestingTokenList", "list of interesting " +
                          "words", true, "FILE", "Output Options");
        options.addOption('K', "printShiftRankings", "print ranked list of " +
                          "semantic shifts for each interesting word", false,
                          null, "Output Options");
        options.addOption('R', "savePartitions", "write semantic partitions as " +
                          ".sspace files to disk", false, null,
                          "Output Options");
        options.addOption('P', "printInterestingTokenShifts", "prints the "
                          + "vectors for each interesting word", false, null,
                          "Output Options");
        options.addOption('N', "printInterestingTokenNeighbors", "prints the "
                          + "nearest neighbors for each interesting word", true,
                          "INT", "Output Options");
        options.addOption('Z', "printInterestingTokenNeighborComparison",
                          "prints the distances between each of the"
                          + "nearest neighbors for each interesting word",
                          false, null , "Output Options");

        return options;
View Full Code Here

    /**
     * Creates the {@code EvaluatorMain}.
     */
    public EvaluatorMain() {
        argOptions = new ArgOptions();
        addOptions();
    }
View Full Code Here

    /**
     * {@inheritDoc}
     */
    protected SemanticSpace getSpace() {
        ArgOptions options = argOptions;
        // Setup the assignment reporter.  When training, the assignment report
        // will only be used If the evaluation mode will be for pseudoWord.
        AssignmentReporter reporter = null;
        if (options.hasOption('P'))
            reporter = new PseudoWordReporter(System.out);

        int numClusters = options.getIntOption('c', 0);

        // If Wordsi is being used in an evaluation mode, set up word space
        // accordingly.
        if (options.hasOption('e')) {
            // If the evaluation type is not set, report an error and exit.
            if (!options.hasOption('E') && !options.hasOption('P')) {
                usage();
                System.out.println(
                        "An Evaluation Type must be set when evaluating " +
                        " a trained Wordsi model.");
                System.exit(1);
            }

            // Load the semantic space that has the predefined word senses from
            // disk and return an EvaluationWordsi instance.
            try {
                SemanticSpace sspace = SemanticSpaceIO.load(
                        options.getStringOption('e'));
                if (options.hasOption('E'))
                    reporter = new SemEvalReporter(System.out);
                return new EvaluationWordsi(
                        getAcceptedWords(), getExtractor(), sspace, reporter);
            } catch (IOException ioe) {
                throw new IOError(ioe);
            }
        } else if (options.hasOption('s')) {
            // Create a StreamingWordsi instance that uses the specified online
            // cluster generator.
            System.getProperties().setProperty(
                    OnlineClustering.NUM_CLUSTERS_PROPERTY,
                    options.getStringOption('c'));
            Generator<OnlineClustering<SparseDoubleVector>> clusterGenerator =
                ReflectionUtil.getObjectInstance(options.getStringOption('s'));
            return new StreamingWordsi(getAcceptedWords(), getExtractor(),
                                       clusterGenerator, reporter, numClusters);
        } else if (options.hasOption('b')) {
            // Create a WaitingWordsi instance that uses the specified batch
            // clustering implementation.
            Clustering clustering =
                ReflectionUtil.getObjectInstance(options.getStringOption('b'));
            return new WaitingWordsi(getAcceptedWords(), getExtractor(),
                                     clustering, reporter, numClusters);
        } else {
            // None of the required options was provided, report an error and
            // exit.
View Full Code Here

     * Adds the default options for running semantic space algorithms from the
     * command line.  Subclasses should override this method and return a
     * different instance if the default options need to be different.
     */
    protected ArgOptions setupOptions() {
        ArgOptions options = new ArgOptions();
        options.addOption('c', "corpusDir", "the directory of the corpus",
                          true, "DIR", "Required");
        options.addOption('a', "analogyFile",
                          "the file containing list of word pairs",
                          true, "FILE", "Required");
        options.addOption('t', "testAnalogies",
                           "the file containing list of analogies",
                           true, "FILE", "Required");
        options.addOption('o', "outputFile",
                          "the file containing the cosine similarity output " +
                          "for the analogies from testAnalogies",
                          true, "FILE", "Required");
        options.addOption('i', "indexDir",
                          "a Directory for storing or loading "
                          + "the Lucene index", true, "DIR", "Required");
        options.addOption('n', "dimensions",
                          "the number of dimensions in the semantic space",
                          true, "INT");
        options.addOption('r', "readMatrixFile",
                          "file containing projection matrix"
                          , true, "FILE");
        options.addOption('s', "skipIndex",
                          "turn indexing off.  Must specify index directory",
                          false , null);
        options.addOption('v', "verbose",
                          "prints verbose output",
                          false, null, "Program Options");
        options.addOption('w', "writeMatrixFile",
                          "file to write projection matrix to"
                          , true, "FILE");
        return options;
    }
View Full Code Here

     * Adds the default options for running semantic space algorithms from the
     * command line.  Subclasses should override this method and return a
     * different instance if the default options need to be different.
     */
    protected ArgOptions setupOptions() {
        ArgOptions options = new ArgOptions();

        // Add run time options.
        options.addOption('o', "outputFormat", "the .sspace format to use",
                          true, "FORMAT",
                          "Program Options");
        options.addOption('w', "overwrite", "specifies whether to " +
                          "overwrite the existing output", true, "BOOL",
                          "Program Options");
        options.addOption('v', "verbose", "prints verbose output",
                          false, null, "Program Options");

        // Add tokenizing options.
       
        options.addOption('Z', "stemmingAlgorithm",
                          "specifices the stemming algorithm to use on " +
                          "tokens while iterating.  (default: none)",
                          true, "CLASSNAME", "Tokenizing Options");
        options.addOption('F', "tokenFilter", "filters to apply to the input " +
                          "token stream", true, "FILTER_SPEC",
                          "Tokenizing Options");
        options.addOption('C', "compoundWords", "a file where each line is a " +
                          "recognized compound word", true, "FILE",
                          "Tokenizing Options");
        options.addOption('z', "wordLimit", "Set the maximum number of words " +
                          "an document can return",
                          true, "INT", "Tokenizing Options");       

        addExtraOptions(options);
        return options;
View Full Code Here

    private static final Logger LOGGER =
        Logger.getLogger(FanmodTool.class.getName());

    public static void main(String[] args) {
        ArgOptions opts = new ArgOptions();
       
        opts.addOption('h', "help", "Generates a help message and exits",
                          false, null, "Program Options");
        opts.addOption('v', "verbose", "Turns on verbose output",
                          false, null, "Program Options");
        opts.addOption('V', "verbVerbose", "Turns on very verbose output",
                          false, null, "Program Options");

        opts.addOption('r', "randomGraphs", "The number of random graphs" +
                       " to use for the null model (default: 1000)",
                       true, "INT", "Algorithm Options");
        opts.addOption('z', "motifSize", "The number of vertices in the" +
                       " identified motifs (default: 3)",
                       true, "INT", "Algorithm Options");
        opts.addOption('s', "useSimpleMotifs", "If searching for motifs in a " +
                       "multigraph, counts only simple graphs as motifs",
                       false, null, "Algorithm Options");

        opts.addOption('Z', "minZScore", "The minimum Z-Score for any motif" +
                       " in the original network to be used for computing " +
                       "modularity (default: 1)",
                       true, "DOUBLE", "Algorithm Options");
        opts.addOption('O', "minOccurrences", "The minimum number of occurrences"
                       + " for any motif" +
                       " in the original network to be used for computing " +
                       "modularity (default: 1)",
                       true, "INT", "Algorithm Options");


//         opts.addOption('w', "weighted", "Uses a weighted edge simiarity",
//                           false, null, "Input Options");
        opts.addOption('d', "loadAsDirectedGraph", "Loads the input graph as " +
                       "a directed graph",
                       false, null, "Input Options");
        opts.addOption('m', "loadAsMultigraph", "Loads the input graph as " +
                       "a multigraph",
                       false, null, "Input Options");

        opts.addOption('o', "outputFormat", "The type of format to use " +
                       "when writing the graphs (default: serialized)",
                       true, "FORMAT", "Output Options");       
        opts.addOption('H', "makeHtml", "Generates an HTML rendering" +
                       "of the significant motifs",
                       true, "DIR", "Output Options");

        opts.parseOptions(args);

        if (opts.numPositionalArgs() < 2 || opts.hasOption("help")) {
            usage(opts);
            return;
        }

        // If verbose output is enabled, update all the loggers in the S-Space
        // package logging tree to output at Level.FINE (normally, it is
        // Level.INFO).  This provides a more detailed view of how the execution
        // flow is proceeding.
        if (opts.hasOption('v'))
            LoggerUtil.setLevel(Level.FINE);
        if (opts.hasOption('V'))
            LoggerUtil.setLevel(Level.FINER);


        LOGGER.info("Loading graph file");
        Indexer<String> vertexLabels = new ObjectIndexer<String>();
        File f = new File(opts.getPositionalArg(0));

        int motifSize = (opts.hasOption('z')) ? opts.getIntOption('z') : 3;
        int numRandomGraphs = (opts.hasOption('r'))
            ? opts.getIntOption('r') : 1000;

        double minZScore = opts.hasOption('Z')
            ? opts.getDoubleOption('Z')
            : 1d;
        int minOccurrences = opts.hasOption('O')
            ? opts.getIntOption('O')
            : 1;
        Fanmod.MotifFilter filter =
            new Fanmod.FrequencyAndZScoreFilter(minOccurrences, minZScore);

        info(LOGGER, "retaining motifs occurring at least %d times with a " +
             "z-score at or above %f", minOccurrences, minZScore);
      
        boolean isMultigraph = opts.hasOption('m');
        boolean isDirected = opts.hasOption('d');
        Fanmod fanmod = new Fanmod();
       
        try {      
            if (isMultigraph && isDirected) {
                DirectedMultigraph<String> dm =
                    GraphIO.readDirectedMultigraph(f, vertexLabels);
                boolean findSimpleMotifs = opts.hasOption('s');
                Map<Multigraph<String,DirectedTypedEdge<String>>,Fanmod.Result>
                    motifToZScore = fanmod.findMotifs(
                      dm, findSimpleMotifs, motifSize, numRandomGraphs, filter);
                info(LOGGER, "found %d motifs with z-score above %f%n",
                     motifToZScore.size(), minZScore);
                if (opts.hasOption('H')) {
                    File baseDir = new File(opts.getStringOption('H'));
                    // Check that we can create output in that directory
                    if (!baseDir.exists())
                        baseDir.mkdir();
                    DotIO dio = new DotIO();
                   
                    // Generate a consistent set of edge colors to user across
                    // all the motif visualizations
                    Map<String,Color> edgeColors = new HashMap<String,Color>();
                    ColorGenerator cg = new ColorGenerator();
                    for (String type : dm.edgeTypes())
                        edgeColors.put(type, cg.next());

                    PrintWriter pw = new PrintWriter(new File(baseDir, "index.html"));
                    PrintWriter imgScript = new PrintWriter(new File(baseDir, "img-script.sh"));
                    imgScript.println("#!/bin/bash");
                    pw.println("<html>");
                    pw.println("<head><script src=\"http://www.kryogenix.org/code/browser/sorttable/sorttable.js\"></script></head>");
                    // pw.println("<head><script src=\"sorttable.js\"></script></head>");
                    pw.println("<body><table border=\"2\" class=\"sortable\">");
                    pw.println("  <tr>" +
                               "<td><h1><u>Motif</u></h1></td>" +
                               "<td><h1><u>Count</u></h1></td>" +
                               "<td><h1><u>Z-Score</u></h1></td>" +
                               "<td><h1><u>Mean Count in Random Graphs</u></h1></td>" +
                               "<td><h1><u>StdDev in Random Graphs</u></h1></td>" +
                               "</tr>");
                    int graphNum = 0;
                    for (Map.Entry<Multigraph<String,DirectedTypedEdge<String>>,Fanmod.Result> e :
                             motifToZScore.entrySet()) {
                        File dotFile = new File(baseDir, "graph-" + (graphNum++) + ".dot");
                        dio.writeDirectedMultigraph(e.getKey(), dotFile, edgeColors);
                        String imgFile = dotFile.getName();
                        imgFile = imgFile.substring(0, imgFile.length() - 3) + "gif";
                        imgScript.printf("dot -Tgif %s -o %s%n", dotFile.getName(), imgFile);
                        int count = e.getValue().count;
                        double zScore = e.getValue().statistic;
                        double mean = e.getValue().meanCountInNullModel;
                        double stddev = e.getValue().stddevInNullModel;
                        pw.printf("  <tr><td><img src=\"%s\"></td><td>%d</td><td>%f</td><td>%f</td><td>%f</td></tr>%n",
                                  imgFile, count, zScore, mean, stddev);
                    }
                    pw.println("</table></body></html>");
                    imgScript.close();
                    pw.close();
                }
               
                info(LOGGER, "writing final motifs to %s",
                     opts.getPositionalArg(1));
                // Write the results to file
                File output = new File(opts.getPositionalArg(1));
                // Copy the motifs to a new HashSet to avoid writing the result
                // as a KeySet, which includes the fanmod result values.
                SerializableUtil.save(
                    new HashSet<Multigraph<String,
                        DirectedTypedEdge<String>>>(motifToZScore.keySet()), output);
            }

            else if (isMultigraph) {
                boolean findSimpleMotifs = opts.hasOption('s');
                UndirectedMultigraph<String> um =
                    GraphIO.readUndirectedMultigraph(f, vertexLabels);
                Map<Multigraph<String,TypedEdge<String>>,Fanmod.Result>
                    motifToZScore = fanmod.findMotifs(
                      um, findSimpleMotifs, motifSize, numRandomGraphs, filter);
                info(LOGGER, "found %d motifs with z-score above %f%n",
                     motifToZScore.size(), minZScore);
                if (opts.hasOption('H')) {
                    File baseDir = new File(opts.getStringOption('H'));
                    // Check that we can create output in that directory
                    if (!baseDir.exists())
                        baseDir.mkdir();
                    DotIO dio = new DotIO();
                    // Generate a consistent set of edge colors to user across
                    // all the motif visualizations
                    Map<String,Color> edgeColors = new HashMap<String,Color>();
                    ColorGenerator cg = new ColorGenerator();
                    for (String type : um.edgeTypes())
                        edgeColors.put(type, cg.next());

                    PrintWriter pw = new PrintWriter(new File(baseDir, "index.html"));
                    PrintWriter imgScript = new PrintWriter(new File(baseDir, "img-script.sh"));
                    imgScript.println("#!/bin/bash");
                    pw.println("<html>");
                    pw.println("<head><script src=\"http://www.kryogenix.org/code/browser/sorttable/sorttable.js\"></script></head>");
                    // pw.println("<head><script src=\"sorttable.js\"></script></head>");
                    pw.println("<body><table border=\"2\" class=\"sortable\">");
                    pw.println("  <tr>" +
                               "<td><h1><u>Motif</u></h1></td>" +
                               "<td><h1><u>Count</u></h1></td>" +
                               "<td><h1><u>Z-Score</u></h1></td>" +
                               "<td><h1><u>Mean Count in Random Graphs</u></h1></td>" +
                               "<td><h1><u>StdDev in Random Graphs</u></h1></td>" +
                               "</tr>");
                    int graphNum = 0;
                    for (Map.Entry<Multigraph<String,TypedEdge<String>>,Fanmod.Result> e :
                             motifToZScore.entrySet()) {
                        File dotFile = new File(baseDir, "graph-" + (graphNum++) + ".dot");
                        dio.writeUndirectedMultigraph(e.getKey(), dotFile, edgeColors);
                        String imgFile = dotFile.getName();
                        imgFile = imgFile.substring(0, imgFile.length() - 3) + "gif";
                        imgScript.printf("dot -Tgif %s -o %s%n", dotFile.getName(), imgFile);
                        int count = e.getValue().count;
                        double zScore = e.getValue().statistic;
                        double mean = e.getValue().meanCountInNullModel;
                        double stddev = e.getValue().stddevInNullModel;
                        pw.printf("  <tr><td><img src=\"%s\"></td><td>%d</td><td>%f</td><td>%f</td><td>%f</td></tr>%n",
                                  imgFile, count, zScore, mean, stddev);
                    }
                    pw.println("</table></body></html>");
                    imgScript.close();
                    pw.close();

                    info(LOGGER, "writing final motifs to %s",
                         opts.getPositionalArg(1));
                    // Write the results to file
                    File output = new File(opts.getPositionalArg(1));
                    // Copy the motifs to a new HashSet to avoid writing the result
                    // as a KeySet, which includes the fanmod result values.
                    SerializableUtil.save(
                        new HashSet<Multigraph<String,TypedEdge<String>>>(
                            motifToZScore.keySet()), output);
View Full Code Here

* @author Keith Stevens
*/
public class MatrixConverter {

    public static void main(String[] args) throws IOException {
        ArgOptions options = new ArgOptions();
        options.addOption('i', "inputFormat",
                          "the matrix format of the input matrix",
                          true, "STRING", "Required");
        options.addOption('o', "ouputFormat",
                          "the matrix format of the output matrix",
                          true, "STRING", "Required");
        options.parseOptions(args);

        if (options.numPositionalArgs() != 2 ||
            !options.hasOption('i') || !options.hasOption('o')) {
            System.out.println(
               "usage: java MatrixConverter [options] <int.mat> <out.mat>\n" +
               options.prettyPrint());
            System.exit(1);
        }

        File inMatFile = new File(options.getPositionalArg(0));
        File outMatFile = new File(options.getPositionalArg(1));
        Format inMatFormat = Format.valueOf(
                options.getStringOption('i').toUpperCase());
        Format outMatFormat = Format.valueOf(
                options.getStringOption('o').toUpperCase());
        Matrix matrix = MatrixIO.readMatrix(inMatFile, inMatFormat);
        MatrixIO.writeMatrix(matrix, outMatFile, outMatFormat);
    }
View Full Code Here

        }
    }

    public static void main(String[] args) throws Exception {
        // Setup the argument options.
        ArgOptions options = new ArgOptions();
        options.addOption('Z', "stemmingAlgorithm",
                          "specifices the stemming algorithm to use on " +
                          "tokens while iterating.  (default: none)",
                          true, "CLASSNAME", "Tokenizing Options");
        options.addOption('F', "tokenFilter", "filters to apply to the input " +
                          "token stream", true, "FILTER_SPEC",
                          "Tokenizing Options");
        options.addOption('L', "lowerCase", "lower-cases each token after " +
                          "all other filtering has been applied", false, null,
                          "Tokenizing Options");
        options.addOption('P', "partOfSpeech",
                          "use part of speech tags for each token.",
                          false, null, "Tokenizing Options");
        options.addOption('H', "discardHeader",
                          "If true, the first line of each dependency " +
                          "document will be discarded.",
                          false, null, "Tokenizing Options");
        options.addOption('v', "verbose",
                          "Print verbose output about counting status",
                          false, null, "Optional");
        options.addOption('D', "dependencyParseFormat",
                          "the name of the dependency parsed format for " +
                          "the corpus (defalt: CoNLL)",
                          true, "STR",
                          "Advanced Dependency Parsing");

        // Parse and validate the options.
        options.parseOptions(args);
        if (options.numPositionalArgs() < 2) {
            System.out.println(
                "usage: java DepTokenCounter"
                + " [options] <output-file> <input-file> [<input-file>]*\n"
                + options.prettyPrint()
                + "\n\n" + OptionDescriptions.TOKEN_FILTER_DESCRIPTION);
            return;
        }

        // Setup logging.
        if (options.hasOption("verbose"))
            LoggerUtil.setLevel(Level.FINE);

        // Extract key arguments.
        boolean doLowerCasing = options.hasOption("lowerCase");
        boolean doPos = options.hasOption("partOfSpeech");
        boolean discardHeader = options.hasOption('H');

        TokenFilter filter = (options.hasOption("tokenFilter"))
            ? TokenFilter.loadFromSpecification(options.getStringOption('F'))
            : null;

        Stemmer stemmer = options.getObjectOption("stemmingAlgorithm", null);

        String format = options.getStringOption(
                "dependencyParseFormat", "CoNLL");

        // setup the dependency extractor.
        DependencyExtractor e = null;
        if (format.equals("CoNLL"))
            e = new CoNLLDependencyExtractor(filter, stemmer);
        else if (format.equals("WaCKy"))
            e = new WaCKyDependencyExtractor(filter, stemmer);

        DepTokenCounter counter = new DepTokenCounter(doLowerCasing, doPos, e);

        // Process each of the input files
        for (int i = 1; i < options.numPositionalArgs(); ++i)
            counter.process(new DependencyFileDocumentIterator(
                        options.getPositionalArg(i), discardHeader));

        // Then write the results to disk
        PrintWriter pw = new PrintWriter(options.getPositionalArg(0));
        for (Map.Entry<String,Integer> entry
                 : counter.tokenToCount.entrySet())
            pw.printf("%s %d\n", entry.getKey(), entry.getValue());
        pw.close();
    }
View Full Code Here

TOP

Related Classes of edu.ucla.sspace.common.ArgOptions

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.