Package edu.ucla.sspace.common

Examples of edu.ucla.sspace.common.ArgOptions


/**
* @author Keith Stevens
*/
public class ClusterSSpace {
    public static void main(String[] args) throws Exception {
        ArgOptions options = new ArgOptions();
        options.addOption('s', "sspace",
                          "The semantic space to be clustered",
                          true, "FILE", "Required");
        options.addOption('a', "clusteringAlgorithm",
                          "The clustering algorithm to use",
                          true, "CLASSNAME", "Required");
        options.addOption('c', "numClusters",
                          "The number of clusters to use",
                          true, "INT", "Optional");
        options.parseOptions(args);

        if (!options.hasOption('a') ||
            !options.hasOption('s')) {
            System.out.println("Usage: ClusterSSpace\n" +
                               options.prettyPrint());
            System.exit(1);
        }

        Clustering clustering = ReflectionUtil.getObjectInstance(
                options.getStringOption('a'));
        SemanticSpace sspace = new StaticSemanticSpace(
                options.getStringOption('s'));
        int numClusters = options.getIntOption('c', 0);

        Set<String> words = sspace.getWords();
        List<DoubleVector> vectors = new ArrayList<DoubleVector>();
        List<SparseDoubleVector> sparseVectors =
            new ArrayList<SparseDoubleVector>();
View Full Code Here


            tokens++;
        return tokens;
    }

    public static void main(String[] args) {
        ArgOptions options = new ArgOptions();
        options.addOption('t', "includeTitles",
                          "Prints article and section titles as a part of " +
                          "the document",
                          false, null, "Document Processing");
        options.addOption('c', "includeCaptions",
                          "Prints image and table captions as a part of " +
                          "the document",
                          false, null, "Document Processing");
        options.addOption('w', "includeLinkText",
                          "Prints text in the Wikipedia links as a part of " +
                          "the document",
                          false, null, "Document Processing");
        options.addOption('F', "tokenFilter",
                          "Specifies a filter to remove or retain certain " +
                          "tokens",
                          true, "FILTER_SPEC", "Filtering");
        options.addOption('M', "minTokens",
                          "Records only those documents with at least the " +
                          "minimum number of tokens",
                          true, "INT", "Filtering");
        options.addOption('P', "applyPreprocessor",
                          "Applies the DocumentPreprocessor to the documents",
                          false, null, "Filtering");
        options.addOption('v', "verbose",
                          "Print verbose output about article cleaning",
                          false, null, "Optional");
        options.addOption('V', "veryVerbose",
                          "Print lots of verbose output about article cleaning",
                          false, null, "Optional");


        options.parseOptions(args);

        if (options.numPositionalArgs() != 2) {
            System.out.println("usage java [OPTIONS] <wikifile> <output-file>\n"
                               + options.prettyPrint());
            return;
        }

        // If verbose output is enabled, update all the loggers in the S-Space
        // package logging tree to output at Level.FINE (normally, it is
        // Level.INFO).  This provides a more detailed view of how the execution
        // flow is proceeding.
        Level logLevel = null;
        if (options.hasOption("verbose"))
            logLevel = Level.FINE;
        else if (options.hasOption("veryVerbose"))
            logLevel = Level.FINER;
        if (logLevel != null)
            LoggerUtil.setLevel(logLevel);
       
        // Set up the options for the cleaner
        Set<CleanerOption> cleanerOptions = EnumSet.noneOf(CleanerOption.class);
        if (options.hasOption("includeTitles"))
            cleanerOptions.add(CleanerOption.INCLUDE_TITLES);
        if (options.hasOption("includeCaptions"))
            cleanerOptions.add(CleanerOption.INCLUDE_CAPTIONS);
        if (options.hasOption("includeLinkText"))
            cleanerOptions.add(CleanerOption.INCLUDE_LINK_TEXT);
        if (options.hasOption("tokenFilter")) {
            // Set up the token filter based on the spec
            Properties props = new Properties();
            props.setProperty(IteratorFactory.TOKEN_FILTER_PROPERTY,
                              options.getStringOption("tokenFilter"));
            IteratorFactory.setProperties(props);
            cleanerOptions.add(CleanerOption.FILTER_TOKENS);
        }
        if (options.hasOption("applyPreprocessor"))
            cleanerOptions.add(CleanerOption.USE_PREPROCESSOR);
           
        int minTokens = (options.hasOption("minTokens"))
            ? options.getIntOption("minTokens")
            : 0;

        try {
            DocumentBufferedQueue docQueue =
                new DocumentBufferedQueue(options.getPositionalArg(0));
           
            String outFileName = options.getPositionalArg(1);
            WikipediaCleaner cleaner =
                new WikipediaCleaner(outFileName, cleanerOptions, minTokens);
           
            while (docQueue.hasNext()) {
                cleaner.processDocument(docQueue.next());
View Full Code Here

                            + tokenToCount.size() + " unique tokens");
        }
    }

    public static void main(String[] args) {
        ArgOptions options = new ArgOptions();
        options.addOption('Z', "stemmingAlgorithm",
                          "specifices the stemming algorithm to use on " +
                          "tokens while iterating.  (default: none)",
                          true, "CLASSNAME", "Tokenizing Options");
        options.addOption('F', "tokenFilter", "filters to apply to the input " +
                          "token stream", true, "FILTER_SPEC",
                          "Tokenizing Options");
        options.addOption('C', "compoundWords", "a file where each line is a " +
                          "recognized compound word", true, "FILE",
                          "Tokenizing Options");
        options.addOption('L', "lowerCase", "lower-cases each token after " +
                          "all other filtering has been applied", false, null,
                          "Tokenizing Options");
        options.addOption('z', "wordLimit", "Set the maximum number of words " +
                          "a document can return",
                          true, "INT", "Tokenizing Options");
        options.addOption('v', "verbose",
                          "Print verbose output about counting status",
                          false, null, "Optional");
        options.parseOptions(args);
        if (options.numPositionalArgs() < 2) {
            System.out.println(
                "usage: java TokenCounter"
                + " [options] <output-file> <input-file> [<input-file>]*\n"
                + options.prettyPrint()
                + "\n" + OptionDescriptions.COMPOUND_WORDS_DESCRIPTION
                + "\n\n" + OptionDescriptions.TOKEN_FILTER_DESCRIPTION);
            return;
        }

        if (options.hasOption("verbose"))
            LoggerUtil.setLevel(Level.FINE);


        boolean doLowerCasing = options.hasOption("lowerCase");

        Properties props = System.getProperties();
        // Initialize the IteratorFactory to tokenize the documents according to
        // the specified configuration (e.g. filtering, compound words)
        if (options.hasOption("tokenFilter"))
            props.setProperty(IteratorFactory.TOKEN_FILTER_PROPERTY,
                              options.getStringOption("tokenFilter"));
        // Set any tokenizing options.
        if (options.hasOption("stemmingAlgorithm"))
            props.setProperty(IteratorFactory.STEMMER_PROPERTY,
                              options.getStringOption("stemmingAlgorithm"));
        
        if (options.hasOption("compoundWords"))
            props.setProperty(IteratorFactory.COMPOUND_TOKENS_FILE_PROPERTY,
                              options.getStringOption("compoundWords"));
        if (options.hasOption("wordLimit"))
            props.setProperty(IteratorFactory.TOKEN_COUNT_LIMIT_PROPERTY,
                              options.getStringOption("wordLimit"));

        IteratorFactory.setProperties(props);

        try {
            TokenCounter counter = new TokenCounter(doLowerCasing);
            // Process each of the input files
            for (int i = 1; i < options.numPositionalArgs(); ++i)
                counter.processFile(options.getPositionalArg(i));
            // Then write the results to disk
            PrintWriter pw = new PrintWriter(options.getPositionalArg(0));
            for (Map.Entry<String,Integer> e
                     : counter.tokenToCount.entrySet())
                pw.println(e.getKey() + " " + e.getValue());
            pw.close();
        } catch (Throwable t) {
View Full Code Here

    private static final Logger LOGGER =
        Logger.getLogger(IterativeBigramExtractor.class.getName());


    public static void main(String[] args) throws Exception {
        ArgOptions options = new ArgOptions();

        options.addOption('f', "fileList", "a list of document files",
                          true, "FILE[,FILE...]", "Required (at least one of)");
        options.addOption('d', "docFile",
                          "a file where each line is a document", true,
                          "FILE[,FILE...]", "Required (at least one of)");

        options.addOption('s', "stopWords", "A file containing a list of stop "+
                          "words that should be encluded from bigrams",
                          true, "FILE", "Program Options");


        options.addOption('h', "help", "Generates a help message and exits",
                          false, null, "Program Options");
        options.addOption('v', "verbose", "Turns on verbose output",
                          false, null, "Program Options");
        options.addOption('V', "veryVerbose", "Turns on *very* verbose output",
                          false, null, "Program Options");

        options.addOption('n', "numberOfTermsPerIteration", "Specifies the " +
                          "number of terms to compute the association between "+
                          "per iteration (default: all)",
                          true, "INT", "Runtime Options");       
        options.addOption('F', "filterAssociationBelow", "Specifies the " +
                          "an association score below which the pair will " +
                          "not be reported",
                          true, "DOUBLE", "Runtime Options");       

        options.parseOptions(args);
       
        // Set the verbosity
        if (options.hasOption('v'))
            LoggerUtil.setLevel(Level.FINE);
        if (options.hasOption('V'))
            LoggerUtil.setLevel(Level.FINER);
        if (options.numPositionalArgs() < 3 || options.hasOption("help")) {
            usage(options);
            return;
        }

        File termsFile = new File(options.getPositionalArg(0));
        String outputPrefix = options.getPositionalArg(1);

        Set<String> terms = StringUtils.loadFileAsSet(termsFile);

        Set<String> stopWords = null;
        if (options.hasOption('s')) {
            stopWords = StringUtils.loadFileAsSet(
                new File(options.getStringOption('s')));
        }

        // A mapping to the minimum weight for a test, or null if all the test's
        // scores should be reported
        Map<SignificanceTest,Double> tests =
            new HashMap<SignificanceTest,Double>();
        Map<SignificanceTest,PrintWriter> testWriters =
            new HashMap<SignificanceTest,PrintWriter>();

        int numArgs = options.numPositionalArgs();
        for (int i = 2; i < numArgs; ++i) {
            String testName = options.getPositionalArg(i);
            SignificanceTest test = getTest(testName);
            Double minWeight = null;
            if (i+1 < numArgs) {
                // This might be a test name
                String weightStr = options.getPositionalArg(i+1);

                try {
                    minWeight = Double.parseDouble(weightStr);
                } catch (NumberFormatException nfe) { }
                i++;
            }
            tests.put(test, minWeight);
            PrintWriter pw = new PrintWriter(outputPrefix + testName + ".txt");
            testWriters.put(test, pw);
        }

        int termsToUsePerIteration = (options.hasOption('n'))
            ? options.getIntOption('n')
            : terms.size();              

        Queue<String> termsToAssociate = new ArrayDeque<String>(terms);
        int round = 0;
        while (termsToAssociate.size() > 0) {
View Full Code Here

        }
    }

    public static void main(String[] args) throws Exception {
        // Setup the argument options.
        ArgOptions options = new ArgOptions();
        options.addOption('F', "tokenFilter", "filters to apply to the input " +
                          "token stream", true, "FILTER_SPEC",
                          "Tokenizing Options");
        options.addOption('v', "verbose",
                          "Print verbose output about counting status",
                          false, null, "Optional");

        // Parse and validate the options.
        options.parseOptions(args);
        if (options.numPositionalArgs() < 2) {
            System.out.println(
                "usage: java DepTokenCounter"
                + " [options] <output-file> <input-file> [<input-file>]*\n"
                + options.prettyPrint()
                + "\n\n" + OptionDescriptions.TOKEN_FILTER_DESCRIPTION);
            return;
        }

        // Setup logging.
        if (options.hasOption("verbose"))
            LoggerUtil.setLevel(Level.FINE);

        TokenFilter filter = (options.hasOption("tokenFilter"))
            ? TokenFilter.loadFromSpecification(options.getStringOption('F'))
            : null;

        // setup the dependency extractor.
        DependencyExtractor e = new CoNLLDependencyExtractor(filter, null);
        DepSemTokenCounter counter = new DepSemTokenCounter(e);

        // Process each of the input files
        for (int i = 1; i < options.numPositionalArgs(); ++i)
            counter.process(new DependencyFileDocumentIterator(
                        options.getPositionalArg(i)));

        // Then write the results to disk
        PrintWriter pw = new PrintWriter(options.getPositionalArg(0));
        for (String term : counter.getTokens())
            pw.println(term);
        pw.close();
    }
View Full Code Here

    }

    public static void main(String[] args) {

        // Add the options.
        ArgOptions options = new ArgOptions();
        options.addOption('p', "partOfSpeechTag",
                          "If set, each token will be appended with it's " +
                          "part of speech tag, such as cat-noun",
                          false, null, "Optional");
        options.addOption('S', "separateByPeriod",
                          "If set, seperates sentences by periods",
                          false, null, "Optional");
        options.addOption('U', "utterancePerDoc",
                          "If set, one utterance is considered a document, " +
                          "otherwise all uterances in a file will be " +
                          "considered a document",
                          false, null, "Optional");
        options.addOption('g', "generateOneDoc",
                          "If set, only one document will be generated for " +
                          "all the text processed",
                          false, null, "Optional");

        options.addOption('A', "augmentedUtterances",
                          "Generates augmented utterances from comments " +
                          "about the utterances", false, null, "Augmented");
        options.addOption('F', "augmentedUtterancesFilter",
                          "Specifes a token filter for which tokens in " +
                          "comments are used to generate augmented utterances",
                          true, "SPEC", "Augmented");

        options.addOption('d', "baseChildesDirectory",
                          "The base childes directory.  XML files will be " +
                          "searched for recursively from this base.  Use of " +
                          "this overrides the fileList option.",
                          true, "DIRECTORY", "Required (At least one of)");
        options.addOption('f', "fileList",
                          "The list of files to process",
                          true, "FILE[,FILE]*", "Required (At least one of)");

        // Process the options and emit errors if any required options are
        // missing.
        options.parseOptions(args);
        if ((!options.hasOption("fileList") &&
             !options.hasOption("baseChildesDirectory")) ||
             options.numPositionalArgs() != 2) {
            System.out.println(
                    "usage: java ChildesParser [options] " +
                    "<outfile> <pos-file>\n" +
                    options.prettyPrint());
            return;
        }

        // The default is to have all utterances from a conversation be in a
        // single document
        boolean utterancePerDoc = false;
        utterancePerDoc = options.hasOption("utterancePerDoc");

        boolean genAugmented = options.hasOption("augmentedUtterances");
        if (genAugmented && options.hasOption("augmentedUtterancesFilter")) {
            String filterConf =
                options.getStringOption("augmentedUtterancesFilter");
            Properties p = System.getProperties();
            p.setProperty(IteratorFactory.TOKEN_FILTER_PROPERTY, filterConf);
            IteratorFactory.setProperties(p);
        }

        ChildesParser parser = new ChildesParser(options.getPositionalArg(0),
                                                 options.getPositionalArg(1),
                                                 genAugmented,
                                                 options.hasOption('S'),
                                                 options.hasOption('p'),
                                                 options.hasOption('g'));

        // Process the given file list, if provided.
        if (options.hasOption("fileList")) {
            String[] files = options.getStringOption("fileList").split(",");
            for (String file : files)
                parser.parseFile(new File(file), utterancePerDoc);
        } else {
            // Otherwise search for xml files to process.
            File baseDir =
                new File(options.getStringOption("baseChildesDirectory"));
            findXmlFiles(parser, utterancePerDoc, baseDir);
        }

        parser.finish();
    }
View Full Code Here

        }
    }

    public static void main(String[] args) throws Exception {
        // Setup the argument options.
        ArgOptions options = new ArgOptions();
        options.addOption('F', "tokenFilter", "filters to apply to the input " +
                          "token stream", true, "FILTER_SPEC",
                          "Tokenizing Options");
        options.addOption('v', "verbose",
                          "Print verbose output about counting status",
                          false, null, "Optional");

        // Parse and validate the options.
        options.parseOptions(args);
        if (options.numPositionalArgs() < 2) {
            System.out.println(
                "usage: java DepTokenCounter"
                + " [options] <output-file> <input-file> [<input-file>]*\n"
                + options.prettyPrint()
                + "\n\n" + OptionDescriptions.TOKEN_FILTER_DESCRIPTION);
            return;
        }

        // Setup logging.
        if (options.hasOption("verbose"))
            LoggerUtil.setLevel(Level.FINE);

        TokenFilter filter = (options.hasOption("tokenFilter"))
            ? TokenFilter.loadFromSpecification(options.getStringOption('F'))
            : null;

        // setup the dependency extractor.
        DependencyExtractor e = new CoNLLDependencyExtractor(filter, null);
        DepPsdTokenCounter counter = new DepPsdTokenCounter(e);

        // Process each of the input files
        for (int i = 1; i < options.numPositionalArgs(); ++i)
            counter.process(new DependencyFileDocumentIterator(
                        options.getPositionalArg(i)));

        // Then write the results to disk
        PrintWriter pw = new PrintWriter(options.getPositionalArg(0));
        for (String term : counter.getTokens())
            pw.println(term);
        pw.close();
    }
View Full Code Here

    private static final Logger LOGGER =
        Logger.getLogger(LinkClusteringTool.class.getName());

    public static void main(String[] args) {
        ArgOptions opts = new ArgOptions();
       
        opts.addOption('h', "help", "Generates a help message and exits",
                          false, null, "Program Options");
        opts.addOption('W', "minWeight", "Loads a weighted graph as " +
                       "unweighted, keeping only those edges with weight "+
                       "at least the specified value",
                       true, "Double", "Input Options");
        opts.addOption('v', "verbose", "Turns on verbose output",
                          false, null, "Program Options");
        opts.addOption('V', "verbVerbose", "Turns on very verbose output",
                          false, null, "Program Options");
       
        ////////
        //
        // OPTIONS TO BE ADDED AT SOME POINT IN THE FUTURE...
        //
        ///////

//         opts.addOption('w', "weighted", "Uses a weighted edge simiarity",
//                        false, null, "Program Options");
//         opts.addOption('k', "kpartite", "Uses the k-partite link clustering " +
//                        "with the provided file that maps a vertex to " +
//                        "its partition (note: not its community)",
//                        true, "FILE", "Program Options");

//         opts.addOption('d', "printDensities", "Prints all the cluster " +
//                        "densities to the specified file", true, "FILE",
//                        "Program Options");
//         opts.addOption('a', "saveAllSolutions", "Saves the communities for all"+
//                        "possible partitionings", true, "FILE_PREFIX",
//                        "Program Options");
//         opts.addOption('n', "saveNthSolutions", "Saves only every nth solution"+
//                        " when -a is used", true, "INT", "Program Options");

        opts.parseOptions(args);

        if (opts.numPositionalArgs() < 2 || opts.hasOption("help")) {
            usage(opts);
            return;
        }

        // If verbose output is enabled, update all the loggers in the S-Space
        // package logging tree to output at Level.FINE (normally, it is
        // Level.INFO).  This provides a more detailed view of how the execution
        // flow is proceeding.
        if (opts.hasOption('v'))
            LoggerUtil.setLevel(Level.FINE);
        if (opts.hasOption('V'))
            LoggerUtil.setLevel(Level.FINER);


        try {
           
            LOGGER.info("Loading graph file");
            Indexer<String> vertexLabels = new ObjectIndexer<String>();
            File f = new File(opts.getPositionalArg(0));
            Graph<Edge> graph = null;
            if (opts.hasOption('W'))
                graph = GraphIO.readUndirectedFromWeighted(
                    f, vertexLabels, opts.getDoubleOption('W'));
            else
                GraphIO.readUndirected(f, vertexLabels);

            LinkClustering lc = new LinkClustering();
            MultiMap<Integer,Integer> clusterToVertices =
                lc.cluster(graph, System.getProperties());

            PrintWriter pw = new PrintWriter(
                new BufferedOutputStream(new FileOutputStream(
                                         opts.getPositionalArg(1))));

            for (Map.Entry<Integer,Set<Integer>> e :
                     clusterToVertices.asMap().entrySet()) {
                Integer clusterId = e.getKey();
                Set<Integer> vertices = e.getValue();
View Full Code Here

                           "Command line options:\n" + options.prettyPrint() +
                           "\n\nExplorer commands:\n" + getCommands());
    }

    public static void main(String[] args) {
        ArgOptions options = new ArgOptions();
        options.addOption('h', "help", "Generates a help message and exits",
                          false, null, "Program Options");
        options.addOption('f', "executeFile", "Executes the commands in the " +
                          "specified file and exits", true, "FILE",
                          "Program Options");
        options.addOption('s', "saveRecord", "Saves a record of all the " +
                          "executed commands to the specfied file", true,
                          "FILE""Program Options");

        options.parseOptions(args);

        if (options.hasOption("help")) {
            usage(options);
            return;
        }

        PrintWriter recordFile = null;
        if (options.hasOption("saveRecord")) {
            try {
                recordFile = new PrintWriter(
                    options.getStringOption("saveRecord"));
            } catch (IOException ioe) {
                System.out.println("Unable to open file for saving commands:\n"
                                   + ioe);
            }
        }
       
        BufferedReader commandsToExecute = null;
        if (options.hasOption("executeFile")) {
            try {
                commandsToExecute = new BufferedReader(new FileReader(
                    options.getStringOption("executeFile")));
            } catch (IOException ioe) {
                System.out.println("unable to open commands file " +
                                   options.getStringOption("executeFile")
                                   + ":\n" + ioe);
                return;
            }
        }
        else {
            commandsToExecute =
                new BufferedReader(new InputStreamReader(System.in));
        }

        boolean suppressPrompt = options.hasOption("executeFile");

        SemanticSpaceExplorer explorer = new SemanticSpaceExplorer();
        Pattern regex = Pattern.compile("[^\\s\"']+|\"([^\"]*)\"|'([^']*)'");
        try {
            if (!suppressPrompt)
View Full Code Here

* @author Keith Stevens
*/
public class ReductionEval {

    public static void main(String[] args) throws Exception {
        ArgOptions options = new ArgOptions();
        options.addOption('w', "wordSpace",
                          "The name of the file to which the reduced " +
                          "word space should be saved",
                          true, "FILE", "Required");
        options.addOption('d', "docSpace",
                          "The name of the file to which the reduced " +
                          "document space should be saved",
                          true, "FILE", "Required");
        options.addOption('r', "dimensions",
                          "The number of reduced dimensions.",
                          true, "INTEGER", "Required");
        options.addOption('a', "reductionAlgorithm",
                          "The reduction algorithm to use, either NMF or SVD",
                          true, "NMF|SVD", "Required");
        options.parseOptions(args);

        LoggerUtil.setLevel(Level.FINE);

        int dimensions = options.getIntOption('r');
        MatrixFactorization reducer = null;
        Format format = null;
        if (options.getStringOption('a').equals("NMF")) {
            reducer = new NonNegativeMatrixFactorizationMultiplicative();
            format = Format.MATLAB_SPARSE;
        } else if (options.getStringOption('a').equals("SVD")) {
            reducer = SVD.getFastestAvailableFactorization();
            format = Format.SVDLIBC_SPARSE_BINARY;
        } else
            System.exit(1);


        MatrixFile mFile = new MatrixFile(new File(options.getPositionalArg(0)),
                                          format);

        reducer.factorize(mFile, dimensions);

        File wordSpaceFile = new File(options.getStringOption('w'));
        MatrixIO.writeMatrix(reducer.dataClasses(), wordSpaceFile,
                             Format.DENSE_TEXT);

        File docSpaceFile = new File(options.getStringOption('d'));
        MatrixIO.writeMatrix(reducer.classFeatures(), docSpaceFile,
                             Format.DENSE_TEXT);
    }
View Full Code Here

TOP

Related Classes of edu.ucla.sspace.common.ArgOptions

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.