Package edu.ucla.sspace.common

Examples of edu.ucla.sspace.common.ArgOptions


        return new int[] { a, b, c, d };
    }


    public static void main(String[] args) {
        ArgOptions options = new ArgOptions();
        options.addOption('F', "tokenFilter", "filters to apply to the input " +
                          "token stream", true, "FILTER_SPEC",
                          "Tokenizing Options");
        options.addOption('M', "minFreq", "minimum frequency of the reported " +
                          "bigrams" , true, "INT",
                          "Bigram Options");
        options.addOption('v', "verbose",
                          "Print verbose output about counting status",
                          false, null, "Program Options");
        options.parseOptions(args);

        if (options.numPositionalArgs() < 3) {
            System.out.println("usage: java BigramExtractor [options] " +
                               "<OutputFile> " +
                               "<SignificanceTest> " +
                               "<InputFile> [<InputFile>...]\n" +
                               " significance test options: " +
                               SignificanceTest.values() + "\n" +
                               options.prettyPrint());
            return;
        }
       
        if (options.hasOption("verbose"))
            LoggerUtil.setLevel(Level.FINE);


        Properties props = System.getProperties();
        // Initialize the IteratorFactory to tokenize the documents according to
        // the specified configuration (e.g. filtering, compound words)
        if (options.hasOption("tokenFilter"))
            props.setProperty(IteratorFactory.TOKEN_FILTER_PROPERTY,
                              options.getStringOption("tokenFilter"));
        IteratorFactory.setProperties(props);
       
        try {
            BigramExtractor be = new BigramExtractor(1000000); // 1M
            String testStr = options.getPositionalArg(1).toUpperCase();
            SignificanceTest test = SignificanceTest.valueOf(testStr);
            PrintWriter output = new PrintWriter(options.getPositionalArg(0));
            int numArgs = options.numPositionalArgs();
            // Process each of the input files
            for (int i = 2; i < numArgs; ++i) {
                String inputFile = options.getPositionalArg(i);
                BufferedReader br = new BufferedReader(
                    new FileReader(inputFile));
           
                int lineNo = 0;
                for (String line = null; (line = br.readLine()) != null; ) {
                    be.process(line);
                    if (++lineNo % 10000 == 0)
                        LOGGER.fine(inputFile +
                                    ": processed document " + lineNo);
                }
                br.close();
            }
            // Write out the bigrams to file
            int minFreq = (options.hasOption("minFreq"))
                ? options.getIntOption("minFreq")
                : 0;
            be.printBigrams(output, test, minFreq);
        }
        catch (Exception e) {
            e.printStackTrace();
View Full Code Here


    /**
     * Runs the program
     */
    public static void main(String[] args) {
        ArgOptions options = new ArgOptions();
       
        options.addOption('h', "help", "Generates a help message and exits",
                          false, null, "Program Options");
        options.addOption('v', "verbose", "Enables verbose reporting",
                          false, null, "Program Options");


        options.addOption('C', "createFinder", "Creates a nearest " +
                          "neighbor finder from the provided .sspace file",
                          true, "FILE", "Program Options");
        options.addOption('L', "loadFinder", "Loads the finder from " +
                          "file", true, "FILE", "Program Options");
        options.addOption('S', "saveFinder", "Saves the loaded or created " +
                          "finder to file", true, "FILE", "Program Options");

        options.addOption('p', "principleVectors", "Specifies the number " +
                          "of principle vectors to create",
                          true, "INT", "Creation Options");

        options.parseOptions(args);

        if (options.hasOption("help") ||
                (!options.hasOption('C') && !options.hasOption('L'))) {
            usage(options);
            return;
        }
       
        if (options.hasOption("verbose"))
            LoggerUtil.setLevel(Level.FINE);

        if (options.hasOption('C') && options.hasOption('L')) {
            System.out.println("Cannot load and create a finder concurrently");
            System.exit(1);
        }
       
        NearestNeighborFinder nnf = null;
        if (options.hasOption('C')) {
            try {
                SemanticSpace sspace =
                    SemanticSpaceIO.load(options.getStringOption('C'));
                int numWords = sspace.getWords().size();
                // See how many principle vectors to create
                int numPrincipleVectors = -1;
                if  (options.hasOption('p')) {
                    numPrincipleVectors = options.getIntOption('p');
                    if (numPrincipleVectors > numWords) {
                        throw new IllegalArgumentException(
                            "Cannot have more principle vectors than " +
                            "word vectors: " + numPrincipleVectors);
                    }
                    else if (numPrincipleVectors < 1) {
                        throw new IllegalArgumentException(
                            "Must have at least one principle vector");
                    }

                }
                else {
                    numPrincipleVectors =
                        Math.min((int)(Math.ceil(Math.log(numWords))), 1000);
                    System.err.printf("Choosing a heuristically selected %d " +
                                      "principle vectors%n",
                                      numPrincipleVectors);
                }
                nnf = new PartitioningNearestNeighborFinder(
                    sspace, numPrincipleVectors);
            } catch (IOException ioe) {
                throw new IOError(ioe);
            }
        }
        else if (options.hasOption('L')) {
            nnf = SerializableUtil.<NearestNeighborFinder>load(
                new File(options.getStringOption('L')));
        }
        else {
            throw new IllegalArgumentException(
                "Must either create or load a NearestNeighborFinder");
        }

        if (options.hasOption('S')) {
            SerializableUtil.save(nnf, new File(options.getStringOption('S')));
        }

        int numWords = options.numPositionalArgs();
        for (int i = 0; i < numWords; ++i) {
            String term = options.getPositionalArg(i);
            long start = System.currentTimeMillis();           
            MultiMap<Double,String> m = nnf.getMostSimilar(term, 10);
            if (m == null) {
                System.out.println(term + " is not in the semantic " +
                                   "space; no neighbors found.");
View Full Code Here


public class SvdTool {

    public static void main(String[] args) throws Exception {
        ArgOptions options = new ArgOptions();
       
        options.addOption('h', "help", "Generates a help message and exits",
                          false, null, "Program Options");

        options.addOption('d', "dimensions", "Desired SVD Triples",
                          true, "INT", "Program Options");

        options.addOption('o', "fileRoot", "Root of files in which to store resulting U,S,V",
                          true, "DIR", "Program Options");
        options.addOption('r', "inputFormat", "Input matrix file format",
                          true, "STRING", "Program Options");
        options.addOption('w', "outputFormat", "Output matrix file format",
                          true, "STRING", "Program Options");

        options.parseOptions(args);

        if (options.numPositionalArgs() == 0 || options.hasOption("help")) {
            usage(options);
            return;
        }

        // Load and sanity check the options prior to computing the SVD
        int dimensions = options.getIntOption('d');
        String matrixFileName = options.getPositionalArg(0);
        File matrixFile = new File(matrixFileName);
        if (!matrixFile.exists())
            throw new IllegalArgumentException(
                "non-existent input matrix file: " + matrixFileName);

        String outputDirName = options.getStringOption('o');
        File outputDir = new File(outputDirName);
        if (!outputDir.exists() || !outputDir.isDirectory())
            throw new IllegalArgumentException(
                "invalid output directory: " + outputDirName);

        Format inputFormat = (options.hasOption('r'))
            ? getFormat(options.getStringOption('r'))
            : Format.SVDLIBC_SPARSE_TEXT;
        Format outputFormat = (options.hasOption('w'))
            ? getFormat(options.getStringOption('w'))
            : Format.SVDLIBC_DENSE_TEXT;

        MatrixFactorization factorizer = SVD.getFastestAvailableFactorization();
        factorizer.factorize(new MatrixFile(matrixFile, inputFormat), dimensions);
        File uFile = new File(outputDir, "U.mat");
View Full Code Here

              options.prettyPrint());
        System.exit(1);
    }

    public static void main(String[] args) throws Exception {
        ArgOptions options = new ArgOptions();
        options.addOption('w', "wordList",
                          "Specifies the wods that should be used in a " +
                          "pseudo word list",
                          true, "FILE", "Required (One of)");
        options.addOption('n', "numberOfPseudoWords",
                          "Specifies the desired number of pseudo words " +
                          "to create. If usePartsOfSpeech is set, this " +
                          "number of words per part of speech will be selected",
                          true, "INT", "Requred (One of)");
        options.addOption('P', "usePartsOfSpeech",
                          "If set, all terms are expected to have their part " +
                          "of speech as a suffix.  Terms should have the " +
                          "form lemma-POS",
                          false, null, "Optional");
        options.addOption('t', "typeOfPseudoWord",
                          "Specifies the specificity of the selected pseudo " +
                          "word confounders.  high will pick the word with " +
                          "the closest score.  med will select a score " +
                          "randomly from the 100 closest scoring words and " +
                          "low will select any confounder at random",
                          true, "high|med|low", "Required");
        options.parseOptions(args);

        if ((!options.hasOption('n') && !options.hasOption('w')) ||
            !options.hasOption('t') ||
            options.numPositionalArgs() != 2)
            usage(options);

        usePos = options.hasOption('P');

        List<Map<String, Double>> wordScores = loadScores(
                options.getPositionalArg(0));
        List<Set<String>> baseWords = (options.hasOption('w'))
            ? extractWordList(options.getStringOption('w'))
            : selectWord(wordScores, options.getIntOption('n'));

        String type = options.getStringOption('t');
        PrintWriter writer = new PrintWriter(options.getPositionalArg(1));
        for (int i = 0; i < baseWords.size(); ++i) {
            Map<String, Double> scores = wordScores.get(i);
            Set<String> keyWords = baseWords.get(i);

            Map<String, String> pseudoWordMap = null;
View Full Code Here

    }
    br.close();
  }

  public static ArgOptions setupOptions() {
    ArgOptions opts = new ArgOptions();
    opts.addOption('d', "docFiles", "location of directory containing only blog files",
                   true, "FILE[,FILE,...]", "Required");
    opts.addOption('w', "wordlist", "Word List for cleaning documents",
                   true, "STRING", "Required");
    opts.addOption('s', "beginTime", "Earliest timestamp for any document",
                   true, "INTEGER", "Optional");
    opts.addOption('e', "endTime", "Latest timestamp for any document",
                   true, "INTEGER", "Optional");
    opts.addOption('h', "threads", "number of threads", true, "INT");
    return opts;
  }
View Full Code Here

    return opts;
  }

  public static void main(String[] args)
      throws IOException, InterruptedException  {
    ArgOptions options = setupOptions();
    options.parseOptions(args);

    if (!options.hasOption("docFiles") ||
        !options.hasOption("wordlist") ||
        options.numPositionalArgs() != 1) {
      System.out.println("usage: java BlogPreProcessor [options] <out_file> \n" +
                         options.prettyPrint());
      System.exit(1);
    }
    // Load up the output file and the wordlist.
    File outFile = new File(options.getPositionalArg(0));
    File wordFile = new File(options.getStringOption("wordlist"));

    // Create the cleaner.
    long startTime = (options.hasOption("beginTime")) ?
      options.getLongOption("beginTime") : 0;
    long endTime = (options.hasOption("endTime")) ?
      options.getLongOption("endTime") : Long.MAX_VALUE;

    final BlogPreProcessor blogCleaner =
      new BlogPreProcessor(wordFile, outFile, startTime, endTime);
    String[] fileNames = options.getStringOption("docFiles").split(",");

  // Load the program-specific options next.
  int numThreads = Runtime.getRuntime().availableProcessors();
  if (options.hasOption("threads"))
      numThreads = options.getIntOption("threads");
   
    Collection<File> blogFiles = new ArrayDeque<File>() ;
    for (String fileName : fileNames) {
      blogFiles.add(new File(fileName));
    }
View Full Code Here

        Logger.getLogger(SimilarityListGenerator.class.getName());

    private final ArgOptions argOptions;
   
    public SimilarityListGenerator() {
        argOptions = new ArgOptions();
        addOptions();
    }
View Full Code Here

TOP

Related Classes of edu.ucla.sspace.common.ArgOptions

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.