Package edu.ucla.sspace.tools

Source Code of edu.ucla.sspace.tools.IterativeBigramExtractor

/*
* Copyright 2012 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.tools;

import edu.ucla.sspace.common.ArgOptions;

import edu.ucla.sspace.common.statistics.*;

import edu.ucla.sspace.clustering.Assignment;

import edu.ucla.sspace.graph.Graph;
import edu.ucla.sspace.graph.Graphs;
import edu.ucla.sspace.graph.LinkClustering;
import edu.ucla.sspace.graph.WeightedEdge;
import edu.ucla.sspace.graph.WeightedGraph;
import edu.ucla.sspace.graph.SparseWeightedGraph;
import edu.ucla.sspace.graph.SimpleWeightedEdge;

import edu.ucla.sspace.text.BufferedFileListDocumentIterator;
import edu.ucla.sspace.text.IteratorFactory;
import edu.ucla.sspace.text.StringUtils;
import edu.ucla.sspace.text.TermAssociationFinder;
import edu.ucla.sspace.text.WordIterator;
import edu.ucla.sspace.text.Document;
import edu.ucla.sspace.text.FileListDocumentIterator;
import edu.ucla.sspace.text.IteratorFactory;
import edu.ucla.sspace.text.OneLinePerDocumentIterator;

import edu.ucla.sspace.util.Counter;
import edu.ucla.sspace.util.CombinedIterator;
import edu.ucla.sspace.util.HashIndexer;
import edu.ucla.sspace.util.HashMultiMap;
import edu.ucla.sspace.util.Indexer;
import edu.ucla.sspace.util.LineReader;
import edu.ucla.sspace.util.LoggerUtil;
import edu.ucla.sspace.util.MultiMap;
import edu.ucla.sspace.util.ObjectCounter;
import edu.ucla.sspace.util.ObjectIndexer;
import edu.ucla.sspace.util.Pair;
import edu.ucla.sspace.util.SortedMultiMap;
import edu.ucla.sspace.util.TreeMultiMap;
import edu.ucla.sspace.util.WorkQueue;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;

import java.util.Arrays;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Queue;
import java.util.Set;

import java.util.logging.Logger;
import java.util.logging.Level;

import static edu.ucla.sspace.util.LoggerUtil.info;
import static edu.ucla.sspace.util.LoggerUtil.verbose;
import static edu.ucla.sspace.util.LoggerUtil.veryVerbose;

public class IterativeBigramExtractor {

    private static final Logger LOGGER =
        Logger.getLogger(IterativeBigramExtractor.class.getName());


    public static void main(String[] args) throws Exception {
        ArgOptions options = new ArgOptions();

        options.addOption('f', "fileList", "a list of document files",
                          true, "FILE[,FILE...]", "Required (at least one of)");
        options.addOption('d', "docFile",
                          "a file where each line is a document", true,
                          "FILE[,FILE...]", "Required (at least one of)");

        options.addOption('s', "stopWords", "A file containing a list of stop "+
                          "words that should be encluded from bigrams",
                          true, "FILE", "Program Options");


        options.addOption('h', "help", "Generates a help message and exits",
                          false, null, "Program Options");
        options.addOption('v', "verbose", "Turns on verbose output",
                          false, null, "Program Options");
        options.addOption('V', "veryVerbose", "Turns on *very* verbose output",
                          false, null, "Program Options");

        options.addOption('n', "numberOfTermsPerIteration", "Specifies the " +
                          "number of terms to compute the association between "+
                          "per iteration (default: all)",
                          true, "INT", "Runtime Options");       
        options.addOption('F', "filterAssociationBelow", "Specifies the " +
                          "an association score below which the pair will " +
                          "not be reported",
                          true, "DOUBLE", "Runtime Options");       

        options.parseOptions(args);
       
        // Set the verbosity
        if (options.hasOption('v'))
            LoggerUtil.setLevel(Level.FINE);
        if (options.hasOption('V'))
            LoggerUtil.setLevel(Level.FINER);
        if (options.numPositionalArgs() < 3 || options.hasOption("help")) {
            usage(options);
            return;
        }

        File termsFile = new File(options.getPositionalArg(0));
        String outputPrefix = options.getPositionalArg(1);

        Set<String> terms = StringUtils.loadFileAsSet(termsFile);

        Set<String> stopWords = null;
        if (options.hasOption('s')) {
            stopWords = StringUtils.loadFileAsSet(
                new File(options.getStringOption('s')));
        }

        // A mapping to the minimum weight for a test, or null if all the test's
        // scores should be reported
        Map<SignificanceTest,Double> tests =
            new HashMap<SignificanceTest,Double>();
        Map<SignificanceTest,PrintWriter> testWriters =
            new HashMap<SignificanceTest,PrintWriter>();

        int numArgs = options.numPositionalArgs();
        for (int i = 2; i < numArgs; ++i) {
            String testName = options.getPositionalArg(i);
            SignificanceTest test = getTest(testName);
            Double minWeight = null;
            if (i+1 < numArgs) {
                // This might be a test name
                String weightStr = options.getPositionalArg(i+1);

                try {
                    minWeight = Double.parseDouble(weightStr);
                } catch (NumberFormatException nfe) { }
                i++;
            }
            tests.put(test, minWeight);
            PrintWriter pw = new PrintWriter(outputPrefix + testName + ".txt");
            testWriters.put(test, pw);
        }

        int termsToUsePerIteration = (options.hasOption('n'))
            ? options.getIntOption('n')
            : terms.size();              

        Queue<String> termsToAssociate = new ArrayDeque<String>(terms);
        int round = 0;
        while (termsToAssociate.size() > 0) {
            round++;

            Counter<String> termCounts = new ObjectCounter<String>();
            Counter<Pair<String>> bigramCounts =
                new ObjectCounter<Pair<String>>();
            int allBigramCounts = 0;

            // Load the next set of terms to test for being bigrams
            Set<String> curTerms = new HashSet<String>();
            while (curTerms.size() < termsToUsePerIteration
                   && !termsToAssociate.isEmpty()) {
                curTerms.add(termsToAssociate.poll());
            }

            info(LOGGER, "Finding associations between all %d terms and a %d " +
                 "term subset (%d remain)", terms.size(), curTerms.size(),
                 termsToAssociate.size());
      
            int docNum = 0;
            long startTime = System.currentTimeMillis();
            Iterator<Document> docs = getDocuments(options);
            while (docs.hasNext()) {
                Document doc = docs.next();
                Iterator<String> tokens = new WordIterator(doc.reader());
                String t1 = null;
                while (tokens.hasNext()) {
                    String t2 = tokens.next();
                    // Count the occurrence of this token if we're supposed to
                    // record it
                    if (terms.contains(t2)
                            && (stopWords == null || !stopWords.contains(t2))) {

                        termCounts.count(t2);
                        if (t1 != null) {
                            allBigramCounts++;
                            // See if we are supposed to record this bigram
                            if (curTerms.contains(t1))
                                bigramCounts.count(new Pair<String>(t1, t2));
                        }
                    }
                    t1 = t2;
                }

                // Just some reporting
                if (++docNum % 1000 == 0) {
                    double now = System.currentTimeMillis();
                    double docsSec = docNum / ((now - startTime) / 1000);
                    verbose(LOGGER, "Processed document %d in round %d, " +
                            "docs/sec: %f", docNum, round, docsSec);
                }
            }

            for (Map.Entry<SignificanceTest,Double> e : tests.entrySet()) {
                SignificanceTest test = e.getKey();
                double minWeight = (e.getValue() == null) ? 0 : e.getValue();
                PrintWriter pw = testWriters.get(test);
               
                for (Map.Entry<Pair<String>,Integer> e2 : bigramCounts) {
                    Pair<String> bigram = e2.getKey();
                    int bigramCount = e2.getValue();
                    String t1 = bigram.x;
                    String t2 = bigram.y;
                    int t1Count = termCounts.getCount(t1);
                    int t2Count = termCounts.getCount(t2);
                    int t1butNotT2 = t1Count - bigramCount;
                    int t2butNotT1 = t2Count - bigramCount;
                    int neitherAppeared =
                        allBigramCounts - ((t1Count + t2Count) - bigramCount);

                    double score = test.score(bigramCount, t1butNotT2,
                                              t2butNotT1, neitherAppeared);
                    if (score > minWeight && !Double.isNaN(score)
                            && !Double.isInfinite(score))
                        pw.println(t1 + "\t" + t2 + "\t" + score);
                }
            }

            for (PrintWriter pw : testWriters.values())
                pw.flush();
        }           

        for (PrintWriter pw : testWriters.values())
            pw.close();
    }


    private static Iterator<Document> getDocuments(ArgOptions argOptions)
            throws IOException {
        Collection<Iterator<Document>> docIters =
            new LinkedList<Iterator<Document>>();

        if (argOptions.hasOption('f')) {
            for (String s : argOptions.getStringOption('f').split(","))
                docIters.add(new BufferedFileListDocumentIterator(s));
        }
        if (argOptions.hasOption('d')) {
            for (String s : argOptions.getStringOption('d').split(","))
                docIters.add(new OneLinePerDocumentIterator(s));
        }

         if (docIters.size() == 0)
             throw new IllegalStateException(
                 "Must specify at least one document source");

        // combine all of the document iterators into one iterator.
        Iterator<Document> docIter = new CombinedIterator<Document>(docIters);
       
        return docIter;
    }

    /**
     * Removes all tokens in the document that are not member of either set,
     * returning the cleaned version, or if the sets contain no elements
     * (indicating no filtering is to be done), the original string is returned.
     */
    private static Set<String> clean(String document,
                                     Set<String> validContext) {
        Set<String> tokens = new HashSet<String>();
        String[] arr = document.split("\\s+");
        // If either set does not include tokens, this indicates that the
        // context should include all the possible tokens.  Therefore, just
        // return the string in its original form.
        if (validContext.isEmpty()) {
            tokens.addAll(Arrays.asList(arr));
            return tokens;
        }

        for (String token : arr) {
            if (validContext.contains(token))
                tokens.add(token);
        }
        return tokens;
    }

    private static SignificanceTest getTest(String testName) {
        if (testName.equals("g-test"))
            return new GTest();
        else if (testName.equals("chi-squared"))
            return new ChiSquaredTest();
        else if (testName.equals("pmi"))
            return new PointwiseMutualInformationTest();
        else
            throw new IllegalArgumentException(
                "No such significance test: " + testName);
    }

    /**
     * Prints the options and supported commands used by this program.
     *
     * @param options the options supported by the system
     */
    private static void usage(ArgOptions options) {
        System.out.println(
            "IterativeBigramExtractor version 1.0\n" +
            "usage: java IterativeBigramExtractor [options] " +
            "terms.txt output-prefix test-name [min-weight] " +
            "[test2-name [min-weight]]\n\n"
            + options.prettyPrint());
    }
}
TOP

Related Classes of edu.ucla.sspace.tools.IterativeBigramExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.