Package edu.ucla.sspace.wordsi.psd

Source Code of edu.ucla.sspace.wordsi.psd.PseudoWordReporter

/*
* Copyright 2010 Keith Stevens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.wordsi.psd;

import edu.ucla.sspace.wordsi.AssignmentReporter;

import java.io.OutputStream;
import java.io.PrintStream;

import java.util.ArrayList;
import java.util.BitSet;
import java.util.HashMap;
import java.util.List;
import java.util.Map;


/**
* A {@link AssignmentReporter} that creates a PseudoWord answer key .
* This should be used in conjunction with a {@link PseudoWordContextExtractor}.
* When reporting, primary keys are expected to be the pseudo word while
* secondary keys are expected to be the actual word used in a given instance.
* The reporter will record the number of times each secondary key was assigned
* to a particular cluster for each primary key.  When the report is finalized,
* it will generate a lines of the form:
* </li> primaryKey secondaryKey clusterNumber assignmentCount
* This can later be used to determine what words best describe each cluster.
*
* @author Keith Stevens
*/
public class PseudoWordReporter implements AssignmentReporter {

    /**
     * The writer used to output the PseudoWord answer key.
     */
    private PrintStream writer;

    /**
     * A mapping from primary keys to secondary keys to the set of data point
     * ids that were assigned to each secondary key.  We use a {@link BitSet}
     * here since there should only be a few secondary keys and each secondary
     * key may have many context id's associated with it.
     */
    private final Map<String, Map<String, BitSet>> contextAssignments;
    /**
     * A mapping from primary keys to secondary keys to the number of times that
     * each secondary key was assigned to a particular cluster.
     */
    private Map<String, Map<String, List<Integer>>> clusterCounts;

    /**
     * Creates a new {@link PseudoWordReporter}.
     *
     * @param stream The stream to which the answer key should be written.
     */
    public PseudoWordReporter(OutputStream stream) {
        writer = new PrintStream(stream);
        clusterCounts = new HashMap<String, Map<String, List<Integer>>>();
        contextAssignments = new HashMap<String, Map<String, BitSet>>();
    }

    /**
     * {@inheritDoc}
     */
    public synchronized void updateAssignment(String primaryKey,
                                              String secondaryKey,
                                              int clusterId) {
        if (primaryKey.equals(secondaryKey))
            return;

        // Get the mapping from seconday keys to cluster assignment counts.
        Map<String, List<Integer>> secondaryCounts = clusterCounts.get(
                primaryKey);
        if (secondaryCounts == null) {
            secondaryCounts = new HashMap<String, List<Integer>>();
            clusterCounts.put(primaryKey, secondaryCounts);
        }

        // Get the list of cluster assignment counts.
        List<Integer> counts = secondaryCounts.get(secondaryKey);
        if (counts == null) {
            counts = new ArrayList<Integer>(clusterId);
            secondaryCounts.put(secondaryKey, counts);
        }

        // Add zero values for any clusters that are not currently recorded.
        while (clusterId >= counts.size())
            counts.add(0);

        // Make the assignment for the given cluster id.
        counts.set(clusterId, counts.get(clusterId) + 1);
    }

    /**
     * {@inheritDoc}
     */
    public void finalizeReport() {
        // For each sense labeling made, print out a triplet consisting of the
        // primary key, secondary key, the cluster id, and the number of times
        // that the secondary key was assigned to the cluster.
        for (Map.Entry<String, Map<String, List<Integer>>> e :
                 clusterCounts.entrySet()) {
            String firstKey = e.getKey();
            for (Map.Entry<String, List<Integer>> f : e.getValue().entrySet()) {
                String secondKey = f.getKey();
                List<Integer> counts = f.getValue();
                for (int i = 0; i < counts.size(); ++i)
                    if (counts.get(i) > 0)
                        writer.printf("%s %s %d %d\n",
                                      firstKey, secondKey, i, counts.get(i));
            }
        }
        writer.close();
    }

    /**
     * Records an assignment of {@code contextId} to {@code secondaryKey} and
     * {@code primaryKey}.
     */
    public void assignContextToKey(String primaryKey,
                                   String secondaryKey,
                                   int contextId) {
        // Get the mapping from secondary keys to context ids.
        Map<String, BitSet> termContexts = contextAssignments.get(primaryKey);
        if (termContexts == null) {
            synchronized (this) {
                termContexts = contextAssignments.get(primaryKey);
                if (termContexts == null) {
                    termContexts = new HashMap<String, BitSet>();
                    contextAssignments.put(primaryKey, termContexts);
                }
            }
        }

        // Get the set of context id's made to the secondary key.
        BitSet contextIds = termContexts.get(secondaryKey);
        if (contextIds == null) {
            synchronized (this) {
                contextIds = termContexts.get(secondaryKey);
                if (contextIds == null) {
                    contextIds = new BitSet();
                    termContexts.put(secondaryKey, contextIds);
                }
            }
        }

        // Update the set of context ids assigned to the secondary key.
        synchronized (contextIds) {
            contextIds.set(contextId);
        }
    }

    /**
     * Return an array mapping context ids to secondary keys. Returns an empty
     * array if there was no tracking done.
     */
    public String[] contextLabels(String primaryKey) {
        Map<String, BitSet> termContexts = contextAssignments.get(primaryKey);
        if (termContexts == null)
            return new String[0];

        // Compute the total number of assignments made.
        int totalAssignments = 0;
        for (Map.Entry<String, BitSet> entry : termContexts.entrySet())
            totalAssignments = Math.max(
                    totalAssignments, entry.getValue().length());
       
        // Fill in each assignment with the secondary key attached to each
        // context id.
        String[] contextLabels = new String[totalAssignments];
        for (Map.Entry<String, BitSet> entry : termContexts.entrySet()) {
            BitSet contextIds = entry.getValue();
            for (int contextId = contextIds.nextSetBit(0); contextId >= 0;
                     contextId = contextIds.nextSetBit(contextId+1))
                contextLabels[contextId] = entry.getKey();
        }
        return contextLabels;
    }
}
TOP

Related Classes of edu.ucla.sspace.wordsi.psd.PseudoWordReporter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.