Package edu.ucla.sspace.wordsi

Source Code of edu.ucla.sspace.wordsi.WaitingWordsi

/*
* Copyright 2010 Keith Stevens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.wordsi;

import edu.ucla.sspace.clustering.Assignment;
import edu.ucla.sspace.clustering.Assignments;
import edu.ucla.sspace.clustering.Clustering;

import edu.ucla.sspace.matrix.SparseMatrix;
import edu.ucla.sspace.matrix.Matrices;

import edu.ucla.sspace.util.WorkQueue;

import edu.ucla.sspace.vector.CompactSparseVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.SparseDoubleVector;
import edu.ucla.sspace.vector.Vectors;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import java.util.concurrent.ConcurrentHashMap;

import java.util.logging.Logger;



/**
* A {@link Wordsi} implementation that performs batch clustering.  Each context
* vector is stored and later clustered using a {@link Clustering} algorithm.
*
* @author Keith Stevens
*/
public class WaitingWordsi extends BaseWordsi {

    /**
     * A logger for recording the process of the {@link Wordsi} processing.
     */
    private static final Logger LOG = Logger.getLogger(
            WaitingWordsi.class.getName());

    /**
     * The {@link Clustering} implementation to use when all data points have
     * been observed.
     */
    private final Clustering clustering;

    /**
     * A mapping from strings to the set of context vectors associated with that
     * token.
     */
    private final Map<String, List<SparseDoubleVector>> dataVectors;

    /**
     * The final word space, which maps from strings to the semantic
     * representation.
     */
    private final Map<String, SparseDoubleVector> wordSpace;

    /**
     * The number of clusters.  This may be used as a theoretical upper bound as
     * opposed to a strict number of clusters.
     */
    private final int numClusters;

    /**
     * The {@link AssignmentReporter} to use for reporting clustering
     * assignments.
     */
    private final AssignmentReporter reporter;

    /**
     * Creates a new {@link WaitingWordsi}.  The number of clusters is left
     * unset, which requires that the {@link Clustering} algorithm be able to
     * decide on an appropriate number of clusters.
     *
     * @param acceptedWords The set of words that {@link Wordsi} should
     *        represent.  This may be {@code null} or empty}.
     * @param extractor The {@link ContextExtractor} used to parse documents.
     * @param trackSecondaryKeys If true, cluster assignments and secondary keys
     *        will be tracked.  If this is false, the {@link AssignmentReporter}
     *        will not be used.
     * @param clustering The {@link Clustering} algorithm to use on each data
     *        set.
     * @param reporter The {@link AssignmentReporter} responsible for generating
     *        a report that details the cluster assignments.    This may be
     *        {@link null}.    If {@code trackSecondaryKeys} is false, this is
     *        not used.
     */
    public WaitingWordsi(Set<String> acceptedWords,
                         ContextExtractor extractor,
                         Clustering clustering,
                         AssignmentReporter reporter) {
        this(acceptedWords, extractor, clustering, reporter, 0);
    }

    /**
     * Creates a new {@link WaitingWordsi}.   
     *
     * @param acceptedWords The set of words that {@link Wordsi} should
     *        represent.  This may be {@code null} or empty}.
     * @param extractor The {@link ContextExtractor} used to parse documents.
     * @param clustering The {@link Clustering} algorithm to use on each data
     *        set.
     * @param reporter The {@link AssignmentReporter} responsible for generating
     *        a report that details the cluster assignments.  This may be {@link
     *        null}.  If {@code trackSecondaryKeys} is false, this is not used.
     * @param numClusters Specifies the number of clusters to generate for each
     *        term.
     */
    public WaitingWordsi(Set<String> acceptedWords,
                         ContextExtractor extractor,
                         Clustering clustering,
                         AssignmentReporter reporter,
                         int numClusters) {
        super(acceptedWords, extractor);

        this.clustering = clustering;
        this.reporter = reporter;
        this.numClusters = numClusters;

        dataVectors = new HashMap<String, List<SparseDoubleVector>>();
        wordSpace = new ConcurrentHashMap<String, SparseDoubleVector>();
    }

    /**
     * {@inheritDoc}
     */
    public Set<String> getWords() {
        return wordSpace.keySet();
    }

    /**
     * {@inheritDoc}
     */
    public SparseDoubleVector getVector(String term) {
        return wordSpace.get(term);
    }

    /**
     * Adds the context vector to the end of the list of context vectors
     * associated with {@code focusKey}.
     */
    public void handleContextVector(String focusKey,
                                    String secondaryKey,
                                    SparseDoubleVector context) {
        // Get the list of context vectors for the focus key.
        List<SparseDoubleVector> termContexts = dataVectors.get(focusKey);
        if (termContexts == null) {
            synchronized (this) {
                termContexts = dataVectors.get(focusKey);
                if (termContexts == null) {
                    termContexts = new ArrayList<SparseDoubleVector>();
                    dataVectors.put(focusKey, termContexts);
                }
            }
        }

        // Add the new context vector.
        int contextId = 0;
        synchronized (termContexts) {
            contextId = termContexts.size();
            termContexts.add(context);
        }

        // Record the association.
        if (reporter != null)
            reporter.assignContextToKey(focusKey, secondaryKey, contextId);
    }

    /**
     * {@inheritDoc}
     */
    public void processSpace(final Properties props) {
        WorkQueue workQueue = WorkQueue.getWorkQueue();

        Object key = workQueue.registerTaskGroup(dataVectors.size());
        // Process each word's context set in a worker thread.
        for (Map.Entry<String, List<SparseDoubleVector>> entry :
                dataVectors.entrySet()) {
            // Get the root word being discriminated and list of observed
            // contexts.
            final String senseName = entry.getKey();

            List<SparseDoubleVector> contextsWithNoLength = entry.getValue();
            final List<SparseDoubleVector> contextSet =
                new ArrayList<SparseDoubleVector>(contextsWithNoLength.size());
            for (SparseDoubleVector v : contextsWithNoLength)
                contextSet.add(Vectors.subview(v, 0, getVectorLength()));
           
            workQueue.add(key, new Runnable() {
                public void run() {
                    clusterTerm(senseName, contextSet, props);
                }
            });
        }
        workQueue.await(key);
        LOG.info("Finished processing all terms");
    }

    /**
     * Clusters the context vectors associated with {@link senseName}.
     */
    private void clusterTerm(String senseName,
                             List<SparseDoubleVector> contextSet,
                             Properties props) {
        // Convert the data points to a sparse matrix.
        SparseMatrix contexts = Matrices.asSparseMatrix(contextSet);

        // Cluster the context set.
        LOG.info("Clustering term: " + senseName);
        Assignments assignments = (numClusters > 0)
            ? clustering.cluster(contexts, numClusters, props)
            : clustering.cluster(contexts, props);
        LOG.info("Finished clustering term: " + senseName);

        SparseDoubleVector[] centroids = assignments.getSparseCentroids();

        // Add the centroids to the splitSenses map.
        for (int index = 0; index < centroids.length; ++index) {
            String sense = (index > 0)
                    ? senseName + "-" + index
                    : senseName;
            wordSpace.put(sense, centroids[index]);
        }

        LOG.info("Finished creating centroids for term: " + senseName);

        // Empty out the stored contexts to free up memory for later processes.
        contextSet.clear();

        // If the reporter is null, avoid making any report.
        if (reporter == null)
            return;

        // Generate the secondary context labels for each data point.
        String[] contextLabels = reporter.contextLabels(senseName);
        if (contextLabels.length == 0)
            return;

        LOG.info("Making assignment report: " + senseName);
        // Report the assignments for each clustered data point.  Note that some
        // data points might not have been clustered (Cluto based clustering
        // does this on occasion) so we must check for the number of assignments
        // first.
        for (int i = 0; i < assignments.size(); ++i)
            if (assignments.get(i).assignments().length > 0)
                reporter.updateAssignment(senseName, contextLabels[i],
                                          assignments.get(i).assignments()[0]);
        LOG.info("Finished making assignment report: " + senseName);
    }
}
TOP

Related Classes of edu.ucla.sspace.wordsi.WaitingWordsi

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.