Package edu.ucla.sspace.tools

Source Code of edu.ucla.sspace.tools.DependencyBasisMaker$OccurrenceCounter

/*
* Copyright 2010 Keith Stevens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/


package edu.ucla.sspace.tools;

import edu.ucla.sspace.basis.BasisMapping;
import edu.ucla.sspace.basis.StringBasisMapping;

import edu.ucla.sspace.common.ArgOptions;
import edu.ucla.sspace.common.SemanticSpace;

import edu.ucla.sspace.dependency.DependencyExtractor;
import edu.ucla.sspace.dependency.DependencyExtractorManager;
import edu.ucla.sspace.dependency.DependencyIterator;
import edu.ucla.sspace.dependency.DependencyPath;
import edu.ucla.sspace.dependency.DependencyPathAcceptor;
import edu.ucla.sspace.dependency.DependencyPathWeight;
import edu.ucla.sspace.dependency.DependencyTreeNode;
import edu.ucla.sspace.dependency.FilteredDependencyIterator;
import edu.ucla.sspace.dependency.FlatPathWeight;
import edu.ucla.sspace.dependency.UniversalPathAcceptor ;

import edu.ucla.sspace.mains.DependencyGenericMain;

import edu.ucla.sspace.matrix.AtomicGrowingSparseHashMatrix;
import edu.ucla.sspace.matrix.NoTransform;
import edu.ucla.sspace.matrix.SparseMatrix;
import edu.ucla.sspace.matrix.Transform;

import edu.ucla.sspace.text.IteratorFactory;

import edu.ucla.sspace.util.BoundedSortedMap;
import edu.ucla.sspace.util.Pair;

import edu.ucla.sspace.vector.DenseVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.SparseDoubleVector;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOError;
import java.io.IOException;
import java.io.ObjectOutputStream;

import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;


/**
* This main creates a {@link BasisMapping} based on the unique terms found in a
* document set and serializes it to disk.
*
* @author Keith Stevens
*/
public class DependencyBasisMaker extends DependencyGenericMain {

    /**
     * {@inheritDoc}
     */
    public void addExtraOptions(ArgOptions options) {
        options.addOption('b', "basisSize",
                          "Specifies the total desired size of the basis " +
                          "(Default: 10000)",
                          true, "INT", "Optional");
        options.addOption('a', "pathAcceptor",
                          "Specifies the dependency path acceptor to use. " +
                          "(Default:    UnivseralPathAcceptor)",
                          true, "CLASSNAME", "Optional");
        options.addOption('w', "pathWeighter",
                          "Specifies the dependency path weighter to use. " +
                          "(Default:    FlatPathWeight)",
                          true, "CLASSNAME", "Optional");
        options.addOption('l', "pathLength",
                          "Specifies the maximum dependency path length. " +
                          "(Default:    5)",
                          true, "INT", "Optional");
    }

    /**
     * {@inheritDoc}
     */
    protected SemanticSpace getSpace() {
        setupDependencyExtractor();

        int bound = argOptions.getIntOption('b', 10000);
        Transform transform = argOptions.getObjectOption(
                'T', new NoTransform());
        DependencyPathAcceptor acceptor = argOptions.getObjectOption(
                'a', new UniversalPathAcceptor());
        DependencyPathWeight weighter = argOptions.getObjectOption(
                'w', new FlatPathWeight());
        int pathLength = argOptions.getIntOption('l', 5);
        return new OccurrenceCounter(
                transform, bound, acceptor, weighter, pathLength);
    }

    /**
     * Saves the {@link BasisMapping} created from the {@link
     * OccurrenceCounter}.
     */
    protected void saveSSpace(SemanticSpace sspace, File outputFile)
            throws IOException{
        BasisMapping<String, String> savedTerms = new StringBasisMapping();
        for (String term : sspace.getWords())
            savedTerms.getDimension(term);

        ObjectOutputStream ouStream = new ObjectOutputStream(
                new FileOutputStream(outputFile));
        ouStream.writeObject(savedTerms);
        ouStream.close();
    }

    /**
     * A simple term {@link SemanticSpace} implementation that counts word
     * co-occurrences, performs a transform, and then scores each recorded basis
     * dimension based on the row summed scores for each word.
     */
    public class OccurrenceCounter implements SemanticSpace {

        /**
         * The matrix used for storing weight co-occurrence statistics of those
         * words that occur both before and after.
         */
        private final AtomicGrowingSparseHashMatrix cooccurrenceMatrix;

        /**
         * The {@link BasisMapping} used to record dimensions.
         */
        private final BasisMapping<String, String> basis;

        /**
         * The final scores for each word in the {@code basis}.
         */
        private final Map<String, Double> wordScores;

        /**
         * The {@link Transform} class used to rescore each word.
         */
        private final Transform transform;

        /**
         * The {@link DependencyPathAcceptor} used to accept or reject
         * dependency paths.
         */
        private final DependencyPathAcceptor acceptor;

        /**
         * The {@link DependencyPathWeight} used to score dependency paths.
         */
        private final DependencyPathWeight weighter;

        /**
         * The maximum path length that is acceptable.
         */
        private final int pathLength;

        /**
         * The {@link DependencyExtractor} used to extract parse trees from each
         * document.
         */
        private final DependencyExtractor extractor;

        /**
         * Creates a new {@link OccurrenceCounter}.
         */
        public OccurrenceCounter(Transform transform,
                                 int bound,
                                 DependencyPathAcceptor acceptor,
                                 DependencyPathWeight weighter,
                                 int pathLength) {
            cooccurrenceMatrix = new AtomicGrowingSparseHashMatrix();
            basis = new StringBasisMapping();
            wordScores = new BoundedSortedMap<String, Double>(bound);
            extractor = DependencyExtractorManager.getDefaultExtractor();

            this.transform = transform;
            this.acceptor = acceptor;
            this.weighter = weighter;
            this.pathLength = pathLength;
        }

        /**
         * {@inheritDoc}
         */
        public void processDocument(BufferedReader document)
                throws IOException {
            // Rather than updating the matrix every time an occurrence is
            // seen, keep a thread-local count of what needs to be modified
            // in the matrix and update after the document has been
            // processed.  This saves potential contention from concurrent
            // writes.
            Map<Pair<Integer>,Double> matrixEntryToCount =
                    new HashMap<Pair<Integer>,Double>();

            // Iterate over all of the parseable dependency parsed sentences in
            // the document.
            for (DependencyTreeNode[] nodes = null;
                    (nodes = extractor.readNextTree(document)) != null; ) {

                // Skip empty documents.
                if (nodes.length == 0)
                    continue;                       

                // Examine the paths for each word in the sentence.
                for (int wordIndex = 0; wordIndex < nodes.length; ++wordIndex) {
                    String focusWord = nodes[wordIndex].word();                           
                    int focusIndex = basis.getDimension(focusWord);

                    // Get all the valid paths starting from this word.    The
                    // acceptor will filter out any paths that don't contain the
                    // semantic connections we're looking for.
                    Iterator<DependencyPath> paths =
                        new FilteredDependencyIterator(
                                nodes[wordIndex], acceptor, pathLength);
                           
                    // For each of the paths rooted at the focus word, update
                    // the co-occurrences of the focus word in the dimension
                    // that the BasisFunction states.
                    while (paths.hasNext()) {
                        DependencyPath path = paths.next();

                        String occurrence = path.last().word();
                        int featureIndex = basis.getDimension(occurrence);

                        double score = weighter.scorePath(path);
                        matrixEntryToCount.put(new Pair<Integer>(
                                    focusIndex, featureIndex), score);
                    }
                }
            }

            // Once the document has been processed, update the co-occurrence
            // matrix accordingly.
            for (Map.Entry<Pair<Integer>,Double> e :
                    matrixEntryToCount.entrySet()){
                    Pair<Integer> p = e.getKey();
                    cooccurrenceMatrix.addAndGet(p.x, p.y, e.getValue());
            }
        }

        /**
         * {@inheritDoc}
         */
        public Set<String> getWords() {
            return Collections.unmodifiableSet(wordScores.keySet());
        }

        /**
         * {@inheritDoc}
         */
        public DoubleVector getVector(String word) {
            Double score = wordScores.get(word);
            return (score == null)
                ? new DenseVector(new double[] {0})
                : new DenseVector(new double[] {score});
        }

        /**
         * {@inheritDoc}
         */
        public int getVectorLength() {
            return 1;
        }

        /**
         * {@inheritDoc}
         */
        public void processSpace(Properties properties) {
            SparseMatrix cleanedMatrix = (SparseMatrix) transform.transform(
                    cooccurrenceMatrix);
            for (String term : basis.keySet()) {
                int index = basis.getDimension(term);
                SparseDoubleVector sdv = cleanedMatrix.getRowVector(index);

                double score = 0;
                for (int i : sdv.getNonZeroIndices())
                    score += sdv.get(i);

                wordScores.put(term, score);
            }
        }

        /**
         * {@inheritDoc}
         */
        public String getSpaceName() {
            return "BasisMaker";
        }
    }
}
TOP

Related Classes of edu.ucla.sspace.tools.DependencyBasisMaker$OccurrenceCounter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.