Source Code of edu.ucla.sspace.rri.ReflectiveRandomIndexing

/*
 * Copyright 2009 David Jurgens
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.rri;


import edu.ucla.sspace.common.Filterable;
import edu.ucla.sspace.common.SemanticSpace;


import edu.ucla.sspace.index.PermutationFunction;
import edu.ucla.sspace.index.RandomIndexVectorGenerator;
import edu.ucla.sspace.index.TernaryPermutationFunction;


import edu.ucla.sspace.text.IteratorFactory;


import edu.ucla.sspace.util.WorkerThread;


import edu.ucla.sspace.vector.CompactSparseIntegerVector;
import edu.ucla.sspace.vector.DenseIntVector;
import edu.ucla.sspace.vector.IntegerVector;
import edu.ucla.sspace.vector.TernaryVector;
import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.Vectors;
import edu.ucla.sspace.vector.VectorMath;


import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOError;
import java.io.IOException;


import java.lang.reflect.Constructor;


import java.util.ArrayDeque;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Queue;
import java.util.Random;
import java.util.Set;


import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;




import java.util.concurrent.atomic.AtomicInteger;


import java.util.logging.Level;
import java.util.logging.Logger;




/**
 * An implementation of Reflective Random Indexing, which uses a two passes
 * through the corpus to build semantic vectors that better approximate indirect
 * co-occurrence.  This implementation is based on the paper: <ul>
 *
 *   <li style="font-family:Garamond, Georgia, serif"></li>
 *
 * </ul>
 *
 * <p>
 *
 * This class defines the following configurable properties that may be set
 * using either the System properties or using the {@link
 * ReflectiveRandomIndexing#ReflectiveRandomIndexing(Properties)} constructor.
 *
 * <dl style="margin-left: 1em">
 *
 * <dt> <i>Property:</i> <code><b>{@value #VECTOR_LENGTH_PROPERTY}
 *      </b></code> <br>
 *      <i>Default:</i> {@value #DEFAULT_VECTOR_LENGTH}
 *
 * <dd style="padding-top: .5em">This property sets the number of dimensions to
 *      be used for the index and semantic vectors. <p>
 *
 * <dt> <i>Property:</i> <code><b>{@value #USE_SPARSE_SEMANTICS_PROPERTY}
 *      </b></code> <br>
 *      <i>Default:</i> {@code true} 
 *
 * <dd style="padding-top: .5em">This property specifies whether to use a sparse
 *       encoding for each word's semantics.  Using a sparse encoding can result
 *       in a large saving in memory, while requiring more time to process each
 *       document.<p>
 *
 * </dl> <p>
 *
 * This class implements {@link Filterable}, which allows for fine-grained
 * control of which semantics are retained.  The {@link #setSemanticFilter(Set)}
 * method can be used to speficy which words should have their semantics
 * retained.  Note that the words that are filtered out will still be used in
 * computing the semantics of <i>other</i> words.  This behavior is intended for
 * use with a large corpora where retaining the semantics of all words in memory
 * is infeasible.<p>
 *
 * This class is thread-safe for concurrent calls of {@link
 * #processDocument(BufferedReader) processDocument}.  The {@link
 * #getVector(String) getVector} method will only return valid reflective
 * vectors after the call to {@link #processSpace(Properties) processSpace}. <p>
 *
 * @author David Jurgens
 */
public class ReflectiveRandomIndexing implements SemanticSpace, Filterable {


    /**
     * The prefix for naming public properties.
     */
    private static final String PROPERTY_PREFIX = 
        "edu.ucla.sspace.ri.ReflectiveRandomIndexing";


    /**
     * The property to specify the number of dimensions to be used by the index
     * and semantic vectors.
     */
    public static final String VECTOR_LENGTH_PROPERTY = 
        PROPERTY_PREFIX + ".vectorLength";


    /**
     * Specifies whether to use a sparse encoding for each word's semantics,
     * which saves space but requires more computation.
     */
    public static final String USE_SPARSE_SEMANTICS_PROPERTY = 
        PROPERTY_PREFIX + ".sparseSemantics";


    /**
     * The default number of dimensions to be used by the index and semantic
     * vectors.
     */
    public static final int DEFAULT_VECTOR_LENGTH = 4000;


    /**
     * The name returned by {@code getName}.
     */
    private static final String RRI_SSPACE_NAME =
        "reflective-random-indexing";


    /**
     * The internal logger used for tracking processing progress.
     */
    private static final Logger LOGGER = 
        Logger.getLogger(ReflectiveRandomIndexing.class.getName());


    /**
     * A mapping from each word to the vector the represents its the summation
     * of all the co-occurring words' index vectors.
     */
    private final Map<Integer,IntegerVector> docToVector;


    /**
     * A mapping from each word to the vector the represents its semantics after
     * the second pass through the corpus.
     */
    private final Map<String,IntegerVector> termToReflectiveSemantics;


    /**
     * A mapping from each word to the vector the represents its semantics after
     * the second pass through the corpus.
     */
    private final Map<String,TernaryVector> termToIndexVector;


    /**
     * A mapping from a each term to its index
     */
    private final Map<String,Integer> termToIndex;


    /**
     * A counter for the number of documents seen in the corpus.
     */
    private final AtomicInteger documentCounter;


    /**
     * The number of dimensions for the semantic and index vectors.
     */
    private final int vectorLength;


    /**
     * A flag for whether this instance should use {@code SparseIntegerVector}
     * instances for representic a word's semantics, which saves space but
     * requires more computation.
     */
    private final boolean useSparseSemantics;


    /**
     * An optional set of words that restricts the set of semantic vectors that
     * this instance will retain.
     */
    private final Set<String> semanticFilter;


    /**
     * The generator used to create index vectors for each unique term in the
     * corpus.
     */
    private final RandomIndexVectorGenerator indexVectorGenerator;


    /**
     * A compressed version of the corpus that is built as the text version is
     * being processed.  The file contains documents represented as an integer
     * for the number of tokens in that document followed by the indices for all
     * of the tokens in the order that they appeared.
     *
     * @see #processSpace(Properties)
     */
    private File compressedDocuments;


    /**
     * The output stream used to the write the {@link #compressedDocuments} file
     * as the text documents are being processed.
     */
    private DataOutputStream compressedDocumentsWriter;


    /**
     * The number that keeps track of the index values of words.
     */
    private int termIndexCounter;


    /**
     * A mapping from each term's index to the term.  This value is not set
     * until {@link #processSpace(Properties)} is called, at which point the
     * final set of terms has been determined
     */
    private String[] indexToTerm;


    /**
     * Creates a new {@code ReflectiveRandomIndexing} instance using the current
     * {@code System} properties for configuration.
     */
    public ReflectiveRandomIndexing() {
        this(System.getProperties());
    }


    /**
     * Creates a new {@code ReflectiveRandomIndexing} instance using the
     * provided properites for configuration.
     */
   public ReflectiveRandomIndexing(Properties properties) {
        String vectorLengthProp = 
            properties.getProperty(VECTOR_LENGTH_PROPERTY);
        vectorLength = (vectorLengthProp != null)
            ? Integer.parseInt(vectorLengthProp)
            : DEFAULT_VECTOR_LENGTH;
        
        String useSparseProp = 
        properties.getProperty(USE_SPARSE_SEMANTICS_PROPERTY);
        useSparseSemantics = (useSparseProp != null)
            ? Boolean.parseBoolean(useSparseProp)
            : true;


        indexVectorGenerator = 
            new RandomIndexVectorGenerator(vectorLength, properties);


        // The various maps for keeping word and document state during
        // processing
        termToIndexVector = new ConcurrentHashMap<String,TernaryVector>();
        docToVector = new ConcurrentHashMap<Integer,IntegerVector>();
        termToReflectiveSemantics = 
            new ConcurrentHashMap<String,IntegerVector>();
        termToIndex = new ConcurrentHashMap<String,Integer>();


        documentCounter = new AtomicInteger();
        semanticFilter = new HashSet<String>();


        // Last set up the writer that will contain a compressed version of the
        // corpus for use in processSpace()
        try {
            compressedDocuments = 
                File.createTempFile("reflective-ri-documents",".dat");
            compressedDocumentsWriter = new DataOutputStream(
                new BufferedOutputStream(
                    new FileOutputStream(compressedDocuments)));
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
    }


    /**
     * Returns a vector for representing word or document semantics whose type
     * is based on whether the used specified to use sparse semantics or not.
     */
    private IntegerVector createVector() {
        return (useSparseSemantics)
            ? new CompactSparseIntegerVector(vectorLength)
            : new DenseIntVector(vectorLength);
    }


    /**
     * Returns the index vector for the term, or if creates one if the term to
     * index vector mapping does not yet exist.
     *
     * @param term a word in the semantic space
     *
     * @return the index for the provide term.
     */
    private TernaryVector getTermIndexVector(String term) {
        TernaryVector iv = termToIndexVector.get(term);
        if (iv == null) {
            // lock in case multiple threads attempt to add it at once
            synchronized(this) {
                // recheck in case another thread added it while we were waiting
                // for the lock
                iv = termToIndexVector.get(term);
                if (iv == null) {
                    // since this is a new term, also map it to its index for
                    // later look-up when the integer documents are processed
                    termToIndex.put(term, termIndexCounter++);
                    // next, map it to its reflective vector which will be
                    // filled in process space
                    termToReflectiveSemantics.put(term, createVector());
                    // last, create an index vector for the term
                    iv = indexVectorGenerator.generate();
                    termToIndexVector.put(term, iv);                    
                }
            }
        }
        return iv;
    }


   /**
     * {@inheritDoc}
     */ 
    public IntegerVector getVector(String word) {
        IntegerVector v = termToReflectiveSemantics.get(word);
        if (v == null) {
            return null;
        }
        return Vectors.immutable(v);
    }


    /**
     * {@inheritDoc}
     */ 
    public String getSpaceName() {
        return RRI_SSPACE_NAME + "-" + vectorLength + "v";
    }


    /**
     * {@inheritDoc}
     */
    public int getVectorLength() {
        return vectorLength;
    }


    /**
     * {@inheritDoc}
     */ 
    public Set<String> getWords() {
        return Collections.unmodifiableSet(termToReflectiveSemantics.keySet());
    }
    
    /**
     * Updates the semantic vectors based on the words in the document.
     *
     * @param document {@inheritDoc}
     */
    public void processDocument(BufferedReader document) throws IOException {
        int docIndex = documentCounter.getAndIncrement();


        Iterator<String> documentTokens = 
            IteratorFactory.tokenizeOrdered(document);


        // As we read in the document, generate a compressed version of it,
        // which we will use during the process space method to recompute all of
        // the word vectors' semantics
        ByteArrayOutputStream compressedDocument = 
            new ByteArrayOutputStream(4096);
        DataOutputStream dos = new DataOutputStream(compressedDocument);
        int tokens = 0; // count how many are in this document
        int unfilteredTokens = 0; // how many tokens remained after filtering


        IntegerVector docVector = createVector();
        docToVector.put(docIndex, docVector);


        while (documentTokens.hasNext()) {
            tokens++;
            String focusWord = documentTokens.next();


            // If we are filtering the semantic vectors, check whether this word
            // should have its semantics calculated.  In addition, if there is a
            // filter and it would have excluded the word, do not keep its
            // semantics around
            boolean calculateSemantics =
                semanticFilter.isEmpty() || semanticFilter.contains(focusWord)
                && !focusWord.equals(IteratorFactory.EMPTY_TOKEN);


      // If the filter does not accept this word, skip the semantic
      // processing, continue with the next word
            if (!calculateSemantics) {
                // Do not write out any removed tokens to save space
    continue;
      }


            // Update the occurrences of this token
            unfilteredTokens++;
            add(docVector, getTermIndexVector(focusWord));


            // Update the compress version of the document with the token.
            //
            // NOTE: this call to termToIndex *must* come after the
            // getTermIndexVector() call, which is responsible for adding this
            // mapping if it doesn't already exist.
      int focusIndex = termToIndex.get(focusWord);


            // write the term index into the compressed for the document for
            // later corpus reprocessing
            dos.writeInt(focusIndex);
        }


        document.close();
        
        dos.close();
        byte[] docAsBytes = compressedDocument.toByteArray();


        // Once the document is finished, write the compressed contents to the
        // corpus stream
        synchronized(compressedDocumentsWriter) {
            // Write how many terms were in this document after filtering
            compressedDocumentsWriter.writeInt(unfilteredTokens);
            compressedDocumentsWriter.write(docAsBytes, 0, docAsBytes.length);
        }
    }
    
    /**
     * Computes the reflective semantic vectors for word meanings
     *
     * @param properties {@inheritDoc}
     */
    public void processSpace(Properties properties) {
        try {
            // Wrap the call to avoid having all the code in a try/catch.  This
            // is for improved readability purposes only.
            processSpace();
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
    }


    /**
     * Computes the reflective semantic vectors for word meanings
     */
    private void processSpace() throws IOException {
        LOGGER.info("generating reflective vectors");
        compressedDocumentsWriter.close();
        int numDocuments = documentCounter.get();
        termToIndexVector.clear();
        indexToTerm = new String[termToIndex.size()];
        for (Map.Entry<String,Integer> e : termToIndex.entrySet())
            indexToTerm[e.getValue()] = e.getKey();


        // Read in the compressed version of the corpus, re-processing each
        // document to build up the document vectors
        DataInputStream corpusReader = new DataInputStream(
            new BufferedInputStream(new FileInputStream(compressedDocuments)));


        // Set up the concurrent data structures so we can reprocess the
        // documents concurrently using a work queue
        final BlockingQueue<Runnable> workQueue =
            new LinkedBlockingQueue<Runnable>();
        for (int i = 0; i < Runtime.getRuntime().availableProcessors(); ++i) {
            Thread t = new WorkerThread(workQueue);
            t.start();
        }
        final Semaphore documentsRerocessed = new Semaphore(0);         


        for (int d = 0; d < numDocuments; ++d) {
            final int docId = d;


            // This value already has any filtered tokens taken into account,
            // i.e. in only counts those tokens that remain after filtering
            int tokensInDoc = corpusReader.readInt();
            // Read in the document
            final int[] doc = new int[tokensInDoc];
            for (int i = 0; i < tokensInDoc; ++i)
                doc[i] = corpusReader.readInt();


            workQueue.offer(new Runnable() {
                    public void run() {
                        // This method creates the document vector and then adds
                        // that document vector with the reflective semantic
                        // vector for each word occurring in the document
                        LOGGER.fine("reprocessing doc #" + docId);
                        processIntDocument(docToVector.get(docId), doc);
                        documentsRerocessed.release();
                    }
                });
        }
        corpusReader.close();


        // Wait until all the documents have been processed
        try {
            documentsRerocessed.acquire(numDocuments);
        } catch (InterruptedException ie) {
            throw new Error("interrupted while waiting for documents to " +
                            "finish reprocessing", ie);
        }        
        LOGGER.fine("finished reprocessing all documents");


    }


    /**
     * Processes the compressed version of a document where each integer
     * indicates that token's index, adding the document's vector to the
     * reflective semantic vector each time a term occurs in the document.
     *
     * @param docVector the vector of the document that is being processed
     * @param document the document to be processed where each {@code int} is a
     *        term index
     *
     * @return the number of contexts present in this document
     */
    private void processIntDocument(IntegerVector docVector, int[] document) {


        // Make one pass through the document to build the document vector.
        for (int termIndex : document) {
            IntegerVector reflectiveVector = 
                termToReflectiveSemantics.get(indexToTerm[termIndex]);
            // Lock on the term's vector to prevent another thread from updating
            // it concurrently
            synchronized(reflectiveVector) {
                VectorMath.add(reflectiveVector, docVector);
            }
        }
    }


    /**
     * {@inheritDoc} Note that all words will still have an index vector
     * assigned to them, which is necessary to properly compute the semantics.
     *
     * @param semanticsToRetain the set of words for which semantics should be
     *        computed.
     */
    public void setSemanticFilter(Set<String> semanticsToRetain) {
        semanticFilter.clear();
        semanticFilter.addAll(semanticsToRetain);
    }


    /**
     * Atomically adds the values of the index vector to the semantic vector.
     * This is a special case addition operation that only iterates over the
     * non-zero values of the index vector.
     */
    private static void add(IntegerVector semantics, TernaryVector index) {
        // Lock on the semantic vector to avoid a race condition with another
        // thread updating its semantics.  Use the vector to avoid a class-level
        // lock, which would limit the concurrency.
        synchronized(semantics) {
            for (int p : index.positiveDimensions())
                semantics.add(p, 1);
            for (int n : index.negativeDimensions())
                semantics.add(n, -1);
        }
    }
}
Source Code of edu.ucla.sspace.rri.ReflectiveRandomIndexing

Related Classes of edu.ucla.sspace.rri.ReflectiveRandomIndexing