Source Code of edu.ucla.sspace.mains.GenericWordsiMain

/*
 * Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
 * the Lawrence Livermore National Laboratory. Written by Keith Stevens,
 * kstevens@cs.ucla.edu OCEC-10-073 All rights reserved. 
 *
 * This file is part of the C-Cat package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.mains;


import edu.ucla.sspace.common.ArgOptions;
import edu.ucla.sspace.common.SemanticSpace;
import edu.ucla.sspace.common.SemanticSpaceIO;


import edu.ucla.sspace.clustering.Clustering;
import edu.ucla.sspace.clustering.OnlineClustering;


import edu.ucla.sspace.text.Document;


import edu.ucla.sspace.util.Generator;
import edu.ucla.sspace.util.ReflectionUtil;


import edu.ucla.sspace.vector.SparseDoubleVector;


import edu.ucla.sspace.wordsi.AssignmentReporter;
import edu.ucla.sspace.wordsi.ContextExtractor;
import edu.ucla.sspace.wordsi.ContextGenerator;
import edu.ucla.sspace.wordsi.EvaluationWordsi;
import edu.ucla.sspace.wordsi.GeneralContextExtractor;
import edu.ucla.sspace.wordsi.StreamingWordsi;
import edu.ucla.sspace.wordsi.WaitingWordsi;


import edu.ucla.sspace.wordsi.psd.PseudoWordContextExtractor;
import edu.ucla.sspace.wordsi.psd.PseudoWordReporter;
import edu.ucla.sspace.wordsi.semeval.SemEvalContextExtractor;
import edu.ucla.sspace.wordsi.semeval.SemEvalReporter;


import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOError;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.ObjectInputStream;


import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;




/**
 * A base implementation for {@link Wordsi} executables.  This class provides
 * base arguments that nearly all {@link Wordsi} executables will require, along
 * with basic processing for those arguments.
 *
 * </p>
 *
 * This class provides access to three different word sense modes : online
 * clustering, offline clustering, and an evaluation mode.  For the two
 * clustering modes, word senses are generated by clustering individual context
 * vectors.  The first mode uses {@link StreamingWordsi} and the latter mode
 * uses {@link WaitingWordsi}.  The third mode assumes that the word sense have
 * already been learned and are fixed.  Individual contexts are labeled with the
 * most similar word sense.
 *
 * </p>
 *
 * This class provides access to two evaluation modes: Pseudo Word
 * Discrimination and the SenseEval/SemEval evaluation.  When training a {@link
 * Wordsi} model for a pseudo word task, the {@code -e} option should be set
 * with the "pseudoWord} argument.  The {@code -P} option should be set so that
 * {@link Wordsi} knows which words form pseudo words.  {@link Wordsi} will
 * generate a report that specifies how many times each core word in a pseudo
 * word was assigned to a word sense for the pseudo word.  When running in
 * evaluation mode, the {@code -e} option must be set.
 *
 * </p>
 *
 * Since {@link Wordsi} instances will need to reuse features during training
 * and testing, the {@code --Save} and {@code --Load} options are provided.
 * {@code --Save} will store any data structures that are required for
 * generating context vectors.  {@code --Load} will load these same data
 * structures from disk and re-use them.  In general, {@code --Save} should be
 * used during training and {@code --Load} should be used during testing.
 * Different {@link Wordsi} executables will serialize different data
 * structures, but these will generally be a mapping from strings to some other
 * data type.
 *
 * </p>
 *
 * {@code GenericMain} provides the core options used by this base executible.
 * This class provides the following addition options:
 *
 * <ul>
 * <li><u>Required (one of)</u>:
 *   <ul>
 *     </li>{@code -s}, {@code --streamingClustering=CLASSNAME} Specifies the
 *     streaming clustering algorithm to use for forming word senses.
 *
 *     </li> {@code -b}, {@code --batchClustering=CLASSNAME} Specifies the batch
 *     clustering algorithm to use for forming word senses.
 *     
 *     </li> {@code -e}, {@code --evaluationClustering=FILE} Specifies a trained
 *     Wordsi semantic space to be used for evaluation. When set, one of the
 *     Evaluation Type arguments must be set.
 *   </ul>
 * </li>
 *
 * <li><u>Evaluation Type</u>
 *   <ul>
 *     </li> {@code -P}, {@code --pseudoWordEvaluation=FILENAME} Specifies a
 *     mapping from raw tokens to their pseudo word token.  Only the raw tokens
 *     in this mapping will be represented in the {@link Wordsi} space.  A
 *     {@link PseudoWordReporter} will be generated for these pseudo words.
 *
 *     </li> {@code -E}, {@code --semEvalEvaluation=STRING} Signifies that the
 *     data files are in the SemEval format and that only test instance words
 *     should be represented in the Wordsi space.  Each line must correspond to
 *     an instance context and the focus word must be precceded by the token
 *     given as the argument to this option.
 *   </ul>
 * </li>
 *
 * <li><u>Optional</u>
 *   <ul>
 *     </li> {@code -a}, {@code --acceptedWords=FILENAME} Specifies the set of
 *     words which should be represented by Wordsi. (Default: all words).
 *
 *     </li> {@code -c}, {@code --clusters} Specifies the desired number of
 *     clusters, or word senses.  (Default: 0).
 *
 *     </li> {@code -w}, {@code --windowSize} Specifies the number of words, in
 *     one direction, that form a valid context.  For example, a window size of
 *     5 means that up to 5 words before and after a focus word are used to form
 *     the context. (Default: 5).
 *
 *   </ul>
 * </li>
 *
 * <li><u>Serialization</u>
 *   <ul>
 *     </li> {@code -S}, {@code --save} Specfies a file to which all files
 *     needed to generate context vectors will be serialized.
 *
 *     </li> {@code -L}, {@code --load} Specfies a file from which all files
 *     needed to generate context vectors will be deserialized.
 *   </ul>
 * </li>
 * </ul>
 *
 * @author Keith Stevens
 */
public abstract class GenericWordsiMain extends GenericMain {


    private ObjectOutputStream saveStream = null;


    private ObjectInputStream loadStream = null;


    /**
     * {@inheritDoc}
     */
    protected void addExtraOptions(ArgOptions options) {
        // Remove some crufty options.
        options.removeOption('Z');
        options.removeOption('X');
        options.removeOption('o');
        options.removeOption('w');


        // Set the three runtime mode arguments.
        options.addOption('s', "streamingClustering", 
                          "Specifies the streaming clustering algorithm to " +
                          "use for forming word senses",
                          true, "CLASSNAME", "Required (one of)");
        options.addOption('b', "batchClustering", 
                          "Specifies the batch clustering algorithm to " +
                          "use for forming word senses",
                          true, "CLASSNAME", "Required (one of)");
        options.addOption('e', "evaluationClustering", 
                          "Specifies a trained Wordsi semantic space to be " +
                          "used for evaluation.  When set, one of the " +
                          "Evaluation Type arguments must be set",
                          true, "<sspace>", "Required (one of)");


        // Set the evaluation type arguments.
        options.addOption('P', "pseudoWordEvaluation",
                          "Specifies a mapping from raw tokens to their " +
                          "pseudo word token.  Only the raw tokens in this " +
                          "mapping will be represented in the Wordsi space.  " +
                          "A PseudoWordReport will be generated for these " +
                          "pseudo words.  This overrides the -a option",
                          true, "FILENAME", "Evaluation Type");
        options.addOption('E', "semEvalEvaluation",
                          "Signifies that the data files are in the SemEval " +
                          "format and that only test instance words should " +
                          "be represented in the Wordsi space.  Each line " +
                          "must correspond to an instance context and the " +
                          "focus word must be precceded by the token given " +
                          "as the argument to this option.",
                          true, "STRING", "Evaluation Type");
        options.addOption('N', "wordlistEvaluation",
                          "Learned word senses are assumed to be related to " +
                          "the senses in for other words in the " +
                          "acceptedWords list.  This evaluation will track " +
                          "the headers for documents which should mark " +
                          "whether or not the focus words are being used " +
                          "with their common sense.",
                          false, null, "Evaluation Type");


        // Set the optional arguments.
        options.addOption('a', "acceptedWords",
                          "Specifies the set of words which should be " +
                          "represented by Wordsi. (Default: all words)",
                          true, "FILENAME", "Optional");
        options.addOption('c', "clusters",
                          "Specifies the desired number of clusters, or " +
                          "word senses.  (Default: 0)",
                          true, "INT", "Optional");
        options.addOption('W', "windowSize",
                          "Specifies the number of words, in one direction, " +
                          "that form a valid context.  For example, a window " +
                          "size of 5 means that up to 5 words before and " +
                          "after a focus word are used to form the context. " +
                          "(Default: 5)",
                          true, "INT", "Optional");
        options.addOption('h', "useHeaderToken", 
                          "Set to true if the first token in a context " +
                          "should be treated as a document header. Note " +
                          "that this is only used when -E and -P are not " +
                          "used.",
                          false, null, "Optional");


        // Set the serialization arguments.
        options.addOption('S', "save",
                          "Specfies a file to which all files needed to " +
                          "generate context vectors will be serialized",
                          true, "FILENAME", "Serialization");
        options.addOption('L', "load",
                          "Specfies a file from which all files needed to " +
                          "generate context vectors will be deserialized",
                          true, "FILENAME", "Serialization");
    }


    /**
     * Returns a {@link ContextExtractor}, which will be responsible for
     * creating context vectors for documents.
     */
    abstract protected ContextExtractor getExtractor();


    /**
     * Returns a set of strings that the {@link Wordsi} implementations should
     * represent, or {@code null}, which signifies that all words should be
     * represented.
     */
    protected Set<String> getAcceptedWords() {
        if (!argOptions.hasOption('a'))
            return null;


        try {
            Set<String> acceptedWords = new HashSet<String>();
            BufferedReader br = new BufferedReader(new FileReader(
                        argOptions.getStringOption('a')));
            for (String line = null; (line = br.readLine()) != null; )
                acceptedWords.add(line.trim().toLowerCase());
            return acceptedWords;
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
    }


    /**
     * Returns a mapping from real tokens to their pseudo word tokens, or {@code
     * null} if the {@code -P} option is not specified.
     */
    protected Map<String, String> getPseudoWordMap() {
        if (!argOptions.hasOption('P'))
            return null;


        try {
            Map<String, String> pseudoWordMap = new HashMap<String, String>();
            BufferedReader br = new BufferedReader(new FileReader(
                        argOptions.getStringOption('P')));
            for (String line = null; (line = br.readLine()) != null; ) {
                String[] tokens = line.split("\\s+");
                pseudoWordMap.put(tokens[0].trim(), tokens[1].trim());
            }
            return pseudoWordMap;
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
    }


    /**
     * Returns a {@link ContextExtractor} that uses the given {@link
     * ContextGenerator} which will process the corpus in the format specified
     * by the command line.  This is just a helper function for sub-classes
     * implementing {@link #getExtractor}.
     */
    protected ContextExtractor contextExtractorFromGenerator(
            ContextGenerator generator) {
        // If experimentation mode is set, mark the generator as read only.
        if (argOptions.hasOption('e'))
            generator.setReadOnly(true);


        // If the evaluation type is for semEval, use a
        // SemEvalContextExtractor.
        if (argOptions.hasOption('E'))
            return new SemEvalContextExtractor(
                    generator, windowSize(), argOptions.getStringOption('E'));


        // If the evaluation type is for pseudoWord, use a
        // PseudoWordContextExtractor.
        if (argOptions.hasOption('P'))
            return new PseudoWordContextExtractor(
                    generator, windowSize(), getPseudoWordMap());


        // Return a standard context extractor
        return new GeneralContextExtractor(generator, windowSize(),
                                           argOptions.hasOption('h'));
    }


    /**
     * Returns the window size used in a sliding context window.
     */
    protected int windowSize() {
        return argOptions.getIntOption('W', 5);
    }


    protected Iterator<Document> getDocumentIterator() throws IOException {
        Iterator<Document> docIter = super.getDocumentIterator();


        // If we are not using the pseudo word evalutor, just return the
        // iterator as normal.  The SemEval corpora already have their contexts
        // shuffled so there is no worry about biasing the results towards a
        // particular sense.
        if (!argOptions.hasOption('P'))
            return docIter;


        // Otherwise, read in all of the documents into a list, shuffle it, and
        // return an iterator over that list.  This is needed to ensure that the
        // ordering does not bias the clustering algorithm.  NOTE that this
        // assumes that the entire corpus can fit into memory.
        List<Document> docList = new LinkedList<Document>();
        while (docIter.hasNext())
            docList.add(docIter.next());
        Collections.shuffle(docList);
        return docList.iterator();
    }


    /**
     * {@inheritDoc}
     */
    protected SemanticSpace getSpace() {
        ArgOptions options = argOptions;
        // Setup the assignment reporter.  When training, the assignment report
        // will only be used If the evaluation mode will be for pseudoWord.
        AssignmentReporter reporter = null;
        if (options.hasOption('P'))
            reporter = new PseudoWordReporter(System.out);


        int numClusters = options.getIntOption('c', 0);


        // If Wordsi is being used in an evaluation mode, set up word space
        // accordingly.
        if (options.hasOption('e')) {
            // If the evaluation type is not set, report an error and exit.
            if (!options.hasOption('E') && !options.hasOption('P')) {
                usage();
                System.out.println(
                        "An Evaluation Type must be set when evaluating " +
                        " a trained Wordsi model.");
                System.exit(1);
            }


            // Load the semantic space that has the predefined word senses from
            // disk and return an EvaluationWordsi instance.
            try {
                SemanticSpace sspace = SemanticSpaceIO.load(
                        options.getStringOption('e'));
                if (options.hasOption('E'))
                    reporter = new SemEvalReporter(System.out);
                return new EvaluationWordsi(
                        getAcceptedWords(), getExtractor(), sspace, reporter);
            } catch (IOException ioe) {
                throw new IOError(ioe);
            }
        } else if (options.hasOption('s')) {
            // Create a StreamingWordsi instance that uses the specified online 
            // cluster generator.
            System.getProperties().setProperty(
                    OnlineClustering.NUM_CLUSTERS_PROPERTY,
                    options.getStringOption('c'));
            Generator<OnlineClustering<SparseDoubleVector>> clusterGenerator =
                ReflectionUtil.getObjectInstance(options.getStringOption('s'));
            return new StreamingWordsi(getAcceptedWords(), getExtractor(),
                                       clusterGenerator, reporter, numClusters);
        } else if (options.hasOption('b')) {
            // Create a WaitingWordsi instance that uses the specified batch
            // clustering implementation.
            Clustering clustering = 
                ReflectionUtil.getObjectInstance(options.getStringOption('b'));
            return new WaitingWordsi(getAcceptedWords(), getExtractor(), 
                                     clustering, reporter, numClusters);
        } else {
            // None of the required options was provided, report an error and
            // exit.
            usage();
            System.out.println("No clustering method was specified.");
            System.exit(1);
            return null;
        }
    }


    /**
     * Returns an {@link ObjectOutputStream} for the file referred to by the
     * {@code --Save} option or {@link null} if the option was not used.
     */
    protected ObjectOutputStream openSaveFile() {
        try {
            if (saveStream == null && argOptions.hasOption('S'))
                saveStream = new ObjectOutputStream(new FileOutputStream(
                            argOptions.getStringOption('S')));
            return saveStream;
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
    }


    /**
     * Returns an {@link ObjectInputStream} for the file referred to by the
     * {@code
     * --Load} option or {@link null} if the option was not used.
     */
    protected ObjectInputStream openLoadFile() {
        try {
            if (loadStream == null && argOptions.hasOption('L'))
                loadStream = new ObjectInputStream(new FileInputStream(
                            argOptions.getStringOption('L')));
            return loadStream;
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
    }


    /**
     * Writes the {@code obj} to the given {@link ObjectOutputStream}.
     */
    protected void saveObject(ObjectOutputStream outStream, Object obj) {
        try {
            outStream.writeObject(obj);
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
    }


    /**
     * Returns an object of type {@code T} from the provided {@link
     * ObjectInputStream}.  This method does the casting, so assignments should
     * be done directly to a pointer and not through a ternary operator,
     * otherwise the cast will need to be done a second time.
     */
    @SuppressWarnings("unchecked")
    protected <T> T loadObject(ObjectInputStream inStream) {
        try {
            return (T) inStream.readObject();
        } catch (IOException ioe) {
            throw new IOError(ioe);
        } catch (ClassNotFoundException cnfe) {
            throw new IOError(cnfe);
        }
    }
}
Source Code of edu.ucla.sspace.mains.GenericWordsiMain

Related Classes of edu.ucla.sspace.mains.GenericWordsiMain