Source Code of edu.ucla.sspace.mains.FixedDurationTemporalRandomIndexingMain

/*
 * Copyright 2009 David Jurgens
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.mains;


import edu.ucla.sspace.common.ArgOptions;
import edu.ucla.sspace.common.SemanticSpace;
import edu.ucla.sspace.common.SemanticSpaceIO;
import edu.ucla.sspace.common.SemanticSpaceIO.SSpaceFormat;
import edu.ucla.sspace.common.Similarity;
import edu.ucla.sspace.common.Similarity.SimType;


import edu.ucla.sspace.ri.IndexVectorUtil;


import edu.ucla.sspace.temporal.TemporalSemanticSpace;


import edu.ucla.sspace.text.FileListTemporalDocumentIterator;
import edu.ucla.sspace.text.IteratorFactory;
import edu.ucla.sspace.text.OneLinePerTemporalDocumentIterator;
import edu.ucla.sspace.text.TemporalDocument;


import edu.ucla.sspace.tri.FixedDurationTemporalRandomIndexing;
import edu.ucla.sspace.tri.OrderedTemporalRandomIndexing;


import edu.ucla.sspace.util.CombinedIterator;
import edu.ucla.sspace.util.MultiMap;
import edu.ucla.sspace.util.NearestNeighborFinder;
import edu.ucla.sspace.util.SimpleNearestNeighborFinder;
import edu.ucla.sspace.util.SortedMultiMap;
import edu.ucla.sspace.util.TimeSpan;
import edu.ucla.sspace.util.TreeMultiMap;


import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.TernaryVector;
import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.Vectors;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOError;
import java.io.IOException;
import java.io.PrintWriter;


import java.text.DateFormat;
import java.text.SimpleDateFormat;


import java.util.Calendar;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Properties;
import java.util.Queue;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;


import java.util.concurrent.BrokenBarrierException;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.CyclicBarrier;


import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;


import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;


import java.util.logging.ConsoleHandler;
import java.util.logging.Handler;
import java.util.logging.Level;
import java.util.logging.Logger;


/**
 * The executable class for running {@link FixedDurationTemporalRandomIndexing}
 * from the command-line.
 *
 * @see TemporalRandomIndexing
 * @see RandomIndexing
 * 
 * @author David Jurgens
 */
public class FixedDurationTemporalRandomIndexingMain {


    /**
     * Extension used for all saved semantic space files.
     */
    private static final String EXT = ".sspace";


    /**
     * The logger used for reporting.
     */
    private static final Logger LOGGER = 
        Logger.getLogger(FixedDurationTemporalRandomIndexingMain.class.getName());


    /**
     * The processed argument options available to the main classes.
     */
    private final ArgOptions argOptions;


    /**
     * A set of words for which the temporal semantics will be calculated.
     */
    private final Set<String> interestingWords;


    /**
     * Whether the nearest neighbors for each interesting word should be
     * compared after processing each partition.
     */
    private boolean compareNeighbors;


    /**
     * The format in which the .sspace should be saved
     */
    private SSpaceFormat format;


    /**
     * How many nearest neightbors of the words in {@code interestingWords} to
     * print for each semantic partition.  If this variable is 0, no neighbors are
     * printed.
     */
    private int interestingWordNeighbors;


    /**
     * The directory in which any serialized .sspace files should be saved.
     */
    private File outputDir;


    /**
     * Whether to overwrite existing .sspace files when serializing
     */
    private boolean overwrite;


    /**
     * Whether to print the semantic shifts and other statistics for the
     * interesting word set for each partition.
     */
    private boolean printInterestingTokenShifts; 


    /**
     * Whether to write the incremental {@code .sspace} files to disk during the
     * processing of each time span.
     */
    private boolean savePartitions;


    /**
     * Whether to print a complete sorted list of all the semantic shifts for
     * each interesting word from the last partition.
     */
    private boolean printShiftRankings;


    /**
     * A mapping from each word to the vectors that account for its temporal
     * semantics according to the specified time span
     */
    private final Map<String,SortedMap<Long,double[]>> wordToTemporalSemantics;




    private FixedDurationTemporalRandomIndexingMain() {
        argOptions = createOptions();
        interestingWords = new HashSet<String>();
        interestingWordNeighbors = 0;
        compareNeighbors = false;
        wordToTemporalSemantics = 
            new HashMap<String,SortedMap<Long,double[]>>();
        savePartitions = false;
        printShiftRankings = false;
    }


    /**
     * Adds all of the options to the {@link ArgOptions}.
     */
    protected ArgOptions createOptions() {
        ArgOptions options = new ArgOptions();
        options.addOption('f', "fileList", "a list of document files", 
                          true, "FILE[,FILE...]", "Required (at least one of)");
        options.addOption('d', "docFile", 
                          "a file where each line is a document", true,
                          "FILE[,FILE...]", "Required (at least one of)");


        options.addOption('T', "timespan", "the timespan for each semantic " +
                          "partition", true, "Date String", "Required");


        options.addOption('o', "outputFormat", "the .sspace format to use",
                          true, "{text|binary}", "Program Options");
        options.addOption('t', "threads", "the number of threads to use",
                          true, "INT", "Program Options");
        options.addOption('w', "overwrite", "specifies whether to " +
                          "overwrite the existing output", true, "BOOL",
                          "Program Options");
        options.addOption('v', "verbose", "prints verbose output",
                          false, null, "Program Options");


        // Algorithm Options
        options.addOption('i', "vectorGenerator", "IndexVectorGenerator "
                          + "class to use", true,
                          "CLASSNAME", "Algorithm Options");
        options.addOption('l', "vectorLength", "length of semantic vectors",
                          true, "INT", "Algorithm Options");
        options.addOption('n', "permutationFunction", "permutation function "
                          + "to use", true,
                          "CLASSNAME", "Algorithm Options");
        options.addOption('p', "usePermutations", "whether to permute " +
                          "index vectors based on word order", true,
                          "BOOL", "Algorithm Options");
        options.addOption('r', "useSparseSemantics", "use a sparse encoding of "
                          + "semantics to save memory", true,
                          "BOOL", "Algorithm Options");
        options.addOption('s', "windowSize", "how many words to consider " +
                          "in each direction", true,
                          "INT", "Algorithm Options");
        options.addOption('S', "saveVectors", "save word-to-IndexVector mapping"
                          + " after processing", true,
                          "FILE", "Algorithm Options");
        options.addOption('L', "loadVectors", "load word-to-IndexVector mapping"
                          + " before processing", true,
                          "FILE", "Algorithm Options");


        // Input Options
        options.addOption('F', "tokenFilter", "filters to apply to the input " +
                          "token stream", true, "FILTER_SPEC", 
                          "Tokenizing Options");
        options.addOption('C', "compoundWords", "a file where each line is a " +
                          "recognized compound word", true, "FILE", 
                          "Tokenizing Options");


        options.addOption('W', "semanticFilter", "exclusive list of word",
                          true, "FILE", "Input Options");


        // Output Options
        options.addOption('I', "interestingTokenList", "list of interesting " +
                          "words", true, "FILE", "Output Options");
        options.addOption('K', "printShiftRankings", "print ranked list of " +
                          "semantic shifts for each interesting word", false, 
                          null, "Output Options");
        options.addOption('R', "savePartitions", "write semantic partitions as " +
                          ".sspace files to disk", false, null, 
                          "Output Options");
        options.addOption('P', "printInterestingTokenShifts", "prints the "
                          + "vectors for each interesting word", false, null, 
                          "Output Options");
        options.addOption('N', "printInterestingTokenNeighbors", "prints the "
                          + "nearest neighbors for each interesting word", true,
                          "INT", "Output Options");
        options.addOption('Z', "printInterestingTokenNeighborComparison",
                          "prints the distances between each of the"
                          + "nearest neighbors for each interesting word", 
                          false, null , "Output Options");


        return options;
    }


    public static void main(String[] args) {
        try {
            FixedDurationTemporalRandomIndexingMain main = 
                new FixedDurationTemporalRandomIndexingMain();
            main.run(args);
        } catch (Throwable t) {
            t.printStackTrace();
        }
    }


    public void run(String[] args) throws Exception {
        if (args.length == 0) {
            usage();
            System.exit(1);
        }
        argOptions.parseOptions(args);
        
        if (argOptions.numPositionalArgs() == 0) {
            throw new IllegalArgumentException("must specify output directory");
        }


        outputDir = new File(argOptions.getPositionalArg(0));
        if (!outputDir.isDirectory()){
            throw new IllegalArgumentException(
                "output directory is not a directory: " + outputDir);
        }


        if (!argOptions.hasOption("timespan")) {
            throw new IllegalArgumentException(
                "must specify a timespan duration for the semantic partition");
        }


        // Get the time span that will be used to group the documents
        String timespanStr = argOptions.getStringOption("timespan");
        TimeSpan timeSpan = new TimeSpan(timespanStr);


        if (argOptions.hasOption('v') || argOptions.hasOption("verbose")) {
            // Enable all the logging at the FINE level for the application
            Logger appRooLogger = Logger.getLogger("edu.ucla.sspace");
            Handler verboseHandler = new ConsoleHandler();
            verboseHandler.setLevel(Level.FINE);
            appRooLogger.addHandler(verboseHandler);
            appRooLogger.setLevel(Level.FINE);
            appRooLogger.setUseParentHandlers(false);
        }
        
        // all the documents are listed in one file, with one document per line
        Iterator<TemporalDocument> docIter = null;
        String fileList = (argOptions.hasOption("fileList"))
            ? argOptions.getStringOption("fileList")
            : null;


        String docFile = (argOptions.hasOption("docFile"))
            ? argOptions.getStringOption("docFile")
            : null;
        if (fileList == null && docFile == null) {
            throw new Error("must specify document sources");
        }


        // Second, determine where the document input sources will be coming
        // from.
        Collection<Iterator<TemporalDocument>> docIters = 
            new LinkedList<Iterator<TemporalDocument>>();


        if (fileList != null) {
            String[] fileNames = fileList.split(",");
            // we have a file that contains the list of all document files we
            // are to process
            for (String s : fileNames) {
                docIters.add(new FileListTemporalDocumentIterator(s));
            }
        }
        if (docFile != null) {
            String[] fileNames = docFile.split(",");
            // all the documents are listed in one file, with one document per
            // line
            for (String s : fileNames) {
                docIters.add(new OneLinePerTemporalDocumentIterator(s));
            }
        }


        // combine all of the document iterators into one iterator.
        docIter = new CombinedIterator<TemporalDocument>(docIters);
        
        int numThreads = Runtime.getRuntime().availableProcessors();
        if (argOptions.hasOption("threads")) {
            numThreads = argOptions.getIntOption("threads");
        }


        overwrite = true;
        if (argOptions.hasOption("overwrite")) {
            overwrite = argOptions.getBooleanOption("overwrite");
        }


        // If the user specified a list of interesting words, load in the set to
        // filter out which semantics shifts are actually tracked
        if (argOptions.hasOption("interestingTokenList")) {
            String fileName = argOptions.getStringOption("interestingTokenList");
            BufferedReader br = new BufferedReader(new FileReader(fileName));
            for (String line = null; (line = br.readLine()) != null; ) {
                for (String s : line.split("\\s+")) {
                    interestingWords.add(s);
                    wordToTemporalSemantics.put(s,new TreeMap<Long,double[]>());
                }
            }
            LOGGER.info("loaded " + interestingWords.size() + 
                        " interesting words");
        }


        // Check whether the incremental .sspace files should be written to disk
        if (argOptions.hasOption("savePartitions"))
            savePartitions = true;
        
        // Check wether each partition should generate a ranked list of words
        // according to their semantic shift
        if (argOptions.hasOption("printShiftRankings"))
            printShiftRankings = true;


        // if the user did not indicate any interesting words, and the .sspace
        // files are not being written, then the program has no output, which is
        // an error
        else if (interestingWords.isEmpty()) {
            throw new IllegalArgumentException(
                "Must specify some form of output as either a non-empty set" +
                "of interesting words and/or writing the semantic partition .sspace"
                + "files to disk");
        }


        // Check whether any interesting-word-output is enabled
        if (argOptions.hasOption("printInterestingTokenNeighbors")) {
            interestingWordNeighbors = 
                argOptions.getIntOption("printInterestingTokenNeighbors");        
        }
        if (argOptions.hasOption("printInterestingTokenShifts")) {
            printInterestingTokenShifts = true;
                LOGGER.info("Recording interesting token shifts");
        }
        if (argOptions.hasOption("printInterestingTokenNeighborComparison")) {
            compareNeighbors = true;
        }
        
        
        // use the System properties in case the user specified them as
        // -Dprop=<val> to the JVM directly.
        Properties props = setupProperties();


        
        FixedDurationTemporalRandomIndexing fdTri = 
            new FixedDurationTemporalRandomIndexing(props); 


        // The user may also specify a limit to the words for which semantics
        // are computed.  If so, set up Random Indexing to not keep semantics
        // for those words.
        if (argOptions.hasOption("semanticFilter")) {
            String fileName = argOptions.getStringOption("semanticFilter");
            BufferedReader br = new BufferedReader(new FileReader(fileName));
            Set<String> wordsToCompute = new HashSet<String>();
            for (String line = null; (line = br.readLine()) != null; ) {
                for (String s : line.split("\\s+")) {
                    wordsToCompute.add(s);
                }
            }
            LOGGER.info("computing semantics for only " + wordsToCompute.size()
                        + " words");


            fdTri.setSemanticFilter(wordsToCompute);
        }


        // Load the word-to-IndexVector mappings if they were specified.
        if (argOptions.hasOption("loadVectors")) {
            String fileName = argOptions.getStringOption("loadVectors");
            LOGGER.info("loading index vectors from " + fileName);
            Map<String,TernaryVector> wordToIndexVector = 
                IndexVectorUtil.load(new File(fileName));
            fdTri.setWordToIndexVector(wordToIndexVector);
        }
        
        String formatName = (argOptions.hasOption("outputFormat"))
            ? argOptions.getStringOption("outputFormat").toUpperCase()
            : "TEXT";
        
        format = SSpaceFormat.valueOf(formatName.toUpperCase());


        parseDocumentsMultiThreaded(fdTri, docIter, timeSpan, numThreads);


        long startTime = System.currentTimeMillis();
        fdTri.processSpace(props);
        long endTime = System.currentTimeMillis();
        LOGGER.info(String.format("processed space in %.3f seconds%n",
                                   ((endTime - startTime) / 1000d)));
        
        // save the word-to-IndexVector mapping if specified to do so
        if (argOptions.hasOption("saveVectors")) {
            String fileName = argOptions.getStringOption("saveVectors");
            LOGGER.info("saving index vectors to " + fileName);
            IndexVectorUtil.save(fdTri.getWordToIndexVector(), 
                                 new File(fileName));
        }
    }


    /**
     * Prints the semantic space to file, inserting the tag into the .sspace
     * file name
     */
    private void printSpace(SemanticSpace sspace, String tag) {
        try {
            String EXT = ".sspace";
            File output = (overwrite)
                ? new File(outputDir, sspace.getSpaceName() + tag + EXT)
                : File.createTempFile(sspace.getSpaceName() + tag, EXT, 
                                      outputDir);
            
            long startTime = System.currentTimeMillis();
            SemanticSpaceIO.save(sspace, output, format);
            long endTime = System.currentTimeMillis();
            verbose("printed space in %.3f seconds%n",
                    ((endTime - startTime) / 1000d));
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
    }


    /**
     * Adds the temporal semantics for each interesting word using the provided
     * semantic partition.
     *
     * @param currentSemanticPartitionStartTime the start time of the semantic
     *        partition
     */
    private void updateTemporalSemantics(long currentSemanticPartitionStartTime,
                                         SemanticSpace semanticPartition) {
        
        // Pre-allocate the zero vector so that if multiple interesting words
        // are not present in the space, they all point to the same zero
        // semantics
        double[] zeroVector = new double[semanticPartition.getVectorLength()];


        for (String word : interestingWords) {
            // update the vectors
            SortedMap<Long,double[]> temporalSemantics = 
                wordToTemporalSemantics.get(word);
            Vector v = semanticPartition.getVector(word);
            // If the wor was not present in the current partition, then just
            // use the zero vector.  Otherwise, use it distirbution.
            double[] semantics = (v == null) 
                ? zeroVector
                : Vectors.asDouble(v).toArray();


            temporalSemantics.put(currentSemanticPartitionStartTime,
                                  semantics);
        }
    }


    /**
     * Prints the semantic shifts for all the words in the {@link
     * #wordToTemporalSemantics} map, using the {code dateString} for naming the
     * output file with the date of the last semantic partition.
     *
     * @param dateString the date of the last semantic partition.
     */
    private void printSemanticShifts(String dateString) throws IOException {


        LOGGER.fine("Writing semantic shifts for " + dateString);


        // Once we have all the vectors for each word in each sspace,
        // calculate how much the vector has changed.
        for (Map.Entry<String,SortedMap<Long,double[]>> e : 
                 wordToTemporalSemantics.entrySet()) {
            
            String word = e.getKey();
            SortedMap<Long,double[]> timeStampToSemantics = e.getValue();
            Iterator<Map.Entry<Long,double[]>> it = 
                timeStampToSemantics.entrySet().iterator();
            
            PrintWriter pw = new PrintWriter(new File(outputDir,
                word + "." + dateString + ".temporal-changes.txt"));            
            
            // Write the header so we can keep track of what all the columns
            // mean
            pw.println("#time\ttime-delay\tcosineSim\tcosineAngle\tEuclidean"+
                       "\tchange-in-magnitde\tmagnitde\tprev-magnitude");
            Map.Entry<Long,double[]> last = null;
            while (it.hasNext()) {
                Map.Entry<Long,double[]> cur = it.next();
                if (last != null) {
                    long timeDelay = cur.getKey() - last.getKey();
                    double euclideanDist = Similarity.
                            euclideanDistance(cur.getValue(), last.getValue());
                    double cosineSim = Similarity.
                        cosineSimilarity(cur.getValue(), last.getValue());
                    double cosineAngle = Math.acos(cosineSim);
                    
                    double oldMag = getMagnitude(last.getValue());
                    double newMag = getMagnitude(cur.getValue());
                    
                    pw.println(cur.getKey() + "\t" + timeDelay + "\t" + 
                               cosineSim + "\t" + cosineAngle + "\t" + 
                               euclideanDist + "\t" + (newMag - oldMag) + "\t" 
                               + newMag + "\t" + oldMag);
                }
                last = cur;
            }
            pw.close();
        }
    }


    /**
     * Computes the ranking of which words underwent the most dramatic shifts in
     *  the most recent partition and then prints the ranking list of a file.
     *
     * @param dateString the string to use when indiciation which partition is
     *        having its ranking lists printed.  This string becomes a part of
     *        the file name.
     */
    private void printShiftRankings(String dateString, 
                                    long startOfMostRecentPartition,
                                    TimeSpan partitionDuration) 
            throws IOException {


        SortedMultiMap<Double,String> shiftToWord = 
            new TreeMultiMap<Double,String>();


        // Create a second time span than is twice the duration.  We will use
        // this to check whether two partition's vectors were adjacent in the
        // slice by seeing wether the timestamps fall within this duration
        TimeSpan twoPartitions = new TimeSpan(partitionDuration.getYears() * 2,
                                              partitionDuration.getMonths() * 2,
                                              partitionDuration.getWeeks() * 2,
                                              partitionDuration.getDays() * 2,
                                              partitionDuration.getHours() * 2);
        
        // Once we have all the vectors for each word in each sspace,
        // calculate how much the vector has changed.
        for (Map.Entry<String,SortedMap<Long,double[]>> e : 
                 wordToTemporalSemantics.entrySet()) {
            String word = e.getKey();
            SortedMap<Long,double[]> m = e.getValue();
            
            // Skip computing shifts for words without enough partitions
            if (m.size() < 2)
                continue;


            // Get the timestamps as a navigable map so we can identify the last
            // two keys in it more easly.
            NavigableMap<Long,double[]> timestampToVector = 
                (e instanceof NavigableMap) 
                ? (NavigableMap<Long,double[]>)m
                : new TreeMap<Long,double[]>(m);            


            Map.Entry<Long,double[]> mostRecent = timestampToVector.lastEntry();
            // Skip calculating the shift for words who most recent partition
            // was not the same as the most recent partition for TRI
            if (!mostRecent.getKey().equals(startOfMostRecentPartition))
                continue;
            
            Map.Entry<Long,double[]> secondMostRecent = 
                timestampToVector.lowerEntry(mostRecent.getKey());
            // Skip calculating the shift for words where the two most recent
            // partitoins aren't contiguous.  Check for this using the custom
            // time span that covers two partitions
            if (!twoPartitions.insideRange(secondMostRecent.getKey(),
                                           mostRecent.getKey()))
                continue;


            
            // Compute the semantic shift of the two partitions
            shiftToWord.put(Similarity.cosineSimilarity(
                            secondMostRecent.getValue(),
                            mostRecent.getValue()), word);
        }


        PrintWriter pw = new PrintWriter(new File(outputDir,
            "shift-ranks-for." + dateString + ".txt"));
        for (Map.Entry<Double,String> e : shiftToWord.entrySet()) {
            pw.println(e.getKey() + "\t" + e.getValue());
        }        
        pw.close();
    }


    /**
     * Using the {@link wordToTemporalSemantics} set and input parameters,
     * calculates the shift in each word's semantic vector per recorded time
     * period and also prints out the nearest neighbors to each word for each
     * time period.
     *
     * @param dateString the string that encodes the date of the semantic partition.
     *        This will be used as a part of the file name to indicate when the
     *        shifts occurred.
     * @param semanticPartition the current semantic that will be used to identify
     *        the neighbors of each interesting word
     */
    private void printWordNeighbors(String dateString,
                                    SemanticSpace semanticPartition) 
            throws IOException {
        
        LOGGER.info("printing the most similar words for the semantic partition" +
                    " starting at: " + dateString);


        NearestNeighborFinder nnf = 
            new SimpleNearestNeighborFinder(semanticPartition);


        // generate the similarity lists
        for (String toExamine : interestingWords) {
            SortedMultiMap<Double,String> mostSimilar = 
                nnf.getMostSimilar(toExamine, interestingWordNeighbors);


            if (mostSimilar != null) {
                File neighborFile = 
                    new File(outputDir, toExamine + "-" + dateString + ".txt");
                neighborFile.createNewFile(); // iff it doesn't already exist
                
                File neighborComparisonFile = new File(outputDir,
                    toExamine + "_neighbor-comparisons_" + dateString + ".txt");
                neighborComparisonFile.createNewFile(); // see above comment
                    


                PrintWriter pw = new PrintWriter(neighborFile);
                for (String similar : mostSimilar.values()) {
                    pw.println(similar);
                }
                pw.close();


                if (compareNeighbors) {                                
                    // Print an N x N comparison between all of the most similar
                    // words.  This gives an indication of whether any of the
                    // words might be outliers.
                    writeNeighborComparison(neighborComparisonFile, 
                                            mostSimilar, semanticPartition);
                }
            }
        }    
    }


    /**
     * Write to the {@code neighborFile} the previously computed results of an N
     * x N similarity comparison of all the neighbors for word.
     *
     * @param neighborFile the file to which the results of the comparison
     *        should be written
     * @param mostSimilar a mapping from the similarity value to the neighbors
     *        of an interesting word that have the value.
     * @param sspace the semantic
     */
    private static void 
        writeNeighborComparison(File neighborFile, 
                                MultiMap<Double,String> mostSimilar, 
                                SemanticSpace sspace) throws IOException {        


        PrintWriter pw = new PrintWriter(neighborFile);
        
        // print out the header so we know the comparison order
        StringBuffer sb = new StringBuffer(mostSimilar.size() * 10);
        for (Iterator<String> it = mostSimilar.values().iterator();
             it.hasNext();) {
            sb.append(it.next());
            if (it.hasNext()) 
                sb.append(" ");
        }
        pw.println(sb.toString());
        
        // create an N x N table of how similar all the words are to each
        // other.
        for (String word : mostSimilar.values()) {
            sb = new StringBuffer(mostSimilar.size() * 10);
            sb.append(word).append(" ");
            
            // loop through all of the words
            for (String other : mostSimilar.values()) {
                // determine how similar the two words are
                double similarity = Similarity.cosineSimilarity(
                        sspace.getVector(word),
                        sspace.getVector(other));
                sb.append(similarity).append(" ");
            }
            pw.println(sb.toString());
        }
        
        pw.close();
    }


    /**
     * Returns the {@code Properties} used to set up the semantic space.
     */
    protected Properties setupProperties() {
        Properties props = System.getProperties();


        // Use the command line options to set the desired properites in the
        // constructor.  Use the system properties in case these properties were
        // set using -Dprop=<value>
        if (argOptions.hasOption("usePermutations")) {
            props.setProperty(
                OrderedTemporalRandomIndexing.USE_PERMUTATIONS_PROPERTY,
                              argOptions.getStringOption("usePermutations"));
        }


        if (argOptions.hasOption("permutationFunction")) {
            props.setProperty(
                OrderedTemporalRandomIndexing.PERMUTATION_FUNCTION_PROPERTY,
                             argOptions.getStringOption("permutationFunction"));
        }


        if (argOptions.hasOption("windowSize")) {
            props.setProperty(
                OrderedTemporalRandomIndexing.WINDOW_SIZE_PROPERTY,
                             argOptions.getStringOption("windowSize"));
        }


        if (argOptions.hasOption("vectorLength")) {
            props.setProperty(
                OrderedTemporalRandomIndexing.VECTOR_LENGTH_PROPERTY,
                             argOptions.getStringOption("vectorLength"));
        }
        
        if (argOptions.hasOption("useSparseSemantics")) {
            props.setProperty(
                OrderedTemporalRandomIndexing.USE_SPARSE_SEMANTICS_PROPERTY,
                              argOptions.getStringOption("useSparseSemantics"));
        }


        if (argOptions.hasOption("partitionDuration")) {
            props.setProperty(FixedDurationTemporalRandomIndexing.
                              SEMANTIC_PARTITION_DURATION_PROPERTY,
                              argOptions.getStringOption("partitionDuration"));
        }
        
        // Initialize the IteratorFactory to tokenize the documents according to
        // the specified configuration (e.g. filtering, compound words)
        if (argOptions.hasOption("tokenFilter")) {
            props.setProperty(IteratorFactory.TOKEN_FILTER_PROPERTY,
                              argOptions.getStringOption("tokenFilter"));            
        }


        if (argOptions.hasOption("compoundTokens")) {
            props.setProperty(IteratorFactory.COMPOUND_TOKENS_FILE_PROPERTY,
                              argOptions.getStringOption("compoundTokens"));
        }


        return props;
    }




    private static double getMagnitude(double[] arr) {
        double mag = 0d;
        for (double d : arr) {
            if (d != 0)
                mag += d*d;
        }
        return Math.sqrt(mag);
    }


    /**
     * Calls {@link TemporalSemanticSpace#processDocument(BufferedReader,long)
     * processDocument} once for every document in {@code docIter} using a the
     * specified number thread to call {@code processSpace} on the {@code
     * TemporalSemanticSpace} instance.
     *
     * @param sspace the space to build
     * @param docIter an iterator over all the documents to process
     * @param numThreads the number of threads to use
     */
    protected void parseDocumentsMultiThreaded(
        final FixedDurationTemporalRandomIndexing fdTri, 
        final Iterator<TemporalDocument> docIter,
        final TimeSpan timeSpan, int numThreads)
            throws IOException, InterruptedException {


        Collection<Thread> processingThreads = new LinkedList<Thread>();


        final AtomicInteger count = new AtomicInteger(0);


        final AtomicLong curSSpaceStartTime = new AtomicLong();
        final Object calendarLock = new Object();
        final DateFormat df = new SimpleDateFormat("yyyy_MM_ww_dd_hh");


        final AtomicLong lastWriteTime = new AtomicLong();
        
        // barrier for setting up the initial time stamp based on the first
        // document processed
        final AtomicBoolean startBarrier = new AtomicBoolean(false);


        // Before a Thread blocks waiting for s-space serialization, it enqueues
        // the time for its next document (outside the time-span).  These times
        // are used to select the start time for the next s-sspace.
        final Queue<Long> futureStartTimes = new ConcurrentLinkedQueue<Long>();


        // final variables necessary due to the anonymous inner class
        final boolean writeSemanticPartitions = savePartitions;
        final boolean writeSemanticShifts = printInterestingTokenShifts;
        final boolean writeInterestingWordNeighbors = 
            interestingWordNeighbors > 0;
        final boolean writeShiftRankings = printShiftRankings;


        /**
         * A runnable that serializes the current semantic space to disk and
         * annotates it with the time at which the space started.
         */
        Runnable serializeTimeSpan = new Runnable() {
                public void run() {
                    Calendar c = Calendar.getInstance();
                    c.setTimeInMillis(curSSpaceStartTime.get());
                    String dateString = df.format(c.getTime());
                                    
                    try {


                        // Save the s-space only when requried to.  This
                        // operation can be very slow due to I/O requirements,
                        // and is not mandatory when computing the shifts
                        if (writeSemanticPartitions) {
                            LOGGER.info("writing semantic partition starting " +
                                        "at: " + dateString);
                            // save the current contets of the semantic space
                            printSpace(fdTri, "-" + dateString);
                        }


                        // Add the semantics from the current semantic partition
                        // for each of the interesting words
                        updateTemporalSemantics(curSSpaceStartTime.get(),
                                                fdTri);
                        if (writeSemanticShifts) 
                            printSemanticShifts(dateString);


                        if (writeShiftRankings) 
                            printShiftRankings(dateString, 
                                               curSSpaceStartTime.get(),
                                               timeSpan);


                        // NOTE: since the FD-TRI implementaiton resets
                        // its semantics after every 
                        if (interestingWordNeighbors > 0) 
                            printWordNeighbors(dateString, fdTri);


                    } catch (IOException ioe) {
                        // rethrow
                        throw new IOError(ioe);
                    }


                    // Pick the earlier start time available as the new starting
                    // time for the s-space
                    assert futureStartTimes.size() > 0;
                    Long ssStart = new TreeSet<Long>(futureStartTimes).first();
                    futureStartTimes.clear();


                    // last update the date with the new time
                    curSSpaceStartTime.set(ssStart);
                }
            };
            
        // barrier for document processing threads.  When their next document is
        // outside of the time range, the immediately increase the release on
        // this semaphore and lock on the an object while the serialization
        // thread writes out the current time span's .sspace
        final CyclicBarrier exceededTimeSpanBarrier =
            new CyclicBarrier(numThreads, serializeTimeSpan);
                    
        
        for (int i = 0; i < numThreads; ++i) {


            Thread processingThread = new Thread() {
                    
                    public void run() {
                        // repeatedly try to process documents while some still
                        // remain
                        while (docIter.hasNext()) {
                            
                            TemporalDocument doc = docIter.next();
                            int docNumber = count.incrementAndGet();
                            long docTime = doc.timeStamp();


                            // special case for first document
                            if (docNumber == 1) {
                                curSSpaceStartTime.set(docTime);
                                startBarrier.set(true);
                            }
                            
                            // Spin until the Thread with the first document
                            // sets the initial starting document time.  Note
                            // that we spin here instead of block, because this
                            // is expected that another thread will immediately
                            // set this and so it will be a quick no-op
                            while (startBarrier.get() == false)
                                ;


                            // Check whether the time for this document would
                            // exceed the maximum time span for any TRI
                            // partition.  Loop to ensure that if this thread
                            // does loop and another thread has an earlier time
                            // that would cause this thread's time span to
                            // exceeds the other thread's time period, then this
                            // thread will block and loop again.
                            while (!timeSpan.insideRange(
                                   curSSpaceStartTime.get(), docTime)) {
                                try {
                                    // notify the barrier that this Thread is
                                    // now processing a document in the next
                                    // time span and so the serialization thread
                                    // should write the .sspace to disk.  In
                                    // addition, enqueue the time for this
                                    // document so the serialization thread can
                                    // reset the correct s-sspace start time
                                    futureStartTimes.offer(docTime);
                                    exceededTimeSpanBarrier.await();
                                } catch (InterruptedException ex) {
                                    return;
                                } catch (BrokenBarrierException ex) {
                                    return;
                                }
                            }


                            try {
                                fdTri.processDocument(doc.reader());
                            } catch (IOException ioe) {
                                // rethrow
                                throw new IOError(ioe);
                            }
                            LOGGER.fine("parsed document #" + docNumber);
                        }
                    }
                };
            processingThreads.add(processingThread);
        }


        long threadStart = System.currentTimeMillis();
        
        // start all the threads processing
        for (Thread t : processingThreads)
            t.start();


        verbose("Beginning processing using %d threads", numThreads);


        // wait until all the documents have been parsed
        for (Thread t : processingThreads)
            t.join();


        verbose("parsed %d document in %.3f total seconds)%n",
                count.get(),
                ((System.currentTimeMillis() - threadStart) / 1000d));
    }
        
    /**
     * Prints the instructions on how to execute this program to standard out.
     */
    protected void usage() {
         System.out.println(
             "usage: java FixedDurationTemporalRandomIndexingMain [options] " +
            "<output-dir>\n\n" + 
            argOptions.prettyPrint() +


            "\nFixed-Duration TRI provides four main output options:\n\n" +


            "  1) Outputting each semantic partition as a separate .sspace file.  "+
            "Each file\n     is named using the yyyy_MM_ww_dd_hh format to " +
            "indicate it start date.\n     This is the most expensive of the " +
            "operations due to I/O overhead.\n\n" +
            
            "  The remaining options require the use of the -I " + 
            "--interestingTokenList option to\n  specify a set of word for use"+
            " in tracking temporal changes.\n\n  2) For each of the interesting"
            + "words, -P, --printInterestingTokenShifts will track\n" +
            "     the semantics" +
            " through time and report the semantic shift along with other\n" +
            "     distance statistics.\n\n"  +
            "  3) For each of the interesting words, -N, " +
            "--printInterestingTokenNeighbors\n     will print the nearest " +
            "neighbor for each in the semantic space.  The\n     number " +
            "of neighbors to print should be specified.\n\n" +


            "  4) For each of the interesting words, generate the list of " + 
            "similar\n     neighbors using the --printInterestingTokenNeighbors"
            + " and then compare\n     those neighbors with each other using " +
            "the\n     --printInterestingTokenNeighborComparison option.  " +
            "This creates a file\n     with the pair-wise cosine similarities "+
            "for all neighbors.  Note that this\n     option requires both " +
            "flags to be specified.\n\n" +


            "Semantic filters limit the set of tokens for which the " +
            "semantics are kept.\nThis limits the potential memory overhead " +
            "for calculating semantics for a\nlarge set of words."


            + "\n\n" + OptionDescriptions.COMPOUND_WORDS_DESCRIPTION
            + "\n\n" + OptionDescriptions.TOKEN_FILTER_DESCRIPTION
            + "\n\n" + OptionDescriptions.FILE_FORMAT_DESCRIPTION
            + "\n\n" + OptionDescriptions.HELP_DESCRIPTION);            
    }


    protected void verbose(String msg) {        
        LOGGER.fine(msg);
    }


    protected void verbose(String format, Object... args) {
        if (LOGGER.isLoggable(Level.FINE)) 
            LOGGER.fine(String.format(format, args));
    }
}
Source Code of edu.ucla.sspace.mains.FixedDurationTemporalRandomIndexingMain

Related Classes of edu.ucla.sspace.mains.FixedDurationTemporalRandomIndexingMain