Package edu.ucla.sspace.evaluation

Source Code of edu.ucla.sspace.evaluation.FinkelsteinEtAl353WordSimilarityEvaluation

/*
* Copyright 2009 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.evaluation;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOError;
import java.io.IOException;

import java.util.Collection;
import java.util.LinkedList;

import edu.ucla.sspace.common.SemanticSpace;

/**
* A collection of human similarity judgements of word pairs gathered by
* Finkelstein et al.  See <a
* href="http://www.cs.technion.ac.il/~gabr/resources/data/wordsim353/">their
* website</a> for access to the test data.  See the following reference for
* full details on how the data was gathered.
*
* <ul>
*
*   <li style="font-family:Garamond, Georgia, serif"> Lev Finkelstein, Evgeniy
*   Gabrilovich, Yossi Matias, Ehud Rivlin, Zach Solan, Gadi Wolfman, and Eytan
*   Ruppin, "Placing Search in Context: The Concept Revisited", ACM
*   Transactions on Information Systems, 20(1):116-131, January 2002.
*   Available <a
*   href="http://www.cs.technion.ac.il/~gabr/papers/tois_context.pdf">here</a>.
*
* </ul>
*/
public class FinkelsteinEtAl353WordSimilarityEvaluation
    implements WordSimilarityEvaluation {

    /**
     * A collection of human judgements on word relatedness
     */
    private final Collection<WordSimilarity> pairs;

    /**
     * The name of the data file for this test
     */
    private final String dataFileName;

    /**
     * Constructs this word similarity evaluation test using the WS353 data file
     * refered to by the provided name.
     */
    public FinkelsteinEtAl353WordSimilarityEvaluation(String word353fileName) {
        this(new File(word353fileName));
    }

    /**
     * Constructs this word similarity evaluation test using the provide WS353
     * data file.
     */
    public FinkelsteinEtAl353WordSimilarityEvaluation(File word353file) {
        pairs = parse(word353file);
        dataFileName = word353file.getName();
    }

    /**
     * Parses the WordSimilarity353 file and returns the set of judgements.
     */
    private Collection<WordSimilarity> parse(File word353file) {
        // the ws353 data set comes in two formats, a comma-separated format and
        // a tab-separated format.  Support both by checking the file name
        // suffix.
        String delimeter = (word353file.getName().endsWith(".csv"))
            ? "," : "\\s";

        Collection<WordSimilarity> pairs = new LinkedList<WordSimilarity>();
               
        try {
            BufferedReader br = new BufferedReader(new FileReader(word353file));
            // skip the first line
            br.readLine();
            for (String line = null; (line = br.readLine()) != null; ) {

                String[] wordsAndNum = line.split(delimeter);
                if (wordsAndNum.length != 3) {
                    throw new Error("Unexpected line formatting: " + line);
                }
                pairs.add(new SimpleWordSimilarity(
                          wordsAndNum[0], wordsAndNum[1],
                          Double.parseDouble(wordsAndNum[2])));
            }
        } catch (IOException ioe) {
            // rethrow as an IOE is fatal evaluation
            throw new IOError(ioe);
        }
           
        return pairs;
    }

    /**
     * {@inheritDoc}
     */
    public Collection<WordSimilarity> getPairs() {
        return pairs;
    }

    /**
     * {@inheritDoc}
     */
    public double getMostSimilarValue() {
        return 10d;
    }
   
    /**
     * {@inheritDoc}
     */
    public double  getLeastSimilarValue() {
        return 0d;
    }

    public String toString() {
        return "Finkelstein et al. Word Similarity Test [" + dataFileName + "]";
    }

}

TOP

Related Classes of edu.ucla.sspace.evaluation.FinkelsteinEtAl353WordSimilarityEvaluation

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.