Source Code of edu.stanford.nlp.classify.RVFDataset

package edu.stanford.nlp.classify;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Random;
import java.util.Set;


import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.Datum;
import edu.stanford.nlp.ling.RVFDatum;
import edu.stanford.nlp.math.ArrayMath;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.HashIndex;


/**
 * An interfacing class for {@link ClassifierFactory} that incrementally builds
 * a more memory-efficient representation of a {@link List} of {@link RVFDatum}
 * objects for the purposes of training a {@link Classifier} with a
 * {@link ClassifierFactory}.
 *
 * @author Jenny Finkel (jrfinkel@stanford.edu)
 * @author Rajat Raina (added methods to record data sources and ids)
 * @author Anna Rafferty (various refactoring with GeneralDataset/Dataset)
 * @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization)
 *
 * @param <L> The type of the labels in the Dataset
 * @param <F> The type of the features in the Dataset
 */
public class RVFDataset<L, F> extends GeneralDataset<L, F> { // implements Iterable<RVFDatum<L, F>>, Serializable


  private static final long serialVersionUID = -3841757837680266182L;


  private double[][] values;  // [datumIndex][i] values of features listed in int[][] data
  private double[] minValues; // = null; //stores the minValues of all features
                              // for normalization.
  private double[] maxValues; // = null; //stores the maxValues of all features
                              // for normalization.
  double[] means;
  double[] stdevs; // means and stdevs of features, used for


  /*
   * Store source and id of each datum; optional, and not fully supported.
   */
  private ArrayList<Pair<String, String>> sourcesAndIds;


  public RVFDataset() {
    this(10);
  }


  public RVFDataset(int numDatums, Index<F> featureIndex, Index<L> labelIndex) {
    this(numDatums);
    this.labelIndex = labelIndex;
    this.featureIndex = featureIndex;
  }


  public RVFDataset(Index<F> featureIndex, Index<L> labelIndex) {
    this(10);
    this.labelIndex = labelIndex;
    this.featureIndex = featureIndex;
  }


  public RVFDataset(int numDatums) {
    initialize(numDatums);
  }


  /**
   * Constructor that fully specifies a Dataset. Needed this for
   * MulticlassDataset.
   */
  public RVFDataset(Index<L> labelIndex, int[] labels, Index<F> featureIndex, int[][] data, double[][] values) {
    this.labelIndex = labelIndex;
    this.labels = labels;
    this.featureIndex = featureIndex;
    this.data = data;
    this.values = values;
    this.size = labels.length;
  }


  @Override
  public Pair<GeneralDataset<L, F>, GeneralDataset<L, F>> split(double percentDev) {
    int devSize = (int) (percentDev * size());
    int trainSize = size() - devSize;


    int[][] devData = new int[devSize][];
    double[][] devValues = new double[devSize][];
    int[] devLabels = new int[devSize];


    int[][] trainData = new int[trainSize][];
    double[][] trainValues = new double[trainSize][];
    int[] trainLabels = new int[trainSize];


    System.arraycopy(data, 0, devData, 0, devSize);
    System.arraycopy(values, 0, devValues, 0, devSize);
    System.arraycopy(labels, 0, devLabels, 0, devSize);


    System.arraycopy(data, devSize, trainData, 0, trainSize);
    System.arraycopy(values, devSize, trainValues, 0, trainSize);
    System.arraycopy(labels, devSize, trainLabels, 0, trainSize);


    RVFDataset<L, F> dev = new RVFDataset<L, F>(labelIndex, devLabels, featureIndex, devData, devValues);
    RVFDataset<L, F> train = new RVFDataset<L, F>(labelIndex, trainLabels, featureIndex, trainData, trainValues);


    return new Pair<GeneralDataset<L, F>, GeneralDataset<L, F>>(train, dev);


  }


  public void scaleFeaturesGaussian() {
    means = new double[this.numFeatures()];
    Arrays.fill(means, 0);


    for (int i = 0; i < this.size(); i++) {
      for (int j = 0; j < data[i].length; j++)
        means[data[i][j]] += values[i][j];
    }
    ArrayMath.multiplyInPlace(means, 1.0 / this.size());


    stdevs = new double[this.numFeatures()];
    Arrays.fill(stdevs, 0);
    double[] deltaX = new double[this.numFeatures()];


    for (int i = 0; i < this.size(); i++) {
      for (int f = 0; f < this.numFeatures(); f++)
        deltaX[f] = -means[f];
      for (int j = 0; j < data[i].length; j++)
        deltaX[data[i][j]] += values[i][j];
      for (int f = 0; f < this.numFeatures(); f++) {
        stdevs[f] += deltaX[f] * deltaX[f];
      }
    }
    for (int f = 0; f < this.numFeatures(); f++) {
      stdevs[f] /= (this.size() - 1);
      stdevs[f] = Math.sqrt(stdevs[f]);
    }
    for (int i = 0; i < this.size(); i++) {
      for (int j = 0; j < data[i].length; j++) {
        int fID = data[i][j];
        if (stdevs[fID] != 0)
          values[i][j] = (values[i][j] - means[fID]) / stdevs[fID];
      }
    }


  }


  /**
   * Scales feature values linearly such that each feature value lies between 0
   * and 1.
   *
   */
  public void scaleFeatures() {
    // TODO: should also implement a method that scales the features using the
    // mean and std.
    minValues = new double[featureIndex.size()];
    maxValues = new double[featureIndex.size()];
    Arrays.fill(minValues, Double.POSITIVE_INFINITY);
    Arrays.fill(maxValues, Double.NEGATIVE_INFINITY);


    // first identify the max and min values for each feature.
    // System.out.printf("number of datums: %d dataset size: %d\n",data.length,size());
    for (int i = 0; i < size(); i++) {
      // System.out.printf("datum %d length %d\n", i,data[i].length);
      for (int j = 0; j < data[i].length; j++) {
        int f = data[i][j];
        if (values[i][j] < minValues[f])
          minValues[f] = values[i][j];
        if (values[i][j] > maxValues[f])
          maxValues[f] = values[i][j];
      }
    }


    for (int f = 0; f < featureIndex.size(); f++) {
      if (minValues[f] == Double.POSITIVE_INFINITY)
        throw new RuntimeException("minValue for feature " + f + " not assigned. ");
      if (maxValues[f] == Double.NEGATIVE_INFINITY)
        throw new RuntimeException("maxValue for feature " + f + " not assigned.");
    }


    // now scale each value such that it's between 0 and 1.
    for (int i = 0; i < size(); i++) {
      for (int j = 0; j < data[i].length; j++) {
        int f = data[i][j];
        if (minValues[f] != maxValues[f])// the equality can happen for binary
                                         // features which always take the value
                                         // of 1.0
          values[i][j] = (values[i][j] - minValues[f]) / (maxValues[f] - minValues[f]);
      }
    }


    /*
    for(int f = 0; f < featureIndex.size(); f++){
      if(minValues[f] == maxValues[f])
        throw new RuntimeException("minValue for feature "+f+" is equal to maxValue:"+minValues[f]);
    }
    */
  }


  /**
   * Checks if the dataset has any unbounded values. Always good to use this
   * before training a model on the dataset. This way, one can avoid seeing the
   * infamous 4's that get printed by the QuasiNewton Method when NaNs exist in
   * the data! -Ramesh
   */
  public void ensureRealValues() {
    double[][] values = getValuesArray();
    int[][] data = getDataArray();
    for (int i = 0; i < size(); i++) {
      for (int j = 0; j < values[i].length; j++) {
        if (Double.isNaN(values[i][j])) {
          int fID = data[i][j];
          F feature = featureIndex.get(fID);
          throw new RuntimeException("datum " + i + " has a NaN value for feature:" + feature);
        }
        if (Double.isInfinite(values[i][j])) {
          int fID = data[i][j];
          F feature = featureIndex.get(fID);
          throw new RuntimeException("datum " + i + " has infinite value for feature:" + feature);
        }
      }
    }
  }


  /**
   * Scales the values of each feature in each linearly using the min and max
   * values found in the training set. NOTE1: Not guaranteed to be between 0 and
   * 1 for a test datum. NOTE2: Also filters out features from each datum that
   * are not seen at training time.
   *
   * @param dataset
   * @return a new dataset
   */
  public RVFDataset<L, F> scaleDataset(RVFDataset<L, F> dataset) {
    RVFDataset<L, F> newDataset = new RVFDataset<L, F>(this.featureIndex, this.labelIndex);
    for (int i = 0; i < dataset.size(); i++) {
      RVFDatum<L, F> datum = dataset.getDatum(i);
      newDataset.add(scaleDatum(datum));
    }
    return newDataset;
  }


  /**
   * Scales the values of each feature linearly using the min and max values
   * found in the training set. NOTE1: Not guaranteed to be between 0 and 1 for
   * a test datum. NOTE2: Also filters out features from the datum that are not
   * seen at training time.
   *
   * @param datum
   * @return a new datum
   */
  public RVFDatum<L, F> scaleDatum(RVFDatum<L, F> datum) {
    // scale this dataset before scaling the datum
    if (minValues == null || maxValues == null)
      scaleFeatures();
    Counter<F> scaledFeatures = new ClassicCounter<F>();
    for (F feature : datum.asFeatures()) {
      int fID = this.featureIndex.indexOf(feature);
      if (fID >= 0) {
        double oldVal = datum.asFeaturesCounter().getCount(feature);
        double newVal;
        if (minValues[fID] != maxValues[fID])
          newVal = (oldVal - minValues[fID]) / (maxValues[fID] - minValues[fID]);
        else
          newVal = oldVal;
        scaledFeatures.incrementCount(feature, newVal);
      }
    }
    return new RVFDatum<L, F>(scaledFeatures, datum.label());
  }


  public RVFDataset<L, F> scaleDatasetGaussian(RVFDataset<L, F> dataset) {
    RVFDataset<L, F> newDataset = new RVFDataset<L, F>(this.featureIndex, this.labelIndex);
    for (int i = 0; i < dataset.size(); i++) {
      RVFDatum<L, F> datum = dataset.getDatum(i);
      newDataset.add(scaleDatumGaussian(datum));
    }
    return newDataset;
  }


  public RVFDatum<L, F> scaleDatumGaussian(RVFDatum<L, F> datum) {
    // scale this dataset before scaling the datum
    if (means == null || stdevs == null)
      scaleFeaturesGaussian();
    Counter<F> scaledFeatures = new ClassicCounter<F>();
    for (F feature : datum.asFeatures()) {
      int fID = this.featureIndex.indexOf(feature);
      if (fID >= 0) {
        double oldVal = datum.asFeaturesCounter().getCount(feature);
        double newVal;
        if (stdevs[fID] != 0)
          newVal = (oldVal - means[fID]) / stdevs[fID];
        else
          newVal = oldVal;
        scaledFeatures.incrementCount(feature, newVal);
      }
    }
    return new RVFDatum<L, F>(scaledFeatures, datum.label());
  }


  @Override
  public Pair<GeneralDataset<L, F>, GeneralDataset<L, F>> split(int start, int end) {
    int devSize = end - start;
    int trainSize = size() - devSize;


    int[][] devData = new int[devSize][];
    double[][] devValues = new double[devSize][];
    int[] devLabels = new int[devSize];


    int[][] trainData = new int[trainSize][];
    double[][] trainValues = new double[trainSize][];
    int[] trainLabels = new int[trainSize];


    System.arraycopy(data, start, devData, 0, devSize);
    System.arraycopy(values, start, devValues, 0, devSize);
    System.arraycopy(labels, start, devLabels, 0, devSize);


    System.arraycopy(data, 0, trainData, 0, start);
    System.arraycopy(data, end, trainData, start, size() - end);
    System.arraycopy(values, 0, trainValues, 0, start);
    System.arraycopy(values, end, trainValues, start, size() - end);
    System.arraycopy(labels, 0, trainLabels, 0, start);
    System.arraycopy(labels, end, trainLabels, start, size() - end);


    GeneralDataset<L, F> dev = new RVFDataset<L, F>(labelIndex, devLabels, featureIndex, devData, devValues);
    GeneralDataset<L, F> train = new RVFDataset<L, F>(labelIndex, trainLabels, featureIndex, trainData, trainValues);


    return new Pair<GeneralDataset<L, F>, GeneralDataset<L, F>>(train, dev);


  }


  // TODO: Check that this does what we want for Datum other than RVFDatum
  @Override
  public void add(Datum<L, F> d) {
    if (d instanceof RVFDatum<?, ?>) {
      addLabel(d.label());
      addFeatures(((RVFDatum<L, F>) d).asFeaturesCounter());
      size++;
    } else {
      addLabel(d.label());
      addFeatures(Counters.asCounter(d.asFeatures()));
      size++;
    }
  }


  public void add(Datum<L, F> d, String src, String id) {
    if (d instanceof RVFDatum<?, ?>) {
      addLabel(d.label());
      addFeatures(((RVFDatum<L, F>) d).asFeaturesCounter());
      addSourceAndId(src, id);
      size++;
    } else {
      addLabel(d.label());
      addFeatures(Counters.asCounter(d.asFeatures()));
      addSourceAndId(src, id);
      size++;
    }
  }


  // TODO shouldn't have both this and getRVFDatum
  @Override
  public RVFDatum<L, F> getDatum(int index) {
    return getRVFDatum(index);
  }


  /**
   * @return the index-ed datum
   *
   *         Note, this returns a new RVFDatum object, not the original RVFDatum
   *         that was added to the dataset.
   */
  @Override
  public RVFDatum<L, F> getRVFDatum(int index) {
    ClassicCounter<F> c = new ClassicCounter<F>();
    for (int i = 0; i < data[index].length; i++) {
      c.incrementCount(featureIndex.get(data[index][i]), values[index][i]);
    }
    return new RVFDatum<L, F>(c, labelIndex.get(labels[index]));
  }


  public String getRVFDatumSource(int index) {
    return sourcesAndIds.get(index).first();
  }


  public String getRVFDatumId(int index) {
    return sourcesAndIds.get(index).second();
  }


  private void addSourceAndId(String src, String id) {
    sourcesAndIds.add(new Pair<String, String>(src, id));
  }


  private void addLabel(L label) {
    if (labels.length == size) {
      int[] newLabels = new int[size * 2];
      System.arraycopy(labels, 0, newLabels, 0, size);
      labels = newLabels;
    }
    labels[size] = labelIndex.addToIndex(label);
  }


  private void addFeatures(Counter<F> features) {
    if (data.length == size) {
      int[][] newData = new int[size * 2][];
      double[][] newValues = new double[size * 2][];
      System.arraycopy(data, 0, newData, 0, size);
      System.arraycopy(values, 0, newValues, 0, size);
      data = newData;
      values = newValues;
    }


    final List<F> featureNames = new ArrayList<F>(features.keySet());
    final int nFeatures = featureNames.size();
    data[size] = new int[nFeatures];
    values[size] = new double[nFeatures];
    for (int i = 0; i < nFeatures; ++i) {
      F feature = featureNames.get(i);
      int fID = featureIndex.addToIndex(feature);
      if (fID >= 0) {
        data[size][i] = fID;
        values[size][i] = features.getCount(feature);
      } else {
        // Usually a feature present at test but not training time.
        assert featureIndex.isLocked() : "Could not add feature to index: " + feature;
      }
    }
  }


  /**
   * Resets the Dataset so that it is empty and ready to collect data.
   */
  @Override
  public void clear() {
    clear(10);
  }


  /**
   * Resets the Dataset so that it is empty and ready to collect data.
   */
  @Override
  public void clear(int numDatums) {
    initialize(numDatums);
  }


  @Override
  protected void initialize(int numDatums) {
    labelIndex = new HashIndex<L>();
    featureIndex = new HashIndex<F>();
    labels = new int[numDatums];
    data = new int[numDatums][];
    values = new double[numDatums][];
    sourcesAndIds = new ArrayList<Pair<String, String>>(numDatums);
    size = 0;
  }


  /**
   * Prints some summary statistics to stderr for the Dataset.
   */
  @Override
  public void summaryStatistics() {
    System.err.println("numDatums: " + size);
    System.err.print("numLabels: " + labelIndex.size() + " [");
    Iterator<L> iter = labelIndex.iterator();
    while (iter.hasNext()) {
      System.err.print(iter.next());
      if (iter.hasNext()) {
        System.err.print(", ");
      }
    }
    System.err.println("]");
    System.err.println("numFeatures (Phi(X) types): " + featureIndex.size());
    /*for(int i = 0; i < data.length; i++) {
      for(int j = 0; j < data[i].length; j++) {
      System.out.println(data[i][j]);
      }
      }*/
  }


  /**
   * prints the full feature matrix in tab-delimited form. These can be BIG
   * matrices, so be careful! [Can also use printFullFeatureMatrixWithValues]
   */
  public void printFullFeatureMatrix(PrintWriter pw) {
    String sep = "\t";
    for (int i = 0; i < featureIndex.size(); i++) {
      pw.print(sep + featureIndex.get(i));
    }
    pw.println();
    for (int i = 0; i < labels.length; i++) {
      pw.print(labelIndex.get(i));
      Set<Integer> feats = Generics.newHashSet();
      for (int j = 0; j < data[i].length; j++) {
        int feature = data[i][j];
        feats.add(Integer.valueOf(feature));
      }
      for (int j = 0; j < featureIndex.size(); j++) {
        if (feats.contains(Integer.valueOf(j))) {
          pw.print(sep + "1");
        } else {
          pw.print(sep + "0");
        }
      }
      pw.println();
    }
  }


  /**
   * Modification of printFullFeatureMatrix to correct bugs & print values
   * (Rajat). Prints the full feature matrix in tab-delimited form. These can be
   * BIG matrices, so be careful!
   */
  public void printFullFeatureMatrixWithValues(PrintWriter pw) {
    String sep = "\t";
    for (int i = 0; i < featureIndex.size(); i++) {
      pw.print(sep + featureIndex.get(i));
    }
    pw.println();
    for (int i = 0; i < size; i++) { // changed labels.length to size
      pw.print(labelIndex.get(labels[i])); // changed i to labels[i]
      Map<Integer, Double> feats = Generics.newHashMap();
      for (int j = 0; j < data[i].length; j++) {
        int feature = data[i][j];
        double val = values[i][j];
        feats.put(Integer.valueOf(feature), new Double(val));
      }
      for (int j = 0; j < featureIndex.size(); j++) {
        if (feats.containsKey(Integer.valueOf(j))) {
          pw.print(sep + feats.get(Integer.valueOf(j)));
        } else {
          pw.print(sep + " ");
        }
      }
      pw.println();
    }
    pw.flush();
  }


  /**
   * Constructs a Dataset by reading in a file in SVM light format.
   *
   */
  public static RVFDataset<String, String> readSVMLightFormat(String filename) {
    return readSVMLightFormat(filename, new HashIndex<String>(), new HashIndex<String>());
  }


  /**
   * Constructs a Dataset by reading in a file in SVM light format. The lines
   * parameter is filled with the lines of the file for further processing (if
   * lines is null, it is assumed no line information is desired)
   */
  public static RVFDataset<String, String> readSVMLightFormat(String filename, List<String> lines) {
    return readSVMLightFormat(filename, new HashIndex<String>(), new HashIndex<String>(), lines);
  }


  /**
   * Constructs a Dataset by reading in a file in SVM light format. the created
   * dataset has the same feature and label index as given
   */
  public static RVFDataset<String, String> readSVMLightFormat(String filename, Index<String> featureIndex, Index<String> labelIndex) {
    return readSVMLightFormat(filename, featureIndex, labelIndex, null);
  }


  /**
   * Removes all features from the dataset that are not in featureSet.
   *
   * @param featureSet
   */
  public void selectFeaturesFromSet(Set<F> featureSet) {
    HashIndex<F> newFeatureIndex = new HashIndex<F>();
    int[] featMap = new int[featureIndex.size()];
    Arrays.fill(featMap, -1);
    for (F feature : featureSet) {
      int oldID = featureIndex.indexOf(feature);
      if (oldID >= 0) { // it's a valid feature in the index
        int newID = newFeatureIndex.addToIndex(feature);
        featMap[oldID] = newID;
      }
    }
    featureIndex = newFeatureIndex;
    for (int i = 0; i < size; i++) {
      List<Integer> featList = new ArrayList<Integer>(data[i].length);
      List<Double> valueList = new ArrayList<Double>(values[i].length);
      for (int j = 0; j < data[i].length; j++) {
        if (featMap[data[i][j]] >= 0) {
          featList.add(featMap[data[i][j]]);
          valueList.add(values[i][j]);
        }
      }
      data[i] = new int[featList.size()];
      values[i] = new double[valueList.size()];
      for (int j = 0; j < data[i].length; j++) {
        data[i][j] = featList.get(j);
        values[i][j] = valueList.get(j);
      }
    }
  }


  /**
   * Applies a feature count threshold to the RVFDataset. All features that
   * occur fewer than <i>k</i> times are expunged.
   */
  public void applyFeatureCountThreshold(int k) {
    float[] counts = getFeatureCounts();
    HashIndex<F> newFeatureIndex = new HashIndex<F>();


    int[] featMap = new int[featureIndex.size()];
    for (int i = 0; i < featMap.length; i++) {
      F feat = featureIndex.get(i);
      if (counts[i] >= k) {
        int newIndex = newFeatureIndex.size();
        newFeatureIndex.add(feat);
        featMap[i] = newIndex;
      } else {
        featMap[i] = -1;
      }
      // featureIndex.remove(feat);
    }


    featureIndex = newFeatureIndex;
    // counts = null; // This is unnecessary; JVM can clean it up


    for (int i = 0; i < size; i++) {
      List<Integer> featList = new ArrayList<Integer>(data[i].length);
      List<Double> valueList = new ArrayList<Double>(values[i].length);
      for (int j = 0; j < data[i].length; j++) {
        if (featMap[data[i][j]] >= 0) {
          featList.add(featMap[data[i][j]]);
          valueList.add(values[i][j]);
        }
      }
      data[i] = new int[featList.size()];
      values[i] = new double[valueList.size()];
      for (int j = 0; j < data[i].length; j++) {
        data[i][j] = featList.get(j);
        values[i][j] = valueList.get(j);
      }
    }
  }


  /**
   * Applies a feature max count threshold to the RVFDataset. All features that
   * occur greater than <i>k</i> times are expunged.
   */
  @Override
  public void applyFeatureMaxCountThreshold(int k) {
    float[] counts = getFeatureCounts();
    HashIndex<F> newFeatureIndex = new HashIndex<F>();


    int[] featMap = new int[featureIndex.size()];
    for (int i = 0; i < featMap.length; i++) {
      F feat = featureIndex.get(i);
      if (counts[i] <= k) {
        int newIndex = newFeatureIndex.size();
        newFeatureIndex.add(feat);
        featMap[i] = newIndex;
      } else {
        featMap[i] = -1;
      }
      // featureIndex.remove(feat);
    }


    featureIndex = newFeatureIndex;
    // counts = null; // This is unnecessary; JVM can clean it up


    for (int i = 0; i < size; i++) {
      List<Integer> featList = new ArrayList<Integer>(data[i].length);
      List<Double> valueList = new ArrayList<Double>(values[i].length);
      for (int j = 0; j < data[i].length; j++) {
        if (featMap[data[i][j]] >= 0) {
          featList.add(featMap[data[i][j]]);
          valueList.add(values[i][j]);
        }
      }
      data[i] = new int[featList.size()];
      values[i] = new double[valueList.size()];
      for (int j = 0; j < data[i].length; j++) {
        data[i][j] = featList.get(j);
        values[i][j] = valueList.get(j);
      }
    }
  }


  private static RVFDataset<String, String> readSVMLightFormat(String filename, Index<String> featureIndex, Index<String> labelIndex, List<String> lines) {
    BufferedReader in = null;
    RVFDataset<String, String> dataset;
    try {
      dataset = new RVFDataset<String, String>(10, featureIndex, labelIndex);
      in = IOUtils.readerFromString(filename);


      while (in.ready()) {
        String line = in.readLine();
        if (lines != null)
          lines.add(line);
        dataset.add(svmLightLineToRVFDatum(line));
      }
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    } finally {
      IOUtils.closeIgnoringExceptions(in);
    }
    return dataset;
  }


  public static RVFDatum<String, String> svmLightLineToRVFDatum(String l) {
    l = l.replaceFirst("#.*$", ""); // remove any trailing comments
    String[] line = l.split("\\s+");
    ClassicCounter<String> features = new ClassicCounter<String>();
    for (int i = 1; i < line.length; i++) {
      String[] f = line[i].split(":");
      if (f.length != 2) {
        throw new IllegalArgumentException("Bad data format: " + l);
      }
      double val = Double.parseDouble(f[1]);
      features.incrementCount(f[0], val);
    }
    return new RVFDatum<String, String>(features, line[0]);
  }


  // todo [cdm 2012]: This duplicates the functionality of the methods above. Should be refactored.
  /**
   * Read SVM-light formatted data into this dataset.
   *
   * A strict SVM-light format is expected, where labels and features are both
   * encoded as integers. These integers are converted into the dataset label
   * and feature types using the indexes stored in this dataset.
   *
   * @param file The file from which the data should be read.
   */
  public void readSVMLightFormat(File file) {
    for (String line : IOUtils.readLines(file)) {
      line = line.replaceAll("#.*", ""); // remove any trailing comments
      String[] items = line.split("\\s+");
      Integer label = Integer.parseInt(items[0]);
      Counter<F> features = new ClassicCounter<F>();
      for (int i = 1; i < items.length; i++) {
        String[] featureItems = items[i].split(":");
        int feature = Integer.parseInt(featureItems[0]);
        double value = Double.parseDouble(featureItems[1]);
        features.incrementCount(this.featureIndex.get(feature), value);
      }
      this.add(new RVFDatum<L, F>(features, this.labelIndex.get(label)));
    }
  }


  /**
   * Write the dataset in SVM-light format to the file.
   *
   * A strict SVM-light format will be written, where labels and features are
   * both encoded as integers, using the label and feature indexes of this
   * dataset. Datasets written by this method can be read by
   * {@link #readSVMLightFormat(File)}.
   *
   * @param file The location where the dataset should be written.
   */
  public void writeSVMLightFormat(File file) throws FileNotFoundException {
    PrintWriter writer = new PrintWriter(file);
    writeSVMLightFormat(writer);
    writer.close();
  }


  public void writeSVMLightFormat(PrintWriter writer) {
    for (RVFDatum<L, F> datum : this) {
      writer.print(this.labelIndex.indexOf(datum.label()));
      Counter<F> features = datum.asFeaturesCounter();
      for (F feature : features.keySet()) {
        double count = features.getCount(feature);
        writer.format(Locale.ENGLISH, " %s:%f", this.featureIndex.indexOf(feature), count);
      }
      writer.println();
    }
  }


  /**
   * Prints the sparse feature matrix using
   * {@link #printSparseFeatureMatrix(PrintWriter)} to {@link System#out
   * System.out}.
   */
  @Override
  public void printSparseFeatureMatrix() {
    printSparseFeatureMatrix(new PrintWriter(System.out, true));
  }


  /**
   * Prints a sparse feature matrix representation of the Dataset. Prints the
   * actual {@link Object#toString()} representations of features.
   */
  @Override
  public void printSparseFeatureMatrix(PrintWriter pw) {
    String sep = "\t";
    for (int i = 0; i < size; i++) {
      pw.print(labelIndex.get(labels[i]));
      int[] datum = data[i];
      for (int feat : datum) {
        pw.print(sep);
        pw.print(featureIndex.get(feat));
      }
      pw.println();
    }
  }


  /**
   * Prints a sparse feature-value output of the Dataset. Prints the actual
   * {@link Object#toString()} representations of features. This is probably
   * what you want for RVFDataset since the above two methods seem useless and
   * unused.
   */
  public void printSparseFeatureValues(PrintWriter pw) {
    for (int i = 0; i < size; i++) {
      printSparseFeatureValues(i, pw);
    }
  }


  /**
   * Prints a sparse feature-value output of the Dataset. Prints the actual
   * {@link Object#toString()} representations of features. This is probably
   * what you want for RVFDataset since the above two methods seem useless and
   * unused.
   */
  public void printSparseFeatureValues(int datumNo, PrintWriter pw) {
    pw.print(labelIndex.get(labels[datumNo]));
    pw.print('\t');
    pw.println("LABEL");
    int[] datum = data[datumNo];
    double[] vals = values[datumNo];
    assert datum.length == vals.length;
    for (int i = 0; i < datum.length; i++) {
      pw.print(featureIndex.get(datum[i]));
      pw.print('\t');
      pw.println(vals[i]);
    }
    pw.println();
  }


  public static void main(String[] args) {
    RVFDataset<String, String> data = new RVFDataset<String, String>();
    ClassicCounter<String> c1 = new ClassicCounter<String>();
    c1.incrementCount("fever", 3.5);
    c1.incrementCount("cough", 1.1);
    c1.incrementCount("congestion", 4.2);


    ClassicCounter<String> c2 = new ClassicCounter<String>();
    c2.incrementCount("fever", 1.5);
    c2.incrementCount("cough", 2.1);
    c2.incrementCount("nausea", 3.2);


    ClassicCounter<String> c3 = new ClassicCounter<String>();
    c3.incrementCount("cough", 2.5);
    c3.incrementCount("congestion", 3.2);


    data.add(new RVFDatum<String, String>(c1, "cold"));
    data.add(new RVFDatum<String, String>(c2, "flu"));
    data.add(new RVFDatum<String, String>(c3, "cold"));
    data.summaryStatistics();


    LinearClassifierFactory<String, String> factory = new LinearClassifierFactory<String, String>();
    factory.useQuasiNewton();


    LinearClassifier<String, String> c = factory.trainClassifier(data);


    ClassicCounter<String> c4 = new ClassicCounter<String>();
    c4.incrementCount("cough", 2.3);
    c4.incrementCount("fever", 1.3);


    RVFDatum<String, String> datum = new RVFDatum<String, String>(c4);


    c.justificationOf((Datum<String, String>) datum);
  }


  @Override
  public double[][] getValuesArray() {
    if (size == 0) {
      return new double[0][];
    }
    values = trimToSize(values);
    data = trimToSize(data);
    return values;
  }


  @Override
  public String toString() {
    return "Dataset of size " + size;
  }


  public String toSummaryString() {
    StringWriter sw = new StringWriter();
    PrintWriter pw = new PrintWriter(sw);
    pw.println("Number of data points: " + size());


    pw.print("Number of labels: " + labelIndex.size() + " [");
    Iterator<L> iter = labelIndex.iterator();
    while (iter.hasNext()) {
      pw.print(iter.next());
      if (iter.hasNext()) {
        pw.print(", ");
      }
    }
    pw.println("]");
    pw.println("Number of features (Phi(X) types): " + featureIndex.size());
    pw.println("Number of active feature types: " + numFeatureTypes());
    pw.println("Number of active feature tokens: " + numFeatureTokens());


    return sw.toString();
  }


  /**
   * {@inheritDoc}
   */
  @Override
  public Iterator<RVFDatum<L, F>> iterator() {
    return new Iterator<RVFDatum<L, F>>() {
      private int index; // = 0;


      @Override
      public boolean hasNext() {
        return this.index < size;
      }


      public RVFDatum<L, F> next() {
        if (index >= size) {
          throw new NoSuchElementException();
        }
        RVFDatum<L, F> next = getRVFDatum(this.index);
        ++this.index;
        return next;
      }


      @Override
      public void remove() {
        throw new UnsupportedOperationException();
      }
    };
  }


  /**
   * Randomizes the data array in place. Needs to be redefined here because we
   * need to randomize the values as well.
   */
  @Override
  public void randomize(long randomSeed) {
    Random rand = new Random(randomSeed);
    for (int j = size - 1; j > 0; j--) {
      int randIndex = rand.nextInt(j);
      int[] tmp = data[randIndex];
      data[randIndex] = data[j];
      data[j] = tmp;


      int tmpl = labels[randIndex];
      labels[randIndex] = labels[j];
      labels[j] = tmpl;


      double[] tmpv = values[randIndex];
      values[randIndex] = values[j];
      values[j] = tmpv;
    }
  }


  /**
   * Randomizes the data array in place. Needs to be redefined here because we
   * need to randomize the values as well.
   */
  @Override
  public <E> void shuffleWithSideInformation(long randomSeed, List<E> sideInformation) {
    if (size != sideInformation.size()) {
      throw new IllegalArgumentException("shuffleWithSideInformation: sideInformation not of same size as Dataset");
    }
    Random rand = new Random(randomSeed);
    for (int j = size - 1; j > 0; j--) {
      int randIndex = rand.nextInt(j);


      int[] tmp = data[randIndex];
      data[randIndex] = data[j];
      data[j] = tmp;


      int tmpl = labels[randIndex];
      labels[randIndex] = labels[j];
      labels[j] = tmpl;


      double[] tmpv = values[randIndex];
      values[randIndex] = values[j];
      values[j] = tmpv;


      E tmpE = sideInformation.get(randIndex);
      sideInformation.set(randIndex, sideInformation.get(j));
      sideInformation.set(j, tmpE);
    }
  }


}
Source Code of edu.stanford.nlp.classify.RVFDataset

Related Classes of edu.stanford.nlp.classify.RVFDataset