Package cc.mallet.pipe

Source Code of cc.mallet.pipe.SvmLight2FeatureVectorAndLabel

/* Copyright (C) 2010 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */

package cc.mallet.pipe;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;

/**
* This Pipe converts a line in SVMLight format to
* a Mallet instance with FeatureVector data and
* Label target.  The expected format is
*
* target feature:value feature:value ...
*
* targets and features can be indices, as in
* SVMLight, or Strings.
*
* Note that if targets and features are indices,
* their indices in the data and target Alphabets
* may be different, though the data will be
* equivalent. 
*
* @author Gregory Druck
*
*/
public class SvmLight2FeatureVectorAndLabel extends Pipe {

  private static final long serialVersionUID = 1L;
 
  public SvmLight2FeatureVectorAndLabel () {
    super (new Alphabet(), new LabelAlphabet());
  }
 
  // There is no guarantee that the feature indices in the text
  // file will be the same as in the pipe.  The data should be
  // exactly the same, however, just permuted. 
  @Override public Instance pipe(Instance carrier) {
    // we expect the data for each instance to be
    // a line from the SVMLight format text file   
    String dataStr = (String)carrier.getData();

    // ignore comments at the end
    if (dataStr.contains("#")) {
      dataStr = dataStr.substring(0, dataStr.indexOf('#'));
    }

    String[] terms = dataStr.split("\\s+");
   
    String classStr = terms[0];
    // In SVMLight +1 and 1 are the same label. 
    // Adding a special case to normalize...
    if (classStr.equals("+1")) {
      classStr = "1";
    }
    Label label = ((LabelAlphabet)getTargetAlphabet()).lookupLabel(classStr, true);
    carrier.setTarget(label);
   
    // the rest are feature-value pairs
    int numFeatures = terms.length - 1;
    int[] indices = new int[numFeatures];
    double[] values = new double[numFeatures];
    for (int termIndex = 1; termIndex < terms.length; termIndex++) {
      if (!terms[termIndex].equals("")) {
        String[] s = terms[termIndex].split(":");
        String feature = s[0];
        indices[termIndex-1] = getDataAlphabet().lookupIndex(feature, true);      
        values[termIndex-1] = Double.parseDouble(s[1]);
      }
    }
   
    FeatureVector fv = new FeatureVector(getDataAlphabet(), indices, values);
    carrier.setData(fv);
    return carrier;
  }
}
TOP

Related Classes of cc.mallet.pipe.SvmLight2FeatureVectorAndLabel

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.