Package cc.mallet.pipe

Source Code of cc.mallet.pipe.Csv2Array

/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */





package cc.mallet.pipe;


import java.util.logging.*;
import java.lang.reflect.Array;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.Labeling;
import cc.mallet.util.CharSequenceLexer;
import cc.mallet.util.MalletLogger;

/**

   Converts a string of comma separated values to an array. To be used
   prior to {@link Array2FeatureVector}. Note that this class assumes
   that each location of the line corresponds to a feature index
   (i.e. "dense" representation) eg:

   instance 1: 1,0,0,1,0,0,1  << feature alphabet size = 7
   instance 2: 0,0,1,0,0,0,1  << feature alphabet size = 7

    @author Aron Culotta
*/
public class Csv2Array extends Pipe {

  CharSequenceLexer lexer;
  int numberFeatures = -1;
  private static Logger logger = MalletLogger.getLogger(Csv2Array.class.getName());

  public Csv2Array () {
    this.lexer = new CharSequenceLexer ("([^,]+)");
  }

  public Csv2Array (String regex) {
    this.lexer = new CharSequenceLexer (regex);
  }

  public Csv2Array (CharSequenceLexer l) {
    this.lexer = l;
  }

  /** Convert the data in an <CODE>Instance</CODE> from a CharSequence
   * of comma-separated-values to an array, where each index is the
   * feature name.
   */
  public Instance pipeInstance carrier ) {
   
    CharSequence c = (CharSequence)carrier.getData();
    int nf = countNumberFeatures (c);
    if (numberFeatures == -1) // first instance seen
      numberFeatures = nf;
    else if (numberFeatures != nf)
      throw new IllegalArgumentException ("Instances must have same-length feature vectors. length_i: " + numberFeatures + " length_j: " + nf);
    double[] feats = new double[numberFeatures];
    lexer.setCharSequence (c);
    int i=0;
    while (lexer.hasNext())
      feats[i++] = Double.parseDouble ((String)lexer.next());
    carrier.setData (feats);
    return carrier;
   
  }
        
  private int countNumberFeatures (CharSequence c) {
    String s = c.toString();
    int ret = 0;
    int pos = 0;
    while ((pos = s.indexOf (",", pos) + 1) != 0)
      ret++;
    return ret+1;
  }
}
TOP

Related Classes of cc.mallet.pipe.Csv2Array

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.