Package cc.mallet.pipe

Source Code of cc.mallet.pipe.SimpleTaggerSentence2StringTokenization

/* Copyright (C) 2003 University of Pennsylvania.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org.  For further
information, see the file `LICENSE' included with this distribution. */

/**
   @author Aron Culotta
*/

package cc.mallet.pipe;


import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;

import cc.mallet.extract.StringSpan;
import cc.mallet.extract.StringTokenization;
import cc.mallet.types.*;

/**
* This extends {@link SimpleTaggerSentence2TokenSequence} to use
* {Slink StringTokenizations} for use with the extract package.
*/
public class SimpleTaggerSentence2StringTokenization
  extends SimpleTaggerSentence2TokenSequence{

  /**
   * Creates a new
   * <code>SimpleTaggerSentence2StringTokenization</code> instance.
   * By default we include tokens as features.
   */
  public SimpleTaggerSentence2StringTokenization ()
  {
    super ();
  }

  /**
   * creates a new <code>SimpleTaggerSentence2StringTokenization</code> instance
   * which includes tokens as features iff the supplied argument is true.
   */
  public SimpleTaggerSentence2StringTokenization (boolean inc)
  {
    super (inc);
  }


  /**
   * Takes an instance with data of type String or String[][] and creates
   * an Instance of type StringTokenization.  Each Token in the sequence is
   * gets the test of the line preceding it and once feature of value 1
   * for each "Feature" in the line.  For example, if the String[][] is
   * {{a,b},{c,d,e}} (and target processing is off) then the text would be
   * "a b" for the first token and "c d e" for the second.  Also, the
   * features "a" and "b" would be set for the first token and "c", "d" and
   * "e"  for the second.  The last element in the String[] for the current
   * token is taken as the target (label), so in the previous example "b"
   * would have been the label of the first sequence.
   */
  public Instance pipe(Instance carrier) {
    Object inputData = carrier.getData();
    LabelAlphabet labels;
    LabelSequence target = null;
    String[][] tokens;
    StringBuffer source = new StringBuffer();
    StringTokenization ts = new StringTokenization(source);
    if (inputData instanceof String)
      tokens = parseSentence((String) inputData);
    else if (inputData instanceof String[][])
      tokens = (String[][]) inputData;
    else
      throw new IllegalArgumentException("Not a String; got " + inputData);
    if (isTargetProcessing()) {
      labels = (LabelAlphabet) getTargetAlphabet();
      target = new LabelSequence(labels, tokens.length);
    }
    for (int l = 0; l < tokens.length; l++) {
      int nFeatures;
      if (isTargetProcessing()) {
        if (tokens[l].length < 1)
          throw new IllegalStateException("Missing label at line "
              + l + " instance " + carrier.getName());
        nFeatures = tokens[l].length - 1;
        target.add(tokens[l][nFeatures]);
      } else
        nFeatures = tokens[l].length;
      int start = source.length();
      String word = makeText(tokens[l]);
      source.append(word + " ");
      Token tok = new StringSpan(source, start, source.length() - 1);
      if (setTokensAsFeatures) {
        for (int f = 0; f < nFeatures; f++)
          tok.setFeatureValue(tokens[l][f], 1.0);
      } else {
        for (int f = 1; f < nFeatures; f++)
          tok.setFeatureValue(tokens[l][f], 1.0);
      }
      ts.add(tok);
    }
    carrier.setData(ts);
    if (isTargetProcessing())
      carrier.setTarget(target);
    return carrier;
  }
 
  // Serialization garbage

  private static final long serialVersionUID = 1;
  private static final int CURRENT_SERIAL_VERSION = 1;

  private void writeObject (ObjectOutputStream out) throws IOException
  {
    out.defaultWriteObject ();
    out.writeInt (CURRENT_SERIAL_VERSION);
  }


  private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException
  {
    in.defaultReadObject ();
    int version = in.readInt ();
  }

}
TOP

Related Classes of cc.mallet.pipe.SimpleTaggerSentence2StringTokenization

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.