Package mstparser.io

Source Code of mstparser.io.CONLLReader

///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2007 University of Texas at Austin and (C) 2005
// University of Pennsylvania and Copyright (C) 2002, 2003 University
// of Massachusetts Amherst, Department of Computer Science.
//
// This software is licensed under the terms of the Common Public
// License, Version 1.0 or (at your option) any subsequent version.
//
// The license is approved by the Open Source Initiative, and is
// available from their website at http://www.opensource.org.
///////////////////////////////////////////////////////////////////////////////

package mstparser.io;

import mstparser.DependencyInstance;
import mstparser.RelationalFeature;
import mstparser.Util;

import java.io.*;
import java.util.*;

/**
* A reader for files in CoNLL format.
*
* <p>
* Created: Sat Nov 10 15:25:10 2001
* </p>
*
* @author Jason Baldridge
* @version $Id: CONLLReader.java 112 2007-03-23 19:19:28Z jasonbaldridge $
* @see mstparser.io.DependencyReader
*/
public class CONLLReader extends DependencyReader {

    protected boolean discourseMode = false;

    public CONLLReader (boolean discourseMode) {
  this.discourseMode = discourseMode;
    }

    public DependencyInstance getNext() throws IOException {

  ArrayList<String[]> lineList = new ArrayList<String[]>();

  String line = inputReader.readLine();
  while (line != null && !line.equals("") && !line.startsWith("*")) {
      lineList.add(line.split("\t"));
      line = inputReader.readLine();
      //System.out.println("## "+line);
  }

  int length = lineList.size();

  if(length == 0) {
      inputReader.close();
      return null;
  }

  String[] forms = new String[length+1];
  String[] lemmas = new String[length+1];
  String[] cpos = new String[length+1];
  String[] pos = new String[length+1];
  String[][] feats = new String[length+1][];
  String[] deprels = new String[length+1];
  int[] heads = new int[length+1];

  forms[0] = "<root>";
  lemmas[0] = "<root-LEMMA>";
  cpos[0] = "<root-CPOS>";
  pos[0] = "<root-POS>";
  deprels[0] = "<no-type>";
  heads[0] = -1;

  for(int i = 0; i < length; i++) {
      String[] info = lineList.get(i);
      forms[i+1] = normalize(info[1]);
      lemmas[i+1] = normalize(info[2]);
      cpos[i+1] = info[3];
      pos[i+1] = info[4];
      feats[i+1] = info[5].split("\\|");
      deprels[i+1] = labeled ? info[7] : "<no-type>";
      heads[i+1] = Integer.parseInt(info[6]);
  }
 
  feats[0] = new String[feats[1].length];
  for (int i = 0; i< feats[1].length; i++)
      feats[0][i] = "<root-feat>"+i;
 
  // The following stuff is for discourse and can be safely
  // ignored if you are doing sentential parsing. (In theory it
  // could be useful for sentential parsing.)
  if (discourseMode) {
      String[][] extended_feats = new String[feats[0].length][length+1];
      for (int i=0; i<extended_feats.length; i++) {
    for (int j=0; j<length+1; j++)
        extended_feats[i][j] = feats[j][i];
      }
 
      feats = extended_feats;
  }

  ArrayList<RelationalFeature> rfeats =
      new ArrayList<RelationalFeature>();
 
  while (line != null && !line.equals("")) {
      rfeats.add(new RelationalFeature(length, line, inputReader));
      line = inputReader.readLine();
  }

  RelationalFeature[] rfeatsList = new RelationalFeature[rfeats.size()];
  rfeats.toArray(rfeatsList);

  // End of discourse stuff.

  return new DependencyInstance(forms, lemmas, cpos, pos, feats, deprels, heads, rfeatsList);

    }


    protected boolean fileContainsLabels (String file) throws IOException {
  BufferedReader in = new BufferedReader(new FileReader(file));
  String line = in.readLine();
  in.close();

  //if(line.trim().length() > 0)
        //return true;
  //else
        return false;
    }

}
TOP

Related Classes of mstparser.io.CONLLReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.