Package org.maltparserx.core.syntaxgraph.reader

Source Code of org.maltparserx.core.syntaxgraph.reader.TabReader

package org.maltparserx.core.syntaxgraph.reader;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.Iterator;
import java.util.zip.GZIPInputStream;

import org.maltparserx.core.exception.MaltChainedException;
import org.maltparserx.core.io.dataformat.ColumnDescription;
import org.maltparserx.core.io.dataformat.DataFormatException;
import org.maltparserx.core.io.dataformat.DataFormatInstance;
import org.maltparserx.core.syntaxgraph.DependencyStructure;
import org.maltparserx.core.syntaxgraph.Element;
import org.maltparserx.core.syntaxgraph.TokenStructure;
import org.maltparserx.core.syntaxgraph.edge.Edge;
/**
*
*
* @author Johan Hall
*/
public class TabReader implements SyntaxGraphReader {
  private BufferedReader reader;
  private int sentenceCount;
  private final StringBuilder input;
  private DataFormatInstance dataFormatInstance;
  private static final String IGNORE_COLUMN_SIGN = "_";
  private static final char TAB = '\t';
  private static final char NEWLINE = '\n';
  private static final char CARRIAGE_RETURN = '\r';
  private String fileName = null;
  private URL url = null;
  private String charsetName;
  private int nIterations;
  private int cIterations;
  private boolean closeStream = true;
 
  public TabReader() {
    input = new StringBuilder();
    nIterations = 1;
    cIterations = 1;
  }
 
  private void reopen() throws MaltChainedException {
    close();
    if (fileName != null) {
      open(fileName, charsetName);
    } else if (url != null) {
      open(url, charsetName);
    } else {
      throw new DataFormatException("The input stream cannot be reopen. ");
    }
  }
 
  public void open(String fileName, String charsetName) throws MaltChainedException {
    setFileName(fileName);
    setCharsetName(charsetName);
    try {
      // sista: detect .gz files on the fly
      InputStream is = new FileInputStream(fileName);
      if(fileName.endsWith(".gz")) is = new GZIPInputStream(is);
      open(is, charsetName);
    } catch (FileNotFoundException e) {
      throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
    } catch(IOException e) {
      throw new DataFormatException("The input file '"+fileName+"' cannot be gunzipped. ", e);
    }
  }
 
  public void open(URL url, String charsetName) throws MaltChainedException {
    setUrl(url);
    setCharsetName(charsetName);
    if (url == null) {
      throw new DataFormatException("The input file cannot be found. ");
    }
    try {
      open(url.openStream(), charsetName);
    } catch (IOException e) {
      throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
    }
  }
 
  public void open(InputStream is, String charsetName) throws MaltChainedException {
    try {
      if (is == System.in) {
        closeStream = false;
      }
      open(new InputStreamReader(is, charsetName));
    } catch (UnsupportedEncodingException e) {
      throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
    }
  }
 
  private void open(InputStreamReader isr) throws MaltChainedException {
    setReader(new BufferedReader(isr));
    setSentenceCount(0);
  }
 
  public void readProlog() throws MaltChainedException {
   
  }
 
  public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
    if (syntaxGraph == null || dataFormatInstance == null) {
      return false;
    }
   
    Element node = null;
    Edge edge = null;
    input.setLength(0);
    int i = 0;
    int terminalCounter = 0;
    int nNewLines = 0;
    syntaxGraph.clear();
    syntaxGraph.getSymbolTables().cleanUp();
    Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
    while (true) {
      int c;

      try {
        c = reader.read();
      } catch (IOException e) {
        close();
        throw new DataFormatException("Error when reading from the input file. ", e);
      }
      if (c == TAB || c == NEWLINE || c == CARRIAGE_RETURN || c == -1) {
        if (input.length() != 0) {         
          if (i == 0) {
            terminalCounter++;
            node = syntaxGraph.addTokenNode(terminalCounter);
          }
          if (columns.hasNext()) {
            ColumnDescription column = columns.next();
            if (column.getCategory() == ColumnDescription.INPUT && node != null) {
              syntaxGraph.addLabel(node, column.getName(), input.toString());
            } else if (column.getCategory() == ColumnDescription.HEAD) {
              if (syntaxGraph instanceof DependencyStructure) {
                if (column.getCategory() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) {
//                if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix
                //if (!input.toString().equals(IGNORE_COLUMN_SIGN)) {
                  edge = ((DependencyStructure)syntaxGraph).addDependencyEdge(Integer.parseInt(input.toString()), terminalCounter);
                }
              }
              else {
                close();
                throw new DataFormatException("The input graph is not a dependency graph and therefore it is not possible to add dependncy edges. ");
              }
            } else if (column.getCategory() == ColumnDescription.DEPENDENCY_EDGE_LABEL && edge != null) {
              //if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix not working for everybody
                syntaxGraph.addLabel(edge, column.getName(), input.toString());
              //} // bugfix
            }
          }
          input.setLength(0);
          nNewLines = 0;
          i++;
        } else if (c == TAB) {
          throw new MaltChainedException("The input file '"+fileName+"' contains a column where the value is an empty string. Please check your input file. ");
        }
        if (c == NEWLINE) {
          nNewLines++;
          i = 0;
          columns = dataFormatInstance.iterator();
        }
      } else {
        input.append((char)c);
      }
     
      if (nNewLines == 2 && c == NEWLINE) {
        if (syntaxGraph.hasTokens()) {
          sentenceCount++;
        }
        return true;
      } else if (c == -1) {
        if (syntaxGraph.hasTokens()) {
          sentenceCount++;
        }
        if (cIterations < nIterations) {
          cIterations++;
          reopen();
          return true;
        }
       
        return false;         
      }
    }
  }
 
  public void readEpilog() throws MaltChainedException {
   
  }
 
  public BufferedReader getReader() {
    return reader;
  }

  public void setReader(BufferedReader reader) throws MaltChainedException {
    close();
    this.reader = reader;
  }
 
  public DataFormatInstance getDataFormatInstance() {
    return dataFormatInstance;
  }

  public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
    this.dataFormatInstance = dataFormatInstance;
  }

  public int getSentenceCount() throws MaltChainedException {
    return sentenceCount;
  }
 
  public void setSentenceCount(int sentenceCount) {
    this.sentenceCount = sentenceCount;
  }
 
  public String getOptions() {
    return null;
  }
 
  public void setOptions(String optionString) throws MaltChainedException {
   
  }
 
  public String getFileName() {
    return fileName;
  }

  public void setFileName(String fileName) {
    this.fileName = fileName;
  }

  public URL getUrl() {
    return url;
  }

  public void setUrl(URL url) {
    this.url = url;
  }

  public String getCharsetName() {
    return charsetName;
  }

  public void setCharsetName(String charsetName) {
    this.charsetName = charsetName;
  }

  public int getNIterations() {
    return nIterations;
  }

  public void setNIterations(int iterations) {
    nIterations = iterations;
  }

  public int getIterationCounter() {
    return cIterations;
  }

  public void close() throws MaltChainedException {
    try {
      if (reader != null) {
        if (closeStream) {
          reader.close();
        }
        reader = null;
      }
    } catch (IOException e) {
      throw new DataFormatException("Error when closing the input file. ", e);
    }
  }
 
  public void clear() throws MaltChainedException {
    close();
    input.setLength(0);
    dataFormatInstance = null;
    sentenceCount = 0;
  }
}
TOP

Related Classes of org.maltparserx.core.syntaxgraph.reader.TabReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.