package edu.stanford.nlp.tagger.io;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import edu.stanford.nlp.ling.TaggedWord;
public class TSVTaggedFileReader implements TaggedFileReader {
final BufferedReader reader;
final String filename;
final int wordColumn, tagColumn;
List<TaggedWord> next = null;
int linesRead = 0;
static final int DEFAULT_WORD_COLUMN = 0;
static final int DEFAULT_TAG_COLUMN = 1;
public TSVTaggedFileReader(TaggedFileRecord record) {
filename = record.file;
try {
reader = new BufferedReader(new InputStreamReader
(new FileInputStream(filename),
record.encoding));
} catch (IOException e) {
throw new RuntimeException(e);
}
wordColumn = ((record.wordColumn == null) ?
DEFAULT_WORD_COLUMN : record.wordColumn);
tagColumn = ((record.tagColumn == null) ?
DEFAULT_TAG_COLUMN : record.tagColumn);
primeNext();
}
public Iterator<List<TaggedWord>> iterator() { return this; }
public String filename() { return filename; }
public boolean hasNext() { return next != null; }
public List<TaggedWord> next() {
if (next == null) {
throw new NoSuchElementException();
}
List<TaggedWord> thisIteration = next;
primeNext();
return thisIteration;
}
void primeNext() {
// eat all blank lines until we hit the next block of text
String line = "";
while (line.trim().equals("")) {
try {
line = reader.readLine();
++linesRead;
} catch (IOException e) {
throw new RuntimeException(e);
}
if (line == null) {
next = null;
return;
}
}
// we hit something with text, so now we read one line at a time
// until we hit the next blank line. the next blank line (or EOF)
// ends the sentence.
next = new ArrayList<TaggedWord>();
while (line != null && !line.trim().equals("")) {
String[] pieces = line.split("\t");
if (pieces.length <= wordColumn || pieces.length <= tagColumn) {
throw new IllegalArgumentException("File " + filename + " line #" +
linesRead + " too short");
}
String word = pieces[wordColumn];
String tag = pieces[tagColumn];
next.add(new TaggedWord(word, tag));
try {
line = reader.readLine();
++linesRead;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
public void remove() { throw new UnsupportedOperationException(); }
}