package edu.stanford.nlp.tagger.io;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.StringTokenizer;
import edu.stanford.nlp.ling.TaggedWord;
public class TextTaggedFileReader implements TaggedFileReader {
final BufferedReader reader;
final String tagSeparator;
final String filename;
int numSentences = 0;
List<TaggedWord> next;
public TextTaggedFileReader(TaggedFileRecord record) {
filename = record.file;
try {
reader = new BufferedReader(new InputStreamReader
(new FileInputStream(filename),
record.encoding));
} catch (IOException e) {
throw new RuntimeException(e);
}
tagSeparator = record.tagSeparator;
primeNext();
}
public Iterator<List<TaggedWord>> iterator() { return this; }
public String filename() { return filename; }
public boolean hasNext() { return next != null; }
public List<TaggedWord> next() {
if (next == null) {
throw new NoSuchElementException();
}
List<TaggedWord> thisIteration = next;
primeNext();
return thisIteration;
}
void primeNext() {
String line;
try {
line = reader.readLine();
} catch (IOException e) {
throw new RuntimeException(e);
}
if (line == null) {
next = null;
return;
}
++numSentences;
next = new ArrayList<TaggedWord>();
StringTokenizer st = new StringTokenizer(line);
//loop over words in a single sentence
while (st.hasMoreTokens()) {
String token = st.nextToken();
int indexUnd = token.lastIndexOf(tagSeparator);
if (indexUnd < 0) {
throw new IllegalArgumentException("Data format error: can't find delimiter \"" + tagSeparator + "\" in word \"" + token + "\" (line " + (numSentences+1) + " of " + filename + ')');
}
String word = token.substring(0, indexUnd).intern();
String tag = token.substring(indexUnd + 1).intern();
next.add(new TaggedWord(word, tag));
}
}
public void remove() { throw new UnsupportedOperationException(); }
}