package edu.stanford.nlp.international.french.scripts;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeReader;
import edu.stanford.nlp.trees.TreeReaderFactory;
import edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory;
/**
* Places predicted morphological analyses in the leaves of gold FTB parse trees.
*
* @author Spence Green
*
*/
public final class MungeTreesWithMorfetteAnalyses {
private static class MorfetteFileIterator implements Iterator<List<CoreLabel>> {
private BufferedReader reader;
private List<CoreLabel> nextList;
private int lineId = 0;
public MorfetteFileIterator(String filename) {
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF-8"));
primeNext();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
private void primeNext() {
try {
nextList = new ArrayList<CoreLabel>(40);
for (String line; (line = reader.readLine()) != null; ++lineId) {
line = line.trim();
if (line.equals("")) {
++lineId;
break;
}
String[] toks = line.split("\\s+");
if (toks.length != 3) {
System.err.println(toks.length);
System.err.println(line);
System.err.println(lineId);
throw new RuntimeException(String.format("line %d: Morfette format is |word lemma tag|: |%s|", lineId, line));
}
CoreLabel cl = new CoreLabel();
String word = toks[0];
String lemma = toks[1];
String tag = toks[2];
cl.setWord(word);
cl.setValue(word);
cl.setLemma(lemma);
cl.setTag(tag);
nextList.add(cl);
}
// File is exhausted
if (nextList.size() == 0) {
reader.close();
nextList = null;
}
} catch (IOException e) {
System.err.printf("Problem reading file at line %d%n", lineId);
e.printStackTrace();
nextList = null;
}
}
@Override
public boolean hasNext() {
return nextList != null;
}
@Override
public List<CoreLabel> next() {
if (hasNext()) {
List<CoreLabel> next = nextList;
primeNext();
return next;
}
return null;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 2) {
System.err.printf("Usage: java %s tree_file morfette_tnt_file%n", MungeTreesWithMorfetteAnalyses.class.getName());
System.exit(-1);
}
String treeFile = args[0];
String morfetteFile = args[1];
TreeReaderFactory trf = new FrenchTreeReaderFactory();
try {
TreeReader tr = trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
Iterator<List<CoreLabel>> morfetteItr = new MorfetteFileIterator(morfetteFile);
for (Tree tree; (tree = tr.readTree()) != null && morfetteItr.hasNext();) {
List<CoreLabel> analysis = morfetteItr.next();
List<Label> yield = tree.yield();
assert analysis.size() == yield.size();
int yieldLen = yield.size();
for (int i = 0; i < yieldLen; ++i) {
CoreLabel tokenAnalysis = analysis.get(i);
Label token = yield.get(i);
String lemma = getLemma(token.value(), tokenAnalysis.lemma());
String newLeaf = String.format("%s%s%s%s%s", token.value(),
MorphoFeatureSpecification.MORPHO_MARK,
lemma,
MorphoFeatureSpecification.LEMMA_MARK,
tokenAnalysis.tag());
((CoreLabel) token).setValue(newLeaf);
}
System.out.println(tree.toString());
}
if (tr.readTree() != null || morfetteItr.hasNext()) {
System.err.println("WARNING: Uneven input files!");
}
tr.close();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
private static final Pattern pIsPunct = Pattern.compile("\\p{Punct}+");
private static final Pattern pAllUpper = Pattern.compile("\\p{Upper}+");
private static String getLemma(String rawToken, String lemma) {
boolean isUpper = Character.isUpperCase(rawToken.charAt(0));
boolean isAllUpper = pAllUpper.matcher(rawToken).matches();
boolean isParen = rawToken.equals("-RRB-") || rawToken.equals("-LRB-");
boolean isPunc = pIsPunct.matcher(rawToken).matches();
if (isParen || isPunc || isAllUpper) {
return rawToken;
}
if (isUpper) {
Character firstChar = Character.toUpperCase(lemma.charAt(0));
lemma = firstChar + lemma.substring(1, lemma.length());
}
return lemma;
}
}