package org.languagetool.dev.conversion;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*;
import org.languagetool.dev.conversion.cg.CgCompositeTag;
import org.languagetool.dev.conversion.cg.CgContextualTest;
import org.languagetool.dev.conversion.cg.CgContextualTest.POS;
import org.languagetool.dev.conversion.cg.CgGrammar;
import org.languagetool.dev.conversion.cg.CgRule;
import org.languagetool.dev.conversion.cg.CgSet;
import org.languagetool.dev.conversion.cg.CgTag;
import org.languagetool.dev.conversion.cg.CgTextualParser;
public class CgRuleConverter extends RuleConverter {
private CgGrammar grammar;
private String[] lines;
private static String tagDelimiter = ":";
// basic constructor
public CgRuleConverter() {
super();
}
public CgRuleConverter(String infile, String outfile, String specificFiletype) {
super(infile, outfile, specificFiletype);
}
// Get methods
public CgGrammar getGrammar() {return this.grammar;}
// Set methods
public void setGrammar(CgGrammar grammar) {this.grammar = grammar;}
public void setTagDelimiter(String td) {tagDelimiter = td;}
@Override
public void parseRuleFile() throws IOException {
parseCgFile(); // builds the grammar
List<CgRule> ruleList = new ArrayList<>();
for (CgRule rule : grammar.rule_by_number) {
ruleList.add(rule);
}
ruleObjects = ruleList;
ltRules = new ArrayList<>();
allLtRules = new ArrayList<>();
disambiguationRules = new ArrayList<>();
originalRuleStrings = new ArrayList<>();
warnings = new ArrayList<>();
for (Object ruleObject : ruleObjects) {
CgRule cgrule = (CgRule) ruleObject;
List<String> ruleAsList = ltRuleAsList(cgrule,generateId(ruleObject),generateName(ruleObject),cgrule.type.name());
disambiguationRules.add(ruleAsList);
allLtRules.add(ruleAsList);
originalRuleStrings.add(lines[cgrule.line]);
}
}
@Override
public boolean isDisambiguationRule(Object ruleObject) {
return true; // all cg rules are disambiguation rules
}
public void parseCgFile() throws IOException {
File file = new File(inFileName);
grammar = new CgGrammar();
CgTextualParser parser = new CgTextualParser(grammar, file);
int result = parser.parse_grammar_from_file(inFileName, null, null);
if (result == 0) {
//System.out.println("Successfully parsed constraint grammar file " + inFileName);
} else {
System.err.println("Failed to parse constraint grammar file " + inFileName);
}
getGrammarFileLines(inFileName);
}
/**
* Grabs the original lines of the CG grammar file. Mainly for retrieving the original rule strings to include in comments.
* @param filename
*/
public void getGrammarFileLines(String filename) {
BufferedReader reader = null;
StringBuilder sb = new StringBuilder();
String inArray = "";
// put some buffer space at the beginning, just to be sure
sb.append(" ");
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF-8"));
int c = reader.read();
while (c != -1) {
sb.append((char)c);
c = reader.read();
}
inArray = sb.toString();
reader.close();
} catch (IOException e) {
System.err.println("Error opening grammar file");
System.exit(1);
}
String[] lines = inArray.split("\n");
this.lines = lines;
}
/**
* Takes a single {@link CgRule} and converts it into a list of lines of a LT rule.
* Sometimes the CG rule has to be split into several LT rules. In this case, the rule
* is added as a bunch of rules in a rulegroup
*/
@Override
public List<String> ltRuleAsList(Object ruleObject, String id, String name, String type) {
CgRule rule = (CgRule)ruleObject;
type = rule.type.name(); // like K_SELECT or K_REMOVE
List<String> ltRule = new ArrayList<>();
List<String> currentWarnings = new ArrayList<>();
String cgRuleString = lines[rule.line];
ltRule.add("<!-- " + cgRuleString + " -->");
// ArrayList<CgRule> rules = splitUnificationRule(mainRule, grammar);
// for (CgRule rule : rules) {
ArrayList<Token> tokensList = new ArrayList<>();
List<ArrayList<Token>> outerList = new ArrayList<>(); // in case we need to split the rule into several rules
ArrayList<Token[]> processedLists = new ArrayList<>();
CgSet targetSet = expandSetSets(grammar.getSet(rule.target));
Token target = new Token(targetSet,false,0,false,false,new CgSet(),false,0,false);
if (!isOrCompatible(target)) {
System.err.println("Target for rule on line " + rule.line + " cannot be represented as one LT rule. Consider rewriting it.");
return new ArrayList<>();
}
tokensList.add(target);
List<CgContextualTest> sortedTestsHeads = new ArrayList<>();
// puts the parent test at the end, so it's processed last
for (CgContextualTest test : rule.test_heads) {
if (test.isParentTest()) {
sortedTestsHeads.add(test);
} else {
sortedTestsHeads.add(0, test);
}
}
for (CgContextualTest test : sortedTestsHeads) {
if (test.isNormalTest()) {
Token testToken = getTokenFromNormalTest(test);
tokensList.add(testToken);
// only accounts for a single branching (i.e. one parent test)
} else if (test.isParentTest()) {
if (!outerList.isEmpty()) {
System.err.println("Can't have two parent tests in one test on line " + rule.line + "\nTry splitting it up.");
System.exit(1);
}
for (int testInt : test.ors) {
ArrayList<Token> newTokenList = copyTokenList(tokensList);
CgContextualTest childTest = rule.test_map.get(testInt);
if (childTest.isNormalTest()) {
Token childTestToken = getTokenFromNormalTest(childTest);
newTokenList.add(childTestToken);
} else if (childTest.isLinkedTest()) {
ArrayList<CgContextualTest> linkedTests = new ArrayList<>();
CgContextualTest curTest = childTest;
while (curTest.next != 0) { // while there are still more tests to link to
linkedTests.add(curTest);
curTest = rule.test_map.get(curTest.next);
}
linkedTests.add(curTest); // add the last linked test
Token headLinkedToken = getLinkedTokens(linkedTests); // modifies the offsets for the linked tests
newTokenList.add(headLinkedToken);
}
outerList.add(newTokenList);
}
} else if (test.isLinkedTest()) {
// add all the linked tests to a list
ArrayList<CgContextualTest> linkedTests = new ArrayList<>();
CgContextualTest curTest = test;
while (curTest.next != 0) { // while there are still more tests to link to
linkedTests.add(curTest);
curTest = rule.test_map.get(curTest.next);
}
linkedTests.add(curTest); // add the last linked test
Token headLinkedToken = getLinkedTokens(linkedTests); // modifies the offsets for the linked tests
tokensList.add(headLinkedToken);
}
}
// if the outerList is empty, we haven't had a parent test, so we can just add the tokensList to the outerList and process it
if (outerList.isEmpty()) {
outerList.add(tokensList);
}
// pre-process/split all the tests
// they come off outerList and go back onto processedLists
// first split off the special case of the negative backward barrier scan
for (int i=0;i<outerList.size();i++) {
Token[] tokens = outerList.get(i).toArray(new Token[outerList.get(i).size()]);
if (negativeBackwardBarrierScan(tokens)) {
Iterable<List<Token>> split = splitNegativeBackwardBarrierScan(tokens);
outerList.remove(i);
for (List<Token> splitList : split) {
outerList.add(i, new ArrayList<>(splitList));
i++;
}
}
}
for (int i=0;i<outerList.size();i++) {
Token[] tokens = outerList.get(i).toArray(new Token[outerList.get(i).size()]);
Arrays.sort(tokens);
tokens = addGapTokens(tokens);
if (skipSafe(tokens)) {
tokens = addSkipTokens(tokens);
tokens = resolveLinkedTokens(tokens);
if (!singleRuleCompatible(tokens)) {
Iterable<List<Token>> singleRuleCompatibleTokens = splitForSingleRule(tokens);
for (List<Token> srctl : singleRuleCompatibleTokens) {
Token[] srcta = srctl.toArray(new Token[srctl.size()]);
processedLists.add(srcta);
}
} else {
processedLists.add(tokens);
}
}
else {
List<List<Token>> splitTokenLists = getSkipSafeTokens(tokens);
for (int j=0;j<splitTokenLists.size();j++) {
Token[] indSplitTokenList = splitTokenLists.get(j).toArray(new Token[splitTokenLists.get(j).size()]);
indSplitTokenList = addSkipTokens(indSplitTokenList);
indSplitTokenList = resolveLinkedTokens(indSplitTokenList);
Arrays.sort(indSplitTokenList);
indSplitTokenList = addGapTokens(indSplitTokenList);
if (!singleRuleCompatible(indSplitTokenList)) {
Iterable<List<Token>> singleRuleCompatibleTokens = splitForSingleRule(indSplitTokenList);
for (List<Token> srctl : singleRuleCompatibleTokens) {
Token[] srcta = srctl.toArray(new Token[srctl.size()]);
processedLists.add(srcta);
}
} else {
processedLists.add(indSplitTokenList);
}
}
}
}
// the actual rule generation
if (processedLists.size() == 1) {
Token[] tokens = processedLists.get(0);
List<String> ltRule2 = getRuleByType(targetSet, tokens, rule, id, name, type);
ltRule.addAll(ltRule2);
} else {
ltRule.add("<rulegroup name=\"" + generateName(ruleObject) + "\">");
for (Token[] tokens : processedLists) {
List<String> ltRule2 = getRuleByType(targetSet, tokens, rule, null, null, type);
ltRule.addAll(ltRule2);
}
ltRule.add("</rulegroup>");
}
// }
warnings.add(currentWarnings.toArray(new String[currentWarnings.size()]));
return ltRule;
}
// ** METHODS THAT SPLIT A RULE INTO MULTIPLE RULES **
/**
* For if there's multiple surface/base forms and postags in a single token that we can't "or" together
* in one LT token. E.g. if a token includes ("man" or "woman" or NN or NNP), it'd have to be split.
*/
public List<List<Token>> splitForSingleRule(Token[] tokens) {
List<List<Token>> list = new ArrayList<>();
List<Token> tokenList = new ArrayList<>(Arrays.asList(tokens));
list.add(tokenList);
boolean notdone = true;
while (notdone) {
for (int i=0;i<list.size();i++) {
List<Token> insideList = list.get(i);
if (singleRuleCompatible(insideList.toArray(new Token[insideList.size()]))) {
if (i == list.size()-1) notdone = false;
continue;
} else {
list.remove(i);
Iterable<List<Token>> splitTokens = splitListForSingleRule(insideList);
for (List<Token> ind : splitTokens) {
list.add(ind);
}
break;
}
}
}
return list;
}
/**
* Actually performs the splitting for wrapper method splitForSingleRule. Only performs a split for a single token.
*/
public List<List<Token>> splitListForSingleRule(List<Token> tokens) {
ArrayList<List<Token>> list = new ArrayList<>();
final List<Token> firstList = new ArrayList<>();
list.add(firstList);
int i;
for (i=0;i<tokens.size();i++) {
if (isOrCompatible(tokens.get(i))) {
firstList.add(tokens.get(i));
} else {
list.remove(firstList);
Iterable<CgSet> newSets = splitCgSet(tokens.get(i).target);
for (CgSet set : newSets) {
Token newToken = new Token(tokens.get(i));
newToken.target = expandSetSets(set);
newToken.postags = newToken.target.getPostagsString();
newToken.baseforms = newToken.target.getSingleTagBaseformsString();
newToken.surfaceforms = newToken.target.getSingleTagSurfaceformsString();
newToken.compositeTags = newToken.target.getCompositeTags();
// clone the first list
List<Token> newList = new ArrayList<>();
for (Token token : firstList) {
newList.add(new Token(token));
}
// add the new token
newList.add(newToken);
// add the new list
list.add(newList);
}
break;
}
}
// finish up the list
for (int j=i+1;j<tokens.size();j++) {
for (int k=0;k<list.size();k++) {
List<Token> insideList = list.get(k);
insideList.add(tokens.get(j));
list.set(k,insideList);
}
}
return list;
}
/**
* Splits off part of a CgSet that can't be represented in a single LT token
*/
public List<CgSet> splitCgSet(CgSet target) {
// setting up the lists to perform the check
List<CgSet> newSets = new ArrayList<>();
CgTag[] postags = target.getSingleTagPostags();
CgTag[] baseforms = target.getSingleTagBaseforms();
CgTag[] surfaceforms = target.getSingleTagSurfaceforms();
CgCompositeTag[] compositePostags = target.getCompositePostags();
// actually checking and doing the splitting
if (postags.length > 0 && baseforms.length > 0) {
CgSet set1 = new CgSet(target);
CgSet set2 = new CgSet(set1);
set1.single_tags.removeAll(Arrays.asList(postags));
set1.tags.removeAll(Arrays.asList(compositePostags));
set2.single_tags.removeAll(Arrays.asList(baseforms));
newSets.add(set1);
newSets.add(set2);
return newSets;
}
if (postags.length > 0 && surfaceforms.length > 0) {
CgSet set1 = new CgSet(target);
CgSet set2 = new CgSet(target);
set1.single_tags.removeAll(Arrays.asList(postags));
set1.tags.removeAll(Arrays.asList(compositePostags));
set2.single_tags.removeAll(Arrays.asList(surfaceforms));
newSets.add(set1);
newSets.add(set2);
return newSets;
}
if (surfaceforms.length > 0 && baseforms.length > 0) {
CgSet set1 = new CgSet(target);
CgSet set2 = new CgSet(target);
set1.single_tags.removeAll(Arrays.asList(surfaceforms));
set2.single_tags.removeAll(Arrays.asList(baseforms));
newSets.add(set1);
newSets.add(set2);
return newSets;
}
// if we didn't catch the culprit in the single tags, it must be in the composite tags,
// which means that there exists two composite tags that have different types of tags in them.
// I could try to do this in a principled way, or I could just split off each composite tag.
// This seems like the better idea for now.
newSets = groupCompositeTags(target);
/*
for (CgCompositeTag ctag : compositeTags) {
CgSet set1 = new CgSet(target);
CgSet set2 = new CgSet(target);
set1.tags.remove(ctag);
set2.tags.removeAll(Arrays.asList(compositeTags));
set2.tags.removeAll(Arrays.asList(compositePostags));
set2.single_tags = new HashSet<CgTag>();
set2.tags.add(ctag);
twoSets.add(set1);
twoSets.add(set2);
return twoSets;
}
*/
// it should never get to here, because it should never get a set that doesn't need to be split passed to it.
return newSets;
}
/**
* Groups the composite tags along lines that can be represented in a single LT rule
*/
@SuppressWarnings("unchecked")
private List<CgSet> groupCompositeTags(CgSet target) {
HashMap<String,ArrayList<CgCompositeTag>> bf = new HashMap<>();
Map<String, ArrayList<CgCompositeTag>> sf = new HashMap<>();
Map<String, CgCompositeTag> dict = new HashMap<>(); // dictionary of sorts
for (CgCompositeTag ctag : target.tags) {
CgCompositeTag postags = new CgCompositeTag();
CgCompositeTag baseforms = new CgCompositeTag();
CgCompositeTag surfaceforms = new CgCompositeTag();
for (CgTag tag : ctag.tags) {
if (isBaseForm(tag.tag)) {
baseforms.addTag(tag);
} else if (isSurfaceForm(tag.tag)) {
surfaceforms.addTag(tag);
} else if (isPostag(tag.tag)) {
postags.addTag(tag);
}
}
if (!postags.isEmpty()) {
if (!baseforms.isEmpty()) {
bf = (HashMap)smartPut(bf, postags.toString(), baseforms);
}
else if (!surfaceforms.isEmpty()) { // assumes there won't be both sf and bf in the same composite tag
sf = (HashMap)smartPut(sf, postags.toString(), surfaceforms);
}
}
dict.put(postags.toString(), postags);
}
List<CgSet> ret = new ArrayList<>();
for (String postagSet : bf.keySet()) {
CgSet newSet = new CgSet(target);
newSet.tags = new HashSet<>();
Iterable<CgCompositeTag> bfs = bf.get(postagSet);
for (CgCompositeTag singleBf : bfs) {
CgCompositeTag newTotalTag = new CgCompositeTag();
for (CgTag tag : dict.get(postagSet).tags) {
newTotalTag.addTag(tag);
}
for (CgTag tag : singleBf.tags) {
newTotalTag.addTag(tag);
}
newSet.addCompositeTag(newTotalTag);
}
ret.add(newSet);
}
return ret;
}
/**
* returns separate Token arrays, each of which is safe for dealing with scanning tokens
* and each of which will be a separate rule. This applies when we have a scanning token
* before a non-scanning token, offset wise
*/
public List<List<Token>> getSkipSafeTokens(Token[] tokens) {
List<List<Token>> list = new ArrayList<>();
List<Token> tokenList = Arrays.asList(tokens);
list.add(tokenList);
boolean notdone = true;
while (notdone) {
for (int i=0;i<list.size();i++) {
List<Token> insideList = list.get(i);
if (skipSafe(insideList.toArray(new Token[insideList.size()]))) {
if (i == list.size()-1) {
notdone = false;
}
} else {
list.remove(i);
Iterable<List<Token>> splitTokens = splitOutSkipTokens(insideList);
for (List<Token> isl : splitTokens) {
list.add(isl);
}
break;
}
}
}
return list;
}
/**
* Actually does the splitting for wrapper method getSkipSafeTokens
*/
public List<List<Token>> splitOutSkipTokens(List<Token> tokens) {
ArrayList<List<Token>> list = new ArrayList<>();
ArrayList<Token> scanningTokens = new ArrayList<>();
ArrayList<Token> reverseScanningTokens = new ArrayList<>();
ArrayList<Token> normalTokens = new ArrayList<>();
for (Token token : tokens) {
if (token.scanahead) scanningTokens.add(token);
else if (token.scanbehind) reverseScanningTokens.add(token);
else normalTokens.add(token);
}
// forward scans
for (int s=0;s<scanningTokens.size();s++) {
final Token scanning = scanningTokens.get(s);
for (int n=0;n<normalTokens.size();n++) {
final Token normal = normalTokens.get(n);
if (normal.offset >= scanning.offset) {
List<Token> newTokenList1 = new ArrayList<>();
List<Token> newTokenList2 = new ArrayList<>();
for (Token ntoken : normalTokens) {
newTokenList1.add(ntoken);
newTokenList2.add(ntoken);
}
Token newNormalToken = new Token(scanning);
newNormalToken.scanahead = false;
newTokenList1.add(newNormalToken);
scanning.offset++;
newTokenList2.add(scanning);
list.add(newTokenList1);
list.add(newTokenList2);
return list;
}
}
}
// backward scans
for (int s=0;s<reverseScanningTokens.size();s++) {
final Token scanning = reverseScanningTokens.get(s);
for (int n=0;n<normalTokens.size();n++) {
final Token normal = normalTokens.get(n);
if (normal.offset <= scanning.offset) {
List<Token> newTokenList1 = new ArrayList<>();
List<Token> newTokenList2 = new ArrayList<>();
for (Token ntoken : normalTokens) {
newTokenList1.add(ntoken);
newTokenList2.add(ntoken);
}
Token newNormalToken = new Token(scanning);
newNormalToken.scanbehind = false;
newTokenList1.add(newNormalToken);
scanning.offset--;
newTokenList2.add(scanning);
list.add(newTokenList1);
list.add(newTokenList2);
return list;
}
}
}
return null;
}
/**
* Handles special case of negative backwards barrier scan (e.g. (NOT -1* Verb BARRIER CLB));
*/
public List<List<Token>> splitNegativeBackwardBarrierScan(Token[] tokens) {
ArrayList<List<Token>> list = new ArrayList<>();
List<Token> newTokenList1 = new ArrayList<>();
List<Token> newTokenList2 = new ArrayList<>();
int index=0;
for (index = 0;index<tokens.length;index++) {
if (tokens[index].scanbehind && tokens[index].negate && !tokens[index].barrier.isEmpty()) {
Token newToken = new Token(tokens[index]);
newToken.barrier = new CgSet();
newTokenList1.add(tokens[index]);
newTokenList2.add(newToken);
break;
} else {
newTokenList1.add(tokens[index]);
newTokenList2.add(tokens[index]);
}
}
// finish off the rest
for (index=index+1;index<tokens.length;index++) {
newTokenList1.add(tokens[index]);
newTokenList2.add(tokens[index]);
}
list.add(newTokenList1);
list.add(newTokenList2);
return list;
}
// // takes a rule that has unification tags (e.g. $$NUMBER) and splits it into several easier to handle rules
// public ArrayList<CgRule> splitUnificationRule(CgRule rule, CgGrammar grammar) {
// ArrayList<CgRule> rules = new ArrayList<CgRule>();
// // go over all the tests in the test_map, which should contain all tests for the entire rule, and every time you see a
// // unification tag, put in one of its component parts and add that modified rule to the list of new rules.
// HashSet<CgSet> unifyingSets = new HashSet<CgSet>();
// // add all the unifying sets in the rule
// for (Iterator<Integer> iter = rule.test_map.keySet().iterator(); iter.hasNext(); ) {
// CgContextualTest curTest = rule.test_map.get(iter.next());
// CgSet target = grammar.getSet(curTest.target);
// for (Integer setint : target.sets) {
// CgSet set = grammar.getSet(setint);
// if (set.type.contains(ST.ST_TAG_UNIFY.value)) {
// unifyingSets.add(set);
// }
// }
// }
// CgSet targetSet = grammar.getSet(rule.target);
// for (int setint : targetSet.sets) {
// if (grammar.getSet(setint).type.contains(ST.ST_TAG_UNIFY.value)) {
// unifyingSets.add(grammar.getSet(setint));
// }
// }
// // if no unifying sets, just return the rule
// if (unifyingSets.size() == 0) {
// rules.add(rule);
// }
// for (CgSet unifyingSet : unifyingSets) {
// CgSet unifyingSetExpanded = expandSetSets(unifyingSet);
// for (CgTag tag : unifyingSetExpanded.single_tags) {
// CgSet oldTargetSet = new CgSet(grammar.getSet(rule.target));
// CgRule newRule = new CgRule(rule);
// if (oldTargetSet.sets.contains(unifyingSet.hash)) {
// oldTargetSet.sets.remove((Object)unifyingSet.hash);
// oldTargetSet.single_tags.add(tag);
// oldTargetSet.rehash();
// grammar.addSet(oldTargetSet);
// newRule.target = oldTargetSet.hash;
// }
//
// for (Iterator<Integer> iter = newRule.test_map.keySet().iterator(); iter.hasNext();) {
// int testKey = iter.next();
// CgContextualTest test = newRule.test_map.get(testKey);
// CgSet oldTestTargetSet = new CgSet(grammar.getSet(test.target));
// if (oldTestTargetSet.sets.contains(unifyingSet.hash)) {
// oldTestTargetSet.sets.remove(unifyingSet);
// oldTestTargetSet.single_tags.add(tag);
// oldTestTargetSet.rehash();
// grammar.addSet(oldTestTargetSet);
// test.target = oldTestTargetSet.hash;
// }
// newRule.test_map.put(testKey, test);
// }
// rules.add(newRule);
// }
// }
// return rules;
// }
// ** METHODS THAT MODIFY A SINGLE TOKEN LIST **
/**
* Expands the tests in a linked test to multiple tokens
*/
public Token[] resolveLinkedTokens(Token[] tokens) {
List<Token> tokenList = new ArrayList<>(Arrays.asList(tokens));
boolean notdone = true;
while (notdone) {
for (int i=0;i<tokenList.size();i++) {
Token curToken = tokenList.get(i);
if (curToken.nextToken != null) {
Token tempToken = new Token(curToken.nextToken);
tempToken.offset = curToken.offset + tempToken.relativeOffset; // to fix the offsets
tokenList.add(i+1, tempToken);
Token temp2 = new Token(curToken);
temp2.nextToken = null;
tokenList.set(i, temp2);
break;
} else {
if (i == tokenList.size()-1) notdone = false;
}
}
}
return tokenList.toArray(new Token[tokenList.size()]);
}
/**
* contains the list of linked CgContextualTests, to translate into Token format
* returns only the head of the linked tokens, have to iterate through them later when you add them
*/
public Token getLinkedTokens(ArrayList<CgContextualTest> tests) {
ArrayList<Token> tokens = new ArrayList<>();
for (int i=0;i<tests.size();i++) {
if (i == 0) {
// this kind of assumes that it won't be a parent test
tokens.add(getTokenFromNormalTest(tests.get(i)));
} else {
Token token = getTokenFromNormalTest(tests.get(i));
token.relativeOffset = token.offset;
token.offset = token.offset + tokens.get(i-1).offset;
token.prevToken = tokens.get(i-1);
tokens.add(token);
}
}
// sort tokens by offset and rearrange them
Token[] ts = tokens.toArray(new Token[tokens.size()]);
Arrays.sort(ts);
// this assumes there's only one scan in the linked tests
// the rationale here is that if it switches positions of tokens, and the
// one that's a scan token gets pushed further back in the list, its scan flag will go back to the front.
for (int i=0;i<ts.length;i++) {
if (ts[i].scanahead) {
ts[i].scanahead = false;
ts[0].scanahead = true;
break;
}
}
// check to make sure there's not another scan in the linked tests:
for (int i=1;i<ts.length;i++) {
if (ts[i].scanahead) {
System.err.println("Two scan tests in one series of linked tests. This is really hard to represent in LT format. Try to split it into several rules");
System.exit(1);
}
}
ts = addLinkedGapTokens(ts);
for (int i=0;i<ts.length;i++) {
if (i == 0) {
ts[i].relativeOffset = 0;
} else {
ts[i].relativeOffset = 1;
}
if (i != ts.length - 1) ts[i].nextToken = ts[i+1];
if (i != 0) ts[i].prevToken = ts[i-1];
}
return ts[0];
}
/**
* Adds gap tokens in the case of a linked (sub) list of tokens
*/
public Token[] addLinkedGapTokens(Token[] ts) {
ArrayList<Token> tokens = new ArrayList<>(Arrays.asList(ts));
boolean notdone = true;
while (notdone) {
for (int i=0;i<tokens.size();i++) {
if (i==0) continue;
else if (i > 0 && i < tokens.size()) {
if (tokens.get(i).offset == tokens.get(i-1).offset || tokens.get(i).offset == (tokens.get(i-1).offset + 1)) {
if (i == tokens.size() - 1) notdone = false;
continue;
}
else {
Token newToken = new Token(new CgSet(), false, tokens.get(i-1).offset + 1, false, false, new CgSet(), false, 0, false);
newToken.relativeOffset = tokens.get(i-1).relativeOffset + 1;
Token oldToken = tokens.get(i-1);
oldToken.relativeOffset = -1;
tokens.set(i-1, oldToken);
tokens.add(i,newToken);
break;
}
}
if (i == tokens.size() - 1) {
notdone = false;
}
}
}
return tokens.toArray(new Token[tokens.size()]);
}
/**
* only gets called if it's safe to add skip tokens in a straightforward manner, otherwise, we split them out
* if it's a negative scanning string (NOT 1* Verb BARRIER CLB), then the exception string is the target
* if it's a positive scanning string (1* Verb BARRIER CLB), then the exception string is the barrier and the end of sentence
* it always goes in the previous token with scope next
*/
public Token[] addSkipTokens(Token[] tokens) {
ArrayList<Token> tokenList = new ArrayList<>(Arrays.asList(tokens));
for (int i=0;i<tokenList.size();i++) {
// forward scans (1* Verb)
if (tokenList.get(i).scanahead) {
if (i == 0) {
Token newToken = new Token(new CgSet(), false, tokenList.get(i).offset - 1, false, false, new CgSet(), false, -1, false);
if (!tokenList.get(i).barrier.isEmpty() || tokenList.get(i).negate) {
newToken.exceptionString = getBarrierExceptionStringFromToken(tokenList.get(i));
}
Token oldToken = tokenList.get(i);
// if it's a negative scan (NOT 1* Noun), then the target of the next token becomes the barrier + SENT_END
if (oldToken.negate) {
CgSet newTarget = oldToken.barrier;
CgTag sentEndTag = new CgTag();
sentEndTag.tag = SENT_END;
newTarget.single_tags.add(sentEndTag);
oldToken.target = newTarget;
oldToken.postags = oldToken.target.getPostagsString();
oldToken.baseforms = oldToken.target.getSingleTagBaseformsString();
oldToken.surfaceforms = oldToken.target.getSingleTagSurfaceformsString();
oldToken.compositeTags = oldToken.target.getCompositeTags();
tokenList.set(0, oldToken);
}
tokenList.add(0, newToken);
} else {
int index = i-1;
String exceptionString = null;
if (!tokenList.get(i).barrier.isEmpty() || tokenList.get(i).negate) {
exceptionString = getBarrierExceptionStringFromToken(tokenList.get(i));
}
int prevOffset = tokenList.get(index).offset;
while (index >= 0 && tokenList.get(index).offset == prevOffset) {
Token prevToken = tokenList.get(index);
prevToken.skip = -1;
prevToken.exceptionString = exceptionString;
tokenList.set(index, prevToken);
index--;
}
Token oldToken = tokenList.get(i);
if (oldToken.negate) {
CgSet newTarget = oldToken.barrier;
CgTag sentEndTag = new CgTag();
sentEndTag.tag = SENT_END;
newTarget.single_tags.add(sentEndTag);
oldToken.target = newTarget;
oldToken.postags = oldToken.target.getPostagsString();
oldToken.baseforms = oldToken.target.getSingleTagBaseformsString();
oldToken.surfaceforms = oldToken.target.getSingleTagSurfaceformsString();
oldToken.compositeTags = oldToken.target.getCompositeTags();
oldToken.negate = false;
tokenList.set(i,oldToken);
}
}
}
// reverse scans (-1* Verb)
else if (tokenList.get(i).scanbehind) {
Token newToken = new Token(new CgSet(), false, tokenList.get(i).offset - 1, false, false, new CgSet(), false, -1, false);
String exceptionString = null;
if (!tokenList.get(i).barrier.isEmpty() || tokenList.get(i).negate) {
exceptionString = getBarrierExceptionStringFromToken(tokenList.get(i));
}
CgSet newTarget = newToken.target;
CgTag sentStartTag = new CgTag();
sentStartTag.tag = SENT_START;
newTarget.single_tags.add(sentStartTag);
newToken.target = newTarget;
newToken.postags = newToken.target.getPostagsString();
newToken.baseforms = newToken.target.getSingleTagBaseformsString();
newToken.surfaceforms = newToken.target.getSingleTagSurfaceformsString();
newToken.compositeTags = newToken.target.getCompositeTags();
newToken.skip = -1;
Token oldToken = tokenList.get(i);
oldToken.skip = -1;
oldToken.scanbehind = false;
// if there's no barrier
if (oldToken.barrier.isEmpty()) {
if (oldToken.negate) {
newToken.exceptionString = exceptionString;
newToken.offset++;
tokenList.set(i, newToken);
} else {
tokenList.set(i, oldToken);
tokenList.add(i, newToken);
i++;
}
}
// if there IS a barrier
else {
if (oldToken.negate) {
oldToken.target = oldToken.barrier;
oldToken.postags = oldToken.target.getPostagsString();
oldToken.baseforms = oldToken.target.getSingleTagBaseformsString();
oldToken.surfaceforms = oldToken.target.getSingleTagSurfaceformsString();
oldToken.compositeTags = oldToken.target.getCompositeTags();
oldToken.exceptionString = exceptionString;
tokenList.set(i,oldToken);
tokenList.add(i,newToken);
} else {
oldToken.exceptionString = exceptionString;
tokenList.set(i, oldToken);
tokenList.add(i, newToken);
i++;
}
}
}
}
return tokenList.toArray(new Token[tokenList.size()]);
}
/**
* For cases where there needs to be an empty token inserted in order to make a proper LT pattern
* e.g. REMOVE N IF (1 Verb) (3 Det);
*/
public Token[] addGapTokens(Token[] tokens) {
boolean notdone = true;
ArrayList<Token> tokenList = new ArrayList<>(Arrays.asList(tokens));
while (notdone) {
for (int i=0;i<tokenList.size();i++) {
if (i == 0) continue;
else if (i > 0 && i < tokenList.size()){
if (tokenList.get(i).offset == tokenList.get(i-1).offset) {
if (i == tokenList.size()-1) notdone = false;
continue;
}
if (tokenList.get(i).offset != (tokenList.get(i-1).offset + 1) && tokenList.get(i).prevToken == null) {
tokenList.add(i, new Token(new CgSet(), false, tokenList.get(i-1).offset + 1, false, false, new CgSet(), false, 0, false));
break;
}
}
if (i == tokenList.size() - 1) {
notdone = false;
}
}
}
return tokenList.toArray(new Token[tokenList.size()]);
}
public ArrayList<Token> copyTokenList(Iterable<Token> tokens) {
ArrayList<Token> newList = new ArrayList<>();
for (Token token : tokens) {
newList.add(new Token(token));
}
return newList;
}
public ArrayList<Token> removeExtraEmptyTokens(ArrayList<Token> tokens) {
if (tokens.size() == 1) {
return tokens;
} else {
ArrayList<Token> newTokenList = new ArrayList<>();
for (Token token : tokens) {
if (!token.isEmpty()) {
newTokenList.add(token);
}
}
if (newTokenList.isEmpty()) {
newTokenList.add(tokens.get(0));
}
return newTokenList;
}
}
// ** METHODS THAT CHECK THE PROPERTIES OF A RULE **
/**
* returns if the rule is safe to add the "skip" attribute to the previous token, as we do normally;
* otherwise, splits the tokens and returns multiple rules.
*/
public boolean skipSafe(Token[] tokens) {
Collection<Token> scanningTokens = new HashSet<>();
Collection<Token> reverseScanningTokens = new HashSet<>();
Collection<Token> normalTokens = new HashSet<>();
for (Token token : tokens) {
if (token.scanahead) {
scanningTokens.add(token);
} else if (token.scanbehind) {
reverseScanningTokens.add(token);
} else {
normalTokens.add(token);
}
}
for (Token s : scanningTokens) {
for (Token o : normalTokens) {
if (s.offset <= o.offset) return false;
}
}
for (Token s : reverseScanningTokens) {
for (Token o : normalTokens) {
if (s.offset >= o.offset) return false;
}
}
return true;
}
/**
* returns true of the rule contains an example of a negative backwards barrier scan (NOT -1* Adj BARRIER Noun), e.g.
* these rules have to be treated as special cases
*/
public boolean negativeBackwardBarrierScan(Token[] tokens) {
for (Token token : tokens) {
if (token.scanbehind && token.negate && !token.barrier.isEmpty()) {
return true;
}
}
return false;
}
public boolean singleRuleCompatible(Token[] tokens) {
for (Token token : tokens) {
if (!isOrCompatible(token)) return false;
}
return true;
}
/**
* Returns true if the {@link Token} can be represented in a single LT rule, false otherwise.
* Example: "are"|noun cannot be represented as one LT token
*/
public boolean isOrCompatible(Token token) {
if (token.postags.length > 0 && (token.baseforms.length > 0 || token.surfaceforms.length > 0)) {
return false;
}
if (token.baseforms.length > 0 && token.surfaceforms.length > 0) {
return false;
}
if (token.compositeTags.length > 0 && (token.postags.length > 0 || token.baseforms.length > 0 || token.surfaceforms.length > 0)) {
return false;
}
Collection<String> pos = new HashSet<>();
Collection<String> base = new HashSet<>();
Collection<String> surf = new HashSet<>();
for (CgCompositeTag ctag : token.compositeTags) {
CgCompositeTag postagCompile = new CgCompositeTag();
for (CgTag tag : ctag.tags) {
if (isPostag(tag.tag)) {
postagCompile.addTag(tag);
} else if (isSurfaceForm(tag.tag)) {
surf.add(tag.tag);
} else if (isBaseForm(tag.tag)) {
base.add(tag.tag);
}
}
pos.add(postagCompile.toString());
}
if (pos.size() > 1 && (surf.size() > 1 || base.size() > 1)) {
return false;
}
if (surf.size() > 1 && base.size() > 1) {
return false;
}
if (surf.size() > 0 && base.size() > 0 && pos.size() > 0) {
return false;
}
return true;
}
// ** METHODS THAT ACTUALLY GENERATE THE LT RULES **
/**
* Actual LT rule generation
*/
@SuppressWarnings("unchecked")
public List<String> getRuleByType(CgSet target, Token[] tokens, CgRule rule, String id, String name, String type) {
ArrayList<String> ltRule = new ArrayList<>();
TreeMap<Integer,ArrayList<Token>> tokenmap = new TreeMap<>();
for (Token token : tokens) {
tokenmap = (TreeMap)smartPut(tokenmap,token.offset,token);
}
if (name != null || id != null) {
ltRule.add("<rule id=\"" + id + "\" name=\"" + name + "\">");
} else {
ltRule.add("<rule>");
}
int mark = getPositionOfTarget(tokens);
ltRule.add(firstIndent + "<pattern mark=\"" + mark + "\">");
for (Iterator<Integer> iter = tokenmap.keySet().iterator(); iter.hasNext();) {
int key = iter.next();
ArrayList<Token> value = tokenmap.get(key);
// remove duplicates, so we don't have unnecessary "and"s floating around
value = removeExtraEmptyTokens(value);
if (value.size() == 1) {
Token token = value.get(0);
ltRule = addCgToken(ltRule,token,secondIndentInt);
}
// if the number of tokens at the given offset is more than 1, we have to and them together
else {
ltRule.add(secondIndent + "<and>");
for (Token token : value) {
ltRule = addCgToken(ltRule,token,thirdIndentInt);
}
ltRule.add(secondIndent + "</and>");
}
}
ltRule.add(firstIndent + "</pattern>");
// REMOVE
if (type.equals("K_REMOVE")) {
ltRule.add(firstIndent + "<disambig action=\"remove\">" + removeTarget(target) + "</disambig>");
}
// SELECT
else if (type.equals("K_SELECT")) {
ltRule.add(firstIndent + filterTarget(target, mark + 1));
}
// MAP
else if (type.equals("K_MAP")) {
ltRule.add(firstIndent + "<disambig action=\"add\" postag=\"" + addRegexp(rule.maplist) + "\" postag_regexp=\"yes\"/>");
}
ltRule.add("</rule>");
return ltRule;
}
/**
* Helper for getRuleByType
*/
public ArrayList<String> addCgToken(ArrayList<String> ltRule, Token token, int indent) {
String postags = postagsToString(token.postags);
String baseforms = glueWords(cleanForms(token.baseforms));
String surfaceforms = glueWords(cleanForms(token.surfaceforms));
CgCompositeTag[] compositeTags = token.compositeTags;
// should never have both composite tags and any of the postags/baseforms/surfaceforms
// also, if there're composite tags, they should have the same postags component, and not both base and surface forms
if (compositeTags.length != 0) {
ArrayList<String> baseformsList = new ArrayList<>();
ArrayList<String> surfaceformsList = new ArrayList<>();
for (CgCompositeTag ctag : compositeTags) {
CgCompositeTag postagCompiled = new CgCompositeTag();
for (CgTag tag : ctag.tags) {
if (isPostag(tag.tag)) {
postagCompiled.addTag(tag);
} else if (isBaseForm(tag.tag)) {
baseformsList.add(tag.tag);
} else if (isSurfaceForm(tag.tag)) {
surfaceformsList.add(tag.tag);
}
}
postags = compositePostagToString(postagCompiled);
}
baseforms = glueWords(cleanForms(baseformsList.toArray(new String[baseformsList.size()])));
surfaceforms = glueWords(cleanForms(surfaceformsList.toArray(new String[surfaceformsList.size()])));
}
boolean careful = token.careful;
boolean negate = token.negate;
String exceptions = token.exceptionString;
int skip = token.skip;
// the special case of the generic token:
if (postags.equals("") && baseforms.equals("") && surfaceforms.equals("")) {
ltRule = addToken(ltRule,baseforms,postags,exceptions,careful,false,negate,skip,indent);
return ltRule;
}
if (!baseforms.equals("")) {
ltRule = addToken(ltRule, baseforms, postags, exceptions, careful, true, negate, skip, indent);
} else if (!surfaceforms.equals("")) {
ltRule = addToken(ltRule, surfaceforms, postags, exceptions, careful, false, negate, skip, indent);
} else {
ltRule = addToken(ltRule, surfaceforms, postags, exceptions, careful, false, negate, skip, indent);
}
return ltRule;
}
/**
* Helper for getRuleByType
*/
public String getBarrierExceptionStringFromToken(Token token) {
boolean not = token.negate;
boolean inflected = true;
String barrierPos = glueWords(expandSetSets(token.barrier).getPostagsString());
if (token.scanahead) {
barrierPos = barrierPos.concat("|" + SENT_END);
}
String barrierToken = glueWords(token.barrier.getSingleTagBaseformsString());
if (barrierToken.isEmpty()) {
barrierToken = glueWords(token.barrier.getSingleTagSurfaceformsString());
inflected = false;
}
String targetPos = glueWords(token.postags);
String targetToken = glueWords(token.baseforms);
if (targetToken.isEmpty()) {
targetToken = glueWords(token.surfaceforms);
if (not) inflected = false;
}
String postagString = "";
String tokenString = "";
String inflectedString = "";
String regexpString = "";
String postagRegexpString = "";
if (not) {
if (!targetPos.isEmpty()) {
postagString = " postag=\"" + targetPos + "\"";
if (isRegex(targetPos)) {
postagRegexpString = " postag_regexp=\"yes\"";
}
}
if (!targetToken.isEmpty()) {
tokenString = " ".concat(targetToken);
if (isRegex(tokenString)) {
regexpString = " regexp=\"yes\"";
}
}
} else {
if (!barrierPos.isEmpty()) {
postagString = " postag=\"" + barrierPos + "\"";
}
if (isRegex(barrierPos)) {
postagRegexpString = " postag_regexp=\"yes\"";
}
if (!barrierToken.isEmpty()) {
tokenString = barrierToken;
if (isRegex(tokenString)) {
regexpString = " postag_regexp=\"yes\"";
}
}
}
if (inflected) {
inflectedString = " inflected=\"yes\"";
}
String retString = "<exception" + postagString + inflectedString + regexpString + postagRegexpString + " scope=\"next\">" + tokenString + "</exception>";
return retString;
}
/**
* Returns the position of the target token, which is always relative to the furthest back token in the rules
*/
public int getPositionOfTarget(Token[] tokens) {
Token firstToken = tokens[0];
return -1 * firstToken.offset;
}
/**
* Helper that takes a normal contextual test (i.e. not a Parent or a Linked test, e.g. (1 Noun))
* and returns the properly filled-out Token object
*/
public Token getTokenFromNormalTest(CgContextualTest test) {
CgSet testTarget = expandSetSets(grammar.getSet(test.target));
boolean testCareful = test.pos.contains(POS.POS_CAREFUL.value);
int testOffset = test.offset;
boolean testScanAhead = test.pos.contains(POS.POS_SCANFIRST.value) && testOffset >= 0;
boolean testScanBehind = test.pos.contains(POS.POS_SCANFIRST.value) && testOffset < 0;
boolean testNot = test.pos.contains(POS.POS_NOT.value);
CgSet testBarrier = grammar.getSet(test.barrier);
CgSet testCBarrier = grammar.getSet(test.cbarrier);
CgSet barrier = null;
boolean cbarrier = false;
if (testBarrier != null && testCBarrier != null) {
System.err.println("Can't have both a barrier and a careful barrier");
System.exit(1);
}
if (testBarrier != null) {
barrier = testBarrier;
cbarrier = false;
} else if (testCBarrier != null) {
barrier = testCBarrier;
cbarrier = true;
} else {
barrier = new CgSet();
cbarrier = false;
}
if (test.line == 548 && test.offset == 1) {
System.out.println();
}
return new Token(testTarget,testCareful,testOffset,testScanAhead,testScanBehind,barrier,cbarrier,0,testNot);
}
/**
* takes a CgSet and, if it contains nested sets, expands them according to the proper
* set operators (set_ops) and returns the new set.
*/
public CgSet expandSetSets(CgSet set) {
CgSet newSet = new CgSet();
newSet.line = set.line;
newSet.type = set.type;
newSet.name = set.name;
if (set.sets.isEmpty()) {
return set;
}
else if (set.sets.size() > 1 && set.set_ops.isEmpty()) {
System.err.println("Error: something wonky with the set on line " + set.line);
System.exit(1);
}
else if (set.set_ops.isEmpty()) {
CgSet expandedSet = expandSetSets(grammar.getSet(set.sets.get(0)));
for (CgCompositeTag ctag : expandedSet.tags) {
newSet.tags.add(ctag);
}
for (CgTag tag : expandedSet.single_tags) {
newSet.single_tags.add(tag);
}
}
else {
for (int op=0;op<set.set_ops.size();op++) {
CgSet expandedSet1 = expandSetSets(grammar.getSet(set.sets.get(op)));
CgSet expandedSet2 = expandSetSets(grammar.getSet(set.sets.get(op+1)));
// Cartesian set product (+)
if (set.set_ops.get(op) == 4) {
for (CgTag tag : expandedSet1.single_tags) {
for (CgTag tag2 : expandedSet2.single_tags) {
if (tag.tag.equals(tag2.tag)) {
newSet.addTag(tag);
} else {
CgCompositeTag ctag = new CgCompositeTag();
ctag.addTag(tag);
ctag.addTag(tag2);
newSet.addCompositeTag(ctag);
}
}
}
for (CgCompositeTag ctag : expandedSet1.tags) {
for (CgTag tag : expandedSet2.single_tags) {
if (ctag.tags.contains(tag)) {
newSet.addCompositeTag(ctag);
} else {
CgCompositeTag ctagnew = new CgCompositeTag();
for (CgTag tag2 : ctag.tags) {
ctagnew.addTag(tag2);
}
ctagnew.addTag(tag);
newSet.addCompositeTag(ctagnew);
}
}
}
for (CgCompositeTag ctag : expandedSet2.tags) {
for (CgTag tag : expandedSet1.single_tags) {
if (ctag.tags.contains(tag)) {
newSet.addCompositeTag(ctag);
} else {
CgCompositeTag ctagnew = new CgCompositeTag();
for (CgTag tag2 : ctag.tags) {
ctagnew.addTag(tag2);
}
ctagnew.addTag(tag);
newSet.addCompositeTag(ctagnew);
}
}
}
for (CgCompositeTag ctag : expandedSet1.tags) {
for (CgCompositeTag ctag2 : expandedSet2.tags) {
CgCompositeTag ctagnew = new CgCompositeTag();
for (CgTag tag : ctag.tags) {
if (!ctagnew.tags.contains(tag)) {
ctagnew.addTag(tag);
}
}
for (CgTag tag : ctag2.tags) {
if (!ctagnew.tags.contains(tag)) {
ctagnew.addTag(tag);
}
}
newSet.addCompositeTag(ctagnew);
}
}
}
// OR or |
else if (set.set_ops.get(op) == 3) {
for (CgCompositeTag ctag : expandedSet1.tags) {
newSet.addCompositeTag(ctag);
}
for (CgCompositeTag ctag : expandedSet2.tags) {
newSet.addCompositeTag(ctag);
}
for (CgTag tag : expandedSet1.single_tags) {
newSet.addTag(tag);
}
for (CgTag tag : expandedSet2.single_tags) {
newSet.addTag(tag);
}
}
}
}
return newSet;
}
@Override
public String generateName(Object ruleObject) {
CgRule rule = (CgRule) ruleObject;
String name = rule.name;
if (name == null) {
name = "rule_" + nameIndex;
nameIndex++;
}
return name;
}
@Override
public String generateId(Object ruleObject) {
CgRule rule = (CgRule) ruleObject;
String name = rule.name;
if (name == null) {
name = "rule_" + idIndex;
idIndex++;
}
return name;
}
@Override
public String getOriginalRuleString(Object ruleObject) {
CgRule rule = (CgRule) ruleObject;
return lines[rule.line];
}
@Override
public String[] getAcceptableFileTypes() {
String[] ft = {"default"};
return ft;
}
/**
* Token class: contains the elements that will be written into <token> elements in the resulting LT rule
* @author mbryant
*
*/
public class Token implements Comparable<Token> {
// target consists of the expanded CgSet, with no nested sets.
public CgSet target;
public String[] postags;
public String[] surfaceforms;
public String[] baseforms;
public CgCompositeTag[] compositeTags;
public boolean careful;
public int offset;
public boolean scanahead;
public boolean scanbehind;
public CgSet barrier;
public boolean cbarrier;
public int skip;
public boolean negate;
public int relativeOffset;
public Token nextToken;
public Token prevToken;
public String exceptionString;
public Token() {
// nothing
}
// copy constructor
public Token(Token another) {
this.target = new CgSet(another.target);
this.postags = target.getPostagsString();
this.surfaceforms = target.getSingleTagSurfaceformsString();
this.baseforms = target.getSingleTagBaseformsString();
this.compositeTags = target.getCompositeTags();
this.careful = another.careful;
this.offset = another.offset;
this.scanahead = another.scanahead;
this.scanbehind = another.scanbehind;
this.barrier = new CgSet(another.barrier);
this.cbarrier = another.cbarrier;
this.skip = another.skip;
this.negate = another.negate;
this.nextToken = another.nextToken;
this.prevToken = another.prevToken;
this.relativeOffset = another.relativeOffset;
this.exceptionString = another.exceptionString;
}
public Token(CgSet target,
boolean careful, int offset, boolean scanahead, boolean scanbehind,
CgSet barrier, boolean cbarrier, int skip, boolean negate) {
this.target = target;
this.postags = target.getPostagsString();
this.surfaceforms = target.getSingleTagSurfaceformsString();
this.baseforms = target.getSingleTagBaseformsString();
this.compositeTags = target.getCompositeTags();
this.careful = careful;
this.offset = offset;
this.scanahead = scanahead;
this.scanbehind = scanbehind;
this.barrier = barrier;
this.cbarrier = cbarrier;
this.skip = skip;
this.negate = negate;
this.nextToken = null;
this.prevToken = null;
this.relativeOffset = 0;
}
@Override
public int compareTo(Token token) {
if (this.offset < token.offset) {
return -1;
} else if (this.offset == token.offset) {
return 0;
} else {
return 1;
}
}
public boolean isEmpty() {
return (this.postags.length == 0) &&
(this.baseforms.length == 0) &&
(this.surfaceforms.length == 0) &&
(this.compositeTags.length == 0);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
for (String postag : this.postags) {
sb.append(postag + " ");
}
for (String baseform : this.baseforms) {
sb.append(baseform + " ");
}
for (String surfaceform : this.surfaceforms) {
sb.append(surfaceform + " ");
}
for (CgCompositeTag ctag : this.compositeTags) {
sb.append(ctag.toString() + " ");
}
return sb.toString();
}
}
// ** SOME STATIC STRING-CHECKING METHODS **
public static boolean isPostag(String tag) {
return !(tag.matches("\"\\<.*\\>\"r?i?") || tag.matches("\".*\"r?i?"));
}
public static boolean isSurfaceForm(String form) {
return (form.matches("\"\\<.*\\>\"r?i?") || form.matches("\"\"\\<.*\\>\"r?i?\""));
}
public static boolean isBaseForm(String form) {
return (form.matches("\"[^<]*[^>]\"r?i?") || form.matches("\"\"[^<]*[^>]\"r?i?\""));
}
public static boolean isCompositePostag(CgCompositeTag ctag) {
for (CgTag tag : ctag.tags) {
if (isBaseForm(tag.tag) || isSurfaceForm(tag.tag)) {
return false;
}
}
return true;
}
public static boolean sameStrings(String[] s1, String[] s2) {
if (s1.length != s2.length) {
return false;
} else {
for (String ss1 : s1) {
for (String ss2 : s2) {
if (!ss1.equals(ss2)) return false;
}
}
}
return true;
}
// ** METHODS THAT RETURN WRITABLE FORMS FOR CONSTRAINT GRAMMAR TAGS/SETS **
/**
* Removes the quotation marks, angle brackets, and suffixes from surface/base forms
*/
public static String[] cleanForms(String[] words) {
for (int i=0;i<words.length;i++) {
words[i] = words[i].replaceAll(">\"r?i?", "").replaceAll(">\"", "").replaceAll("\"<","").replaceAll("\"r?i?", "");
}
return words;
}
// this is problematic, in that I don't know how to write these regular expressions when we have
// different morphological tag syntax, as Polish or French.
public static String filterRegexp(CgSet target) {
String[] postags = target.getPostagsString();
String postagString = glueWords(postags);
postagString = "(".concat(postagString).concat(")");
String postagRegexp = toStringRegexpFormat(postagString);
StringBuilder sb = new StringBuilder();
sb.append("(?!");
sb.append(postagRegexp);
sb.append(").*");
return sb.toString();
}
public static String filterTarget(CgSet target, int targetNo) {
// <match no=\"" + (mark + 1) + "\" postag=\"" + replaceRegexp(target) + "\" postag_regexp=\"yes\"/>
StringBuilder sb = new StringBuilder();
String[] lemmas = cleanForms(target.getSingleTagBaseformsString());
String[] postags = cleanForms(target.getPostagsString());
CgCompositeTag[] compositeTags = target.getCompositeTags();
String[] surfaceforms = cleanForms(target.getSingleTagSurfaceformsString());
if (lemmas.length > 0 && (compositeTags.length > 0 || postags.length > 0 || surfaceforms.length > 0)) {
System.err.println("Error: something went wrong here.");
}
// assumes there can't be both lemmas and postags
if (lemmas.length > 0) {
sb.append("<disambig action=\"filter\"><match no=\"" + targetNo + "\">" + glueWords(lemmas) + "</match></disambig>");
}
if (postags.length > 0) {
String postagRegexp = "";
if (isRegex(glueWords(postags))) {
postagRegexp = " postag_regexp=\"yes\"";
}
sb.append("<disambig postag=\"" + glueWords(postags) + "\"" + postagRegexp + "/>");
}
return sb.toString();
}
// formats the target for use with disambiguation action="remove" keyword
// assuming they support regular expressions, which they currently don't but kind of have to in order to work
public static String removeTarget(CgSet target) {
StringBuilder sb = new StringBuilder();
sb.append("<wd ");
// these should always be the correct lengths, because if they weren't, they should have been split earlier.
String[] lemmas = target.getSingleTagBaseformsString();
String[] postags = target.getPostagsString();
CgCompositeTag[] compositeTags = target.getCompositeTags();
String[] surfaceforms = target.getSingleTagSurfaceformsString();
if (lemmas.length > 0 && (compositeTags.length > 0 || postags.length > 0 || surfaceforms.length > 0)) {
System.err.println("Error: something went wrong here.");
}
if (lemmas.length > 0) {
sb.append("lemma=" + glueWords(lemmas));
}
if (postags.length > 0) {
sb.append("pos=\"" + glueWords(postags) + "\"");
}
sb.append("/>");
return sb.toString();
}
public static String replaceRegexp(CgSet target) {
String[] postags = target.getPostagsString();
String postagString = glueWords(postags);
postagString = "(".concat(postagString).concat(")");
String postagRegexp = toStringRegexpFormat(postagString);
return postagRegexp;
}
public static String addRegexp(CgSet target) {
String[] postags = target.getSingleTagPostagsString();
if (postags.length != 1) {
System.err.println("Error: trying to map more than one mapping tag on line " + target.line);
System.exit(1);
}
String postag = postags[0];
return postag;
}
// ** LANGUAGE-DEPENDENT METHODS **
public static String postagsToString(String[] postags) {
if (postags.length == 0) {
return "";
}
/*
StringBuilder sb = new StringBuilder();
sb.append("(.*" + tagDelimiter + ")?");
String postagsGlued = glueWords(postags);
sb.append(postagsGlued);
sb.append("(" + tagDelimiter + ".*)?");
*/
// The simplest possible way to do this
StringBuilder sb = new StringBuilder();
for (String pos : postags) {
sb.append(pos + "|");
}
String ret = sb.toString();
return ret.substring(0,ret.length() - 1);
}
public static String toStringRegexpFormat(String t) {
/*
StringBuilder sb = new StringBuilder();
sb.append("^(.*" + tagDelimiter + ")?");
sb.append(t);
sb.append("(" + tagDelimiter + ".*)?$");
return sb.toString();
*/
return t;
}
//TODO: only a stand-in for now; depends on the language-specific multiple-tag string representation
// some complicated regex stuff going on here.
// only should be applied to composite postags. Composite tags with postags + s/b-forms get split in different ways
public static String compositePostagToString(CgCompositeTag ctag) {
/*
StringBuilder sb = new StringBuilder();
String gluedPostag = "";
int noComponents = 0;
for (CgTag tag : ctag.tags) {
if (isPostag(tag.tag)) {
gluedPostag = gluedPostag.concat(tag.tag).concat("|");
noComponents++;
}
}
gluedPostag = "(".concat(gluedPostag.substring(0, gluedPostag.length() - 1)).concat(")");
sb.append("^(.*" + tagDelimiter + ")?");
for (int i=0;i<noComponents;i++) {
sb.append(gluedPostag);
if (i < noComponents - 1) {
sb.append("(" + tagDelimiter + ".*" + tagDelimiter + "?)");
}
}
sb.append("(" + tagDelimiter + ".*)?$");
*/
// The simplest possible way to represent this
StringBuilder sb = new StringBuilder();
for (CgTag tag : ctag.tags) {
if (isPostag(tag.tag)) {
sb.append(tag.tag);
sb.append(tagDelimiter);
}
}
sb.deleteCharAt(sb.length() - 1);
sb.append(".*");
return sb.toString();
}
// language-specific way of representing tags. Relies on a tagDelimiter, which appears to be different in different languages.
// For example, in French, the tags look like: "N f s" (i.e. Noun feminine singular). But in Polish they look like "N:f:s" (same).
// so this syntax takes care of both of those cases (as long as you change the tagDelimiter), wherever in the postag string
// the tag appears.
public static String tagToString(CgTag tag) {
/*
if (tag.tag.equals(sent_end)) {
return sent_end;
}
if (tag.tag.equals(sent_start)) {
return sent_start;
}
StringBuilder sb = new StringBuilder();
String t = tag.tag;
sb.append("^" + t + "$|"); // the tag is the only tag
sb.append("^" + t + tagDelimiter + ".*|"); // the tag is the first tag
sb.append(".*" + tagDelimiter + t + tagDelimiter + ".*|"); // the tag is in the middle somewhere
sb.append(".*" + tagDelimiter + t + "$"); // the tag is the last tag
return sb.toString();
*/
// The simplest possible way to do this
return tag.tag;
}
/**
* Helper to properly put an item to a map where the values are lists
*/
public static <K,V> Map<K,ArrayList<V>> smartPut(Map<K,ArrayList<V>> map, K key, V value) {
if (map.containsKey(key)) {
ArrayList<V> original = map.get(key);
original.add(value);
map.put(key, original);
} else {
ArrayList<V> newcollection = new ArrayList<>();
newcollection.add(value);
map.put(key, newcollection);
}
return map;
}
}