Package org.languagetool.dev.conversion

Source Code of org.languagetool.dev.conversion.CgRuleConverter$Token

package org.languagetool.dev.conversion;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*;

import org.languagetool.dev.conversion.cg.CgCompositeTag;
import org.languagetool.dev.conversion.cg.CgContextualTest;
import org.languagetool.dev.conversion.cg.CgContextualTest.POS;
import org.languagetool.dev.conversion.cg.CgGrammar;
import org.languagetool.dev.conversion.cg.CgRule;
import org.languagetool.dev.conversion.cg.CgSet;
import org.languagetool.dev.conversion.cg.CgTag;
import org.languagetool.dev.conversion.cg.CgTextualParser;

public class CgRuleConverter extends RuleConverter {

  private CgGrammar grammar;
  private String[] lines;
  private static String tagDelimiter = ":";
 
  // basic constructor
  public CgRuleConverter() {
    super();
  }
 
  public CgRuleConverter(String infile, String outfile, String specificFiletype) {
    super(infile, outfile, specificFiletype);
  }
 
  // Get methods
  public CgGrammar getGrammar() {return this.grammar;}
 
  // Set methods
  public void setGrammar(CgGrammar grammar) {this.grammar = grammar;}
 
  public void setTagDelimiter(String td) {tagDelimiter = td;}
 
  @Override
  public void parseRuleFile() throws IOException {
    parseCgFile()// builds the grammar
    List<CgRule> ruleList = new ArrayList<>();
    for (CgRule rule : grammar.rule_by_number) {
      ruleList.add(rule);
    }
    ruleObjects  = ruleList;
    ltRules = new ArrayList<>();
    allLtRules = new ArrayList<>();
    disambiguationRules = new ArrayList<>();
    originalRuleStrings = new ArrayList<>();
    warnings = new ArrayList<>();
    for (Object ruleObject : ruleObjects) {
      CgRule cgrule = (CgRule) ruleObject;
      List<String> ruleAsList = ltRuleAsList(cgrule,generateId(ruleObject),generateName(ruleObject),cgrule.type.name());
      disambiguationRules.add(ruleAsList);
      allLtRules.add(ruleAsList);
      originalRuleStrings.add(lines[cgrule.line]);
    }
  }
 
 
  @Override
  public boolean isDisambiguationRule(Object ruleObject) {
    return true// all cg rules are disambiguation rules
  }
 
  public void parseCgFile() throws IOException {
    File file = new File(inFileName);
    grammar = new CgGrammar();
    CgTextualParser parser = new CgTextualParser(grammar, file);
    int result = parser.parse_grammar_from_file(inFileName, null, null);
    if (result == 0) {
      //System.out.println("Successfully parsed constraint grammar file " + inFileName);
    } else {
      System.err.println("Failed to parse constraint grammar file " + inFileName);
    }
    getGrammarFileLines(inFileName);
  }
 
  /**
   * Grabs the original lines of the CG grammar file. Mainly for retrieving the original rule strings to include in comments.
   * @param filename
   */
  public void getGrammarFileLines(String filename) {
    BufferedReader reader = null;
        StringBuilder sb = new StringBuilder();
        String inArray = "";
        // put some buffer space at the beginning, just to be sure
        sb.append("    ");
        try {
            reader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF-8"));
            int c = reader.read();
            while (c != -1) {
                sb.append((char)c);
                c = reader.read();
            }
            inArray = sb.toString();
            reader.close();
        } catch (IOException e) {
            System.err.println("Error opening grammar file");
            System.exit(1);
        }
        String[] lines = inArray.split("\n");
        this.lines = lines;
  }
 
  /**
   * Takes a single {@link CgRule} and converts it into a list of lines of a LT rule.
   * Sometimes the CG rule has to be split into several LT rules. In this case, the rule
   * is added as a bunch of rules in a rulegroup
   */
  @Override
  public List<String> ltRuleAsList(Object ruleObject, String id, String name, String type) {
    CgRule rule = (CgRule)ruleObject;
    type = rule.type.name()// like K_SELECT or K_REMOVE
    List<String> ltRule = new ArrayList<>();
    List<String> currentWarnings = new ArrayList<>();

    String cgRuleString = lines[rule.line];
    ltRule.add("<!-- " + cgRuleString + " -->");
//    ArrayList<CgRule> rules = splitUnificationRule(mainRule, grammar);
//    for (CgRule rule : rules) {
    ArrayList<Token> tokensList = new ArrayList<>();
    List<ArrayList<Token>> outerList = new ArrayList<>();   // in case we need to split the rule into several rules
    ArrayList<Token[]> processedLists = new ArrayList<>();
   
    CgSet targetSet = expandSetSets(grammar.getSet(rule.target));
    Token target = new Token(targetSet,false,0,false,false,new CgSet(),false,0,false);
    if (!isOrCompatible(target)) {
      System.err.println("Target for rule on line " + rule.line + " cannot be represented as one LT rule. Consider rewriting it.");
      return new ArrayList<>();
    }
    tokensList.add(target);
    List<CgContextualTest> sortedTestsHeads = new ArrayList<>();
    // puts the parent test at the end, so it's processed last
    for (CgContextualTest test : rule.test_heads) {
      if (test.isParentTest()) {
        sortedTestsHeads.add(test);
      } else {
        sortedTestsHeads.add(0, test);
      }
    }
   
    for (CgContextualTest test : sortedTestsHeads) {
      if (test.isNormalTest()) {
        Token testToken = getTokenFromNormalTest(test);
        tokensList.add(testToken);
        // only accounts for a single branching (i.e. one parent test)
      } else if (test.isParentTest()) {
        if (!outerList.isEmpty()) {
          System.err.println("Can't have two parent tests in one test on line " + rule.line + "\nTry splitting it up.");
          System.exit(1);
        }
        for (int testInt : test.ors) {
          ArrayList<Token> newTokenList = copyTokenList(tokensList);
          CgContextualTest childTest = rule.test_map.get(testInt);
          if (childTest.isNormalTest()) {
            Token childTestToken = getTokenFromNormalTest(childTest);
            newTokenList.add(childTestToken);
          } else if (childTest.isLinkedTest()) {
            ArrayList<CgContextualTest> linkedTests = new ArrayList<>();
            CgContextualTest curTest = childTest;
            while (curTest.next != 0) {  // while there are still more tests to link to
              linkedTests.add(curTest);
              curTest = rule.test_map.get(curTest.next);
            }  
            linkedTests.add(curTest);   // add the last linked test
           
            Token headLinkedToken = getLinkedTokens(linkedTests)// modifies the offsets for the linked tests
            newTokenList.add(headLinkedToken);
          }
         
          outerList.add(newTokenList);
        }
      } else if (test.isLinkedTest()) {
        // add all the linked tests to a list
        ArrayList<CgContextualTest> linkedTests = new ArrayList<>();
        CgContextualTest curTest = test;
        while (curTest.next != 0) {  // while there are still more tests to link to
          linkedTests.add(curTest);
          curTest = rule.test_map.get(curTest.next);
        }  
        linkedTests.add(curTest);   // add the last linked test
       
        Token headLinkedToken = getLinkedTokens(linkedTests)// modifies the offsets for the linked tests
        tokensList.add(headLinkedToken);
       
      }
    }
    // if the outerList is empty, we haven't had a parent test, so we can just add the tokensList to the outerList and process it
    if (outerList.isEmpty()) { 
      outerList.add(tokensList);
    }
    // pre-process/split all the tests
    // they come off outerList and go back onto processedLists
    // first split off the special case of the negative backward barrier scan
    for (int i=0;i<outerList.size();i++) {
      Token[] tokens = outerList.get(i).toArray(new Token[outerList.get(i).size()]);
      if (negativeBackwardBarrierScan(tokens)) {
        Iterable<List<Token>> split = splitNegativeBackwardBarrierScan(tokens);
        outerList.remove(i);
        for (List<Token> splitList : split) {
          outerList.add(i, new ArrayList<>(splitList));
          i++;
        }
      }
    }
   
    for (int i=0;i<outerList.size();i++) {
      Token[] tokens = outerList.get(i).toArray(new Token[outerList.get(i).size()]);
      Arrays.sort(tokens);
      tokens = addGapTokens(tokens);
      if (skipSafe(tokens)) {
        tokens = addSkipTokens(tokens);
        tokens = resolveLinkedTokens(tokens);
        if (!singleRuleCompatible(tokens)) {
          Iterable<List<Token>> singleRuleCompatibleTokens = splitForSingleRule(tokens);
          for (List<Token> srctl : singleRuleCompatibleTokens) {
            Token[] srcta = srctl.toArray(new Token[srctl.size()]);
            processedLists.add(srcta);
          }
        } else {
          processedLists.add(tokens);
        }
      }
      else {
        List<List<Token>> splitTokenLists = getSkipSafeTokens(tokens);
        for (int j=0;j<splitTokenLists.size();j++) {
          Token[] indSplitTokenList = splitTokenLists.get(j).toArray(new Token[splitTokenLists.get(j).size()]);
          indSplitTokenList = addSkipTokens(indSplitTokenList);
          indSplitTokenList = resolveLinkedTokens(indSplitTokenList);
          Arrays.sort(indSplitTokenList);
          indSplitTokenList = addGapTokens(indSplitTokenList);
          if (!singleRuleCompatible(indSplitTokenList)) {
            Iterable<List<Token>> singleRuleCompatibleTokens = splitForSingleRule(indSplitTokenList);
            for (List<Token> srctl : singleRuleCompatibleTokens) {
              Token[] srcta = srctl.toArray(new Token[srctl.size()]);
              processedLists.add(srcta);
            }
          } else {
            processedLists.add(indSplitTokenList);
          }
        }
      }
    }
   
    // the actual rule generation
    if (processedLists.size() == 1) {
      Token[] tokens = processedLists.get(0);
      List<String> ltRule2 = getRuleByType(targetSet, tokens, rule, id, name, type);
      ltRule.addAll(ltRule2);
    } else {
      ltRule.add("<rulegroup name=\"" + generateName(ruleObject) + "\">");
      for (Token[] tokens : processedLists) {
        List<String> ltRule2 = getRuleByType(targetSet, tokens, rule, null, null, type);
        ltRule.addAll(ltRule2);
      }
      ltRule.add("</rulegroup>");
    }
   
//    }
    warnings.add(currentWarnings.toArray(new String[currentWarnings.size()]));
    return ltRule;
  }
 
  // ** METHODS THAT SPLIT A RULE INTO MULTIPLE RULES **
 
  /**
   * For if there's multiple surface/base forms and postags in a single token that we can't "or" together
   * in one LT token. E.g. if a token includes ("man" or "woman" or NN or NNP), it'd have to be split.
   */
  public List<List<Token>> splitForSingleRule(Token[] tokens) {
    List<List<Token>> list = new ArrayList<>();
    List<Token> tokenList = new ArrayList<>(Arrays.asList(tokens));
    list.add(tokenList);
    boolean notdone = true;
    while (notdone) {
      for (int i=0;i<list.size();i++) {
        List<Token> insideList = list.get(i);
        if (singleRuleCompatible(insideList.toArray(new Token[insideList.size()]))) {
          if (i == list.size()-1) notdone = false;
          continue;
        } else {
          list.remove(i);
          Iterable<List<Token>> splitTokens = splitListForSingleRule(insideList);
          for (List<Token> ind : splitTokens) {
            list.add(ind);
          }
          break;
        }
      }
    }
   
    return list;
  }
 
  /**
   * Actually performs the splitting for wrapper method splitForSingleRule. Only performs a split for a single token.
   */
  public List<List<Token>> splitListForSingleRule(List<Token> tokens) {
    ArrayList<List<Token>> list = new ArrayList<>();
    final List<Token> firstList = new ArrayList<>();
    list.add(firstList);
    int i;
    for (i=0;i<tokens.size();i++) {
      if (isOrCompatible(tokens.get(i))) {
        firstList.add(tokens.get(i));
      } else {
        list.remove(firstList);
        Iterable<CgSet> newSets = splitCgSet(tokens.get(i).target);
        for (CgSet set : newSets) {
          Token newToken = new Token(tokens.get(i));
          newToken.target = expandSetSets(set);
          newToken.postags = newToken.target.getPostagsString();
          newToken.baseforms = newToken.target.getSingleTagBaseformsString();
          newToken.surfaceforms = newToken.target.getSingleTagSurfaceformsString();
          newToken.compositeTags = newToken.target.getCompositeTags();
          // clone the first list
          List<Token> newList = new ArrayList<>();
          for (Token token : firstList) {
            newList.add(new Token(token));
          }
          // add the new token
          newList.add(newToken);
          // add the new list
          list.add(newList);
        }
        break;
      }
    }
    // finish up the list
    for (int j=i+1;j<tokens.size();j++) {
      for (int k=0;k<list.size();k++) {
        List<Token> insideList = list.get(k);
        insideList.add(tokens.get(j));
        list.set(k,insideList);
      }
    }
    return list;
  }
 
  /**
   * Splits off part of a CgSet that can't be represented in a single LT token
   */
  public List<CgSet> splitCgSet(CgSet target) {
    // setting up the lists to perform the check
    List<CgSet> newSets = new ArrayList<>();
    CgTag[] postags = target.getSingleTagPostags();
    CgTag[] baseforms = target.getSingleTagBaseforms();
    CgTag[] surfaceforms = target.getSingleTagSurfaceforms();
    CgCompositeTag[] compositePostags = target.getCompositePostags();
   
    // actually checking and doing the splitting
    if (postags.length > 0 && baseforms.length > 0) {
      CgSet set1 = new CgSet(target);
      CgSet set2 = new CgSet(set1);
      set1.single_tags.removeAll(Arrays.asList(postags));
      set1.tags.removeAll(Arrays.asList(compositePostags));
      set2.single_tags.removeAll(Arrays.asList(baseforms));
      newSets.add(set1);
      newSets.add(set2);
      return newSets;
    }
    if (postags.length > 0 && surfaceforms.length > 0) {
      CgSet set1 = new CgSet(target);
      CgSet set2 = new CgSet(target);
      set1.single_tags.removeAll(Arrays.asList(postags));
      set1.tags.removeAll(Arrays.asList(compositePostags));
      set2.single_tags.removeAll(Arrays.asList(surfaceforms));
      newSets.add(set1);
      newSets.add(set2);
      return newSets;
    }
    if (surfaceforms.length > 0 && baseforms.length > 0) {
      CgSet set1 = new CgSet(target);
      CgSet set2 = new CgSet(target);
      set1.single_tags.removeAll(Arrays.asList(surfaceforms));
      set2.single_tags.removeAll(Arrays.asList(baseforms));
      newSets.add(set1);
      newSets.add(set2);
      return newSets;
    }
    // if we didn't catch the culprit in the single tags, it must be in the composite tags,
    // which means that there exists two composite tags that have different types of tags in them.
    // I could try to do this in a principled way, or I could just split off each composite tag.
    // This seems like the better idea for now.
    newSets = groupCompositeTags(target);
    /*
    for (CgCompositeTag ctag : compositeTags) {
      CgSet set1 = new CgSet(target);
      CgSet set2 = new CgSet(target);
      set1.tags.remove(ctag);
      set2.tags.removeAll(Arrays.asList(compositeTags));
      set2.tags.removeAll(Arrays.asList(compositePostags));
      set2.single_tags = new HashSet<CgTag>();
      set2.tags.add(ctag);
      twoSets.add(set1);
      twoSets.add(set2);
      return twoSets;
    }
    */
    // it should never get to here, because it should never get a set that doesn't need to be split passed to it.
    return newSets;
  }
 
  /**
   * Groups the composite tags along lines that can be represented in a single LT rule
   */
  @SuppressWarnings("unchecked")
  private List<CgSet> groupCompositeTags(CgSet target) {
    HashMap<String,ArrayList<CgCompositeTag>> bf = new HashMap<>();
    Map<String, ArrayList<CgCompositeTag>> sf = new HashMap<>();
    Map<String, CgCompositeTag> dict = new HashMap<>()// dictionary of sorts
    for (CgCompositeTag ctag : target.tags) {
      CgCompositeTag postags = new CgCompositeTag();
      CgCompositeTag baseforms = new CgCompositeTag();
      CgCompositeTag surfaceforms = new CgCompositeTag();
      for (CgTag tag : ctag.tags) {
        if (isBaseForm(tag.tag)) {
          baseforms.addTag(tag);
        } else if (isSurfaceForm(tag.tag)) {
          surfaceforms.addTag(tag);
        } else if (isPostag(tag.tag)) {
          postags.addTag(tag);
        }
      }
      if (!postags.isEmpty()) {
        if (!baseforms.isEmpty()) {
          bf = (HashMap)smartPut(bf, postags.toString(), baseforms);
        }
        else if (!surfaceforms.isEmpty()) {  // assumes there won't be both sf and bf in the same composite tag
          sf = (HashMap)smartPut(sf, postags.toString(), surfaceforms);
        }
      }
      dict.put(postags.toString(), postags);
    }
    List<CgSet> ret = new ArrayList<>();
    for (String postagSet : bf.keySet()) {
      CgSet newSet = new CgSet(target);
      newSet.tags = new HashSet<>();
      Iterable<CgCompositeTag> bfs = bf.get(postagSet);
      for (CgCompositeTag singleBf : bfs) {
        CgCompositeTag newTotalTag = new CgCompositeTag();
        for (CgTag tag : dict.get(postagSet).tags) {
          newTotalTag.addTag(tag);
        }
        for (CgTag tag : singleBf.tags) {
          newTotalTag.addTag(tag);
        }
        newSet.addCompositeTag(newTotalTag);
      }
      ret.add(newSet);
    }
    return ret;
  }
 
  /**
   * returns separate Token arrays, each of which is safe for dealing with scanning tokens
   * and each of which will be a separate rule. This applies when we have a scanning token
   * before a non-scanning token, offset wise
   */
  public List<List<Token>> getSkipSafeTokens(Token[] tokens) {
    List<List<Token>> list = new ArrayList<>();
    List<Token> tokenList = Arrays.asList(tokens);
    list.add(tokenList);
    boolean notdone = true;
    while (notdone) {
      for (int i=0;i<list.size();i++) {
        List<Token> insideList = list.get(i);
        if (skipSafe(insideList.toArray(new Token[insideList.size()]))) {
          if (i == list.size()-1) {
            notdone = false;
          }
        } else {
          list.remove(i);
          Iterable<List<Token>> splitTokens = splitOutSkipTokens(insideList);
          for (List<Token> isl : splitTokens) {
            list.add(isl);
          }
          break;
        }
      }
     
    }
    return list;
  }
 
  /**
   * Actually does the splitting for wrapper method getSkipSafeTokens
   */
  public List<List<Token>> splitOutSkipTokens(List<Token> tokens) {
    ArrayList<List<Token>> list = new ArrayList<>();
    ArrayList<Token> scanningTokens = new ArrayList<>();
    ArrayList<Token> reverseScanningTokens = new ArrayList<>();
    ArrayList<Token> normalTokens = new ArrayList<>();
    for (Token token : tokens) {
      if (token.scanahead) scanningTokens.add(token);
      else if (token.scanbehind) reverseScanningTokens.add(token);
      else normalTokens.add(token);
    }
    // forward scans
    for (int s=0;s<scanningTokens.size();s++) {
      final Token scanning = scanningTokens.get(s);
      for (int n=0;n<normalTokens.size();n++) {
        final Token normal = normalTokens.get(n);
        if (normal.offset >= scanning.offset) {
          List<Token> newTokenList1 = new ArrayList<>();
          List<Token> newTokenList2 = new ArrayList<>();
          for (Token ntoken : normalTokens) {
            newTokenList1.add(ntoken);
            newTokenList2.add(ntoken);
          }
          Token newNormalToken = new Token(scanning);
          newNormalToken.scanahead = false;
          newTokenList1.add(newNormalToken);
          scanning.offset++;
          newTokenList2.add(scanning);
          list.add(newTokenList1);
          list.add(newTokenList2);
          return list;
        }
      }
    }
    // backward scans
    for (int s=0;s<reverseScanningTokens.size();s++) {
      final Token scanning = reverseScanningTokens.get(s);
      for (int n=0;n<normalTokens.size();n++) {
        final Token normal = normalTokens.get(n);
        if (normal.offset <= scanning.offset) {
          List<Token> newTokenList1 = new ArrayList<>();
          List<Token> newTokenList2 = new ArrayList<>();
          for (Token ntoken : normalTokens) {
            newTokenList1.add(ntoken);
            newTokenList2.add(ntoken);
          }
          Token newNormalToken = new Token(scanning);
          newNormalToken.scanbehind = false;
          newTokenList1.add(newNormalToken);
          scanning.offset--;
          newTokenList2.add(scanning);
          list.add(newTokenList1);
          list.add(newTokenList2);
          return list;
        }
      }
    }
   
    return null;
  }
 
  /**
   * Handles special case of negative backwards barrier scan (e.g. (NOT -1* Verb BARRIER CLB));
   */
  public List<List<Token>> splitNegativeBackwardBarrierScan(Token[] tokens) {
    ArrayList<List<Token>> list = new ArrayList<>();
    List<Token> newTokenList1 = new ArrayList<>();
    List<Token> newTokenList2 = new ArrayList<>();
    int index=0;
    for (index = 0;index<tokens.length;index++) {
      if (tokens[index].scanbehind && tokens[index].negate && !tokens[index].barrier.isEmpty()) {
        Token newToken = new Token(tokens[index]);
        newToken.barrier = new CgSet();
        newTokenList1.add(tokens[index]);
        newTokenList2.add(newToken);
        break;
      } else {
        newTokenList1.add(tokens[index]);
        newTokenList2.add(tokens[index]);
      }
    }
    // finish off the rest
    for (index=index+1;index<tokens.length;index++) {
      newTokenList1.add(tokens[index]);
      newTokenList2.add(tokens[index]);
    }
    list.add(newTokenList1);
    list.add(newTokenList2);
    return list;
  }
 
//  // takes a rule that has unification tags (e.g. $$NUMBER) and splits it into several easier to handle rules
//  public ArrayList<CgRule> splitUnificationRule(CgRule rule, CgGrammar grammar) {
//    ArrayList<CgRule> rules = new ArrayList<CgRule>();
//    // go over all the tests in the test_map, which should contain all tests for the entire rule, and every time you see a
//    // unification tag, put in one of its component parts and add that modified rule to the list of new rules.
//    HashSet<CgSet> unifyingSets = new HashSet<CgSet>();
//    // add all the unifying sets in the rule
//    for (Iterator<Integer> iter = rule.test_map.keySet().iterator(); iter.hasNext(); ) {
//      CgContextualTest curTest = rule.test_map.get(iter.next());
//      CgSet target = grammar.getSet(curTest.target);
//      for (Integer setint : target.sets) {
//        CgSet set = grammar.getSet(setint);
//        if (set.type.contains(ST.ST_TAG_UNIFY.value)) {
//          unifyingSets.add(set);
//        }
//      }
//    }
//    CgSet targetSet = grammar.getSet(rule.target);
//    for (int setint : targetSet.sets) {
//      if (grammar.getSet(setint).type.contains(ST.ST_TAG_UNIFY.value)) {
//        unifyingSets.add(grammar.getSet(setint));
//      }
//    }
//    // if no unifying sets, just return the rule
//    if (unifyingSets.size() == 0) {
//      rules.add(rule);
//    }
//    for (CgSet unifyingSet : unifyingSets) {
//      CgSet unifyingSetExpanded = expandSetSets(unifyingSet);
//      for (CgTag tag : unifyingSetExpanded.single_tags) {
//        CgSet oldTargetSet = new CgSet(grammar.getSet(rule.target));
//        CgRule newRule = new CgRule(rule);
//        if (oldTargetSet.sets.contains(unifyingSet.hash)) {
//          oldTargetSet.sets.remove((Object)unifyingSet.hash);
//          oldTargetSet.single_tags.add(tag);
//          oldTargetSet.rehash();
//          grammar.addSet(oldTargetSet);
//          newRule.target = oldTargetSet.hash;
//        }
//       
//        for (Iterator<Integer> iter = newRule.test_map.keySet().iterator(); iter.hasNext();) {
//          int testKey = iter.next();
//          CgContextualTest test = newRule.test_map.get(testKey);
//          CgSet oldTestTargetSet = new CgSet(grammar.getSet(test.target));
//          if (oldTestTargetSet.sets.contains(unifyingSet.hash)) {
//            oldTestTargetSet.sets.remove(unifyingSet);
//            oldTestTargetSet.single_tags.add(tag);
//            oldTestTargetSet.rehash();
//            grammar.addSet(oldTestTargetSet);
//            test.target = oldTestTargetSet.hash;
//          }
//          newRule.test_map.put(testKey, test);
//        }
//        rules.add(newRule);
//      }
//    }
//    return rules;
//  }
 
 
  // ** METHODS THAT MODIFY A SINGLE TOKEN LIST **
 
  /**
   * Expands the tests in a linked test to multiple tokens
   */
  public Token[] resolveLinkedTokens(Token[] tokens) {
    List<Token> tokenList = new ArrayList<>(Arrays.asList(tokens));
    boolean notdone = true;
    while (notdone) {
      for (int i=0;i<tokenList.size();i++) {
        Token curToken = tokenList.get(i);
        if (curToken.nextToken != null) {
          Token tempToken = new Token(curToken.nextToken);
          tempToken.offset = curToken.offset + tempToken.relativeOffset;  // to fix the offsets
          tokenList.add(i+1, tempToken);
          Token temp2 = new Token(curToken);
          temp2.nextToken = null;
          tokenList.set(i, temp2);
          break;
        } else {
          if (i == tokenList.size()-1) notdone = false;
        }
      }
    }
    return tokenList.toArray(new Token[tokenList.size()]);
  }
 
  /**
   * contains the list of linked CgContextualTests, to translate into Token format
   * returns only the head of the linked tokens, have to iterate through them later when you add them
   */
  public Token getLinkedTokens(ArrayList<CgContextualTest> tests) {
    ArrayList<Token> tokens = new ArrayList<>();
    for (int i=0;i<tests.size();i++) {
      if (i == 0) {
        // this kind of assumes that it won't be a parent test
        tokens.add(getTokenFromNormalTest(tests.get(i)));
      } else {
        Token token = getTokenFromNormalTest(tests.get(i));
        token.relativeOffset = token.offset;
        token.offset = token.offset + tokens.get(i-1).offset;
        token.prevToken = tokens.get(i-1);
        tokens.add(token);
      }
    }
    // sort tokens by offset and rearrange them
    Token[] ts = tokens.toArray(new Token[tokens.size()]);
    Arrays.sort(ts);
    // this assumes there's only one scan in the linked tests
    // the rationale here is that if it switches positions of tokens, and the
    // one that's a scan token gets pushed further back in the list, its scan flag will go back to the front.
    for (int i=0;i<ts.length;i++) {
      if (ts[i].scanahead) {
        ts[i].scanahead = false;
        ts[0].scanahead = true;
        break;
      }
    }
    // check to make sure there's not another scan in the linked tests:
    for (int i=1;i<ts.length;i++) {
      if (ts[i].scanahead) {
        System.err.println("Two scan tests in one series of linked tests. This is really hard to represent in LT format. Try to split it into several rules");
        System.exit(1);
      }
    }
    ts = addLinkedGapTokens(ts);
    for (int i=0;i<ts.length;i++) {
       if (i == 0) {
         ts[i].relativeOffset = 0;
       } else {
         ts[i].relativeOffset = 1;
       }
       if (i != ts.length - 1) ts[i].nextToken = ts[i+1];
       if (i != 0) ts[i].prevToken = ts[i-1];
    }
    return ts[0];
  }
 
  /**
   * Adds gap tokens in the case of a linked (sub) list of tokens
   */
  public Token[] addLinkedGapTokens(Token[] ts) {
    ArrayList<Token> tokens = new ArrayList<>(Arrays.asList(ts));
    boolean notdone = true;
    while (notdone) {
      for (int i=0;i<tokens.size();i++) {
        if (i==0) continue;
        else if (i > 0 && i < tokens.size()) {
          if (tokens.get(i).offset == tokens.get(i-1).offset || tokens.get(i).offset == (tokens.get(i-1).offset + 1)) {
            if (i == tokens.size() - 1) notdone = false;
            continue;
          }
          else {
            Token newToken = new Token(new CgSet(), false, tokens.get(i-1).offset + 1, false, false, new CgSet(), false, 0, false);
            newToken.relativeOffset = tokens.get(i-1).relativeOffset + 1;
            Token oldToken = tokens.get(i-1);
            oldToken.relativeOffset = -1;
            tokens.set(i-1, oldToken);
            tokens.add(i,newToken);
            break;
          }
        }
        if (i == tokens.size() - 1) {
          notdone = false;
        }
      }
    }
    return tokens.toArray(new Token[tokens.size()]);
  }
 
  /**
   * only gets called if it's safe to add skip tokens in a straightforward manner, otherwise, we split them out
   * if it's a negative scanning string (NOT 1* Verb BARRIER CLB), then the exception string is the target
   * if it's a positive scanning string (1* Verb BARRIER CLB), then the exception string is the barrier and the end of sentence
   * it always goes in the previous token with scope next
   */
  public Token[] addSkipTokens(Token[] tokens) {
    ArrayList<Token> tokenList = new ArrayList<>(Arrays.asList(tokens));
    for (int i=0;i<tokenList.size();i++) {
      // forward scans (1* Verb)
      if (tokenList.get(i).scanahead) {
        if (i == 0) {
          Token newToken = new Token(new CgSet(), false, tokenList.get(i).offset - 1, false, false, new CgSet(), false, -1, false);
          if (!tokenList.get(i).barrier.isEmpty() || tokenList.get(i).negate) {
            newToken.exceptionString = getBarrierExceptionStringFromToken(tokenList.get(i));
          }
          Token oldToken = tokenList.get(i);
          // if it's a negative scan (NOT 1* Noun), then the target of the next token becomes the barrier + SENT_END
          if (oldToken.negate) { 
            CgSet newTarget = oldToken.barrier;
            CgTag sentEndTag = new CgTag();
            sentEndTag.tag = SENT_END;
            newTarget.single_tags.add(sentEndTag);
            oldToken.target = newTarget;
            oldToken.postags = oldToken.target.getPostagsString();
            oldToken.baseforms = oldToken.target.getSingleTagBaseformsString();
            oldToken.surfaceforms = oldToken.target.getSingleTagSurfaceformsString();
            oldToken.compositeTags = oldToken.target.getCompositeTags();
            tokenList.set(0, oldToken);
          }
          tokenList.add(0, newToken);
        } else {
          int index = i-1;
          String exceptionString = null;
          if (!tokenList.get(i).barrier.isEmpty() || tokenList.get(i).negate) {
            exceptionString = getBarrierExceptionStringFromToken(tokenList.get(i));
          }
         
          int prevOffset = tokenList.get(index).offset;
          while (index >= 0 && tokenList.get(index).offset == prevOffset) {
            Token prevToken = tokenList.get(index);
            prevToken.skip = -1;
            prevToken.exceptionString = exceptionString;
            tokenList.set(index, prevToken);
            index--;
          }
          Token oldToken = tokenList.get(i);
          if (oldToken.negate) {
            CgSet newTarget = oldToken.barrier;
            CgTag sentEndTag = new CgTag();
            sentEndTag.tag = SENT_END;
            newTarget.single_tags.add(sentEndTag);
            oldToken.target = newTarget;
            oldToken.postags = oldToken.target.getPostagsString();
            oldToken.baseforms = oldToken.target.getSingleTagBaseformsString();
            oldToken.surfaceforms = oldToken.target.getSingleTagSurfaceformsString();
            oldToken.compositeTags = oldToken.target.getCompositeTags();
            oldToken.negate = false;
            tokenList.set(i,oldToken);
          }
        }
      }
      // reverse scans (-1* Verb)
      else if (tokenList.get(i).scanbehind) {
        Token newToken = new Token(new CgSet(), false, tokenList.get(i).offset - 1, false, false, new CgSet(), false, -1, false);
        String exceptionString = null;
        if (!tokenList.get(i).barrier.isEmpty() || tokenList.get(i).negate) {
          exceptionString = getBarrierExceptionStringFromToken(tokenList.get(i));
        }
        CgSet newTarget = newToken.target;
        CgTag sentStartTag = new CgTag();
        sentStartTag.tag = SENT_START;
        newTarget.single_tags.add(sentStartTag);
        newToken.target = newTarget;
        newToken.postags = newToken.target.getPostagsString();
        newToken.baseforms = newToken.target.getSingleTagBaseformsString();
        newToken.surfaceforms = newToken.target.getSingleTagSurfaceformsString();
        newToken.compositeTags = newToken.target.getCompositeTags();
        newToken.skip = -1;
        Token oldToken = tokenList.get(i);
        oldToken.skip = -1;
        oldToken.scanbehind = false;
        // if there's no barrier
        if (oldToken.barrier.isEmpty()) {
          if (oldToken.negate) {
            newToken.exceptionString = exceptionString;
            newToken.offset++;
            tokenList.set(i, newToken);
          } else {
            tokenList.set(i, oldToken);
            tokenList.add(i, newToken);
            i++;
          }
        }
        // if there IS a barrier
        else {
          if (oldToken.negate) {
            oldToken.target = oldToken.barrier;
            oldToken.postags = oldToken.target.getPostagsString();
            oldToken.baseforms = oldToken.target.getSingleTagBaseformsString();
            oldToken.surfaceforms = oldToken.target.getSingleTagSurfaceformsString();
            oldToken.compositeTags = oldToken.target.getCompositeTags();
            oldToken.exceptionString = exceptionString;
            tokenList.set(i,oldToken);
            tokenList.add(i,newToken);
          } else {
            oldToken.exceptionString = exceptionString;
            tokenList.set(i, oldToken);
            tokenList.add(i, newToken);
            i++;
          }
        }
      }
    }
    return tokenList.toArray(new Token[tokenList.size()]);
  }
 
  /**
   * For cases where there needs to be an empty token inserted in order to make a proper LT pattern
   * e.g. REMOVE N IF (1 Verb) (3 Det);
   */
  public Token[] addGapTokens(Token[] tokens) {
    boolean notdone = true;
    ArrayList<Token> tokenList = new ArrayList<>(Arrays.asList(tokens));
    while (notdone) {
      for (int i=0;i<tokenList.size();i++) {
        if (i == 0) continue;
        else if (i > 0 && i < tokenList.size()){
          if (tokenList.get(i).offset == tokenList.get(i-1).offset) {
            if (i == tokenList.size()-1) notdone = false;
            continue;
          }
          if (tokenList.get(i).offset != (tokenList.get(i-1).offset + 1) && tokenList.get(i).prevToken == null) {
            tokenList.add(i, new Token(new CgSet(), false, tokenList.get(i-1).offset + 1, false, false, new CgSet(), false, 0, false));
            break;
          }
        }
        if (i == tokenList.size() - 1) {
          notdone = false;
        }
      }
    }
    return tokenList.toArray(new Token[tokenList.size()]);
  }
 
  public ArrayList<Token> copyTokenList(Iterable<Token> tokens) {
    ArrayList<Token> newList = new ArrayList<>();
    for (Token token : tokens) {
      newList.add(new Token(token));
    }
    return newList;
  }
 
  public ArrayList<Token> removeExtraEmptyTokens(ArrayList<Token> tokens) {
    if (tokens.size() == 1) {
      return tokens;
    } else {
      ArrayList<Token> newTokenList = new ArrayList<>();
      for (Token token : tokens) {
        if (!token.isEmpty()) {
          newTokenList.add(token);
        }
      }
      if (newTokenList.isEmpty()) {
        newTokenList.add(tokens.get(0));
      }
      return newTokenList;
    }
  }
 
 
  // ** METHODS THAT CHECK THE PROPERTIES OF A RULE **
 
  /**
   * returns if the rule is safe to add the "skip" attribute to the previous token, as we do normally;
   * otherwise, splits the tokens and returns multiple rules.
   */
  public boolean skipSafe(Token[] tokens) {
    Collection<Token> scanningTokens = new HashSet<>();
    Collection<Token> reverseScanningTokens = new HashSet<>();
    Collection<Token> normalTokens = new HashSet<>();
    for (Token token : tokens) {
      if (token.scanahead) {
        scanningTokens.add(token);
      } else if (token.scanbehind) {
        reverseScanningTokens.add(token);
      } else {
        normalTokens.add(token);
      }
    }
    for (Token s : scanningTokens) {
      for (Token o : normalTokens) {
        if (s.offset <= o.offset) return false;
      }
    }
    for (Token s : reverseScanningTokens) {
      for (Token o : normalTokens) {
        if (s.offset >= o.offset) return false;
      }
    }
    return true;
  }
 
  /**
   * returns true of the rule contains an example of a negative backwards barrier scan (NOT -1* Adj BARRIER Noun), e.g.
   * these rules have to be treated as special cases
   */
  public boolean negativeBackwardBarrierScan(Token[] tokens) {
    for (Token token : tokens) {
      if (token.scanbehind && token.negate && !token.barrier.isEmpty()) {
        return true;
      }
    }
    return false;
  }
 
  public boolean singleRuleCompatible(Token[] tokens) {
    for (Token token : tokens) {
      if (!isOrCompatible(token)) return false;
    }
    return true;
  }
 
  /**
   * Returns true if the {@link Token} can be represented in a single LT rule, false otherwise.
   * Example: "are"|noun cannot be represented as one LT token
   */
  public boolean isOrCompatible(Token token) {
    if (token.postags.length > 0 && (token.baseforms.length > 0 || token.surfaceforms.length > 0)) {
      return false;
    }
    if (token.baseforms.length > 0 && token.surfaceforms.length > 0) {
      return false;
    }
    if (token.compositeTags.length > 0 && (token.postags.length > 0 || token.baseforms.length > 0 || token.surfaceforms.length > 0)) {
      return false;
    }
    Collection<String> pos = new HashSet<>();
    Collection<String> base = new HashSet<>();
    Collection<String> surf = new HashSet<>();
    for (CgCompositeTag ctag : token.compositeTags) {
      CgCompositeTag postagCompile = new CgCompositeTag();
      for (CgTag tag : ctag.tags) {
        if (isPostag(tag.tag)) {
          postagCompile.addTag(tag);
        } else if (isSurfaceForm(tag.tag)) {
          surf.add(tag.tag);
        } else if (isBaseForm(tag.tag)) {
          base.add(tag.tag);
        }
      }
      pos.add(postagCompile.toString());
    }
    if (pos.size() > 1 && (surf.size() > 1 || base.size() > 1)) {
      return false;
    }
    if (surf.size() > 1 && base.size() > 1) {
      return false;
    }
    if (surf.size() > 0 && base.size() > 0 && pos.size() > 0) {
      return false;
    }
    return true;
  }

 
 
 
  // ** METHODS THAT ACTUALLY GENERATE THE LT RULES **
 
  /**
   * Actual LT rule generation
   */
  @SuppressWarnings("unchecked")
  public List<String> getRuleByType(CgSet target, Token[] tokens, CgRule rule, String id, String name, String type) {
    ArrayList<String> ltRule = new ArrayList<>();
    TreeMap<Integer,ArrayList<Token>> tokenmap = new TreeMap<>();
    for (Token token : tokens) {
      tokenmap = (TreeMap)smartPut(tokenmap,token.offset,token);
    }
   
    if (name != null || id != null) {
      ltRule.add("<rule id=\"" + id + "\" name=\"" + name + "\">");
    } else {
      ltRule.add("<rule>");
    }
   
    int mark = getPositionOfTarget(tokens);
    ltRule.add(firstIndent + "<pattern mark=\"" + mark + "\">");
    for (Iterator<Integer> iter = tokenmap.keySet().iterator(); iter.hasNext();) {
      int key = iter.next();
      ArrayList<Token> value = tokenmap.get(key);
      // remove duplicates, so we don't have unnecessary "and"s floating around
      value = removeExtraEmptyTokens(value);
      if (value.size() == 1) {
        Token token = value.get(0);
        ltRule = addCgToken(ltRule,token,secondIndentInt);
      }
      // if the number of tokens at the given offset is more than 1, we have to and them together
      else {
        ltRule.add(secondIndent + "<and>");
       
        for (Token token : value) {
          ltRule = addCgToken(ltRule,token,thirdIndentInt);
        }
        ltRule.add(secondIndent + "</and>");
      }
     
    }
    ltRule.add(firstIndent + "</pattern>");
    // REMOVE
    if (type.equals("K_REMOVE")) {
      ltRule.add(firstIndent + "<disambig action=\"remove\">" + removeTarget(target) + "</disambig>");
    }
    // SELECT
    else if (type.equals("K_SELECT")) {
      ltRule.add(firstIndent + filterTarget(target, mark + 1));
    }
    // MAP
    else if (type.equals("K_MAP")) {
      ltRule.add(firstIndent + "<disambig action=\"add\" postag=\"" + addRegexp(rule.maplist) + "\" postag_regexp=\"yes\"/>");
    }
    ltRule.add("</rule>");
    return ltRule;
  }
 
  /**
   * Helper for getRuleByType
   */
  public ArrayList<String> addCgToken(ArrayList<String> ltRule, Token token, int indent) {
    String postags = postagsToString(token.postags);
    String baseforms = glueWords(cleanForms(token.baseforms));
    String surfaceforms = glueWords(cleanForms(token.surfaceforms));
    CgCompositeTag[] compositeTags = token.compositeTags;
    // should never have both composite tags and any of the postags/baseforms/surfaceforms
    // also, if there're composite tags, they should have the same postags component, and not both base and surface forms
    if (compositeTags.length != 0) {
      ArrayList<String> baseformsList = new ArrayList<>();
      ArrayList<String> surfaceformsList = new ArrayList<>();
      for (CgCompositeTag ctag : compositeTags) {
        CgCompositeTag postagCompiled = new CgCompositeTag();
        for (CgTag tag : ctag.tags) {
          if (isPostag(tag.tag)) {
            postagCompiled.addTag(tag);
          } else if (isBaseForm(tag.tag)) {
            baseformsList.add(tag.tag);
          } else if (isSurfaceForm(tag.tag)) {
            surfaceformsList.add(tag.tag);
          }
        }
        postags = compositePostagToString(postagCompiled);
      }
      baseforms = glueWords(cleanForms(baseformsList.toArray(new String[baseformsList.size()])));
      surfaceforms = glueWords(cleanForms(surfaceformsList.toArray(new String[surfaceformsList.size()])));
    }
   
    boolean careful = token.careful;
    boolean negate = token.negate;
    String exceptions = token.exceptionString;
    int skip = token.skip;
   
    // the special case of the generic token:
    if (postags.equals("") && baseforms.equals("") && surfaceforms.equals("")) {
      ltRule = addToken(ltRule,baseforms,postags,exceptions,careful,false,negate,skip,indent);
      return ltRule;
    }
   
    if (!baseforms.equals("")) {
      ltRule = addToken(ltRule, baseforms, postags, exceptions, careful, true, negate, skip, indent);
    } else if (!surfaceforms.equals("")) {
      ltRule = addToken(ltRule, surfaceforms, postags, exceptions, careful, false, negate, skip, indent);
    } else {
      ltRule = addToken(ltRule, surfaceforms, postags, exceptions, careful, false, negate, skip, indent);
    }
    return ltRule;
  }
 
  /**
   * Helper for getRuleByType
   */
  public String getBarrierExceptionStringFromToken(Token token) {
    boolean not = token.negate;
    boolean inflected = true;
    String barrierPos = glueWords(expandSetSets(token.barrier).getPostagsString());
   
    if (token.scanahead) {
      barrierPos = barrierPos.concat("|" + SENT_END);
    }
    String barrierToken = glueWords(token.barrier.getSingleTagBaseformsString());
    if (barrierToken.isEmpty()) {
      barrierToken = glueWords(token.barrier.getSingleTagSurfaceformsString());
      inflected = false;
    }
    String targetPos = glueWords(token.postags);
    String targetToken = glueWords(token.baseforms);
    if (targetToken.isEmpty()) {
      targetToken = glueWords(token.surfaceforms);
      if (not) inflected = false;
    }
    String postagString = "";
    String tokenString = "";
    String inflectedString = "";
    String regexpString = "";
    String postagRegexpString = "";
    if (not) {
      if (!targetPos.isEmpty()) {
        postagString = " postag=\"" + targetPos + "\"";
        if (isRegex(targetPos)) {
          postagRegexpString = " postag_regexp=\"yes\"";
        }
      }
      if (!targetToken.isEmpty()) {
        tokenString = " ".concat(targetToken);
        if (isRegex(tokenString)) {
          regexpString = " regexp=\"yes\"";
        }
      }
    } else {
      if (!barrierPos.isEmpty()) {
        postagString = " postag=\"" + barrierPos + "\"";
      }
      if (isRegex(barrierPos)) {
        postagRegexpString = " postag_regexp=\"yes\"";
      }
      if (!barrierToken.isEmpty()) {
        tokenString = barrierToken;
        if (isRegex(tokenString)) {
          regexpString = " postag_regexp=\"yes\"";
        }
      }
    }
    if (inflected) {
      inflectedString = " inflected=\"yes\"";
    }
    String retString = "<exception" + postagString + inflectedString + regexpString + postagRegexpString + " scope=\"next\">" + tokenString + "</exception>";
    return retString;
  }
 
  /**
   * Returns the position of the target token, which is always relative to the furthest back token in the rules
   */
  public int getPositionOfTarget(Token[] tokens) {
    Token firstToken = tokens[0];
    return -1 * firstToken.offset;
  }
 
  /**
   * Helper that takes a normal contextual test (i.e. not a Parent or a Linked test, e.g. (1 Noun))
   * and returns the properly filled-out Token object
   */
  public Token getTokenFromNormalTest(CgContextualTest test) {
    CgSet testTarget = expandSetSets(grammar.getSet(test.target));
    boolean testCareful = test.pos.contains(POS.POS_CAREFUL.value);
    int testOffset = test.offset;
    boolean testScanAhead = test.pos.contains(POS.POS_SCANFIRST.value) && testOffset >= 0;
    boolean testScanBehind = test.pos.contains(POS.POS_SCANFIRST.value) && testOffset < 0;
    boolean testNot = test.pos.contains(POS.POS_NOT.value);
    CgSet testBarrier = grammar.getSet(test.barrier);
    CgSet testCBarrier = grammar.getSet(test.cbarrier);
    CgSet barrier = null;
    boolean cbarrier = false;
    if (testBarrier != null && testCBarrier != null) {
      System.err.println("Can't have both a barrier and a careful barrier");
      System.exit(1);
    }
    if (testBarrier != null) {
      barrier = testBarrier;
      cbarrier = false;
    } else if (testCBarrier != null) {
      barrier = testCBarrier;
      cbarrier = true;
    } else {
      barrier = new CgSet();
      cbarrier = false;
    }
    if (test.line == 548 && test.offset == 1) {
      System.out.println();
    }
    return new Token(testTarget,testCareful,testOffset,testScanAhead,testScanBehind,barrier,cbarrier,0,testNot);
   
  }
 
  /**
   * takes a CgSet and, if it contains nested sets, expands them according to the proper
   * set operators (set_ops) and returns the new set.
   */
  public CgSet expandSetSets(CgSet set) {
    CgSet newSet = new CgSet();
    newSet.line = set.line;
    newSet.type = set.type;
    newSet.name  = set.name;
    if (set.sets.isEmpty()) {
      return set;
    }
    else if (set.sets.size() > 1 && set.set_ops.isEmpty()) {
      System.err.println("Error: something wonky with the set on line " + set.line);
      System.exit(1);
    }
    else if (set.set_ops.isEmpty()) {
      CgSet expandedSet = expandSetSets(grammar.getSet(set.sets.get(0)));
      for (CgCompositeTag ctag : expandedSet.tags) {
        newSet.tags.add(ctag);
      }
      for (CgTag tag : expandedSet.single_tags) {
        newSet.single_tags.add(tag);
      }
    }
    else {
      for (int op=0;op<set.set_ops.size();op++) {
        CgSet expandedSet1 = expandSetSets(grammar.getSet(set.sets.get(op)));
        CgSet expandedSet2 = expandSetSets(grammar.getSet(set.sets.get(op+1)));
        // Cartesian set product (+)
        if (set.set_ops.get(op) == 4) {
          for (CgTag tag : expandedSet1.single_tags) {
            for (CgTag tag2 : expandedSet2.single_tags) {
              if (tag.tag.equals(tag2.tag)) {
                newSet.addTag(tag);
              } else {
                CgCompositeTag ctag = new CgCompositeTag();
                ctag.addTag(tag);
                ctag.addTag(tag2);
                newSet.addCompositeTag(ctag);
              }
            }
          }
          for (CgCompositeTag ctag : expandedSet1.tags) {
            for (CgTag tag : expandedSet2.single_tags) {
              if (ctag.tags.contains(tag)) {
                newSet.addCompositeTag(ctag);
              } else {
                CgCompositeTag ctagnew = new CgCompositeTag();
                for (CgTag tag2 : ctag.tags) {
                  ctagnew.addTag(tag2);
                }
                ctagnew.addTag(tag);
                newSet.addCompositeTag(ctagnew);
              }
            }
          }
          for (CgCompositeTag ctag : expandedSet2.tags) {
            for (CgTag tag : expandedSet1.single_tags) {
              if (ctag.tags.contains(tag)) {
                newSet.addCompositeTag(ctag);
              } else {
                CgCompositeTag ctagnew = new CgCompositeTag();
                for (CgTag tag2 : ctag.tags) {
                  ctagnew.addTag(tag2);
                }
                ctagnew.addTag(tag);
                newSet.addCompositeTag(ctagnew);
              }
            }
          }
          for (CgCompositeTag ctag : expandedSet1.tags) {
            for (CgCompositeTag ctag2 : expandedSet2.tags) {
              CgCompositeTag ctagnew = new CgCompositeTag();
              for (CgTag tag : ctag.tags) {
                if (!ctagnew.tags.contains(tag)) {
                  ctagnew.addTag(tag);
                }
              }
              for (CgTag tag : ctag2.tags) {
                if (!ctagnew.tags.contains(tag)) {
                  ctagnew.addTag(tag);
                }
              }
              newSet.addCompositeTag(ctagnew);
            }
          }
        }
        // OR or |
        else if (set.set_ops.get(op) == 3) {
          for (CgCompositeTag ctag : expandedSet1.tags) {
            newSet.addCompositeTag(ctag);
          }
          for (CgCompositeTag ctag : expandedSet2.tags) {
            newSet.addCompositeTag(ctag);
          }
          for (CgTag tag : expandedSet1.single_tags) {
            newSet.addTag(tag);
          }
          for (CgTag tag : expandedSet2.single_tags) {
            newSet.addTag(tag);
          }
        }
      }
    }
    return newSet;
  }
 
  @Override
  public String generateName(Object ruleObject) {
    CgRule rule = (CgRule) ruleObject;
    String name = rule.name;
    if (name == null) {
      name = "rule_" + nameIndex;
      nameIndex++;
    }
    return name;
  }
 
  @Override
  public String generateId(Object ruleObject) {
    CgRule rule = (CgRule) ruleObject;
    String name = rule.name;
    if (name == null) {
      name = "rule_" + idIndex;
      idIndex++;
    }
    return name;
  }
 
  @Override
  public String getOriginalRuleString(Object ruleObject) {
    CgRule rule = (CgRule) ruleObject;
    return lines[rule.line];
  }
 
  @Override
  public String[] getAcceptableFileTypes() {
    String[] ft = {"default"};
    return ft;
  }
 
 
  /**
   * Token class: contains the elements that will be written into <token> elements in the resulting LT rule
   * @author mbryant
   *
   */
  public class Token implements Comparable<Token> {

    // target consists of the expanded CgSet, with no nested sets.
    public CgSet target;
    public String[] postags;
    public String[] surfaceforms;
    public String[] baseforms;
    public CgCompositeTag[] compositeTags;
    public boolean careful;
    public int offset;
    public boolean scanahead;
    public boolean scanbehind;
    public CgSet barrier;
    public boolean cbarrier;
    public int skip;
    public boolean negate;
    public int relativeOffset;
    public Token nextToken;
    public Token prevToken;
    public String exceptionString;
   
    public Token() {
      // nothing
    }
   
    // copy constructor
    public Token(Token another) {
      this.target = new CgSet(another.target);
      this.postags = target.getPostagsString();
      this.surfaceforms = target.getSingleTagSurfaceformsString();
      this.baseforms = target.getSingleTagBaseformsString();
      this.compositeTags = target.getCompositeTags();
      this.careful = another.careful;
      this.offset = another.offset;
      this.scanahead = another.scanahead;
      this.scanbehind = another.scanbehind;
      this.barrier = new CgSet(another.barrier);
      this.cbarrier = another.cbarrier;
      this.skip = another.skip;
      this.negate = another.negate;
      this.nextToken = another.nextToken;
      this.prevToken = another.prevToken;
      this.relativeOffset = another.relativeOffset;
      this.exceptionString = another.exceptionString;
    }
   
    public Token(CgSet target,
           boolean careful, int offset, boolean scanahead, boolean scanbehind,
           CgSet barrier, boolean cbarrier, int skip, boolean negate) {
      this.target = target;
      this.postags = target.getPostagsString();
      this.surfaceforms = target.getSingleTagSurfaceformsString();
      this.baseforms = target.getSingleTagBaseformsString();
      this.compositeTags = target.getCompositeTags();
      this.careful = careful;
      this.offset = offset;
      this.scanahead = scanahead;
      this.scanbehind = scanbehind;
      this.barrier = barrier;
      this.cbarrier = cbarrier;
      this.skip = skip;
      this.negate = negate;
      this.nextToken = null;
      this.prevToken = null;
      this.relativeOffset = 0;
    }
   
    @Override
    public int compareTo(Token token) {
      if (this.offset < token.offset) {
        return -1;
      } else if (this.offset == token.offset) {
        return 0;
      } else {
        return 1;
      }
    }
   
    public boolean isEmpty() {
      return (this.postags.length == 0) &&
           (this.baseforms.length == 0) &&
           (this.surfaceforms.length == 0) &&
           (this.compositeTags.length == 0);
    }
   
    @Override
    public String toString() {
      StringBuilder sb = new StringBuilder();
      for (String postag : this.postags) {
        sb.append(postag + " ");
      }
      for (String baseform : this.baseforms) {
        sb.append(baseform + " ");
      }
      for (String surfaceform : this.surfaceforms) {
        sb.append(surfaceform + " ");
      }
      for (CgCompositeTag ctag : this.compositeTags) {
        sb.append(ctag.toString() + " ");
      }
      return sb.toString();
    }
  }
 
 
  // ** SOME STATIC STRING-CHECKING METHODS **
 
  public static boolean isPostag(String tag) {
    return !(tag.matches("\"\\<.*\\>\"r?i?") || tag.matches("\".*\"r?i?"));
  }
 
  public static boolean isSurfaceForm(String form) {
    return (form.matches("\"\\<.*\\>\"r?i?") || form.matches("\"\"\\<.*\\>\"r?i?\""));
  }
 
  public static boolean isBaseForm(String form) {
    return (form.matches("\"[^<]*[^>]\"r?i?") || form.matches("\"\"[^<]*[^>]\"r?i?\""));
  }
 
  public static boolean isCompositePostag(CgCompositeTag ctag) {
    for (CgTag tag : ctag.tags) {
      if (isBaseForm(tag.tag) || isSurfaceForm(tag.tag)) {
        return false;
      }
    }
    return true;
  }
 
  public static boolean sameStrings(String[] s1, String[] s2) {
    if (s1.length != s2.length) {
      return false;
    } else {
      for (String ss1 : s1) {
        for (String ss2 : s2) {
          if (!ss1.equals(ss2)) return false;
        }
      }
    }
    return true;
  }
 
 
 
  // ** METHODS THAT RETURN WRITABLE FORMS FOR CONSTRAINT GRAMMAR TAGS/SETS **
 
  /**
   * Removes the quotation marks, angle brackets, and suffixes from surface/base forms
   */
  public static String[] cleanForms(String[] words) {
    for (int i=0;i<words.length;i++) {
      words[i] = words[i].replaceAll(">\"r?i?", "").replaceAll(">\"", "").replaceAll("\"<","").replaceAll("\"r?i?", "");
    }
    return words;
  }
 
  // this is problematic, in that I don't know how to write these regular expressions when we have
  // different morphological tag syntax, as Polish or French.
  public static String filterRegexp(CgSet target) {
    String[] postags = target.getPostagsString();
    String postagString = glueWords(postags);
    postagString = "(".concat(postagString).concat(")");
    String postagRegexp = toStringRegexpFormat(postagString);
   
    StringBuilder sb = new StringBuilder();
    sb.append("(?!");
    sb.append(postagRegexp);
    sb.append(").*");
    return sb.toString();
  }
 
  public static String filterTarget(CgSet target, int targetNo) {
    // <match no=\"" + (mark + 1) + "\" postag=\"" + replaceRegexp(target) + "\" postag_regexp=\"yes\"/>
    StringBuilder sb = new StringBuilder();
   
    String[] lemmas = cleanForms(target.getSingleTagBaseformsString());
    String[] postags = cleanForms(target.getPostagsString());
    CgCompositeTag[] compositeTags = target.getCompositeTags();
    String[] surfaceforms = cleanForms(target.getSingleTagSurfaceformsString());
   
    if (lemmas.length > 0 && (compositeTags.length > 0 || postags.length > 0 || surfaceforms.length > 0)) {
      System.err.println("Error: something went wrong here.");
    }
   
    // assumes there can't be both lemmas and postags
    if (lemmas.length > 0) {
      sb.append("<disambig action=\"filter\"><match no=\"" + targetNo + "\">" + glueWords(lemmas) + "</match></disambig>");
    }
    if (postags.length > 0) {
      String postagRegexp = "";
      if (isRegex(glueWords(postags))) {
        postagRegexp = " postag_regexp=\"yes\"";
      }
      sb.append("<disambig postag=\"" + glueWords(postags) + "\"" + postagRegexp + "/>");
    }
   
    return sb.toString();
   
  }
 
  // formats the target for use with disambiguation action="remove" keyword
  // assuming they support regular expressions, which they currently don't but kind of have to in order to work
  public static String removeTarget(CgSet target) {
    StringBuilder sb = new StringBuilder();
    sb.append("<wd ");
    // these should always be the correct lengths, because if they weren't, they should have been split earlier.
    String[] lemmas = target.getSingleTagBaseformsString();
    String[] postags = target.getPostagsString();
    CgCompositeTag[] compositeTags = target.getCompositeTags();
    String[] surfaceforms = target.getSingleTagSurfaceformsString();
   
    if (lemmas.length > 0 && (compositeTags.length > 0 || postags.length > 0 || surfaceforms.length > 0)) {
      System.err.println("Error: something went wrong here.");
    }
   
    if (lemmas.length > 0) {
      sb.append("lemma=" + glueWords(lemmas));
    }
    if (postags.length > 0) {
      sb.append("pos=\"" + glueWords(postags) + "\"");
    }
   
    sb.append("/>");
    return sb.toString();
  }
 
  public static String replaceRegexp(CgSet target) {
    String[] postags = target.getPostagsString();
    String postagString = glueWords(postags);
    postagString = "(".concat(postagString).concat(")");
    String postagRegexp = toStringRegexpFormat(postagString);
    return postagRegexp;
   
  }
 
  public static String addRegexp(CgSet target) {
    String[] postags = target.getSingleTagPostagsString();
    if (postags.length != 1) {
      System.err.println("Error: trying to map more than one mapping tag on line " + target.line);
      System.exit(1);
    }
    String postag = postags[0];
    return postag;
  }
 
  // ** LANGUAGE-DEPENDENT METHODS **
 
  public static String postagsToString(String[] postags) {
   
    if (postags.length == 0) {
      return "";
    }
    /*
    StringBuilder sb = new StringBuilder();
    sb.append("(.*" + tagDelimiter + ")?");
    String postagsGlued = glueWords(postags);
    sb.append(postagsGlued);
    sb.append("(" + tagDelimiter + ".*)?");
    */
    // The simplest possible way to do this
    StringBuilder sb = new StringBuilder();
    for (String pos : postags) {
      sb.append(pos + "|");
    }
    String ret = sb.toString();
    return ret.substring(0,ret.length() - 1);
  }
 
  public static String toStringRegexpFormat(String t) {
    /*
    StringBuilder sb = new StringBuilder();
    sb.append("^(.*" + tagDelimiter + ")?");
    sb.append(t);
    sb.append("(" + tagDelimiter + ".*)?$");
    return sb.toString();
    */
    return t;
   
  }
 
  //TODO: only a stand-in for now; depends on the language-specific multiple-tag string representation
  // some complicated regex stuff going on here.
  // only should be applied to composite postags. Composite tags with postags + s/b-forms get split in different ways
  public static String compositePostagToString(CgCompositeTag ctag) {
    /*
    StringBuilder sb = new StringBuilder();
    String gluedPostag = "";
    int noComponents = 0;
    for (CgTag tag : ctag.tags) {
      if (isPostag(tag.tag)) {
        gluedPostag = gluedPostag.concat(tag.tag).concat("|");
        noComponents++;
      }
    }
    gluedPostag = "(".concat(gluedPostag.substring(0, gluedPostag.length() - 1)).concat(")");
    sb.append("^(.*" + tagDelimiter + ")?");
    for (int i=0;i<noComponents;i++) {
      sb.append(gluedPostag);
      if (i < noComponents - 1) {
        sb.append("(" + tagDelimiter + ".*" + tagDelimiter + "?)");
      }     
    }
    sb.append("(" + tagDelimiter + ".*)?$");
    */
    // The simplest possible way to represent this
    StringBuilder sb = new StringBuilder();
    for (CgTag tag : ctag.tags) {
      if (isPostag(tag.tag)) {
        sb.append(tag.tag);
        sb.append(tagDelimiter);
      }
    }
    sb.deleteCharAt(sb.length() - 1);
    sb.append(".*");
    return sb.toString();
  }
 
  // language-specific way of representing tags. Relies on a tagDelimiter, which appears to be different in different languages.
  // For example, in French, the tags look like: "N f s" (i.e. Noun feminine singular). But in Polish they look like "N:f:s" (same).
  // so this syntax takes care of both of those cases (as long as you change the tagDelimiter), wherever in the postag string
  // the tag appears.
  public static String tagToString(CgTag tag) {
    /*
    if (tag.tag.equals(sent_end)) {
      return sent_end;
    }
    if (tag.tag.equals(sent_start)) {
      return sent_start;
    }
    StringBuilder sb = new StringBuilder();
    String t = tag.tag;
    sb.append("^" + t + "$|");                      // the tag is the only tag
    sb.append("^" + t + tagDelimiter + ".*|");              // the tag is the first tag
    sb.append(".*" + tagDelimiter + t + tagDelimiter + ".*|");      // the tag is in the middle somewhere
    sb.append(".*" + tagDelimiter + t + "$");              // the tag is the last tag
   
    return sb.toString();
    */
    // The simplest possible way to do this
    return tag.tag;
  }
 
  /**
   * Helper to properly put an item to a map where the values are lists
   */
  public static <K,V> Map<K,ArrayList<V>> smartPut(Map<K,ArrayList<V>> map, K key, V value) {
    if (map.containsKey(key)) {
      ArrayList<V> original = map.get(key);
      original.add(value);
      map.put(key, original);
    } else {
      ArrayList<V> newcollection = new ArrayList<>();
      newcollection.add(value);
      map.put(key, newcollection);
    }
    return map;
  }

}
TOP

Related Classes of org.languagetool.dev.conversion.CgRuleConverter$Token

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.