Source Code of org.apache.uima.ruta.textruler.learner.lp2.NaiveLP2

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */


package org.apache.uima.ruta.textruler.learner.lp2;


import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;


import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.ruta.engine.RutaEngine;
import org.apache.uima.ruta.textruler.TextRulerPlugin;
import org.apache.uima.ruta.textruler.core.TextRulerAnnotation;
import org.apache.uima.ruta.textruler.core.TextRulerExample;
import org.apache.uima.ruta.textruler.core.TextRulerRule;
import org.apache.uima.ruta.textruler.core.TextRulerRuleItem;
import org.apache.uima.ruta.textruler.core.TextRulerRulePattern;
import org.apache.uima.ruta.textruler.core.TextRulerTarget;
import org.apache.uima.ruta.textruler.core.TextRulerTarget.MLTargetType;
import org.apache.uima.ruta.textruler.core.TextRulerToolkit;
import org.apache.uima.ruta.textruler.extension.TextRulerLearnerDelegate;
import org.apache.uima.ruta.textruler.learner.lp2.LP2RuleItem.MLLP2ContextConstraint;
import org.apache.uima.ruta.textruler.learner.lp2.LP2RuleItem.MLLP2OtherConstraint;
import org.apache.uima.util.FileUtils;


public class NaiveLP2 extends BasicLP2 {


  public NaiveLP2(String inputDir, String prePropTMFile, String tmpDir, String[] slotNames,
          Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) {
    super(inputDir, prePropTMFile, tmpDir, slotNames, filterSet, skip, delegate);
  }


  public static final boolean SAVE_DEBUG_INFO_IN_TEMPFOLDER = false;


  @Override
  protected void induceRulesFromExample(TextRulerExample e, int roundNumber) {
    LP2Rule baseRule = createInitialRuleForPositiveExample(e);
    List<LP2Rule> genRules = generalizeRule(baseRule);


    if (shouldAbort())
      return;


    List<LP2Rule> test = new ArrayList<LP2Rule>();


    // int i=1;
    // for (LP2Rule newRule : genRules)
    // {
    // if (shouldAbort())
    // return;
    // sendStatusUpdateToDelegate("Round "+roundNumber+" - Testing proposed generalization "+i+"/"+(genRules.size())+
    // "    - uncovered examples: "+
    // (examples.size()-coveredExamples.size() + " / "+examples.size()),
    // TextRulerLearnerState.ML_RUNNING, false);
    // i++;
    // testRuleOnDocumentSet(newRule, exampleDocuments);
    //
    // checkAndHandleNewRule(newRule);
    //
    // if (TextRulerToolkit.DEBUG)
    // test.add(newRule);
    // }
    // new cache and testCAS optimized rule testing:


    sendStatusUpdateToDelegate(
            "Round " + roundNumber + " - Testing " + (genRules.size())
                    + "generalizations... - uncovered examples: "
                    + (examples.size() - coveredExamples.size() + " / " + examples.size()),
            TextRulerLearnerState.ML_RUNNING, false);
    testRulesOnDocumentSet(new ArrayList<TextRulerRule>(genRules), exampleDocuments);


    for (LP2Rule newRule : genRules) {
      checkAndHandleNewRule(newRule);
      if (TextRulerToolkit.DEBUG)
        test.add(newRule);
    }


    if (TextRulerToolkit.DEBUG && SAVE_DEBUG_INFO_IN_TEMPFOLDER) {
      Collections.sort(test, new Comparator<LP2Rule>() {


        public int compare(LP2Rule o1, LP2Rule o2) {
          return o1.getRuleString().compareTo(o2.getRuleString());
        }


      });


      String startend = e.getTarget().type == MLTargetType.SINGLE_LEFT_BOUNDARY ? "left_"
              : "right_";
      File file = new File(tempDirectory() + startend + "generalizations" + roundNumber + RutaEngine.SCRIPT_FILE_EXTENSION);
      StringBuffer str = new StringBuffer();
      for (TextRulerRule rule : test) {
        str.append(rule.getCoveringStatistics() + "\t\t" + rule.getRuleString() + "\n");
      }
      try {
        FileUtils.saveString2File(str.toString(), file);
      } catch (Exception ex) {
        TextRulerPlugin.error(ex);
      }
    }


  }


  protected void checkAndHandleNewRule(LP2Rule rule) {
    boolean tooFewPositives = rule.getCoveringStatistics().getCoveredPositivesCount() < minCoveredPositives;
    boolean tooManyErrors = rule.getErrorRate() > maxErrorThreshold;


    boolean isBestRule = !(tooFewPositives || tooManyErrors);


    if (TextRulerToolkit.DEBUG && SAVE_DEBUG_INFO_IN_TEMPFOLDER)
      TextRulerToolkit.appendStringToFile(tempDirectory() + "bestcandidates"+RutaEngine.SCRIPT_FILE_EXTENSION,
              rule.getRuleString() + "\n");


    if (isBestRule) {
      currentBestRules.add(rule);
      currentBestRules.removeSubsumedRules();
      currentBestRules.cutToMaxSize();
    } else if (!tooFewPositives) {


      // test in context
      // in our TM representation, we simply can add a NEAR condition in
      // the MARKing rule item and retest it on the
      // corpus. we should do that for all kinds of tags we have, but
      // currently we only do it for the corresponding opening/closing
      // tag, since we do not have any information about other slots yet!
      // // TODO use all other slot tags! (see optimized version as well)


      if (true) {
        rule = rule.copy();
        LP2RuleItem item = rule.getMarkingRuleItem();
        // TextRulerToolkit.log("CONTEXTUAL RULE CANDIDATE: "+rule.getRuleString()+"  ;  "+rule.getCoveringStatistics());
        item.setContextConstraint(new MLLP2ContextConstraint(slotMaximumTokenCountMap.get(rule
                .getTarget().getSingleSlotRawTypeName()), rule));
        rule.setIsContextualRule(true);


        rule.setNeedsCompile(true);


        if (TextRulerToolkit.DEBUG && SAVE_DEBUG_INFO_IN_TEMPFOLDER)
          TextRulerToolkit.appendStringToFile(tempDirectory() + "ctxcandidates"+RutaEngine.SCRIPT_FILE_EXTENSION,
                  rule.getRuleString());


        testRuleOnDocumentSet(rule, exampleDocuments); // not very
        // fast... but
        // works!
        boolean ctxTooFewPositives = rule.getCoveringStatistics().getCoveredPositivesCount() < minCoveredPositives;
        boolean ctxTooManyErrors = rule.getErrorRate() > maxErrorThreshold;
        boolean isGoodContextRule = !(ctxTooFewPositives || ctxTooManyErrors);
        if (isGoodContextRule) {
          currentContextualRules.add(rule);
          currentContextualRules.removeSubsumedRules();
          currentContextualRules.cutToMaxSize();
        }
      }


    }
  }


  protected List<LP2Rule> generalizeRule(LP2Rule baseRule) {
    List<LP2Rule> result = new ArrayList<LP2Rule>();
    TextRulerRulePattern rulePattern = new TextRulerRulePattern();
    TextRulerRulePattern prePattern = baseRule.getPreFillerPattern();


    for (int i = prePattern.size() - 1; i >= 0; i--) // we have to reverse
    // the order again!
    {
      rulePattern.add(prePattern.get(i));
    }
    rulePattern.addAll(baseRule.getPostFillerPattern());


    recursiveGeneralizeRule(baseRule, rulePattern, new TextRulerRulePattern(), result);
    TextRulerToolkit.log("GENERALIZATIONS: " + result.size());


    for (LP2Rule r : result)
      removeOutermostWildCardItemsFromRule(r);


    // for (LP2Rule r : result)
    // {
    // TextRulerToolkit.log("NEWRULE = "+r.getRuleString());
    // }


    return result;
  }


  protected LP2Rule createInitialRuleForPositiveExample(TextRulerExample example) {
    TextRulerTarget target = example.getTarget();
    LP2Rule rule = new LP2Rule(this, example.getTarget());
    CAS docCas = example.getDocumentCAS();
    TextRulerAnnotation exampleAnnotation = example.getAnnotation();
    TypeSystem ts = docCas.getTypeSystem();
    Type tokensRootType = ts.getType(TextRulerToolkit.RUTA_ANY_TYPE_NAME);
    int thePosition = target.type == MLTargetType.SINGLE_LEFT_BOUNDARY ? exampleAnnotation
            .getBegin() : exampleAnnotation.getEnd();


    List<AnnotationFS> leftContext = TextRulerToolkit.getAnnotationsBeforePosition(docCas,
            thePosition, windowSize,
            TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokensRootType);
    List<AnnotationFS> rightContext = TextRulerToolkit.getAnnotationsAfterPosition(docCas,
            thePosition, windowSize,
            TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokensRootType);


    // the left context has to be reversed since we get the arrayList from
    // the slot's point of view!
    for (int i = leftContext.size() - 1; i >= 0; i--) {
      TextRulerAnnotation annot = new TextRulerAnnotation(leftContext.get(i), example.getDocument());
      LP2RuleItem item = new LP2RuleItem();
      item.setWordConstraint(annot);
      if (item.getWordConstraint().isRegExpConstraint())
        item.addOtherConstraint(new MLLP2OtherConstraint(annot, annot));
      rule.addPreFillerItem(item);
    }


    for (AnnotationFS afs : rightContext) {
      TextRulerAnnotation annot = new TextRulerAnnotation(afs, example.getDocument());
      LP2RuleItem item = new LP2RuleItem();
      item.setWordConstraint(annot);
      if (item.getWordConstraint().isRegExpConstraint())
        item.addOtherConstraint(new MLLP2OtherConstraint(annot, annot));


      rule.addPostFillerItem(item);
    }
    TextRulerToolkit.log("INITIAL RULE: " + rule.getRuleString());
    return rule;
  }


  protected void recursiveGeneralizeRule(LP2Rule baseRule, TextRulerRulePattern allItems,
          TextRulerRulePattern currentPattern, List<LP2Rule> resultList) {
    if (currentPattern.size() == allItems.size()) {
      // create new Rule
      LP2Rule newRule = new LP2Rule(this, baseRule.getTarget());
      int preCount = baseRule.getPreFillerPattern().size();
      for (int i = 0; i < currentPattern.size(); i++) {
        if (i < preCount)
          newRule.addPreFillerItem(currentPattern.get(i));
        else
          newRule.addPostFillerItem(currentPattern.get(i));
      }
      // TextRulerToolkit.log("GEN: "+newRule.getRuleString());
      if (newRule.totalInnerConstraintCount() > 0) // skip the ANY ANY ANY
        // ANY... rule ! this
        // makes no sense in no
        // application!!
        resultList.add(newRule);
    } else {
      int index = currentPattern.size();
      TextRulerRuleItem baseItem = allItems.get(index);
      List<TextRulerRuleItem> itemGeneralizations = generalizeRuleItem((LP2RuleItem) baseItem);
      for (TextRulerRuleItem newItem : itemGeneralizations) {
        currentPattern.add(newItem);
        recursiveGeneralizeRule(baseRule, allItems, currentPattern, resultList);
        currentPattern.remove(currentPattern.size() - 1);
      }
    }
  }


  protected void recursiveGeneralizeRuleItem(LP2RuleItem baseItem,
          List<MLLP2OtherConstraint> otherConstraints, int currentConstraintIndex,
          List<MLLP2OtherConstraint> currentConstraintTuple, List<TextRulerRuleItem> result) {
    if (currentConstraintIndex > otherConstraints.size() - 1) {
      LP2RuleItem newItem;
      newItem = new LP2RuleItem();
      for (MLLP2OtherConstraint c : currentConstraintTuple)
        newItem.addOtherConstraint(c.copy());
      result.add(newItem);
    } else {
      MLLP2OtherConstraint currentConstraint = otherConstraints.get(currentConstraintIndex);
      // recurse WITH and WITHOUT this key:
      recursiveGeneralizeRuleItem(baseItem, otherConstraints, currentConstraintIndex + 1,
              currentConstraintTuple, result);
      currentConstraintTuple.add(currentConstraint);
      recursiveGeneralizeRuleItem(baseItem, otherConstraints, currentConstraintIndex + 1,
              currentConstraintTuple, result);
      currentConstraintTuple.remove(currentConstraintTuple.size() - 1);
    }
  }


  protected List<TextRulerRuleItem> generalizeRuleItem(LP2RuleItem baseItem) {
    List<TextRulerRuleItem> result = new ArrayList<TextRulerRuleItem>();


    // one with word constraint
    if (baseItem.getWordConstraint() != null) {
      LP2RuleItem newItem = new LP2RuleItem();
      newItem.setWordConstraint(baseItem.getWordConstraint().copy());
      result.add(newItem);
    }


    // all other combinations without word constraint
    // List<String> keys = new
    // ArrayList<String>(baseItem.getOtherConstraints().keySet());
    List<MLLP2OtherConstraint> constraints = baseItem.getOtherConstraints();
    recursiveGeneralizeRuleItem(baseItem, constraints, 0, new ArrayList<MLLP2OtherConstraint>(),
            result);
    return result;
  }


  protected void removeOutermostWildCardItemsFromRule(LP2Rule rule) {
    while (true) {
      LP2RuleItem item = (LP2RuleItem) rule.getOutermostPreFillerItem();
      if (item == null) // no more items left
        break;


      // if this rule is a RIGHT BOUNDARY rule, we must not remove the
      // last remaining pre filler item,
      // since this is used for marking the SLOT END BOUNDARY (= RIGHT
      // BOUNDARY)
      if ((rule.getTarget().type == MLTargetType.SINGLE_RIGHT_BOUNDARY)
              && (rule.getPreFillerPattern().size() == 1))
        break;


      if (item.totalConstraintCount() == 0)
        rule.removeOutermostPreFillerItem();
      else
        break;
    }
    while (true) {
      LP2RuleItem item = (LP2RuleItem) rule.getOutermostPostFillerItem();
      if (item == null) // no more items left
        break;


      // if this rule is a LEFT BOUNDARY rule, we must not remove the last
      // remaining post filler item,
      // since this is used for marking the SLOT START BOUNDARY (= LEFT
      // BOUNDARY)
      if ((rule.getTarget().type == MLTargetType.SINGLE_LEFT_BOUNDARY)
              && (rule.getPostFillerPattern().size() == 1))
        break;


      if (item.totalConstraintCount() == 0)
        rule.removeOutermostPostFillerItem();
      else
        break;
    }
  }


  @Override
  public boolean collectNegativeCoveredInstancesWhenTesting() {
    return false;
  }


}
Source Code of org.apache.uima.ruta.textruler.learner.lp2.NaiveLP2

Related Classes of org.apache.uima.ruta.textruler.learner.lp2.NaiveLP2