Package org.apache.uima.ruta.textruler.learner.lp2

Source Code of org.apache.uima.ruta.textruler.learner.lp2.NaiveLP2

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.uima.ruta.textruler.learner.lp2;

import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.ruta.engine.RutaEngine;
import org.apache.uima.ruta.textruler.TextRulerPlugin;
import org.apache.uima.ruta.textruler.core.TextRulerAnnotation;
import org.apache.uima.ruta.textruler.core.TextRulerExample;
import org.apache.uima.ruta.textruler.core.TextRulerRule;
import org.apache.uima.ruta.textruler.core.TextRulerRuleItem;
import org.apache.uima.ruta.textruler.core.TextRulerRulePattern;
import org.apache.uima.ruta.textruler.core.TextRulerTarget;
import org.apache.uima.ruta.textruler.core.TextRulerTarget.MLTargetType;
import org.apache.uima.ruta.textruler.core.TextRulerToolkit;
import org.apache.uima.ruta.textruler.extension.TextRulerLearnerDelegate;
import org.apache.uima.ruta.textruler.learner.lp2.LP2RuleItem.MLLP2ContextConstraint;
import org.apache.uima.ruta.textruler.learner.lp2.LP2RuleItem.MLLP2OtherConstraint;
import org.apache.uima.util.FileUtils;

public class NaiveLP2 extends BasicLP2 {

  public NaiveLP2(String inputDir, String prePropTMFile, String tmpDir, String[] slotNames,
          Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) {
    super(inputDir, prePropTMFile, tmpDir, slotNames, filterSet, skip, delegate);
  }

  public static final boolean SAVE_DEBUG_INFO_IN_TEMPFOLDER = false;

  @Override
  protected void induceRulesFromExample(TextRulerExample e, int roundNumber) {
    LP2Rule baseRule = createInitialRuleForPositiveExample(e);
    List<LP2Rule> genRules = generalizeRule(baseRule);

    if (shouldAbort())
      return;

    List<LP2Rule> test = new ArrayList<LP2Rule>();

    // int i=1;
    // for (LP2Rule newRule : genRules)
    // {
    // if (shouldAbort())
    // return;
    // sendStatusUpdateToDelegate("Round "+roundNumber+" - Testing proposed generalization "+i+"/"+(genRules.size())+
    // "    - uncovered examples: "+
    // (examples.size()-coveredExamples.size() + " / "+examples.size()),
    // TextRulerLearnerState.ML_RUNNING, false);
    // i++;
    // testRuleOnDocumentSet(newRule, exampleDocuments);
    //
    // checkAndHandleNewRule(newRule);
    //
    // if (TextRulerToolkit.DEBUG)
    // test.add(newRule);
    // }
    // new cache and testCAS optimized rule testing:

    sendStatusUpdateToDelegate(
            "Round " + roundNumber + " - Testing " + (genRules.size())
                    + "generalizations... - uncovered examples: "
                    + (examples.size() - coveredExamples.size() + " / " + examples.size()),
            TextRulerLearnerState.ML_RUNNING, false);
    testRulesOnDocumentSet(new ArrayList<TextRulerRule>(genRules), exampleDocuments);

    for (LP2Rule newRule : genRules) {
      checkAndHandleNewRule(newRule);
      if (TextRulerToolkit.DEBUG)
        test.add(newRule);
    }

    if (TextRulerToolkit.DEBUG && SAVE_DEBUG_INFO_IN_TEMPFOLDER) {
      Collections.sort(test, new Comparator<LP2Rule>() {

        public int compare(LP2Rule o1, LP2Rule o2) {
          return o1.getRuleString().compareTo(o2.getRuleString());
        }

      });

      String startend = e.getTarget().type == MLTargetType.SINGLE_LEFT_BOUNDARY ? "left_"
              : "right_";
      File file = new File(tempDirectory() + startend + "generalizations" + roundNumber + RutaEngine.SCRIPT_FILE_EXTENSION);
      StringBuffer str = new StringBuffer();
      for (TextRulerRule rule : test) {
        str.append(rule.getCoveringStatistics() + "\t\t" + rule.getRuleString() + "\n");
      }
      try {
        FileUtils.saveString2File(str.toString(), file);
      } catch (Exception ex) {
        TextRulerPlugin.error(ex);
      }
    }

  }

  protected void checkAndHandleNewRule(LP2Rule rule) {
    boolean tooFewPositives = rule.getCoveringStatistics().getCoveredPositivesCount() < minCoveredPositives;
    boolean tooManyErrors = rule.getErrorRate() > maxErrorThreshold;

    boolean isBestRule = !(tooFewPositives || tooManyErrors);

    if (TextRulerToolkit.DEBUG && SAVE_DEBUG_INFO_IN_TEMPFOLDER)
      TextRulerToolkit.appendStringToFile(tempDirectory() + "bestcandidates"+RutaEngine.SCRIPT_FILE_EXTENSION,
              rule.getRuleString() + "\n");

    if (isBestRule) {
      currentBestRules.add(rule);
      currentBestRules.removeSubsumedRules();
      currentBestRules.cutToMaxSize();
    } else if (!tooFewPositives) {

      // test in context
      // in our TM representation, we simply can add a NEAR condition in
      // the MARKing rule item and retest it on the
      // corpus. we should do that for all kinds of tags we have, but
      // currently we only do it for the corresponding opening/closing
      // tag, since we do not have any information about other slots yet!
      // // TODO use all other slot tags! (see optimized version as well)

      if (true) {
        rule = rule.copy();
        LP2RuleItem item = rule.getMarkingRuleItem();
        // TextRulerToolkit.log("CONTEXTUAL RULE CANDIDATE: "+rule.getRuleString()+"  ;  "+rule.getCoveringStatistics());
        item.setContextConstraint(new MLLP2ContextConstraint(slotMaximumTokenCountMap.get(rule
                .getTarget().getSingleSlotRawTypeName()), rule));
        rule.setIsContextualRule(true);

        rule.setNeedsCompile(true);

        if (TextRulerToolkit.DEBUG && SAVE_DEBUG_INFO_IN_TEMPFOLDER)
          TextRulerToolkit.appendStringToFile(tempDirectory() + "ctxcandidates"+RutaEngine.SCRIPT_FILE_EXTENSION,
                  rule.getRuleString());

        testRuleOnDocumentSet(rule, exampleDocuments); // not very
        // fast... but
        // works!
        boolean ctxTooFewPositives = rule.getCoveringStatistics().getCoveredPositivesCount() < minCoveredPositives;
        boolean ctxTooManyErrors = rule.getErrorRate() > maxErrorThreshold;
        boolean isGoodContextRule = !(ctxTooFewPositives || ctxTooManyErrors);
        if (isGoodContextRule) {
          currentContextualRules.add(rule);
          currentContextualRules.removeSubsumedRules();
          currentContextualRules.cutToMaxSize();
        }
      }

    }
  }

  protected List<LP2Rule> generalizeRule(LP2Rule baseRule) {
    List<LP2Rule> result = new ArrayList<LP2Rule>();
    TextRulerRulePattern rulePattern = new TextRulerRulePattern();
    TextRulerRulePattern prePattern = baseRule.getPreFillerPattern();

    for (int i = prePattern.size() - 1; i >= 0; i--) // we have to reverse
    // the order again!
    {
      rulePattern.add(prePattern.get(i));
    }
    rulePattern.addAll(baseRule.getPostFillerPattern());

    recursiveGeneralizeRule(baseRule, rulePattern, new TextRulerRulePattern(), result);
    TextRulerToolkit.log("GENERALIZATIONS: " + result.size());

    for (LP2Rule r : result)
      removeOutermostWildCardItemsFromRule(r);

    // for (LP2Rule r : result)
    // {
    // TextRulerToolkit.log("NEWRULE = "+r.getRuleString());
    // }

    return result;
  }

  protected LP2Rule createInitialRuleForPositiveExample(TextRulerExample example) {
    TextRulerTarget target = example.getTarget();
    LP2Rule rule = new LP2Rule(this, example.getTarget());
    CAS docCas = example.getDocumentCAS();
    TextRulerAnnotation exampleAnnotation = example.getAnnotation();
    TypeSystem ts = docCas.getTypeSystem();
    Type tokensRootType = ts.getType(TextRulerToolkit.RUTA_ANY_TYPE_NAME);
    int thePosition = target.type == MLTargetType.SINGLE_LEFT_BOUNDARY ? exampleAnnotation
            .getBegin() : exampleAnnotation.getEnd();

    List<AnnotationFS> leftContext = TextRulerToolkit.getAnnotationsBeforePosition(docCas,
            thePosition, windowSize,
            TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokensRootType);
    List<AnnotationFS> rightContext = TextRulerToolkit.getAnnotationsAfterPosition(docCas,
            thePosition, windowSize,
            TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokensRootType);

    // the left context has to be reversed since we get the arrayList from
    // the slot's point of view!
    for (int i = leftContext.size() - 1; i >= 0; i--) {
      TextRulerAnnotation annot = new TextRulerAnnotation(leftContext.get(i), example.getDocument());
      LP2RuleItem item = new LP2RuleItem();
      item.setWordConstraint(annot);
      if (item.getWordConstraint().isRegExpConstraint())
        item.addOtherConstraint(new MLLP2OtherConstraint(annot, annot));
      rule.addPreFillerItem(item);
    }

    for (AnnotationFS afs : rightContext) {
      TextRulerAnnotation annot = new TextRulerAnnotation(afs, example.getDocument());
      LP2RuleItem item = new LP2RuleItem();
      item.setWordConstraint(annot);
      if (item.getWordConstraint().isRegExpConstraint())
        item.addOtherConstraint(new MLLP2OtherConstraint(annot, annot));

      rule.addPostFillerItem(item);
    }
    TextRulerToolkit.log("INITIAL RULE: " + rule.getRuleString());
    return rule;
  }

  protected void recursiveGeneralizeRule(LP2Rule baseRule, TextRulerRulePattern allItems,
          TextRulerRulePattern currentPattern, List<LP2Rule> resultList) {
    if (currentPattern.size() == allItems.size()) {
      // create new Rule
      LP2Rule newRule = new LP2Rule(this, baseRule.getTarget());
      int preCount = baseRule.getPreFillerPattern().size();
      for (int i = 0; i < currentPattern.size(); i++) {
        if (i < preCount)
          newRule.addPreFillerItem(currentPattern.get(i));
        else
          newRule.addPostFillerItem(currentPattern.get(i));
      }
      // TextRulerToolkit.log("GEN: "+newRule.getRuleString());
      if (newRule.totalInnerConstraintCount() > 0) // skip the ANY ANY ANY
        // ANY... rule ! this
        // makes no sense in no
        // application!!
        resultList.add(newRule);
    } else {
      int index = currentPattern.size();
      TextRulerRuleItem baseItem = allItems.get(index);
      List<TextRulerRuleItem> itemGeneralizations = generalizeRuleItem((LP2RuleItem) baseItem);
      for (TextRulerRuleItem newItem : itemGeneralizations) {
        currentPattern.add(newItem);
        recursiveGeneralizeRule(baseRule, allItems, currentPattern, resultList);
        currentPattern.remove(currentPattern.size() - 1);
      }
    }
  }

  protected void recursiveGeneralizeRuleItem(LP2RuleItem baseItem,
          List<MLLP2OtherConstraint> otherConstraints, int currentConstraintIndex,
          List<MLLP2OtherConstraint> currentConstraintTuple, List<TextRulerRuleItem> result) {
    if (currentConstraintIndex > otherConstraints.size() - 1) {
      LP2RuleItem newItem;
      newItem = new LP2RuleItem();
      for (MLLP2OtherConstraint c : currentConstraintTuple)
        newItem.addOtherConstraint(c.copy());
      result.add(newItem);
    } else {
      MLLP2OtherConstraint currentConstraint = otherConstraints.get(currentConstraintIndex);
      // recurse WITH and WITHOUT this key:
      recursiveGeneralizeRuleItem(baseItem, otherConstraints, currentConstraintIndex + 1,
              currentConstraintTuple, result);
      currentConstraintTuple.add(currentConstraint);
      recursiveGeneralizeRuleItem(baseItem, otherConstraints, currentConstraintIndex + 1,
              currentConstraintTuple, result);
      currentConstraintTuple.remove(currentConstraintTuple.size() - 1);
    }
  }

  protected List<TextRulerRuleItem> generalizeRuleItem(LP2RuleItem baseItem) {
    List<TextRulerRuleItem> result = new ArrayList<TextRulerRuleItem>();

    // one with word constraint
    if (baseItem.getWordConstraint() != null) {
      LP2RuleItem newItem = new LP2RuleItem();
      newItem.setWordConstraint(baseItem.getWordConstraint().copy());
      result.add(newItem);
    }

    // all other combinations without word constraint
    // List<String> keys = new
    // ArrayList<String>(baseItem.getOtherConstraints().keySet());
    List<MLLP2OtherConstraint> constraints = baseItem.getOtherConstraints();
    recursiveGeneralizeRuleItem(baseItem, constraints, 0, new ArrayList<MLLP2OtherConstraint>(),
            result);
    return result;
  }

  protected void removeOutermostWildCardItemsFromRule(LP2Rule rule) {
    while (true) {
      LP2RuleItem item = (LP2RuleItem) rule.getOutermostPreFillerItem();
      if (item == null) // no more items left
        break;

      // if this rule is a RIGHT BOUNDARY rule, we must not remove the
      // last remaining pre filler item,
      // since this is used for marking the SLOT END BOUNDARY (= RIGHT
      // BOUNDARY)
      if ((rule.getTarget().type == MLTargetType.SINGLE_RIGHT_BOUNDARY)
              && (rule.getPreFillerPattern().size() == 1))
        break;

      if (item.totalConstraintCount() == 0)
        rule.removeOutermostPreFillerItem();
      else
        break;
    }
    while (true) {
      LP2RuleItem item = (LP2RuleItem) rule.getOutermostPostFillerItem();
      if (item == null) // no more items left
        break;

      // if this rule is a LEFT BOUNDARY rule, we must not remove the last
      // remaining post filler item,
      // since this is used for marking the SLOT START BOUNDARY (= LEFT
      // BOUNDARY)
      if ((rule.getTarget().type == MLTargetType.SINGLE_LEFT_BOUNDARY)
              && (rule.getPostFillerPattern().size() == 1))
        break;

      if (item.totalConstraintCount() == 0)
        rule.removeOutermostPostFillerItem();
      else
        break;
    }
  }

  @Override
  public boolean collectNegativeCoveredInstancesWhenTesting() {
    return false;
  }

}
TOP

Related Classes of org.apache.uima.ruta.textruler.learner.lp2.NaiveLP2

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.