Source Code of edu.stanford.nlp.patterns.surface.SurfacePatternFactory

package edu.stanford.nlp.patterns.surface;


import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.util.CollectionUtils;
import edu.stanford.nlp.util.Execution;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Triple;


import java.nio.charset.Charset;
import java.util.*;


/**
 * Created by sonalg on 10/27/14.
 */
public class SurfacePatternFactory extends PatternFactory{


  /**
   * Use POS tag restriction in the target term: One of this and
   * <code>addPatWithoutPOS</code> has to be true.
   */
  @Execution.Option(name = "usePOS4Pattern")
  public static boolean usePOS4Pattern = true;


  /**
   * Add patterns without POS restriction as well: One of this and
   * <code>usePOS4Pattern</code> has to be true.
   */
  @Execution.Option(name = "addPatWithoutPOS")
  public static boolean addPatWithoutPOS = true;


  /**
   * Consider contexts longer or equal to these many tokens.
   */
  @Execution.Option(name = "minWindow4Pattern")
  public static int minWindow4Pattern = 2;


  /**
   * Consider contexts less than or equal to these many tokens -- total of left
   * and right contexts be can double of this.
   */
  @Execution.Option(name = "maxWindow4Pattern")
  public static int maxWindow4Pattern = 4;


  /**
   * Consider contexts on the left of a token.
   */
  @Execution.Option(name = "usePreviousContext")
  public static boolean usePreviousContext = true;


  /**
   * Consider contexts on the right of a token.
   */
  @Execution.Option(name = "useNextContext")
  public static boolean useNextContext = false;;


  /**
   * If the whole (either left or right) context is just stop words, add the
   * pattern only if number of tokens is equal or more than this. This is get
   * patterns like "I am on X" but ignore "on X".
   */
  @Execution.Option(name = "numMinStopWordsToAdd")
  public static int numMinStopWordsToAdd = 3;




  /**
   * Adds the parent's tag from the parse tree to the target phrase in the patterns
   */
  @Execution.Option(name = "useTargetParserParentRestriction")
  public static boolean useTargetParserParentRestriction = false;


  /**
   * If the NER tag of the context tokens is not the background symbol,
   * generalize the token with the NER tag
   */
  @Execution.Option(name = "useContextNERRestriction")
  public static boolean useContextNERRestriction = false;


  /**
   * Ignore words like "a", "an", "the" when matching a pattern.
   */
  @Execution.Option(name = "useFillerWordsInPat")
  public static boolean useFillerWordsInPat = true;






  public static enum Genre {
    PREV, NEXT, PREVNEXT
  };


  static Token fw, sw;


  public static void setUp(Properties props){
    Execution.fillOptions(SurfacePatternFactory.class, props);
    Execution.fillOptions(SurfacePattern.class, props);
    if (!addPatWithoutPOS && !usePOS4Pattern) {
      throw new RuntimeException(
        "addPatWithoutPOS and usePOS4Pattern both cannot be false ");
    }


    fw = new Token();
    if (useFillerWordsInPat) {
      fw.setEnvBindRestriction("$FILLER");
      fw.setNumOcc(0,2);
    }
    sw = new Token();
    if (useStopWordsBeforeTerm) {
      sw.setEnvBindRestriction("$STOPWORD");
      sw.setNumOcc(0, 2);
    }
  }




  public static Set<SurfacePattern> getContext(List<CoreLabel> sent, int i) {




    Set<SurfacePattern> prevpatterns = new HashSet<SurfacePattern>();
    Set<SurfacePattern> nextpatterns = new HashSet<SurfacePattern>();
    Set<SurfacePattern> prevnextpatterns = new HashSet<SurfacePattern>();
    CoreLabel token = sent.get(i);
    String tag = null;
    if (usePOS4Pattern) {
      String fulltag = token.tag();
      tag = fulltag.substring(0, Math.min(fulltag.length(), 2));
    }
    String nerTag = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
    for (int maxWin = 1; maxWin <= maxWindow4Pattern; maxWin++) {
      List<Token> previousTokens = new ArrayList<Token>();
      List<String> originalPrev = new ArrayList<String>(), originalNext = new ArrayList<String>();
      List<Token> nextTokens = new ArrayList<Token>();


      int numStopWordsprev = 0, numStopWordsnext = 0;
      // int numPrevTokensSpecial = 0, numNextTokensSpecial = 0;
      int numNonStopWordsNext = 0, numNonStopWordsPrev = 0;
      boolean useprev = false, usenext = false;




      PatternToken twithoutPOS = null;
      if (addPatWithoutPOS) {
        twithoutPOS = new PatternToken(tag, false,
          numWordsCompound > 1, numWordsCompound,
          nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.get(CoreAnnotations.GrandparentAnnotation.class));
      }


      PatternToken twithPOS = null;
      if (usePOS4Pattern) {
        twithPOS = new PatternToken(tag, true,
          numWordsCompound > 1, numWordsCompound,
          nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.get(CoreAnnotations.GrandparentAnnotation.class));
      }


      if (usePreviousContext) {
        // int j = Math.max(0, i - 1);
        int j = i - 1;
        int numTokens = 0;
        while (numTokens < maxWin && j >= 0) {
          // for (int j = Math.max(i - maxWin, 0); j < i; j++) {
          CoreLabel tokenj = sent.get(j);


          String tokenjStr;
          if (useLemmaContextTokens)
            tokenjStr = tokenj.lemma();
          else
            tokenjStr = tokenj.word();


          // do not use this word in context consideration
          if (useFillerWordsInPat
            && fillerWords.contains(tokenj.word().toLowerCase())) {
            j--;
            continue;
          }
//          if (!tokenj.containsKey(answerClass.get(label))) {
//            throw new RuntimeException("how come the class "
//                + answerClass.get(label) + " for token "
//                + tokenj.word() + " in " + sent + " is not set");
//          }


          Triple<Boolean, Token, String> tr = getContextTokenStr(tokenj);
          boolean isLabeledO = tr.first;
          Token strgeneric = tr.second;
          String strOriginal = tr.third;


          if (!isLabeledO) {
            // numPrevTokensSpecial++;
            previousTokens.add(0, strgeneric);
            // previousTokens.add(0,
            // "[{answer:"
            // + tokenj.get(answerClass.get(label)).toString()
            // + "}]");
            originalPrev.add(0, strOriginal);
            numNonStopWordsPrev++;
          } else if (tokenj.word().startsWith("http")) {
            useprev = false;
            previousTokens.clear();
            originalPrev.clear();
            break;
          } else {
            Token str = SurfacePattern.getContextToken(tokenj);
            previousTokens.add(0, str);
            originalPrev.add(0, tokenjStr);
            if (doNotUse(tokenjStr, ConstantsAndVariables.getStopWords())) {
              numStopWordsprev++;
            } else
              numNonStopWordsPrev++;
          }
          numTokens++;
          j--;
        }
      }


      if (useNextContext) {
        int numTokens = 0;
        int j = i + 1;
        while (numTokens < maxWin && j < sent.size()) {
          // for (int j = i + 1; j < sent.size() && j <= i + maxWin; j++) {
          CoreLabel tokenj = sent.get(j);


          String tokenjStr;
          if (useLemmaContextTokens)
            tokenjStr = tokenj.lemma();
          else
            tokenjStr = tokenj.word();


          // do not use this word in context consideration
          if (useFillerWordsInPat
            && fillerWords.contains(tokenj.word().toLowerCase())) {
            j++;
            continue;
          }
//          if (!tokenj.containsKey(answerClass.get(label))) {
//            throw new RuntimeException(
//                "how come the dict annotation for token " + tokenj.word()
//                    + " in " + sent + " is not set");
//          }


          Triple<Boolean, Token, String> tr = getContextTokenStr(tokenj);
          boolean isLabeledO = tr.first;
          Token strgeneric = tr.second;
          String strOriginal = tr.third;


          // boolean isLabeledO = tokenj.get(answerClass.get(label))
          // .equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL);
          if (!isLabeledO) {
            // numNextTokensSpecial++;
            numNonStopWordsNext++;
            nextTokens.add(strgeneric);
            // nextTokens.add("[{" + label + ":"
            // + tokenj.get(answerClass.get(label)).toString()
            // + "}]");
            originalNext.add(strOriginal);
            // originalNextStr += " "
            // + tokenj.get(answerClass.get(label)).toString();
          } else if (tokenj.word().startsWith("http")) {
            usenext = false;
            nextTokens.clear();
            originalNext.clear();
            break;
          } else {// if (!tokenj.word().matches("[.,?()]")) {
            Token str = SurfacePattern.getContextToken(tokenj);
            nextTokens.add(str);
            originalNext.add(tokenjStr);
            if (doNotUse(tokenjStr, ConstantsAndVariables.getStopWords())) {
              numStopWordsnext++;
            } else
              numNonStopWordsNext++;
          }
          j++;
          numTokens++;
        }
      }
      // String prevContext = null, nextContext = null;


      // int numNonSpecialPrevTokens = previousTokens.size()
      // - numPrevTokensSpecial;
      // int numNonSpecialNextTokens = nextTokens.size() - numNextTokensSpecial;






      Token[] prevContext = null;
      //String[] prevContext = null;
      //String[] prevOriginalArr = null;
      // if (previousTokens.size() >= minWindow4Pattern
      // && (numStopWordsprev < numNonSpecialPrevTokens ||
      // numNonSpecialPrevTokens > numMinStopWordsToAdd)) {
      if (previousTokens.size() >= minWindow4Pattern
        && (numNonStopWordsPrev > 0 || numStopWordsprev > numMinStopWordsToAdd)) {


        // prevContext = StringUtils.join(previousTokens, fw);


        List<Token> prevContextList = new ArrayList<Token>();
        List<String> prevOriginal = new ArrayList<String>();
        for (Token p : previousTokens) {
          prevContextList.add(p);
          if (!fw.isEmpty())
            prevContextList.add(fw);
        }


        // add fw and sw to the the originalprev
        for (String p : originalPrev) {
          prevOriginal.add(p);
          if (!fw.isEmpty())
            prevOriginal.add(" FW ");
        }


        if (!sw.isEmpty()) {
          prevContextList.add(sw);
          prevOriginal.add(" SW ");
        }


        // String str = prevContext + fw + sw;




        if (isASCII(StringUtils.join(prevOriginal))) {
          prevContext = prevContextList.toArray(new Token[0]);
          //prevOriginalArr = prevOriginal.toArray(new String[0]);
          if (previousTokens.size() >= minWindow4Pattern) {
            if (twithoutPOS != null) {
              SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS,
                null, Genre.PREV);
              prevpatterns.add(pat);
            }
            if (twithPOS != null) {
              SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS,
                null, Genre.PREV);
              prevpatterns.add(patPOS);
            }
          }
          useprev = true;
        }
      }


      Token[] nextContext = null;
      //String [] nextOriginalArr = null;
      // if (nextTokens.size() > 0
      // && (numStopWordsnext < numNonSpecialNextTokens ||
      // numNonSpecialNextTokens > numMinStopWordsToAdd)) {
      if (nextTokens.size() > 0
        && (numNonStopWordsNext > 0 || numStopWordsnext > numMinStopWordsToAdd)) {
        // nextContext = StringUtils.join(nextTokens, fw);
        List<Token> nextContextList = new ArrayList<Token>();


        List<String> nextOriginal = new ArrayList<String>();


        if (!sw.isEmpty()) {
          nextContextList.add(sw);
          nextOriginal.add(" SW ");
        }


        for (Token n : nextTokens) {
          if (!fw.isEmpty())
            nextContextList.add(fw);
          nextContextList.add(n);
        }


        for (String n : originalNext) {
          if (!fw.isEmpty())
            nextOriginal.add(" FW ");
          nextOriginal.add(n);
        }


        if (nextTokens.size() >= minWindow4Pattern) {
          nextContext = nextContextList.toArray(new Token[0]);
          //nextOriginalArr =  nextOriginal.toArray(new String[0]);
          if (twithoutPOS != null) {
            SurfacePattern pat = new SurfacePattern(null, twithoutPOS,
              nextContext, Genre.NEXT);
            nextpatterns.add(pat);
          }
          if (twithPOS != null) {
            SurfacePattern patPOS = new SurfacePattern(null, twithPOS,
              nextContext, Genre.NEXT);
            nextpatterns.add(patPOS);
          }


        }
        usenext = true;


      }


      if (useprev && usenext) {
        // String strprev = prevContext + fw + sw;


        // String strnext = sw + fw + nextContext;
        if (previousTokens.size() + nextTokens.size() >= minWindow4Pattern) {


          if (twithoutPOS != null) {
            SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS,
              nextContext, Genre.PREVNEXT);
            prevnextpatterns.add(pat);
          }


          if (twithPOS != null) {
            SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS,
              nextContext, Genre.PREVNEXT);
            prevnextpatterns.add(patPOS);
          }
        }


      }
    }


//    Triple<Set<Integer>, Set<Integer>, Set<Integer>> patterns = new Triple<Set<Integer>, Set<Integer>, Set<Integer>>(
//        prevpatterns, nextpatterns, prevnextpatterns);
    // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
    // " prev patterns are " + prevpatterns);
    // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
    // " next patterns are " + nextpatterns);
    // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
    // " prevnext patterns are " + prevnextpatterns);
    //getPatternIndex().finishCommit();
    return CollectionUtils.unionAsSet(prevpatterns, nextpatterns, prevnextpatterns);
  }






  static Triple<Boolean, Token, String> getContextTokenStr(CoreLabel tokenj) {
    Token strgeneric = new Token();
    String strOriginal = "";
    boolean isLabeledO = true;
//    for (Entry<String, Class<? extends TypesafeMap.Key<String>>> e : getAnswerClass().entrySet()) {
//      if (!tokenj.get(e.getValue()).equals(backgroundSymbol)) {
//        isLabeledO = false;
//        if (strOriginal.isEmpty()) {
//          strOriginal = e.getKey();
//        } else {
//          strOriginal += "|" + e.getKey();
//        }
//        strgeneric.addRestriction(e.getKey(), e.getKey());
//      }
//    }


    for (Map.Entry<String, Class> e : ConstantsAndVariables.getGeneralizeClasses().entrySet()) {
      if (!tokenj.get(e.getValue()).equals(ConstantsAndVariables.backgroundSymbol)) {
        isLabeledO = false;
        if (strOriginal.isEmpty()) {


          strOriginal = e.getKey();
        } else {


          strOriginal += "|" + e.getKey();
        }
        strgeneric.addORRestriction(e.getValue(), e.getKey());
      }
    }


    if (useContextNERRestriction) {
      String nerTag = tokenj
        .get(CoreAnnotations.NamedEntityTagAnnotation.class);
      if (nerTag != null
        && !nerTag.equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL)) {
        isLabeledO = false;
        if (strOriginal.isEmpty()) {


          strOriginal = nerTag;
        } else {


          strOriginal += "|" + nerTag;
        }
        strgeneric.addORRestriction(CoreAnnotations.NamedEntityTagAnnotation.class, nerTag);
      }
    }


    return new Triple<Boolean, Token, String>(isLabeledO, strgeneric,
      strOriginal);
  }


  public static boolean isASCII(String text) {


    Charset charset = Charset.forName("US-ASCII");
    String checked = new String(text.getBytes(charset), charset);
    return checked.equals(text);// && !text.contains("+") &&
    // !text.contains("*");// && !
    // text.contains("$") && !text.contains("\"");


  }
}
Source Code of edu.stanford.nlp.patterns.surface.SurfacePatternFactory

Related Classes of edu.stanford.nlp.patterns.surface.SurfacePatternFactory