Package edu.stanford.nlp.ling.tokensregex

Examples of edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher


        }
      }
    }
    // TODO: Should we allow "," in written out numbers?
    // TODO: Handle "-" that is not with token?
    TokenSequenceMatcher matcher = numberPattern.getMatcher(tokens);
    List<CoreMap> numbers = new ArrayList<CoreMap>();
    while (matcher.find()) {
      @SuppressWarnings("unused")
      List<CoreMap> matchedTokens = matcher.groupNodes();
      int numStart = matcher.start();
      int possibleNumEnd = -1;
      int lastUnitPos = -1;
      int possibleNumStart = -1;
      Number possibleNumEndUnit = null;
      Number lastUnit = null;
      // Check if we need to split matched chunk up more
      for (int i = matcher.start(); i < matcher.end(); i++) {
        CoreLabel token = tokens.get(i);
        CoreLabel prev = (i > matcher.start())? tokens.get(i - 1): null;
        Number num = token.get(CoreAnnotations.NumericValueAnnotation.class);
        Number prevNum = (prev != null)? prev.get(CoreAnnotations.NumericValueAnnotation.class):null;
        String w = token.word();
        w = w.trim().toLowerCase();
        switch (w) {
          case ",":
            if (lastUnit != null && lastUnitPos == i - 1) {
              // OKAY, this may be one big number
              possibleNumEnd = i;
              possibleNumEndUnit = lastUnit;
            } else {
              // Not one big number
              if (numStart < i) {
                numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
                numStart = i + 1;
                possibleNumEnd = -1;
                possibleNumEndUnit = null;
                lastUnit = null;
                lastUnitPos = -1;
              }
            }
            if (numStart == i) {
              numStart = i + 1;
            }
            break;
          case "and":
            // Check if number before and was unit
            String prevWord = prev.word();
            if (lastUnitPos == i - 1 || (lastUnitPos == i - 2 && ",".equals(prevWord))) {
              // Okay
            } else {
              // Two separate numbers
              if (numStart < possibleNumEnd) {
                numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, possibleNumEnd));
                if (possibleNumStart >= possibleNumEnd) {
                  numStart = possibleNumStart;
                } else {
                  numStart = i + 1;
                }
              } else if (numStart < i) {
                numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
                numStart = i + 1;
              }
              if (lastUnitPos < numStart) {
                lastUnit = null;
                lastUnitPos = -1;
              }
              possibleNumEnd = -1;
              possibleNumEndUnit = null;
            }
            break;
          default:
            // NUMBER or ORDINAL
            String numType = token.get(CoreAnnotations.NumericTypeAnnotation.class);
            if ("UNIT".equals(numType)) {
              // Compare this unit with previous
              if (lastUnit == null || lastUnit.longValue() > num.longValue()) {
                // lastUnit larger than this unit
                // maybe four thousand two hundred?
                // OKAY, probably one big number
              } else {
                if (numStart < possibleNumEnd) {
                  // Units are increasing - check if this unit is >= unit before "," (if so, need to split into chunks)
                  // Not one big number  ( had a comma )
                  if (num.longValue() >= possibleNumEndUnit.longValue()) {
                    numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, possibleNumEnd));
                    if (possibleNumStart >= possibleNumEnd) {
                      numStart = possibleNumStart;
                    } else {
                      numStart = i;
                    }
                    possibleNumEnd = -1;
                    possibleNumEndUnit = null;
                  }
                } else {
                  // unit is increasing - can be okay, maybe five hundred thousand?
                  // what about four hundred five thousand
                  // unit might also be the same, as in thousand thousand,
                  // which we convert to million
                }
              }
              lastUnit = num;
              lastUnitPos = i;
            } else {
              // Normal number
              if (num == null) {
                logger.warning("NO NUMBER: " + token.word());
                continue;
              }
              if (prevNum != null) {
                if (num.doubleValue() > 0) {
                  if (num.doubleValue() < 10) {
                    // This number is a digit
                    // Treat following as two separate numbers
                    //    \d+ [0-9]
                    //    [one to nine]  [0-9]
                    if (NumberNormalizer.numPattern.matcher(prev.word()).matches() ||
                        prevNum.longValue() < 10 || prevNum.longValue() % 10 != 0) {
                      // two separate numbers
                      if (numStart < i) {
                        numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
                      }
                      numStart = i;
                      possibleNumEnd = -1;
                      possibleNumEndUnit = null;
                      lastUnit = null;
                      lastUnitPos = -1;
                    }
                  } else {
                    String prevNumType = prev.get(CoreAnnotations.NumericTypeAnnotation.class);
                    if ("UNIT".equals(prevNumType)) {
                      // OKAY
                    } else if (!ordinalUnitPattern.matcher(w).matches()) {
                      // Start of new number
                      if (numStart < i) {
                        numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
                      }
                      numStart = i;
                      possibleNumEnd = -1;
                      possibleNumEndUnit = null;
                      lastUnit = null;
                      lastUnitPos = -1;
                    }
                  }
                }
              }
              if ("ORDINAL".equals(numType)) {
                if (possibleNumEnd >= 0) {
                  if (numStart < possibleNumEnd) {
                    numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, possibleNumEnd));
                  }
                  if (possibleNumStart > possibleNumEnd) {
                    numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, possibleNumStart, i + 1));
                  } else {
                    numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, possibleNumEnd + 1, i + 1));
                  }
                } else {
                  if (numStart < i + 1) {
                    numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i + 1));
                  }
                }
                numStart = i + 1;
                possibleNumEnd = -1;
                possibleNumEndUnit = null;
                lastUnit = null;
                lastUnitPos = -1;
              }
              if (possibleNumStart < possibleNumEnd) {
                possibleNumStart = i;
              }
            }
            break;
        }
      }
      if (numStart < matcher.end()) {
        numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, matcher.end()));
      }
    }
    for (CoreMap n:numbers) {
      String exp = n.get(CoreAnnotations.TextAnnotation.class);
      if (exp.trim().equals("")) { continue; }
View Full Code Here


          logger.warning("Error interpreting number range " + w + ": " + ex.getMessage());
        }
      }
    }
    List<CoreMap> numberRanges = new ArrayList<CoreMap>();
    TokenSequenceMatcher matcher = rangePattern.getMatcher(numerizedTokens);
    while (matcher.find()) {
      List<CoreMap> matched = matcher.groupNodes();
      if (matched.size() == 1) {
        numberRanges.add(matched.get(0));
      } else {
        Number v1 = matched.get(0).get(CoreAnnotations.NumericCompositeValueAnnotation.class);
        Number v2 = matched.get(matched.size()-1).get(CoreAnnotations.NumericCompositeValueAnnotation.class);
        if (v2.doubleValue() > v1.doubleValue()) {
          CoreMap newChunk = ChunkAnnotationUtils.getMergedChunk(numerizedTokens,  matcher.start(), matcher.end(),
                  CoreMapAttributeAggregator.getDefaultAggregators());
          newChunk.set(CoreAnnotations.NumericCompositeTypeAnnotation.class, "NUMBER_RANGE");
          Pair<Number,Number> range = new Pair<Number,Number>(v1,v2);
          newChunk.set(CoreAnnotations.NumericCompositeObjectAnnotation.class, range);
          numberRanges.add(newChunk);
View Full Code Here

      for (Entry<TokenSequencePattern, E> pEn : patterns.entrySet()) {

        if (pEn.getKey() == null)
          throw new RuntimeException("why is the pattern " + pEn + " null?");

        TokenSequenceMatcher m = pEn.getKey().getMatcher(sent);

//        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
//        m.setFindType(SequenceMatcher.FindType.FIND_ALL);

        //Higher branch values makes the faster but uses more memory
        m.setBranchLimit(5);

        while (m.find()) {

          int s = m.start("$term");
          int e = m.end("$term");

          String phrase = "";
          String phraseLemma = "";
          boolean useWordNotLabeled = false;
          boolean doNotUse = false;
View Full Code Here

  public VerbSemClassMatch match(String sentence) {

    Annotation annotation = new Annotation(sentence.toLowerCase());
    pipeline.annotate(annotation);
    TokenSequenceMatcher m1 = p1.getMatcher(annotation.get(CoreAnnotations.TokensAnnotation.class));
    if(m1.find())
      return new VerbSemClassMatch(extractMatch(m1.groupNodes(2)), extractMatch(m1.groupNodes(1)), false);
    TokenSequenceMatcher m2 = p2.getMatcher(annotation.get(CoreAnnotations.TokensAnnotation.class));
    if(m2.find())
      return new VerbSemClassMatch(extractMatch(m2.groupNodes(2)), extractMatch(m2.groupNodes(1)), false);
    TokenSequenceMatcher m3 = p3.getMatcher(annotation.get(CoreAnnotations.TokensAnnotation.class));
    if(m3.find())
      return new VerbSemClassMatch(extractMatch(m3.groupNodes(2)), extractMatch(m3.groupNodes(1)), true);
    TokenSequenceMatcher m4 = p3.getMatcher(annotation.get(CoreAnnotations.TokensAnnotation.class));
    if(m4.find())
      return new VerbSemClassMatch(extractMatch(m4.groupNodes(2)), extractMatch(m4.groupNodes(1)), true);
    return null;
  }
View Full Code Here

TOP

Related Classes of edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.