Package com.caucho.quercus.lib.regexp

Source Code of com.caucho.quercus.lib.regexp.Regcomp

/*
* Copyright (c) 1998-2008 Caucho Technology -- all rights reserved
*
* This file is part of Resin(R) Open Source
*
* Each copy or derived work must preserve the copyright notice and this
* notice unmodified.
*
* Resin Open Source is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* Resin Open Source is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
* of NON-INFRINGEMENT.  See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with Resin Open Source; if not, write to the
*
*   Free Software Foundation, Inc.
*   59 Temple Place, Suite 330
*   Boston, MA 02111-1307  USA
*
* @author Scott Ferguson
*/

/*
* XXX: anchored expressions should have flags for quick matching.
*/

package com.caucho.quercus.lib.regexp;

import java.util.*;
import java.util.concurrent.*;
import java.util.logging.*;

import com.caucho.quercus.env.ConstStringValue;
import com.caucho.quercus.env.StringValue;
import com.caucho.quercus.env.StringBuilderValue;
import com.caucho.util.*;

/**
* Regular expression compilation.
*/
class Regcomp {
  private static final Logger log
    = Logger.getLogger(Regcomp.class.getName());
  private static final L10N L = new L10N(RegexpNode.class);

  // #2526, JIT issues with Integer.MAX_VALUE
  private static final int INTEGER_MAX = Integer.MAX_VALUE - 1;
 
  static final int MULTILINE = 0x1;
  static final int SINGLE_LINE = 0x2;
  static final int IGNORE_CASE = 0x4;
  static final int IGNORE_WS = 0x8;
  static final int GLOBAL = 0x10;

  static final int ANCHORED = 0x20;
  static final int END_ONLY = 0x40;
  static final int UNGREEDY = 0x80;
  static final int STRICT = 0x100;
  static final int UTF8 = 0x200;
 
  static final HashMap<String,Integer> _characterClassMap
    = new HashMap<String,Integer>();

  static final ConcurrentHashMap<String,RegexpSet> _unicodeBlockMap
    = new ConcurrentHashMap<String,RegexpSet>();
 
  int _nGroup;
  int _nLoop;
  int _maxGroup;
  int _flags;

  HashMap<Integer,StringValue> _groupNameMap
    = new HashMap<Integer,StringValue>();

  HashMap<StringValue,Integer> _groupNameReverseMap
    = new HashMap<StringValue,Integer>();

  ArrayList<RegexpNode.Recursive> _recursiveList
    = new ArrayList<RegexpNode.Recursive>();

  RegexpNode _groupTail;
 
  boolean _isLookbehind;
  boolean _isOr;
 
  Regcomp(int flags)
  {
    _flags = flags;
  }

  boolean isGreedy()
  {
    return (_flags & UNGREEDY) != UNGREEDY;
  }

  boolean isIgnoreCase()
  {
    return (_flags & IGNORE_CASE) == IGNORE_CASE;
  }

  boolean isIgnoreWs()
  {
    return (_flags & IGNORE_WS) == IGNORE_WS;
  }

  boolean isMultiline()
  {
    return (_flags & MULTILINE) == MULTILINE;
  }

  boolean isDollarEndOnly()
  {
    return (_flags & END_ONLY) == END_ONLY;
  }

  int nextLoopIndex()
  {
    return _nLoop++;
  }

  RegexpNode parse(PeekStream pattern) throws IllegalRegexpException
  {
    _nGroup = 1;

    RegexpNode begin = null;

    if ((_flags & ANCHORED) != 0)
      begin = RegexpNode.ANCHOR_BEGIN_RELATIVE;
   
    RegexpNode value = parseRec(pattern, begin);

    while (pattern.read() == '|') {
      value = RegexpNode.Or.create(value, parseRec(pattern, begin));
    }
   
    value = value != null ? value.getHead() : RegexpNode.N_END;

    if (_maxGroup < _nGroup)
      _maxGroup = _nGroup;

    for (RegexpNode.Recursive rec : _recursiveList) {
      RegexpNode top = value;

      if (top instanceof RegexpNode.Concat) {
        RegexpNode.Concat topConcat = (RegexpNode.Concat) top;

        if (topConcat.getConcatHead() instanceof RegexpNode.AnchorBegin
            || topConcat.getConcatHead() instanceof RegexpNode.AnchorBeginRelative) {
          top = topConcat.getConcatNext();
        }
      }
     
      rec.setTop(top);
    }

    if (log.isLoggable(Level.FINEST))
      log.finest("regexp[] " + value);

    return value;
  }
 
  /**
   *   Recursively compile a RegexpNode.
   *
   * first      -- The first node of this sub-RegexpNode
   * prev       -- The previous node of this sub-RegexpNode
   * last_begin -- When the last grouping began
   * last_end   -- When the last grouping ended
   *
   * head       ->  node
   *                 v -- rest
   *                ...
   *                 v -- rest
   *                node
   *
   * last       ->  node
   *                 v -- rest
   *                ...
   *                 v -- rest
   *                node
   */
  private RegexpNode parseRec(PeekStream pattern, RegexpNode tail)
    throws IllegalRegexpException
  {
    int ch = pattern.read();
    RegexpNode next;
    RegexpNode groupTail;

    switch (ch) {
    case -1:
      return tail != null ? tail.getHead() : null;

    case '?':
      if (tail == null)
        throw error(L.l("'?' requires a preceeding regexp"));

      tail = createLoop(pattern, tail, 0, 1);
     
      return parseRec(pattern, tail.getTail());

    case '*':
      if (tail == null)
        throw error(L.l("'*' requires a preceeding regexp"));

      tail = createLoop(pattern, tail, 0, INTEGER_MAX);
     
      return parseRec(pattern, tail.getTail());

    case '+':
      if (tail == null)
        throw error(L.l("'+' requires a preceeding regexp"));

      tail = createLoop(pattern, tail, 1, INTEGER_MAX);
     
      return parseRec(pattern, tail.getTail());

    case '{':
      if (tail == null || ! ('0' <= pattern.peek() && pattern.peek() <= '9')) {
        next = parseString('{', pattern);
     
        return concat(tail, parseRec(pattern, next));
      }

      return parseRec(pattern, parseBrace(pattern, tail).getTail());

    case '.':
      if ((_flags & SINGLE_LINE) == 0)
        next = RegexpNode.DOT;
      else
        next = RegexpNode.ANY_CHAR;
       
      return concat(tail, parseRec(pattern, next));

    case '|':
      pattern.ungetc(ch);

      if (_groupTail != null)
        return concat(tail, _groupTail);
      else
        return tail.getHead();

    case '(':
      {
        switch (pattern.peek()) {
        case '?':
          pattern.read();

          switch (pattern.peek()) {
          case ':':
            pattern.read();
            return parseGroup(pattern, tail, 0, _flags);
           
          case '#':
            parseCommentGroup(pattern);
           
            return parseRec(pattern, tail);
           
          case '(':
            return parseConditional(pattern, tail);
           
          case '=':
          case '!':
            ch = pattern.read();

            boolean isPositive = (ch == '=');

            groupTail = _groupTail;
            _groupTail = null;

            next = parseRec(pattern, null);
           
            while ((ch = pattern.read()) == '|') {
              RegexpNode nextHead = parseRec(pattern, null);
              next = next.createOr(nextHead);
            }

            if (isPositive)
              next = new RegexpNode.Lookahead(next);
            else
              next = new RegexpNode.NotLookahead(next);

            if (ch != ')')
              throw error(L.l("expected ')' at '{0}'",
                              String.valueOf((char) ch)));

            _groupTail = groupTail;

            return concat(tail, parseRec(pattern, next));
           
          case '<':
            pattern.read();

            switch (pattern.read()) {
            case '=':
              isPositive = true;
              break;
            case '!':
              isPositive = false;
              break;
            default:
              throw error(L.l("expected '=' or '!'"));
            }

            groupTail = _groupTail;
            _groupTail = null;

            next = parseRec(pattern, null);

            if (next == null) {
            }
            else if (isPositive)
              next = new RegexpNode.Lookbehind(next);
            else
              next = new RegexpNode.NotLookbehind(next);
           
            while ((ch = pattern.read()) == '|') {
              RegexpNode second = parseRec(pattern, null);

              if (second == null) {
              }
              else if (isPositive)
                second = new RegexpNode.Lookbehind(second);
              else
                second = new RegexpNode.NotLookbehind(second);

              if (second != null)
                next = next.createOr(second);
            }

            if (ch != ')')
              throw error(L.l("expected ')' at '{0}'",
                              String.valueOf((char) ch)));

            _groupTail = groupTail;

            return concat(tail, parseRec(pattern, next));
         
          // XXX: once-only subpatterns (mostly an optimization feature)
          case '>':
            pattern.read();
            return parseGroup(pattern, tail, 0, _flags);

          case 'P':
            pattern.read();
            return parseNamedGroup(pattern, tail);

          case 'R':
            pattern.read();
            RegexpNode.Recursive rec = new RegexpNode.Recursive();
            _recursiveList.add(rec);
            ch = pattern.read();
            if (ch != ')')
              throw error(L.l("expected ')' at '{0}'",
                              String.valueOf((char) ch)));
           
            return concat(tail, parseRec(pattern, rec));

          case 'm': case 's': case 'i': case 'x': case 'g':
          case 'U': case 'X':
            {
              int flags = _flags;
             
              while ((ch = pattern.read()) > 0 && ch != ')') {
                switch (ch) {
                case 'm': _flags |= MULTILINE; break;
                case 's': _flags |= SINGLE_LINE; break;
                case 'i': _flags |= IGNORE_CASE; break;
                case 'x': _flags |= IGNORE_WS; break;
                case 'g': _flags |= GLOBAL; break;
                case 'U': _flags |= UNGREEDY; break;
                case 'X': _flags |= STRICT; break;
                case ':':
                  {
                    return parseGroup(pattern, tail, 0, flags);
                  }
                default:
                  throw error(L.l("'{0}' is an unknown (? code", String.valueOf((char) ch)));
                }
              }

              if (ch != ')')
                throw error(L.l("expected ')' at '{0}'",
                                String.valueOf((char) ch)));
             
              RegexpNode node = parseRec(pattern, tail);

              _flags = flags;

              return node;
            }
           
          default:
            throw error(L.l("'{0}' is an unknown (? code", String.valueOf((char) pattern.peek())));
          }
         
        default:
          return parseGroup(pattern, tail, _nGroup++, _flags);
        }
      }

    case ')':
      pattern.ungetc(ch);

      if (_groupTail != null)
        return concat(tail, _groupTail);
      else
        return tail;

    case '[':
      next = parseSet(pattern);

      return concat(tail, parseRec(pattern, next));
     
    case '\\':
      next = parseSlash(pattern);
     
      return concat(tail, parseRec(pattern, next));
     
    case '^':
      if (isMultiline())
        next = RegexpNode.ANCHOR_BEGIN_OR_NEWLINE;
      else
        next = RegexpNode.ANCHOR_BEGIN;
     
      return concat(tail, parseRec(pattern, next));
     
    case '$':
      if (isMultiline())
        next = RegexpNode.ANCHOR_END_OR_NEWLINE;
      else if (isDollarEndOnly())
        next = RegexpNode.ANCHOR_END_ONLY;
      else
        next = RegexpNode.ANCHOR_END;
     
      return concat(tail, parseRec(pattern, next));

    case ' ': case '\n': case '\t': case '\r':
      if (isIgnoreWs()) {
        while (Character.isWhitespace((char) pattern.peek()))
          pattern.read();

        return parseRec(pattern, tail);
      }
      else {
        next = parseString(ch, pattern);
       
        return concat(tail, parseRec(pattern, next));
      }

    case '#':
      if (isIgnoreWs()) {
        while ((ch = pattern.read()) > 0 && ch != '\n') {
        }

        return parseRec(pattern, tail);
      }
      else {
        next = parseString(ch, pattern);
       
        return concat(tail, parseRec(pattern, next));
      }
     
    default:
      next = parseString(ch, pattern);
     
      return concat(tail, parseRec(pattern, next));
    }
  }

  private void parseCommentGroup(PeekStream pattern)
  {
    int ch;
   
    // (?#...) Comment
    while ((ch = pattern.read()) >= 0 && ch != ')') {
    }
  }
 
  private RegexpNode parseNamedGroup(PeekStream pattern, RegexpNode tail)
    throws IllegalRegexpException
  {
    int ch = pattern.read();

    if (ch == '=') {
      StringBuilder sb = new StringBuilder();

      while ((ch = pattern.read()) != ')' && ch >= 0) {
        sb.append((char) ch);
      }

      if (ch != ')')
        throw error(L.l("expected ')'"));

      String name = sb.toString();
     
      Integer v = _groupNameReverseMap.get(new ConstStringValue(name));

      if (v != null) {
        RegexpNode next = new RegexpNode.GroupRef(v);
     
        return concat(tail, parseRec(pattern, next));
      }
      else
        throw error(L.l("'{0}' is an unknown regexp group", name));
    }
    else if (ch == '<') {
      StringBuilder sb = new StringBuilder();

      while ((ch = pattern.read()) != '>' && ch >= 0) {
        sb.append((char) ch);
      }

      if (ch != '>')
        throw error(L.l("expected '>'"));

      String name = sb.toString();

      int group = _nGroup++;

      _groupNameMap.put(group, new StringBuilderValue(name));
      _groupNameReverseMap.put(new StringBuilderValue(name), group);

      return parseGroup(pattern, tail, group, _flags);
    }
    else
      throw error(L.l("Expected '(?:P=name' or '(?:P<name' for named group"));
  }

  private RegexpNode parseConditional(PeekStream pattern, RegexpNode tail)
    throws IllegalRegexpException
  {
    int ch = pattern.read();

    if (ch != '(')
      throw error(L.l("expected '('"));
   
    RegexpNode.ConditionalHead groupHead = null;;
    RegexpNode groupTail = null;

    if ('1' <= (ch = pattern.peek()) && ch <= '9') {
      int value = 0;

      while ('0' <= (ch = pattern.read()) && ch <= '9') {
        value = 10 * value + ch - '0';
      }

      if (ch != ')')
        throw error(L.l("expected ')'"));

      if (_nGroup <= value)
        throw error(L.l("conditional value less than number of groups"));

      groupHead = new RegexpNode.ConditionalHead(value);
      groupTail = groupHead.getTail();
    }
    else
      throw error(L.l("conditional requires number"));

    RegexpNode oldTail = _groupTail;

    _groupTail = groupTail;
       
    RegexpNode first = parseRec(pattern, null);
    RegexpNode second = null;

    if ((ch = pattern.read()) == '|') {
      second = parseRec(pattern, null);

      ch = pattern.read();
    }

    if (ch != ')')
      throw error(L.l("expected ')' at '{0}'", String.valueOf((char) ch)));

    _groupTail = oldTail;

    groupHead.setFirst(first);
    groupHead.setSecond(second);
       
    return concat(tail, parseRec(pattern, groupHead));
  }

  private RegexpNode parseGroup(PeekStream pattern, RegexpNode tail,
                                int group, int oldFlags)
    throws IllegalRegexpException
  {
    RegexpNode.GroupHead groupHead = new RegexpNode.GroupHead(group);
    RegexpNode groupTail = groupHead.getTail();

    RegexpNode oldTail = _groupTail;

    _groupTail = groupTail;
       
    RegexpNode body = parseRec(pattern, null);

    int ch;
    while ((ch = pattern.read()) == '|') {
      RegexpNode nextBody = parseRec(pattern, null);
      body = body.createOr(nextBody);
    }

    if (ch != ')')
      throw error(L.l("expected ')'"));

    _flags = oldFlags;

    _groupTail = oldTail;

    groupHead.setNode(body.getHead());
       
    return concat(tail, parseRec(pattern, groupTail).getHead());
  }

  private void expect(char expected, int value)
    throws IllegalRegexpException
  {
    if (expected != value)
      throw error(L.l("expected '{0}'", String.valueOf(expected)));
  }

  private IllegalRegexpException error(String msg)
  {
    return new IllegalRegexpException(msg);
  }

  /**
   *   Parse the repetition construct.
   *
   *   {n}    -- exactly n
   *   {n,}   -- at least n
   *   {n,m}  -- from n to m
   *   {,m}   -- at most m
   */
  private RegexpNode parseBrace(PeekStream pattern, RegexpNode node)
    throws IllegalRegexpException
  {
    int ch;
    int min = 0;
    int max = INTEGER_MAX;

    while ((ch = pattern.read()) >= '0' && ch <= '9') {
      min = 10 * min + ch - '0';
    }

    if (ch == ',') {
      while ('0' <= (ch = pattern.read()) && ch <= '9') {
        if (max == INTEGER_MAX)
          max = 0;
       
        max = 10 * max + ch - '0';
      }
    }
    else
      max = min;

    if (ch != '}')
      throw error(L.l("Expected '}'"));

    return createLoop(pattern, node, min, max);
  }

  private RegexpNode createLoop(PeekStream pattern, RegexpNode node,
                                int min, int max)
  {
    if (pattern.peek() == '+') {
      pattern.read();
     
      return node.createPossessiveLoop(min, max);
    }
    else if (pattern.peek() == '?') {
      pattern.read();

      if (isGreedy())
        return node.createLoopUngreedy(this, min, max);
      else
        return node.createLoop(this, min, max);
    }
    else {
      if (isGreedy())
        return node.createLoop(this, min, max);
      else
        return node.createLoopUngreedy(this, min, max);
    }
  }

  static RegexpNode concat(RegexpNode prev, RegexpNode next)
  {
    if (prev != null) {
      return prev.concat(next).getHead();
    }
    else
      return next;
  }

  private String hex(int value)
  {
    CharBuffer cb = new CharBuffer();

    for (int b = 3; b >= 0; b--) {
      int v = (value >> (4 * b)) & 0xf;
      if (v < 10)
        cb.append((char) (v + '0'));
      else
        cb.append((char) (v - 10 + 'a'));
    }

    return cb.toString();
  }

  private String badChar(int ch)
  {
    if (0x20 <= ch && ch <= 0x7f)
      return "'" + (char) ch + "'";
    else if ((ch & 0xffff) == 0xffff)
      return "end of expression";
    else
      return "'" + (char) ch + "' (\\u" + hex(ch) + ")";
  }

  /**
   *   Collect the characters in a set, e.g. [a-z@@^!"]
   *
   * Variables:
   *
   *   last     -- Contains last read character.
   *   lastdash -- Contains character before dash or -1 if not after dash.
   */
  private RegexpNode parseSet(PeekStream pattern)
    throws IllegalRegexpException
  {
    int first = pattern.peek();
    boolean isNot = false;

    if (first == '^') {
      pattern.read();
      isNot = true;
    }
   
    RegexpSet set = new RegexpSet();

    int last = -1;
    int lastdash = -1;
    int ch;

    int charRead = 0;
   
    ArrayList<RegexpNode> nodeList = null;
   
    while ((ch = pattern.read()) >= 0) {
      charRead++;

      // php/4e3o
      // first literal closing bracket need not be escaped
      if (ch == ']') {
        if (charRead == 1) {
          pattern.ungetc(ch);
          ch = '\\';
        }
        else
          break;
      }
     
      boolean isChar = true;
      boolean isDash = ch == '-';

      if (ch == '\\') {
        isChar = false;

        switch ((ch = pattern.read())) {
        case 's':
          set.mergeOr(RegexpSet.SPACE);
          break;

        case 'S':
          set.mergeOrInv(RegexpSet.SPACE);
          break;

        case 'd':
          set.mergeOr(RegexpSet.DIGIT);
          break;

        case 'D':
          set.mergeOrInv(RegexpSet.DIGIT);
          break;

        case 'w':
          set.mergeOr(RegexpSet.WORD);
          break;

        case 'W':
          set.mergeOrInv(RegexpSet.WORD);
          break;

        case 'p':
          int ch2 = pattern.read();
         
          if (ch2 != '{') {
            if (nodeList == null)
              nodeList = new ArrayList<RegexpNode>();
           
            nodeList.add(parseUnicodeProperty(ch2, false));
          }
          else {
            StringBuilder sb = new StringBuilder();
           
            int ch3;
           
            while ((ch3 = pattern.read()) >= 0 && ch3 != '}') {
              sb.append((char) ch3);
            }

            String name = sb.toString();

            if (ch3 != '}')
              throw new IllegalRegexpException(L.l("expected '}' at "
                               + badChar(ch3)));
           
            int len = name.length();

            if (len == 1) {
              if (nodeList == null)
                  nodeList = new ArrayList<RegexpNode>();
             
              nodeList.add(parseUnicodeProperty(name.charAt(0), false));
            }
            else if (len == 2) {
              if (nodeList == null)
                  nodeList = new ArrayList<RegexpNode>();
             
              nodeList.add(parseUnicodeProperty(name.charAt(0),
                                                name.charAt(1),
                                                false));
            }
            else {
              set.mergeOr(getUnicodeSet(name));
            }
          }
          break;

        case 'b':
          ch = '\b';
          isChar = true;
          break;
        case 'n':
          ch = '\n';
          isChar = true;
          break;
        case 't':
          ch = '\t';
          isChar = true;
          break;
        case 'r':
          ch = '\r';
          isChar = true;
          break;
        case 'f':
          ch = '\f';
          isChar = true;
          break;

        case 'x':
          ch = parseHex(pattern);
          isChar = true;
          break;

        case '0': case '1': case '2': case '3':
        case '4': case '5': case '6': case '7':
          ch = parseOctal(ch, pattern);
          isChar = true;
          break;

        default:
          isChar = true;
        }
      }
      else if (ch == '[') {
        if (pattern.peek() == ':') {
          isChar = false;
          pattern.read();
         
          set.mergeOr(parseCharacterClass(pattern));
        }
      }

      if (isDash && last != -1 && lastdash == -1) {
        lastdash = last;
      }
      // c1-c2
      else if (isChar && lastdash != -1) {
        if (lastdash > ch)
          throw new IllegalRegexpException("expected increasing range at " +
                                           badChar(ch));

        setRange(set, lastdash, ch);

        last = -1;
        lastdash = -1;
      }
      else if (lastdash != -1) {
        setRange(set, lastdash, lastdash);
        setRange(set, '-', '-');

        last = -1;
        lastdash = -1;
      }
      else if (last != -1) {
       
        setRange(set, last, last);

        if (isChar)
          last = ch;
      }
      else if (isChar)
        last = ch;
    }

    // Dash at end of set: [a-z1-]
    if (lastdash != -1) {
      setRange(set, lastdash, lastdash);
      setRange(set, '-', '-');
    }
    else if (last != -1) {
      setRange(set, last, last);
    }
   
    if (ch != ']')
      throw error(L.l("Expected ']'"));

    if (nodeList == null) {
      if (isNot)
        return set.createNotNode();
      else
        return set.createNode();
    }
    else {
      RegexpNode setNode = set.createNode();

      for (RegexpNode node : nodeList) {
        setNode = setNode.createOr(node);
      }
       
      if (isNot)
        return setNode.createNot();
      else
        return setNode;
    }
  }

  private void setRange(RegexpSet set, int a, int b)
  {
    set.setRange(a, b);
   
    if (isIgnoreCase()) {
      if (Character.isLowerCase(a) && Character.isLowerCase(b)) {
        set.setRange(Character.toUpperCase(a), Character.toUpperCase(b));
      }
         
      if (Character.isUpperCase(a) && Character.isUpperCase(b)) {
        set.setRange(Character.toLowerCase(a), Character.toLowerCase(b));
      }
    }
  }

  private RegexpSet getUnicodeSet(String name)
    throws IllegalRegexpException
  {
    _flags |= UTF8;

    RegexpSet set = _unicodeBlockMap.get(name);

    if (set == null) {
      Character.UnicodeBlock block = Character.UnicodeBlock.forName(name);

      if (block == null)
        throw new IllegalRegexpException(L.l("'{0}' is an unknown unicode block",
                         name));

      set = new RegexpSet();

      for (int ch = 0; ch < 65536; ch++) {
        if (Character.UnicodeBlock.of(ch) == block) {
          set.setRange(ch, ch);
        }
      }

      _unicodeBlockMap.put(name, set);
    }

    return set;
  }

  /**
   * Returns a node for sequences starting with a backslash.
   */
  private RegexpNode parseSlash(PeekStream pattern)
    throws IllegalRegexpException
  {
    int ch;
    switch (ch = pattern.read()) {
    case 's':
      return RegexpNode.SPACE;

    case 'S':
      return RegexpNode.NOT_SPACE;

    case 'd':
      return RegexpNode.DIGIT;

    case 'D':
      return RegexpNode.NOT_DIGIT;

    case 'w':
      return RegexpNode.S_WORD;

    case 'W':
      return RegexpNode.NOT_S_WORD;

    case 'b':
      return RegexpNode.WORD;

    case 'B':
      return RegexpNode.NOT_WORD;

    case 'A':
      return RegexpNode.STRING_BEGIN;

    case 'z':
      return RegexpNode.STRING_END;
     
    case 'Z':
      return RegexpNode.STRING_NEWLINE;

    case 'G':
      return RegexpNode.STRING_FIRST;

    case 'a':
      return parseString('\u0007', pattern);
   
    case 'c':
      ch = pattern.read();
     
      ch = Character.toUpperCase(ch);
      ch ^= 0x40;

      return parseString(ch, pattern);

    case 'e':
      return parseString('\u001B', pattern, true);
    case 'n':
      return parseString('\n', pattern, true);
    case 'r':
      return parseString('\r', pattern, true);
    case 'f':
      return parseString('\f', pattern, true);
    case 't':
      return parseString('\t', pattern, true);

    case 'x':
      int hex = parseHex(pattern);
      return parseString(hex, pattern, true);
   
    case '0':
      int oct = parseOctal(ch, pattern);
      return parseString(oct, pattern, true);

    case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
      return parseBackReference(ch, pattern);

    case 'p':
      return parseUnicodeProperty(pattern, false);
    case 'P':
      return parseUnicodeProperty(pattern, true);
     
    case 'Q':
      throw new UnsupportedOperationException();
      /*
      while ((ch = pattern.read()) >= 0) {
        if (ch == '\\' && pattern.peek() == 'E') {
          pattern.read();
          break;
        }

        last = parseString(ch, pattern);
      }

      return last;
      */
     
    case '#':
      return parseString('#', pattern, true);

    default:
      if ((_flags & STRICT) != 0)
        throw new IllegalRegexpException("unrecognized escape at " +
                                         badChar(ch));
      return parseString(ch, pattern);
    }
  }
 
  /**
   * Returns a node for sequences starting with a '[:'.
   */
  private RegexpSet parseCharacterClass(PeekStream pattern)
    throws IllegalRegexpException
  {
    StringBuilder sb = new StringBuilder();
   
    int ch;
    while ((ch = pattern.read()) != ':' && ch >= 0) {
      sb.append((char)ch);
    }
   
    if (ch != ':') {
      throw new IllegalRegexpException("expected character class closing colon ':' at " + badChar(ch));
   
    
    if ((ch = pattern.read()) != ']') {
      throw new IllegalRegexpException("expected character class closing bracket ']' at " + badChar(ch));
    }

    String name = sb.toString();
   
    RegexpSet set = RegexpSet.CLASS_MAP.get(name);
   
    if (set == null) {
      throw new IllegalRegexpException("unrecognized POSIX character class " +
                                       name);
    }
    return set;
  }

  private int parseHex(PeekStream pattern)
    throws IllegalRegexpException
  {
    int ch = pattern.read();
   
    int hex = 0;
   
    StringBuilder sb = new StringBuilder();
   
    if (ch == '{') {
      while ((ch = pattern.read()) != '}') {
        if (ch < 0)
          throw new IllegalRegexpException("no more input; expected '}'");
       
        sb.append((char)ch);
      }
    }
    else {
      if (ch < 0)
        throw new IllegalRegexpException("expected hex digit at " +
                badChar(ch));
     
      sb.append((char)ch);
      ch = pattern.read();
     
      if (ch < 0) {
        throw new IllegalRegexpException("expected hex digit at " +
                                         badChar(ch));
      }

      sb.append((char)ch);
    }
   
    int len = sb.length();
   
    for (int i = 0; i < len; i++) {
      ch = sb.charAt(i);

      if ('0' <= ch && ch <= '9')
        hex = hex * 16 + ch - '0';
      else if ('a' <= ch && ch <= 'f')
        hex = hex * 16 + ch - 'a' + 10;
      else if ('A' <= ch && ch <= 'F')
        hex = hex * 16 + ch - 'A' + 10;
      else
        throw new IllegalRegexpException("expected hex digit at " +
                                         badChar(ch));
    }
   
    return hex;
  }
 
  private RegexpNode parseBackReference(int ch, PeekStream pattern)
    throws IllegalRegexpException
  {
    int value = ch - '0';
    int ch2 = pattern.peek();
   
    if ('0' <= ch2 && ch2 <= '9') {
      pattern.read();
      value = value * 10 + ch2 - '0';
    }

    int ch3 = pattern.peek();
   
    if (value < 10 || value <= _nGroup && ! ('0' <= ch3 && ch3 <= '7')) {
      return new RegexpNode.GroupRef(value);
    }
    else if (! ('0' <= ch2 && ch2 <= '7')
             && ! ('0' <= ch3 && ch3 <= '7'))
      throw new IllegalRegexpException("back referencing to a non-existent group: " +
                                       value);
   
    if (value > 10)
      pattern.ungetc(ch2);
   
    if (ch == '8' || ch == '9'
        || '0' <= ch3 && ch3 <= '9' && value * 10 + ch3 - '0' > 0xFF) {
      //out of byte range or not an octal,
      //need to parse backslash as the NULL character
     
      pattern.ungetc(ch);
      return parseString('\u0000', pattern);
    }
   
    int oct = parseOctal(ch, pattern);
   
    return parseString(oct, pattern, true);
  }

  private RegexpNode parseString(int ch,
                                 PeekStream pattern)
    throws IllegalRegexpException
  {
    return parseString(ch, pattern, false);
  }
 
  /**
   * parseString
   */
  private RegexpNode parseString(int ch,
                                 PeekStream pattern,
                                 boolean isEscaped)
    throws IllegalRegexpException
  {
    CharBuffer cb = new CharBuffer();
    cb.append((char) ch);

    for (ch = pattern.read(); ch >= 0; ch = pattern.read()) {
      switch (ch) {
      case ' ': case '\t': case '\n': case '\r':
        if (! isIgnoreWs() || isEscaped)
          cb.append((char) ch);
        break;

      case '#':
        if (! isIgnoreWs() || isEscaped)
          cb.append((char) ch);
        else {
          while ((ch = pattern.read()) != '\n' && ch >= 0) {
          }
        }
        break;

      case '(': case ')': case '[':
      case '+': case '?': case '*': case '.':
      case '$': case '^': case '|':
        pattern.ungetc(ch);
        return createString(cb);

      case '{':
        if ('0' <= pattern.peek() && pattern.peek() <= '9') {
          pattern.ungetc(ch);
          return createString(cb);
        }
        cb.append('{');
        break;
       
      case '\\':
        ch = pattern.read();
       
        switch (ch) {
        case -1:
          cb.append('\\');
          return createString(cb);
         
        case 's': case 'S': case 'd': case 'D':
        case 'w': case 'W': case 'b': case 'B':
        case 'A': case 'z': case 'Z': case 'G':
        case 'p': case 'P':
          pattern.ungetc(ch);
          pattern.ungetc('\\');
          return createString(cb);

        case 'a':
          cb.append('\u0007');
          break;
         
        case 'c':
          ch = pattern.read();
     
          ch = Character.toUpperCase(ch);
          ch ^= 0x40;
         
          cb.append((char) ch);
          break;
        case 'e':
          cb.append('\u001b');
          break;
        case 't':
          cb.append('\t');
          break;
        case 'f':
          cb.append('\f');
          break;
        case 'n':
          cb.append('\n');
          break;
        case 'r':
          cb.append('\r');
          break;

        case 'x':
          int hex = parseHex(pattern);
          cb.append((char) hex);
          break;
     
        case 'Q':
          while ((ch = pattern.read()) >= 0) {
            if (ch == '\\' && pattern.peek() == 'E') {
              pattern.read();
              break;
            }

            cb.append((char) ch);
          }
          break;
   
        case '0':
          int oct = parseOctal(ch, pattern);
          cb.append((char) oct);
          break;

        case '1': case '2': case '3': case '4':
        case '5': case '6': case '7': case '8': case '9':
          if (ch - '0' <= _nGroup) {
            pattern.ungetc(ch);
            pattern.ungetc('\\');
            return createString(cb);
          }
          else {
            oct = parseOctal(ch, pattern);
            cb.append((char) oct);
          }
          break;
        case '#':
          cb.append('#');
          break;

        default:
          if ((_flags & STRICT) != 0)
            throw error(L.l("unrecognized escape at " + badChar(ch)));

          cb.append((char) ch);
          break;
        }
        break;

      default:
        cb.append((char) ch);
      }
    }

    return createString(cb);
  }

  private RegexpNode createString(CharBuffer cb)
  {
    if (isIgnoreCase())
      return new RegexpNode.StringIgnoreCase(cb);
    else
      return new RegexpNode.StringNode(cb);
  }
 
  private int parseOctal(int ch, PeekStream pattern)
    throws IllegalRegexpException
  {
    if ('0' > ch || ch > '7')
      throw new IllegalRegexpException("expected octal digit at " +
                                       badChar(ch));
   
    int oct = ch - '0';
   
    int ch2 = pattern.peek();
   
    if ('0' <= ch2 && ch2 <= '7') {
      pattern.read();
     
      oct = oct * 8 + ch2 - '0';
     
      ch = pattern.peek();
     
      if ('0' <= ch && ch <= '7') {
        pattern.read();
       
        oct = oct * 8 + ch - '0';
      }
    }
   
    return oct;
  }
 
  private RegexpNode parseUnicodeProperty(PeekStream pattern,
                                          boolean isNegated)
    throws IllegalRegexpException
  {
    int ch = pattern.read();

    boolean isBraced = false;

    if (ch == '{') {
      isBraced = true;
      ch = pattern.read();
     
      if (ch == '^') {
        isNegated = ! isNegated;
        ch = pattern.read();
      }
    }
   
    RegexpNode node;
   
    if (isBraced) {
      int ch2 = pattern.read();
     
      if (ch2 == '}')
        node = parseUnicodeProperty(ch, isNegated);
      else {
        node = parseUnicodeProperty(ch, ch2, isNegated);
       
        expect('}', pattern.read());
      }
    }
    else
      node = parseUnicodeProperty(ch, isNegated);
   
    return node;
  }
 
  private RegexpNode parseUnicodeProperty(int ch, int ch2,
                                          boolean isNegated)
    throws IllegalRegexpException
  {
    switch (ch) {
    case 'C':
      switch (ch2) {
      case 'c':
        return isNegated ? RegexpNode.PROP_NOT_Cc : RegexpNode.PROP_Cc;
      case 'f':
        return isNegated ? RegexpNode.PROP_NOT_Cf : RegexpNode.PROP_Cf;
      case 'n':
        return isNegated ? RegexpNode.PROP_NOT_Cn : RegexpNode.PROP_Cn;
      case 'o':
        return isNegated ? RegexpNode.PROP_NOT_Co : RegexpNode.PROP_Co;
      case 's':
        return isNegated ? RegexpNode.PROP_NOT_Cs : RegexpNode.PROP_Cs;
      default:
        throw error(L.l("invalid Unicode category {0}{1}",
                        badChar(ch), badChar(ch2)));
      }

    case 'L':
      switch (ch2) {
      case 'l':
        return isNegated ? RegexpNode.PROP_NOT_Ll : RegexpNode.PROP_Ll;
      case 'm':
        return isNegated ? RegexpNode.PROP_NOT_Lm : RegexpNode.PROP_Lm;
      case 'o':
        return isNegated ? RegexpNode.PROP_NOT_Lo : RegexpNode.PROP_Lo;
      case 't'
        return isNegated ? RegexpNode.PROP_NOT_Lt : RegexpNode.PROP_Lt;
      case 'u':
        return isNegated ? RegexpNode.PROP_NOT_Lu : RegexpNode.PROP_Lu;
       
      case '}':
        return isNegated ? RegexpNode.PROP_NOT_L : RegexpNode.PROP_L;
       
      default:
        throw error(L.l("invalid Unicode category {0}{1}",
                        badChar(ch), badChar(ch2)));
      }
    case 'M':
      switch (ch2) {
      case 'c':
        return isNegated ? RegexpNode.PROP_NOT_Mc : RegexpNode.PROP_Mc;
      case 'e':
        return isNegated ? RegexpNode.PROP_NOT_Me : RegexpNode.PROP_Me;
      case 'n':
        return isNegated ? RegexpNode.PROP_NOT_Mn : RegexpNode.PROP_Mn;
      default:
        throw error(L.l("invalid Unicode category {0}{1}",
                        badChar(ch), badChar(ch2)));
      }

    case 'N':
      switch (ch2) {
      case 'd':
        return isNegated ? RegexpNode.PROP_NOT_Nd : RegexpNode.PROP_Nd;
      case 'l':
        return isNegated ? RegexpNode.PROP_NOT_Nl : RegexpNode.PROP_Nl;
      case 'o':
        return isNegated ? RegexpNode.PROP_NOT_No : RegexpNode.PROP_No;
      default:
        throw error(L.l("invalid Unicode category {0}{1}",
                        badChar(ch), badChar(ch2)));
      }

    case 'P':
      switch (ch2) {
      case 'c':
        return isNegated ? RegexpNode.PROP_NOT_Pc : RegexpNode.PROP_Pc;
      case 'd'
        return isNegated ? RegexpNode.PROP_NOT_Pd : RegexpNode.PROP_Pd;
      case 'e':
        return isNegated ? RegexpNode.PROP_NOT_Pe : RegexpNode.PROP_Pe;
      case 'f':
        return isNegated ? RegexpNode.PROP_NOT_Pf : RegexpNode.PROP_Pf;
      case 'i':    
        return isNegated ? RegexpNode.PROP_NOT_Pi : RegexpNode.PROP_Pi;
      case 'o':   
        return isNegated ? RegexpNode.PROP_NOT_Po : RegexpNode.PROP_Po;
      case 's':  
        return isNegated ? RegexpNode.PROP_NOT_Ps : RegexpNode.PROP_Ps;
      default:
        throw error(L.l("invalid Unicode category {0}{1}",
                        badChar(ch), badChar(ch2)));
      }

    case 'S':
      switch (ch2) {
      case 'c':
        return isNegated ? RegexpNode.PROP_NOT_Sc : RegexpNode.PROP_Sc;
      case 'k':
        return isNegated ? RegexpNode.PROP_NOT_Sk : RegexpNode.PROP_Sk;
      case 'm'
        return isNegated ? RegexpNode.PROP_NOT_Sm : RegexpNode.PROP_Sm;
      case 'o':
        return isNegated ? RegexpNode.PROP_NOT_So : RegexpNode.PROP_So;
      default:
        throw error(L.l("invalid Unicode category {0}{1}",
                        badChar(ch), badChar(ch2)));
      }

    case 'Z':
      switch (ch2) {
      case 'l':
        return isNegated ? RegexpNode.PROP_NOT_Zl : RegexpNode.PROP_Zl;
      case 'p':  
        return isNegated ? RegexpNode.PROP_NOT_Zp : RegexpNode.PROP_Zp;
      case 's':  
        return isNegated ? RegexpNode.PROP_NOT_Zs : RegexpNode.PROP_Zs;
      default:
        throw error(L.l("invalid Unicode category {0}{1}",
                        badChar(ch), badChar(ch2)));
      }
    }

    throw new UnsupportedOperationException();
  }
 
  private RegexpNode parseUnicodeProperty(int ch,
                                          boolean isNegated)
    throws IllegalRegexpException
  {
    switch (ch) {
      case 'C':
        return isNegated ? RegexpNode.PROP_NOT_C : RegexpNode.PROP_C;

      case 'L':
        return isNegated ? RegexpNode.PROP_NOT_L : RegexpNode.PROP_L;
       
      case 'M':
        return isNegated ? RegexpNode.PROP_NOT_M : RegexpNode.PROP_M;
       
      case 'N':
        return isNegated ? RegexpNode.PROP_NOT_N : RegexpNode.PROP_N;

      case 'P':
        return isNegated ? RegexpNode.PROP_NOT_P : RegexpNode.PROP_P;

      case 'S':
        return isNegated ? RegexpNode.PROP_NOT_S : RegexpNode.PROP_S;

      case 'Z':
        return isNegated ? RegexpNode.PROP_NOT_Z : RegexpNode.PROP_Z;
       
      default:
        throw new IllegalRegexpException("invalid Unicode property " +
                badChar(ch));
    }
  }
 
  /*
  static {
    _characterClassMap.put("alnum", RegexpNode.RC_ALNUM);
    _characterClassMap.put("alpha", RegexpNode.RC_ALPHA);
    _characterClassMap.put("blank", RegexpNode.RC_BLANK);
    _characterClassMap.put("cntrl", RegexpNode.RC_CNTRL);
    _characterClassMap.put("digit", RegexpNode.RC_DIGIT);
    _characterClassMap.put("graph", RegexpNode.RC_GRAPH);
    _characterClassMap.put("lower", RegexpNode.RC_LOWER);
    _characterClassMap.put("print", RegexpNode.RC_PRINT);
    _characterClassMap.put("punct", RegexpNode.RC_PUNCT);
    _characterClassMap.put("space", RegexpNode.RC_SPACE);
    _characterClassMap.put("upper", RegexpNode.RC_UPPER);
    _characterClassMap.put("xdigit", RegexpNode.RC_XDIGIT);
  }
  */
TOP

Related Classes of com.caucho.quercus.lib.regexp.Regcomp

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.