Source Code of com.caucho.quercus.lib.regexp.Regcomp

/*
 * Copyright (c) 1998-2008 Caucho Technology -- all rights reserved
 *
 * This file is part of Resin(R) Open Source
 *
 * Each copy or derived work must preserve the copyright notice and this
 * notice unmodified.
 *
 * Resin Open Source is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * Resin Open Source is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
 * of NON-INFRINGEMENT.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Resin Open Source; if not, write to the
 *
 *   Free Software Foundation, Inc.
 *   59 Temple Place, Suite 330
 *   Boston, MA 02111-1307  USA
 *
 * @author Scott Ferguson
 */


/*
 * XXX: anchored expressions should have flags for quick matching.
 */


package com.caucho.quercus.lib.regexp;


import java.util.*;
import java.util.concurrent.*;
import java.util.logging.*;


import com.caucho.quercus.env.ConstStringValue;
import com.caucho.quercus.env.StringValue;
import com.caucho.quercus.env.StringBuilderValue;
import com.caucho.util.*;


/**
 * Regular expression compilation.
 */
class Regcomp {
  private static final Logger log
    = Logger.getLogger(Regcomp.class.getName());
  private static final L10N L = new L10N(RegexpNode.class);


  // #2526, JIT issues with Integer.MAX_VALUE
  private static final int INTEGER_MAX = Integer.MAX_VALUE - 1;
  
  static final int MULTILINE = 0x1;
  static final int SINGLE_LINE = 0x2;
  static final int IGNORE_CASE = 0x4;
  static final int IGNORE_WS = 0x8;
  static final int GLOBAL = 0x10;


  static final int ANCHORED = 0x20;
  static final int END_ONLY = 0x40;
  static final int UNGREEDY = 0x80;
  static final int STRICT = 0x100;
  static final int UTF8 = 0x200;
  
  static final HashMap<String,Integer> _characterClassMap
    = new HashMap<String,Integer>();


  static final ConcurrentHashMap<String,RegexpSet> _unicodeBlockMap
    = new ConcurrentHashMap<String,RegexpSet>();
  
  int _nGroup;
  int _nLoop;
  int _maxGroup;
  int _flags;


  HashMap<Integer,StringValue> _groupNameMap
    = new HashMap<Integer,StringValue>();


  HashMap<StringValue,Integer> _groupNameReverseMap
    = new HashMap<StringValue,Integer>();


  ArrayList<RegexpNode.Recursive> _recursiveList
    = new ArrayList<RegexpNode.Recursive>();


  RegexpNode _groupTail;
  
  boolean _isLookbehind;
  boolean _isOr;
  
  Regcomp(int flags)
  {
    _flags = flags;
  }


  boolean isGreedy()
  {
    return (_flags & UNGREEDY) != UNGREEDY;
  }


  boolean isIgnoreCase()
  {
    return (_flags & IGNORE_CASE) == IGNORE_CASE;
  }


  boolean isIgnoreWs()
  {
    return (_flags & IGNORE_WS) == IGNORE_WS;
  }


  boolean isMultiline()
  {
    return (_flags & MULTILINE) == MULTILINE;
  }


  boolean isDollarEndOnly()
  {
    return (_flags & END_ONLY) == END_ONLY;
  }


  int nextLoopIndex()
  {
    return _nLoop++;
  }


  RegexpNode parse(PeekStream pattern) throws IllegalRegexpException
  {
    _nGroup = 1;


    RegexpNode begin = null;


    if ((_flags & ANCHORED) != 0)
      begin = RegexpNode.ANCHOR_BEGIN_RELATIVE;
    
    RegexpNode value = parseRec(pattern, begin);


    while (pattern.read() == '|') {
      value = RegexpNode.Or.create(value, parseRec(pattern, begin));
    }
    
    value = value != null ? value.getHead() : RegexpNode.N_END;


    if (_maxGroup < _nGroup)
      _maxGroup = _nGroup;


    for (RegexpNode.Recursive rec : _recursiveList) {
      RegexpNode top = value;


      if (top instanceof RegexpNode.Concat) {
        RegexpNode.Concat topConcat = (RegexpNode.Concat) top;


        if (topConcat.getConcatHead() instanceof RegexpNode.AnchorBegin
            || topConcat.getConcatHead() instanceof RegexpNode.AnchorBeginRelative) {
          top = topConcat.getConcatNext();
        }
      }
      
      rec.setTop(top);
    }


    if (log.isLoggable(Level.FINEST))
      log.finest("regexp[] " + value);


    return value;
  }
  
  /**
   *   Recursively compile a RegexpNode.
   *
   * first      -- The first node of this sub-RegexpNode
   * prev       -- The previous node of this sub-RegexpNode
   * last_begin -- When the last grouping began
   * last_end   -- When the last grouping ended
   *
   * head       ->  node
   *                 v -- rest
   *                ...
   *                 v -- rest
   *                node
   *
   * last       ->  node
   *                 v -- rest
   *                ...
   *                 v -- rest
   *                node
   */
  private RegexpNode parseRec(PeekStream pattern, RegexpNode tail)
    throws IllegalRegexpException
  {
    int ch = pattern.read();
    RegexpNode next;
    RegexpNode groupTail;


    switch (ch) {
    case -1:
      return tail != null ? tail.getHead() : null;


    case '?':
      if (tail == null)
        throw error(L.l("'?' requires a preceeding regexp"));


      tail = createLoop(pattern, tail, 0, 1);
      
      return parseRec(pattern, tail.getTail());


    case '*':
      if (tail == null)
        throw error(L.l("'*' requires a preceeding regexp"));


      tail = createLoop(pattern, tail, 0, INTEGER_MAX);
      
      return parseRec(pattern, tail.getTail());


    case '+':
      if (tail == null)
        throw error(L.l("'+' requires a preceeding regexp"));


      tail = createLoop(pattern, tail, 1, INTEGER_MAX);
      
      return parseRec(pattern, tail.getTail());


    case '{':
      if (tail == null || ! ('0' <= pattern.peek() && pattern.peek() <= '9')) {
        next = parseString('{', pattern);
      
        return concat(tail, parseRec(pattern, next));
      }


      return parseRec(pattern, parseBrace(pattern, tail).getTail());


    case '.':
      if ((_flags & SINGLE_LINE) == 0)
        next = RegexpNode.DOT;
      else
        next = RegexpNode.ANY_CHAR;
        
      return concat(tail, parseRec(pattern, next));


    case '|':
      pattern.ungetc(ch);


      if (_groupTail != null)
        return concat(tail, _groupTail);
      else
        return tail.getHead();


    case '(':
      {
        switch (pattern.peek()) {
        case '?':
          pattern.read();


          switch (pattern.peek()) {
          case ':':
            pattern.read();
            return parseGroup(pattern, tail, 0, _flags);
            
          case '#':
            parseCommentGroup(pattern);
            
            return parseRec(pattern, tail);
            
          case '(':
            return parseConditional(pattern, tail);
            
          case '=':
          case '!':
            ch = pattern.read();


            boolean isPositive = (ch == '=');


            groupTail = _groupTail;
            _groupTail = null;


            next = parseRec(pattern, null);
            
            while ((ch = pattern.read()) == '|') {
              RegexpNode nextHead = parseRec(pattern, null);
              next = next.createOr(nextHead);
            }


            if (isPositive)
              next = new RegexpNode.Lookahead(next);
            else
              next = new RegexpNode.NotLookahead(next);


            if (ch != ')')
              throw error(L.l("expected ')' at '{0}'",
                              String.valueOf((char) ch)));


            _groupTail = groupTail;


            return concat(tail, parseRec(pattern, next));
            
          case '<':
            pattern.read();


            switch (pattern.read()) {
            case '=':
              isPositive = true;
              break;
            case '!':
              isPositive = false;
              break;
            default:
              throw error(L.l("expected '=' or '!'"));
            }


            groupTail = _groupTail;
            _groupTail = null;


            next = parseRec(pattern, null);


            if (next == null) {
            }
            else if (isPositive)
              next = new RegexpNode.Lookbehind(next);
            else
              next = new RegexpNode.NotLookbehind(next);
            
            while ((ch = pattern.read()) == '|') {
              RegexpNode second = parseRec(pattern, null);


              if (second == null) {
              }
              else if (isPositive)
                second = new RegexpNode.Lookbehind(second);
              else
                second = new RegexpNode.NotLookbehind(second);


              if (second != null)
                next = next.createOr(second);
            }


            if (ch != ')')
              throw error(L.l("expected ')' at '{0}'",
                              String.valueOf((char) ch)));


            _groupTail = groupTail;


            return concat(tail, parseRec(pattern, next));
          
          // XXX: once-only subpatterns (mostly an optimization feature)
          case '>':
            pattern.read();
            return parseGroup(pattern, tail, 0, _flags);


          case 'P':
            pattern.read();
            return parseNamedGroup(pattern, tail);


          case 'R':
            pattern.read();
            RegexpNode.Recursive rec = new RegexpNode.Recursive();
            _recursiveList.add(rec);
            ch = pattern.read();
            if (ch != ')')
              throw error(L.l("expected ')' at '{0}'",
                              String.valueOf((char) ch)));
            
            return concat(tail, parseRec(pattern, rec));


          case 'm': case 's': case 'i': case 'x': case 'g':
          case 'U': case 'X':
            {
              int flags = _flags;
              
              while ((ch = pattern.read()) > 0 && ch != ')') {
                switch (ch) {
                case 'm': _flags |= MULTILINE; break;
                case 's': _flags |= SINGLE_LINE; break;
                case 'i': _flags |= IGNORE_CASE; break;
                case 'x': _flags |= IGNORE_WS; break;
                case 'g': _flags |= GLOBAL; break;
                case 'U': _flags |= UNGREEDY; break;
                case 'X': _flags |= STRICT; break;
                case ':':
                  {
                    return parseGroup(pattern, tail, 0, flags);
                  }
                default:
                  throw error(L.l("'{0}' is an unknown (? code", String.valueOf((char) ch)));
                }
              }


              if (ch != ')')
                throw error(L.l("expected ')' at '{0}'",
                                String.valueOf((char) ch)));
              
              RegexpNode node = parseRec(pattern, tail);


              _flags = flags;


              return node;
            }
            
          default:
            throw error(L.l("'{0}' is an unknown (? code", String.valueOf((char) pattern.peek())));
          }
          
        default:
          return parseGroup(pattern, tail, _nGroup++, _flags);
        }
      }


    case ')':
      pattern.ungetc(ch);


      if (_groupTail != null)
        return concat(tail, _groupTail);
      else
        return tail;


    case '[':
      next = parseSet(pattern);


      return concat(tail, parseRec(pattern, next));
      
    case '\\':
      next = parseSlash(pattern);
      
      return concat(tail, parseRec(pattern, next));
      
    case '^':
      if (isMultiline())
        next = RegexpNode.ANCHOR_BEGIN_OR_NEWLINE;
      else
        next = RegexpNode.ANCHOR_BEGIN;
      
      return concat(tail, parseRec(pattern, next));
      
    case '$':
      if (isMultiline())
        next = RegexpNode.ANCHOR_END_OR_NEWLINE;
      else if (isDollarEndOnly())
        next = RegexpNode.ANCHOR_END_ONLY;
      else
        next = RegexpNode.ANCHOR_END;
      
      return concat(tail, parseRec(pattern, next));


    case ' ': case '\n': case '\t': case '\r':
      if (isIgnoreWs()) {
        while (Character.isWhitespace((char) pattern.peek()))
          pattern.read();


        return parseRec(pattern, tail);
      }
      else {
        next = parseString(ch, pattern);
        
        return concat(tail, parseRec(pattern, next));
      }


    case '#':
      if (isIgnoreWs()) {
        while ((ch = pattern.read()) > 0 && ch != '\n') {
        }


        return parseRec(pattern, tail);
      }
      else {
        next = parseString(ch, pattern);
        
        return concat(tail, parseRec(pattern, next));
      }
      
    default:
      next = parseString(ch, pattern);
      
      return concat(tail, parseRec(pattern, next));
    }
  }


  private void parseCommentGroup(PeekStream pattern)
  {
    int ch;
    
    // (?#...) Comment
    while ((ch = pattern.read()) >= 0 && ch != ')') {
    }
  }
  
  private RegexpNode parseNamedGroup(PeekStream pattern, RegexpNode tail)
    throws IllegalRegexpException
  {
    int ch = pattern.read();


    if (ch == '=') {
      StringBuilder sb = new StringBuilder();


      while ((ch = pattern.read()) != ')' && ch >= 0) {
        sb.append((char) ch);
      }


      if (ch != ')')
        throw error(L.l("expected ')'"));


      String name = sb.toString();
      
      Integer v = _groupNameReverseMap.get(new ConstStringValue(name));


      if (v != null) {
        RegexpNode next = new RegexpNode.GroupRef(v);
      
        return concat(tail, parseRec(pattern, next));
      }
      else
        throw error(L.l("'{0}' is an unknown regexp group", name));
    }
    else if (ch == '<') {
      StringBuilder sb = new StringBuilder();


      while ((ch = pattern.read()) != '>' && ch >= 0) {
        sb.append((char) ch);
      }


      if (ch != '>')
        throw error(L.l("expected '>'"));


      String name = sb.toString();


      int group = _nGroup++;


      _groupNameMap.put(group, new StringBuilderValue(name));
      _groupNameReverseMap.put(new StringBuilderValue(name), group);


      return parseGroup(pattern, tail, group, _flags);
    }
    else
      throw error(L.l("Expected '(?:P=name' or '(?:P<name' for named group"));
  }


  private RegexpNode parseConditional(PeekStream pattern, RegexpNode tail)
    throws IllegalRegexpException
  {
    int ch = pattern.read();


    if (ch != '(')
      throw error(L.l("expected '('"));
    
    RegexpNode.ConditionalHead groupHead = null;;
    RegexpNode groupTail = null;


    if ('1' <= (ch = pattern.peek()) && ch <= '9') {
      int value = 0;


      while ('0' <= (ch = pattern.read()) && ch <= '9') {
        value = 10 * value + ch - '0';
      }


      if (ch != ')')
        throw error(L.l("expected ')'"));


      if (_nGroup <= value)
        throw error(L.l("conditional value less than number of groups"));


      groupHead = new RegexpNode.ConditionalHead(value);
      groupTail = groupHead.getTail();
    }
    else
      throw error(L.l("conditional requires number"));


    RegexpNode oldTail = _groupTail;


    _groupTail = groupTail;
        
    RegexpNode first = parseRec(pattern, null);
    RegexpNode second = null;


    if ((ch = pattern.read()) == '|') {
      second = parseRec(pattern, null);


      ch = pattern.read();
    }


    if (ch != ')')
      throw error(L.l("expected ')' at '{0}'", String.valueOf((char) ch)));


    _groupTail = oldTail;


    groupHead.setFirst(first);
    groupHead.setSecond(second);
        
    return concat(tail, parseRec(pattern, groupHead));
  }


  private RegexpNode parseGroup(PeekStream pattern, RegexpNode tail,
                                int group, int oldFlags)
    throws IllegalRegexpException
  {
    RegexpNode.GroupHead groupHead = new RegexpNode.GroupHead(group);
    RegexpNode groupTail = groupHead.getTail();


    RegexpNode oldTail = _groupTail;


    _groupTail = groupTail;
        
    RegexpNode body = parseRec(pattern, null);


    int ch;
    while ((ch = pattern.read()) == '|') {
      RegexpNode nextBody = parseRec(pattern, null);
      body = body.createOr(nextBody);
    }


    if (ch != ')')
      throw error(L.l("expected ')'"));


    _flags = oldFlags;


    _groupTail = oldTail;


    groupHead.setNode(body.getHead());
        
    return concat(tail, parseRec(pattern, groupTail).getHead());
  }


  private void expect(char expected, int value)
    throws IllegalRegexpException
  {
    if (expected != value)
      throw error(L.l("expected '{0}'", String.valueOf(expected)));
  }


  private IllegalRegexpException error(String msg)
  {
    return new IllegalRegexpException(msg);
  }


  /**
   *   Parse the repetition construct.
   *
   *   {n}    -- exactly n
   *   {n,}   -- at least n
   *   {n,m}  -- from n to m
   *   {,m}   -- at most m
   */
  private RegexpNode parseBrace(PeekStream pattern, RegexpNode node)
    throws IllegalRegexpException
  {
    int ch;
    int min = 0;
    int max = INTEGER_MAX;


    while ((ch = pattern.read()) >= '0' && ch <= '9') {
      min = 10 * min + ch - '0';
    }


    if (ch == ',') {
      while ('0' <= (ch = pattern.read()) && ch <= '9') {
        if (max == INTEGER_MAX)
          max = 0;
        
        max = 10 * max + ch - '0';
      }
    }
    else
      max = min;


    if (ch != '}')
      throw error(L.l("Expected '}'"));


    return createLoop(pattern, node, min, max);
  }


  private RegexpNode createLoop(PeekStream pattern, RegexpNode node,
                                int min, int max)
  {
    if (pattern.peek() == '+') {
      pattern.read();
      
      return node.createPossessiveLoop(min, max);
    }
    else if (pattern.peek() == '?') {
      pattern.read();


      if (isGreedy())
        return node.createLoopUngreedy(this, min, max);
      else
        return node.createLoop(this, min, max);
    }
    else {
      if (isGreedy())
        return node.createLoop(this, min, max);
      else
        return node.createLoopUngreedy(this, min, max);
    }
  }


  static RegexpNode concat(RegexpNode prev, RegexpNode next)
  {
    if (prev != null) {
      return prev.concat(next).getHead();
    }
    else
      return next;
  }


  private String hex(int value)
  {
    CharBuffer cb = new CharBuffer();


    for (int b = 3; b >= 0; b--) {
      int v = (value >> (4 * b)) & 0xf;
      if (v < 10)
        cb.append((char) (v + '0'));
      else
        cb.append((char) (v - 10 + 'a'));
    }


    return cb.toString();
  }


  private String badChar(int ch)
  {
    if (0x20 <= ch && ch <= 0x7f)
      return "'" + (char) ch + "'";
    else if ((ch & 0xffff) == 0xffff)
      return "end of expression";
    else
      return "'" + (char) ch + "' (\\u" + hex(ch) + ")";
  }


  /**
   *   Collect the characters in a set, e.g. [a-z@@^!"]
   *
   * Variables:
   *
   *   last     -- Contains last read character.
   *   lastdash -- Contains character before dash or -1 if not after dash.
   */
  private RegexpNode parseSet(PeekStream pattern) 
    throws IllegalRegexpException
  {
    int first = pattern.peek();
    boolean isNot = false;


    if (first == '^') {
      pattern.read();
      isNot = true;
    }
    
    RegexpSet set = new RegexpSet();


    int last = -1;
    int lastdash = -1;
    int ch;


    int charRead = 0;
    
    ArrayList<RegexpNode> nodeList = null;
    
    while ((ch = pattern.read()) >= 0) {
      charRead++;


      // php/4e3o
      // first literal closing bracket need not be escaped
      if (ch == ']') {
        if (charRead == 1) {
          pattern.ungetc(ch);
          ch = '\\';
        }
        else
          break;
      }
      
      boolean isChar = true;
      boolean isDash = ch == '-';


      if (ch == '\\') {
        isChar = false;


        switch ((ch = pattern.read())) {
        case 's':
          set.mergeOr(RegexpSet.SPACE);
          break;


        case 'S':
          set.mergeOrInv(RegexpSet.SPACE);
          break;


        case 'd':
          set.mergeOr(RegexpSet.DIGIT);
          break;


        case 'D':
          set.mergeOrInv(RegexpSet.DIGIT);
          break;


        case 'w':
          set.mergeOr(RegexpSet.WORD);
          break;


        case 'W':
          set.mergeOrInv(RegexpSet.WORD);
          break;


        case 'p':
          int ch2 = pattern.read();
          
          if (ch2 != '{') {
            if (nodeList == null)
              nodeList = new ArrayList<RegexpNode>();
            
            nodeList.add(parseUnicodeProperty(ch2, false));
          }
          else {
            StringBuilder sb = new StringBuilder();
            
            int ch3;
            
            while ((ch3 = pattern.read()) >= 0 && ch3 != '}') {
              sb.append((char) ch3);
            }


            String name = sb.toString();


            if (ch3 != '}')
              throw new IllegalRegexpException(L.l("expected '}' at "
                               + badChar(ch3)));
            
            int len = name.length();


            if (len == 1) {
              if (nodeList == null)
                  nodeList = new ArrayList<RegexpNode>();
              
              nodeList.add(parseUnicodeProperty(name.charAt(0), false));
            }
            else if (len == 2) {
              if (nodeList == null)
                  nodeList = new ArrayList<RegexpNode>();
              
              nodeList.add(parseUnicodeProperty(name.charAt(0),
                                                name.charAt(1),
                                                false));
            }
            else {
              set.mergeOr(getUnicodeSet(name));
            }
          }
          break;


        case 'b':
          ch = '\b';
          isChar = true;
          break;
        case 'n':
          ch = '\n';
          isChar = true;
          break;
        case 't':
          ch = '\t';
          isChar = true;
          break;
        case 'r':
          ch = '\r';
          isChar = true;
          break;
        case 'f':
          ch = '\f';
          isChar = true;
          break;


        case 'x':
          ch = parseHex(pattern);
          isChar = true;
          break;


        case '0': case '1': case '2': case '3':
        case '4': case '5': case '6': case '7':
          ch = parseOctal(ch, pattern);
          isChar = true;
          break;


        default:
          isChar = true;
        }
      }
      else if (ch == '[') {
        if (pattern.peek() == ':') {
          isChar = false;
          pattern.read();
          
          set.mergeOr(parseCharacterClass(pattern));
        }
      }


      if (isDash && last != -1 && lastdash == -1) {
        lastdash = last;
      }
      // c1-c2
      else if (isChar && lastdash != -1) {
        if (lastdash > ch)
          throw new IllegalRegexpException("expected increasing range at " +
                                           badChar(ch));


        setRange(set, lastdash, ch);


        last = -1;
        lastdash = -1;
      }
      else if (lastdash != -1) {
        setRange(set, lastdash, lastdash);
        setRange(set, '-', '-');


        last = -1;
        lastdash = -1;
      }
      else if (last != -1) {
        
        setRange(set, last, last);


        if (isChar)
          last = ch;
      }
      else if (isChar)
        last = ch;
    }


    // Dash at end of set: [a-z1-]
    if (lastdash != -1) {
      setRange(set, lastdash, lastdash);
      setRange(set, '-', '-');
    }
    else if (last != -1) {
      setRange(set, last, last);
    }
    
    if (ch != ']')
      throw error(L.l("Expected ']'"));


    if (nodeList == null) {
      if (isNot)
        return set.createNotNode();
      else
        return set.createNode();
    }
    else {
      RegexpNode setNode = set.createNode();


      for (RegexpNode node : nodeList) {
        setNode = setNode.createOr(node);
      }
        
      if (isNot)
        return setNode.createNot();
      else
        return setNode;
    }
  }


  private void setRange(RegexpSet set, int a, int b)
  {
    set.setRange(a, b);
    
    if (isIgnoreCase()) {
      if (Character.isLowerCase(a) && Character.isLowerCase(b)) {
        set.setRange(Character.toUpperCase(a), Character.toUpperCase(b));
      }
          
      if (Character.isUpperCase(a) && Character.isUpperCase(b)) {
        set.setRange(Character.toLowerCase(a), Character.toLowerCase(b));
      }
    }
  }


  private RegexpSet getUnicodeSet(String name)
    throws IllegalRegexpException
  {
    _flags |= UTF8;


    RegexpSet set = _unicodeBlockMap.get(name);


    if (set == null) {
      Character.UnicodeBlock block = Character.UnicodeBlock.forName(name);


      if (block == null)
        throw new IllegalRegexpException(L.l("'{0}' is an unknown unicode block",
                         name));


      set = new RegexpSet();


      for (int ch = 0; ch < 65536; ch++) {
        if (Character.UnicodeBlock.of(ch) == block) {
          set.setRange(ch, ch);
        }
      }


      _unicodeBlockMap.put(name, set);
    }


    return set;
  }


  /**
   * Returns a node for sequences starting with a backslash.
   */
  private RegexpNode parseSlash(PeekStream pattern)
    throws IllegalRegexpException
  {
    int ch;
    switch (ch = pattern.read()) {
    case 's':
      return RegexpNode.SPACE;


    case 'S':
      return RegexpNode.NOT_SPACE;


    case 'd':
      return RegexpNode.DIGIT;


    case 'D':
      return RegexpNode.NOT_DIGIT;


    case 'w':
      return RegexpNode.S_WORD;


    case 'W':
      return RegexpNode.NOT_S_WORD;


    case 'b':
      return RegexpNode.WORD;


    case 'B':
      return RegexpNode.NOT_WORD;


    case 'A':
      return RegexpNode.STRING_BEGIN;


    case 'z':
      return RegexpNode.STRING_END;
      
    case 'Z':
      return RegexpNode.STRING_NEWLINE;


    case 'G':
      return RegexpNode.STRING_FIRST;


    case 'a':
      return parseString('\u0007', pattern);
    
    case 'c':
      ch = pattern.read();
      
      ch = Character.toUpperCase(ch);
      ch ^= 0x40;


      return parseString(ch, pattern);


    case 'e':
      return parseString('\u001B', pattern, true);
    case 'n':
      return parseString('\n', pattern, true);
    case 'r':
      return parseString('\r', pattern, true);
    case 'f':
      return parseString('\f', pattern, true);
    case 't':
      return parseString('\t', pattern, true);


    case 'x':
      int hex = parseHex(pattern);
      return parseString(hex, pattern, true);
    
    case '0':
      int oct = parseOctal(ch, pattern);
      return parseString(oct, pattern, true);


    case '1': case '2': case '3': case '4': 
    case '5': case '6': case '7': case '8': case '9':
      return parseBackReference(ch, pattern);


    case 'p':
      return parseUnicodeProperty(pattern, false);
    case 'P':
      return parseUnicodeProperty(pattern, true);
      
    case 'Q':
      throw new UnsupportedOperationException();
      /*
      while ((ch = pattern.read()) >= 0) {
        if (ch == '\\' && pattern.peek() == 'E') {
          pattern.read();
          break;
        }


        last = parseString(ch, pattern);
      }


      return last;
      */
      
    case '#':
      return parseString('#', pattern, true);


    default:
      if ((_flags & STRICT) != 0)
        throw new IllegalRegexpException("unrecognized escape at " +
                                         badChar(ch));
      return parseString(ch, pattern);
    }
  }
  
  /**
   * Returns a node for sequences starting with a '[:'.
   */
  private RegexpSet parseCharacterClass(PeekStream pattern)
    throws IllegalRegexpException
  {
    StringBuilder sb = new StringBuilder();
    
    int ch;
    while ((ch = pattern.read()) != ':' && ch >= 0) {
      sb.append((char)ch);
    }
    
    if (ch != ':') {
      throw new IllegalRegexpException("expected character class closing colon ':' at " + badChar(ch));
    }  
     
    if ((ch = pattern.read()) != ']') {
      throw new IllegalRegexpException("expected character class closing bracket ']' at " + badChar(ch));
    }


    String name = sb.toString();
    
    RegexpSet set = RegexpSet.CLASS_MAP.get(name);
    
    if (set == null) {
      throw new IllegalRegexpException("unrecognized POSIX character class " +
                                       name);
    }
 
    return set;
  }


  private int parseHex(PeekStream pattern)
    throws IllegalRegexpException
  {
    int ch = pattern.read();
    
    int hex = 0;
    
    StringBuilder sb = new StringBuilder();
    
    if (ch == '{') {
      while ((ch = pattern.read()) != '}') {
        if (ch < 0)
          throw new IllegalRegexpException("no more input; expected '}'");
        
        sb.append((char)ch);
      }
    }
    else {
      if (ch < 0)
        throw new IllegalRegexpException("expected hex digit at " +
                badChar(ch));
      
      sb.append((char)ch);
      ch = pattern.read();
      
      if (ch < 0) {
        throw new IllegalRegexpException("expected hex digit at " +
                                         badChar(ch));
      }


      sb.append((char)ch);
    }
    
    int len = sb.length();
    
    for (int i = 0; i < len; i++) {
      ch = sb.charAt(i);


      if ('0' <= ch && ch <= '9')
        hex = hex * 16 + ch - '0';
      else if ('a' <= ch && ch <= 'f')
        hex = hex * 16 + ch - 'a' + 10;
      else if ('A' <= ch && ch <= 'F')
        hex = hex * 16 + ch - 'A' + 10;
      else
        throw new IllegalRegexpException("expected hex digit at " +
                                         badChar(ch));
    }
    
    return hex;
  }
  
  private RegexpNode parseBackReference(int ch, PeekStream pattern)
    throws IllegalRegexpException
  {
    int value = ch - '0';
    int ch2 = pattern.peek();
    
    if ('0' <= ch2 && ch2 <= '9') {
      pattern.read();
      value = value * 10 + ch2 - '0';
    }


    int ch3 = pattern.peek();
    
    if (value < 10 || value <= _nGroup && ! ('0' <= ch3 && ch3 <= '7')) {
      return new RegexpNode.GroupRef(value);
    }
    else if (! ('0' <= ch2 && ch2 <= '7')
             && ! ('0' <= ch3 && ch3 <= '7'))
      throw new IllegalRegexpException("back referencing to a non-existent group: " +
                                       value);
    
    if (value > 10)
      pattern.ungetc(ch2);
    
    if (ch == '8' || ch == '9'
        || '0' <= ch3 && ch3 <= '9' && value * 10 + ch3 - '0' > 0xFF) {
      //out of byte range or not an octal,
      //need to parse backslash as the NULL character
      
      pattern.ungetc(ch);
      return parseString('\u0000', pattern);
    }
    
    int oct = parseOctal(ch, pattern);
    
    return parseString(oct, pattern, true);
  }


  private RegexpNode parseString(int ch,
                                 PeekStream pattern)
    throws IllegalRegexpException
  {
    return parseString(ch, pattern, false);
  }
  
  /**
   * parseString
   */
  private RegexpNode parseString(int ch,
                                 PeekStream pattern,
                                 boolean isEscaped)
    throws IllegalRegexpException
  {
    CharBuffer cb = new CharBuffer();
    cb.append((char) ch);


    for (ch = pattern.read(); ch >= 0; ch = pattern.read()) {
      switch (ch) {
      case ' ': case '\t': case '\n': case '\r':
        if (! isIgnoreWs() || isEscaped)
          cb.append((char) ch);
        break;


      case '#':
        if (! isIgnoreWs() || isEscaped)
          cb.append((char) ch);
        else {
          while ((ch = pattern.read()) != '\n' && ch >= 0) {
          }
        }
        break;


      case '(': case ')': case '[':
      case '+': case '?': case '*': case '.':
      case '$': case '^': case '|':
        pattern.ungetc(ch);
        return createString(cb);


      case '{':
        if ('0' <= pattern.peek() && pattern.peek() <= '9') {
          pattern.ungetc(ch);
          return createString(cb);
        }
        cb.append('{');
        break;
        
      case '\\':
        ch = pattern.read();
        
        switch (ch) {
        case -1:
          cb.append('\\');
          return createString(cb);
          
        case 's': case 'S': case 'd': case 'D':
        case 'w': case 'W': case 'b': case 'B':
        case 'A': case 'z': case 'Z': case 'G':
        case 'p': case 'P':
          pattern.ungetc(ch);
          pattern.ungetc('\\');
          return createString(cb);


        case 'a':
          cb.append('\u0007');
          break;
          
        case 'c':
          ch = pattern.read();
      
          ch = Character.toUpperCase(ch);
          ch ^= 0x40;
          
          cb.append((char) ch);
          break;
        case 'e':
          cb.append('\u001b');
          break;
        case 't':
          cb.append('\t');
          break;
        case 'f':
          cb.append('\f');
          break;
        case 'n':
          cb.append('\n');
          break;
        case 'r':
          cb.append('\r');
          break;


        case 'x':
          int hex = parseHex(pattern);
          cb.append((char) hex);
          break;
      
        case 'Q':
          while ((ch = pattern.read()) >= 0) {
            if (ch == '\\' && pattern.peek() == 'E') {
              pattern.read();
              break;
            }


            cb.append((char) ch);
          }
          break;
    
        case '0':
          int oct = parseOctal(ch, pattern);
          cb.append((char) oct);
          break;


        case '1': case '2': case '3': case '4': 
        case '5': case '6': case '7': case '8': case '9':
          if (ch - '0' <= _nGroup) {
            pattern.ungetc(ch);
            pattern.ungetc('\\');
            return createString(cb);
          }
          else {
            oct = parseOctal(ch, pattern);
            cb.append((char) oct);
          }
          break;
        case '#':
          cb.append('#');
          break;


        default:
          if ((_flags & STRICT) != 0)
            throw error(L.l("unrecognized escape at " + badChar(ch)));


          cb.append((char) ch);
          break;
        }
        break;


      default:
        cb.append((char) ch);
      }
    }


    return createString(cb);
  }


  private RegexpNode createString(CharBuffer cb)
  {
    if (isIgnoreCase())
      return new RegexpNode.StringIgnoreCase(cb);
    else
      return new RegexpNode.StringNode(cb);
  }
  
  private int parseOctal(int ch, PeekStream pattern)
    throws IllegalRegexpException
  {
    if ('0' > ch || ch > '7')
      throw new IllegalRegexpException("expected octal digit at " +
                                       badChar(ch));
    
    int oct = ch - '0';
    
    int ch2 = pattern.peek();
    
    if ('0' <= ch2 && ch2 <= '7') {
      pattern.read();
      
      oct = oct * 8 + ch2 - '0';
      
      ch = pattern.peek();
      
      if ('0' <= ch && ch <= '7') {
        pattern.read();
        
        oct = oct * 8 + ch - '0';
      }
    }
    
    return oct;
  }
  
  private RegexpNode parseUnicodeProperty(PeekStream pattern,
                                          boolean isNegated)
    throws IllegalRegexpException
  {
    int ch = pattern.read();


    boolean isBraced = false;


    if (ch == '{') {
      isBraced = true;
      ch = pattern.read();
      
      if (ch == '^') {
        isNegated = ! isNegated;
        ch = pattern.read();
      }
    }
    
    RegexpNode node;
    
    if (isBraced) {
      int ch2 = pattern.read();
      
      if (ch2 == '}')
        node = parseUnicodeProperty(ch, isNegated);
      else {
        node = parseUnicodeProperty(ch, ch2, isNegated);
        
        expect('}', pattern.read());
      }
    }
    else
      node = parseUnicodeProperty(ch, isNegated);
    
    return node;
  }
  
  private RegexpNode parseUnicodeProperty(int ch, int ch2,
                                          boolean isNegated)
    throws IllegalRegexpException
  {
    switch (ch) {
    case 'C':
      switch (ch2) {
      case 'c':
        return isNegated ? RegexpNode.PROP_NOT_Cc : RegexpNode.PROP_Cc;
      case 'f':
        return isNegated ? RegexpNode.PROP_NOT_Cf : RegexpNode.PROP_Cf;
      case 'n':
        return isNegated ? RegexpNode.PROP_NOT_Cn : RegexpNode.PROP_Cn;
      case 'o':
        return isNegated ? RegexpNode.PROP_NOT_Co : RegexpNode.PROP_Co;
      case 's':
        return isNegated ? RegexpNode.PROP_NOT_Cs : RegexpNode.PROP_Cs;
      default:
        throw error(L.l("invalid Unicode category {0}{1}",
                        badChar(ch), badChar(ch2)));
      }


    case 'L':
      switch (ch2) {
      case 'l':
        return isNegated ? RegexpNode.PROP_NOT_Ll : RegexpNode.PROP_Ll;
      case 'm':
        return isNegated ? RegexpNode.PROP_NOT_Lm : RegexpNode.PROP_Lm;
      case 'o':
        return isNegated ? RegexpNode.PROP_NOT_Lo : RegexpNode.PROP_Lo;
      case 't':  
        return isNegated ? RegexpNode.PROP_NOT_Lt : RegexpNode.PROP_Lt;
      case 'u': 
        return isNegated ? RegexpNode.PROP_NOT_Lu : RegexpNode.PROP_Lu;
        
      case '}':
        return isNegated ? RegexpNode.PROP_NOT_L : RegexpNode.PROP_L;
        
      default:
        throw error(L.l("invalid Unicode category {0}{1}",
                        badChar(ch), badChar(ch2)));
      }
    case 'M':
      switch (ch2) {
      case 'c':
        return isNegated ? RegexpNode.PROP_NOT_Mc : RegexpNode.PROP_Mc;
      case 'e':
        return isNegated ? RegexpNode.PROP_NOT_Me : RegexpNode.PROP_Me;
      case 'n':
        return isNegated ? RegexpNode.PROP_NOT_Mn : RegexpNode.PROP_Mn;
      default:
        throw error(L.l("invalid Unicode category {0}{1}",
                        badChar(ch), badChar(ch2)));
      }


    case 'N':
      switch (ch2) {
      case 'd':
        return isNegated ? RegexpNode.PROP_NOT_Nd : RegexpNode.PROP_Nd;
      case 'l':
        return isNegated ? RegexpNode.PROP_NOT_Nl : RegexpNode.PROP_Nl;
      case 'o':
        return isNegated ? RegexpNode.PROP_NOT_No : RegexpNode.PROP_No;
      default:
        throw error(L.l("invalid Unicode category {0}{1}",
                        badChar(ch), badChar(ch2)));
      }


    case 'P':
      switch (ch2) {
      case 'c':
        return isNegated ? RegexpNode.PROP_NOT_Pc : RegexpNode.PROP_Pc;
      case 'd':  
        return isNegated ? RegexpNode.PROP_NOT_Pd : RegexpNode.PROP_Pd;
      case 'e':
        return isNegated ? RegexpNode.PROP_NOT_Pe : RegexpNode.PROP_Pe;
      case 'f':
        return isNegated ? RegexpNode.PROP_NOT_Pf : RegexpNode.PROP_Pf;
      case 'i':     
        return isNegated ? RegexpNode.PROP_NOT_Pi : RegexpNode.PROP_Pi;
      case 'o':    
        return isNegated ? RegexpNode.PROP_NOT_Po : RegexpNode.PROP_Po;
      case 's':   
        return isNegated ? RegexpNode.PROP_NOT_Ps : RegexpNode.PROP_Ps;
      default:
        throw error(L.l("invalid Unicode category {0}{1}",
                        badChar(ch), badChar(ch2)));
      }


    case 'S':
      switch (ch2) {
      case 'c': 
        return isNegated ? RegexpNode.PROP_NOT_Sc : RegexpNode.PROP_Sc;
      case 'k':
        return isNegated ? RegexpNode.PROP_NOT_Sk : RegexpNode.PROP_Sk;
      case 'm':  
        return isNegated ? RegexpNode.PROP_NOT_Sm : RegexpNode.PROP_Sm;
      case 'o':
        return isNegated ? RegexpNode.PROP_NOT_So : RegexpNode.PROP_So;
      default:
        throw error(L.l("invalid Unicode category {0}{1}",
                        badChar(ch), badChar(ch2)));
      }


    case 'Z':
      switch (ch2) {
      case 'l':
        return isNegated ? RegexpNode.PROP_NOT_Zl : RegexpNode.PROP_Zl;
      case 'p':   
        return isNegated ? RegexpNode.PROP_NOT_Zp : RegexpNode.PROP_Zp;
      case 's':   
        return isNegated ? RegexpNode.PROP_NOT_Zs : RegexpNode.PROP_Zs;
      default:
        throw error(L.l("invalid Unicode category {0}{1}",
                        badChar(ch), badChar(ch2)));
      }
    }


    throw new UnsupportedOperationException();
  }
  
  private RegexpNode parseUnicodeProperty(int ch,
                                          boolean isNegated)
    throws IllegalRegexpException
  {
    switch (ch) {
      case 'C':
        return isNegated ? RegexpNode.PROP_NOT_C : RegexpNode.PROP_C;


      case 'L':
        return isNegated ? RegexpNode.PROP_NOT_L : RegexpNode.PROP_L;
        
      case 'M':
        return isNegated ? RegexpNode.PROP_NOT_M : RegexpNode.PROP_M;
        
      case 'N':
        return isNegated ? RegexpNode.PROP_NOT_N : RegexpNode.PROP_N;


      case 'P':
        return isNegated ? RegexpNode.PROP_NOT_P : RegexpNode.PROP_P;


      case 'S':
        return isNegated ? RegexpNode.PROP_NOT_S : RegexpNode.PROP_S;


      case 'Z':
        return isNegated ? RegexpNode.PROP_NOT_Z : RegexpNode.PROP_Z;
        
      default:
        throw new IllegalRegexpException("invalid Unicode property " +
                badChar(ch));
    }
  }
  
  /*
  static {
    _characterClassMap.put("alnum", RegexpNode.RC_ALNUM);
    _characterClassMap.put("alpha", RegexpNode.RC_ALPHA);
    _characterClassMap.put("blank", RegexpNode.RC_BLANK);
    _characterClassMap.put("cntrl", RegexpNode.RC_CNTRL);
    _characterClassMap.put("digit", RegexpNode.RC_DIGIT);
    _characterClassMap.put("graph", RegexpNode.RC_GRAPH);
    _characterClassMap.put("lower", RegexpNode.RC_LOWER);
    _characterClassMap.put("print", RegexpNode.RC_PRINT);
    _characterClassMap.put("punct", RegexpNode.RC_PUNCT);
    _characterClassMap.put("space", RegexpNode.RC_SPACE);
    _characterClassMap.put("upper", RegexpNode.RC_UPPER);
    _characterClassMap.put("xdigit", RegexpNode.RC_XDIGIT);
  }
  */
}
Source Code of com.caucho.quercus.lib.regexp.Regcomp

Related Classes of com.caucho.quercus.lib.regexp.Regcomp