Source Code of client.net.sf.saxon.ce.regex.ARegexIterator

package client.net.sf.saxon.ce.regex;


import client.net.sf.saxon.ce.event.Receiver;
import client.net.sf.saxon.ce.expr.XPathContext;
import client.net.sf.saxon.ce.expr.z.IntHashMap;
import client.net.sf.saxon.ce.expr.z.IntToIntHashMap;
import client.net.sf.saxon.ce.om.Item;
import client.net.sf.saxon.ce.om.SequenceIterator;
import client.net.sf.saxon.ce.trans.XPathException;
import client.net.sf.saxon.ce.tree.iter.ArrayIterator;
import client.net.sf.saxon.ce.tree.iter.EmptyIterator;
import client.net.sf.saxon.ce.tree.util.FastStringBuffer;
import client.net.sf.saxon.ce.value.StringValue;


import java.util.ArrayList;
import java.util.List;


/**
 * Class ARegexIterator - provides an iterator over matched and unmatched substrings.
 * This implementation of RegexIterator uses the modified Jakarta regular expression engine.
*/


public class ARegexIterator implements RegexIterator {


    private UnicodeString theString;   // the input string being matched
    private UnicodeString regex;
    private REMatcher matcher;    // the Matcher object that does the matching, and holds the state
    private UnicodeString current;     // the string most recently returned by the iterator
    private UnicodeString next;        // if the last string was a matching string, null; otherwise the next substring
                                //        matched by the regex
    private int position = 0;   // the value of XPath position()
    private int prevEnd = 0;    // the position in the input string of the end of the last match or non-match
    private IntToIntHashMap nestingTable = null;
                                // evaluated on demand: a table that indicates for each captured group,
                                // what its immediately-containing captured group is.


    /**
    * Construct a RegexIterator. Note that the underlying matcher.find() method is called once
    * to obtain each matching substring. But the iterator also returns non-matching substrings
    * if these appear between the matching substrings.
    * @param string the string to be analysed
    * @param matcher a matcher for the regular expression
    */


    public ARegexIterator(UnicodeString string, UnicodeString regex, REMatcher matcher) {
        theString = string;
        this.regex = regex;
        this.matcher = matcher;
        next = null;
    }


    /**
    * Get the next item in the sequence
    * @return the next item in the sequence
    */


    public Item next() {
        if (next == null && prevEnd >= 0) {
            // we've returned a match (or we're at the start), so find the next match
            if (matcher.match(theString, prevEnd)) {
                int start = matcher.getParenStart(0);
                int end = matcher.getParenEnd(0);
                if (prevEnd == start) {
                    // there's no intervening non-matching string to return
                    next = null;
                    current = theString.substring(start, end);
                    prevEnd = end;
                } else {
                    // return the non-matching substring first
                    current = theString.substring(prevEnd, start);
                    next = theString.substring(start, end);
                }
            } else {
                // there are no more regex matches, we must return the final non-matching text if any
                if (prevEnd < theString.length()) {
                    current = theString.substring(prevEnd, theString.length());
                    next = null;
                } else {
                    // this really is the end...
                    current = null;
                    position = -1;
                    prevEnd = -1;
                    return null;
                }
                prevEnd = -1;
            }
        } else {
            // we've returned a non-match, so now return the match that follows it, if there is one
            if (prevEnd >= 0) {
                current = next;
                next = null;
                prevEnd = matcher.getParenEnd(0);
            } else {
                current = null;
                position = -1;
                return null;
            }
        }
        position++;
        return currentStringValue();
    }


    private StringValue currentStringValue() {
        if (current instanceof BMPString) {
            return StringValue.makeStringValue(((BMPString)current).getCharSequence());
        } else {
            return StringValue.makeStringValue(current.toString());
        }
    }


    /**
    * Get the current item in the sequence
    * @return the item most recently returned by next()
    */


    public Item current() {
        return currentStringValue();
    }


    /**
    * Get the position of the current item in the sequence
    * @return the position of the item most recently returned by next(), starting at 1
    */


    public int position() {
        return position;
    }


    public void close() {
    }


    /**
    * Get another iterator over the same items
    * @return a new iterator, positioned before the first item
    */


    /*@NotNull*/
    public SequenceIterator getAnother() {
        return new ARegexIterator(theString, regex, new REMatcher(matcher.getProgram()));
    }


    /**
     * Get properties of this iterator, as a bit-significant integer.
     *
     * @return the properties of this iterator. It is always
     *         acceptable to return the value zero, indicating that there are no known special properties.
     *         It is acceptable for the properties of the iterator to change depending on its state.
     */


    public int getProperties() {
        return 0;
    }


    /**
    * Determine whether the current item is a matching item or a non-matching item
    * @return true if the current item (the one most recently returned by next()) is
    * an item that matches the regular expression, or false if it is an item that
    * does not match
    */


    public boolean isMatching() {
        return next == null && prevEnd >= 0;
    }


    /**
    * Get a substring that matches a parenthesised group within the regular expression
    * @param number    the number of the group to be obtained
    * @return the substring of the current item that matches the n'th parenthesized group
    * within the regular expression
    */


    public String getRegexGroup(int number) {
        if (!isMatching()) {
            return null;
        }
        if (number >= matcher.getParenCount() || number < 0) return "";
        UnicodeString us = matcher.getParen(number);
        return (us == null ? "" : us.toString());
    }


    /**
     * Get a sequence containing all the regex groups (except group 0, because we want to use indexing from 1).
     * This is used by the saxon:analyze-string() higher-order extension function.
     */


    public SequenceIterator getRegexGroupIterator() {
        int c = matcher.getParenCount() - 1;
        if (c == 0) {
            return EmptyIterator.getInstance();
        } else {
            StringValue[] groups = new StringValue[c];
            for (int i=1; i<=groups.length; i++) {
                groups[i-1] = StringValue.makeStringValue(matcher.getParen(i).toString());
            }
            return new ArrayIterator(groups);
        }
    }


    /**
     * Process a matching substring, performing specified actions at the start and end of each captured
     * subgroup. This method will always be called when operating in "push" mode; it writes its
     * result to context.getReceiver(). The matching substring text is all written to the receiver,
     * interspersed with calls to the {@link RegexIterator.OnGroup} methods onGroupStart() and onGroupEnd().
     * @param context the dynamic evaluation context
     * @param action defines the processing to be performed at the start and end of a group
     */


    public void processMatchingSubstring(XPathContext context, OnGroup action) throws XPathException {
        Receiver out = context.getReceiver();
        int c = matcher.getParenCount()-1;
        if (c == 0) {
            out.characters(current.toString());
        } else {
            // Create a map from positions in the string to lists of actions.
            // The "actions" in each list are: +N: start group N; -N: end group N.
            IntHashMap<List<Integer>> actions = new IntHashMap<List<Integer>>(c);
            for (int i=1; i<=c; i++) {
                int start = matcher.getParenStart(i) - matcher.getParenStart(0);
                if (start != -1) {
                    int end = matcher.getParenEnd(i) - matcher.getParenStart(0);
                    if (start < end) {
                        // Add the start action after all other actions on the list for the same position
                        List<Integer> s = actions.get(start);
                        if (s == null) {
                            s = new ArrayList<Integer>(4);
                            actions.put(start, s);
                        }
                        s.add(i);
                        // Add the end action before all other actions on the list for the same position
                        List<Integer> e = actions.get(end);
                        if (e == null) {
                            e = new ArrayList<Integer>(4);
                            actions.put(end, e);
                        }
                        e.add(0, -i);
                    } else {
                        // zero-length group (start==end). The problem here is that the information available
                        // from Java isn't sufficient to determine the nesting of groups: match("a", "(a(b?))")
                        // and match("a", "(a)(b?)") will both give the same result for group 2 (start=1, end=1).
                        // So we need to go back to the original regex to determine the group nesting
                        if (nestingTable == null) {
                            computeNestingTable();
                        }
                        int parentGroup = nestingTable.get(i);
                        // insert the start and end events immediately before the end event for the parent group,
                        // if present; otherwise after all existing events for this position
                        List<Integer> s = actions.get(start);
                        if (s == null) {
                            s = new ArrayList<Integer>(4);
                            actions.put(start, s);
                            s.add(i);
                            s.add(-i);
                        } else {
                            int pos = s.size();
                            for (int e=0; e<s.size(); e++) {
                                if (s.get(e) == -parentGroup) {
                                    pos = e;
                                    break;
                                }
                            }
                            s.add(pos, -i);
                            s.add(pos, i);
                        }


                    }
                }


            }
            FastStringBuffer buff = new FastStringBuffer(current.length());
            for (int i=0; i < current.length()+1; i++) {
                List<Integer> events = actions.get(i);
                if (events != null) {
                    if (buff.length() > 0) {
                        out.characters(buff);
                        buff.setLength(0);
                    }
                    for (Integer group : events) {
                        if (group > 0) {
                            action.onGroupStart(context, group);
                        } else {
                            action.onGroupEnd(context, -group);
                        }
                    }
                }
                if (i < current.length()) {
                    buff.appendWideChar(current.charAt(i));
                }
            }
            if (buff.length() > 0) {
                out.characters(buff);
            }
        }


    }


    /**
     * Compute a table showing for each captured group number (opening paren in the regex),
     * the number of its parent group. This is done by reparsing the source of the regular
     * expression. This is needed when the result of a match includes an empty group, to determine
     * its position relative to other groups finishing at the same character position.
     */


    private void computeNestingTable() {
        nestingTable = new IntToIntHashMap(16);
        UnicodeString s = regex;
        int[] stack = new int[s.length()];
        int tos = 0;
        int group = 1;
        int inBrackets = 0;
        stack[tos++] = 0;
        for (int i=0; i<s.length(); i++) {
            int ch = s.charAt(i);
            if (ch == '\'') {
                i++;
            } else if (ch == '[') {
                inBrackets++;
            } else if (ch == ']') {
                inBrackets--;
            } else if (ch == '(' && s.charAt(i+1) != '?' && inBrackets == 0) {
                nestingTable.put(group, stack[tos-1]);
                stack[tos++] = group++;
            } else if (ch == ')' && inBrackets == 0) {
                tos--;
            }
        }
    }




}


// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. 
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is “Incompatible With Secondary Licenses”, as defined by the Mozilla Public License, v. 2.0.
Source Code of client.net.sf.saxon.ce.regex.ARegexIterator

Related Classes of client.net.sf.saxon.ce.regex.ARegexIterator