Package org.jkff.ire.regex

Source Code of org.jkff.ire.regex.RegexCompiler

package org.jkff.ire.regex;

import org.jkff.ire.DFARopePatternSet;
import org.jkff.ire.PatternSet;
import org.jkff.ire.fa.*;
import org.jkff.ire.util.CoarsestPartition;
import org.jkff.ire.util.WrappedBitSet;
import org.jkff.ire.util.Pair;

import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;

import static org.jkff.ire.util.CollectionFactory.*;

/**
* Created on: 01.09.2010 23:43:07
*/
public class RegexCompiler {
    public static PatternSet compile(List<RxNode> roots) {
        return new DFARopePatternSet(compileToBiDFA(roots));
    }

    public static PatternSet compile(String... regexes) {
        List<RxNode> roots = newArrayList();
        for(String regex : regexes) {
            roots.add(RegexParser.parse(regex));
        }
        return compile(roots);
    }

    static BiDFA<Character, PowerIntState> compileToBiDFA(List<RxNode> roots) {
        List<RxNode> rootsAnywhere = newArrayList();
        List<RxNode> reversedRoots = newArrayList();
        for(RxNode root : roots) {
            Alternative dotStar = new Alternative(new Empty(), new OnceOrMore(CharacterClass.ANY_CHAR));
            rootsAnywhere.add(new Sequence(dotStar, new Sequence(root, dotStar)));
            reversedRoots.add(new Sequence(reverse(root), dotStar));
        }
        return new BiDFA<Character, PowerIntState>(compileToDFA(rootsAnywhere), compileToDFA(reversedRoots));
    }

    static DFA<Character, PowerIntState> compileToDFA(List<RxNode> rxNodes) {
        if(rxNodes.isEmpty()) {
            throw new IllegalArgumentException("Pattern list can't be empty");
        }
        List<RxNode> labeled = newArrayList();
        for(int i = 0; i < rxNodes.size(); ++i) {
            labeled.add(new Labeled(rxNodes.get(i), i));
        }
        RxNode alt = labeled.get(0);
        for(int i = 1; i < rxNodes.size(); ++i) {
            alt = new Alternative(alt, labeled.get(i));
        }

        return toDFA(reduceNFA(toNFA(alt)), rxNodes.size());
    }

    static DFA<Character, PowerIntState> toDFA(NFA nfa, int numPatterns) {
        Pair<Set<NFA.Node>, NFA.Node> opt = optimize(nfa);

        Set<NFA.Node> allNodes = opt.first;
        NFA.Node newInitial = opt.second;

        final int numStates = allNodes.size();

        final Map<NFA.Node, Integer> node2id = newLinkedHashMap();
        final NFA.Node[] id2node = allNodes.toArray(new NFA.Node[allNodes.size()]);

        for(int i = 0; i < id2node.length; ++i) {
            node2id.put(id2node[i], i);
        }

        final State[] basis = new State[numStates];
        for(int i = 0; i < numStates; ++i) {
            WrappedBitSet terminatedPatterns = new WrappedBitSet(numPatterns);
            for(int pat : id2node[i].patternIds) {
                terminatedPatterns.set(pat);
            }
            basis[i] = new IntState(i, terminatedPatterns);
        }

        TransferTable<Character,PowerIntState> transfer = new TransferTable<Character, PowerIntState>() {
            private TransferFunction[] transfer = new TransferFunction[Character.MAX_VALUE+1];

            public TransferFunction<PowerIntState> forToken(Character token) {
                char t = token;
                TransferFunction<PowerIntState> f = transfer[t];
                if(f == null) {
                    transfer[t] = f = computeTransferFor(t);
                }
                return f;
            }

            private TransferFunction<PowerIntState> computeTransferFor(char token) {
                WrappedBitSet[] state2next = new WrappedBitSet[numStates];
                for(int i = 0; i < numStates; ++i) {
                    WrappedBitSet res = new WrappedBitSet(numStates);
                    NFA.Node node = id2node[i];
                    for (Pair<CharacterClass, NFA.Node> out : node.out) {
                        if(out.first.acceptsChar(token)) {
                            res.set(node2id.get(out.second));
                        }
                    }
                    state2next[i] = res;
                }
                return new PowerIntTable(state2next);
            }
        };

        final WrappedBitSet justInitial = new WrappedBitSet(numStates);
        justInitial.set(node2id.get(newInitial));
        PowerIntState initial = new PowerIntState(basis, justInitial);

//        StringBuilder dot = new StringBuilder();
//        dot.append("digraph g {\n");
//        for(int i = 0; i < numStates; ++i) {
//            WrappedBitSet justThis = new WrappedBitSet(numStates);
//            justThis.set(i);
//            PowerIntState state = new PowerIntState(basis, justThis);
//            dot.append(i + " [shape=" + (state.getTerminatedPatterns().isEmpty() ? "circle" : "square") + "]\n");
//        }
//        for(int i = 0; i < numStates; ++i) {
//            WrappedBitSet justThis = new WrappedBitSet(numStates);
//            justThis.set(i);
//            PowerIntState state = new PowerIntState(basis, justThis);
//            PowerIntState nextState = transfer.forToken('t').next(state);
//            WrappedBitSet next = nextState.getSubset();
//            for(int bit = next.nextSetBit(0); bit != -1; bit = next.nextSetBit(bit+1)) {
//                dot.append(i + " -> " + bit + "\n");
//            }
//        }
//        dot.append("}\n");
//        System.out.println(dot);

        return new DFA<Character, PowerIntState>(transfer, initial, PowerIntTable.REDUCER) {
            @Override
            public PowerIntState resetTerminatedPattern(PowerIntState state, int pattern) {
                WrappedBitSet reset = new WrappedBitSet(basis.length);
                reset.or(state.getSubset());
                for(int substate = reset.nextSetBit(0); substate != -1; substate = reset.nextSetBit(substate + 1)) {
                    if(basis[substate].getTerminatedPatterns().get(pattern)) {
                        reset.clear(substate);
                    }
                }
                reset.or(justInitial);
                return new PowerIntState(basis, reset);
            }
        };
    }

    private static Pair<Set<NFA.Node>, NFA.Node> optimize(NFA nfa) {
        Pair<Set<NFA.Node>, NFA.Node> eClosure = computeEClosure(nfa);
        Pair<Set<NFA.Node>, NFA.Node> groupedLeft = groupEquivalentStates(eClosure, true);
        Pair<Set<NFA.Node>, NFA.Node> groupedRight = groupEquivalentStates(groupedLeft, false);

        return groupedRight;
    }

    private static Pair<Set<NFA.Node>, NFA.Node> groupEquivalentStates(
            Pair<Set<NFA.Node>, NFA.Node> nfa, boolean leftNotRight)
    {
        // See paper "On NFA reductions".
        Set<NFA.Node> nodes = nfa.first;
        NFA.Node initial = nfa.second;

        // Nodes terminating different patterns are different.
        Map<Set<Integer>, Integer> patIds2block = newLinkedHashMap();
        Map<NFA.Node, Integer> node2block = newLinkedHashMap();
        for(NFA.Node node : nodes) {
            Integer block = patIds2block.get(node.patternIds);
            if(block == null) {
                patIds2block.put(node.patternIds, block = patIds2block.size());
            }
            node2block.put(node, block);
        }

        NFA.Node[] id2node = nodes.toArray(new NFA.Node[nodes.size()]);
        Map<NFA.Node, Integer> node2id = newLinkedHashMap();
        for(int i = 0; i < id2node.length; ++i) {
            node2id.put(id2node[i], i);
        }

        int[] p = new int[id2node.length];
        for(int i = 0; i < id2node.length; ++i) {
            p[i] = node2block.get(id2node[i]);
        }

        // Instead of iterating over the whole unicode alphabet,
        // let us iterate over the distinct labels of the automaton.
        Set<CharacterClass> alphabet = newLinkedHashSet();
        for(NFA.Node node : nodes) {
            for (Pair<CharacterClass, NFA.Node> out : node.out) {
                alphabet.add(out.first);
            }
        }

        boolean anythingChanged;
        do {
            anythingChanged = false;
            for(CharacterClass c : alphabet) {
                List<int[]> edges = newArrayList();
                for(int i = 0; i < id2node.length; ++i) {
                    NFA.Node node = id2node[i];
                    for (Pair<CharacterClass, NFA.Node> out : node.out) {
                        // When splitting by a particular label, say, [agc],
                        // we should take into account all edges that might
                        // be triggered by any of the characters accepted
                        // by this label. For example, a "." edge should be used.
                        if(out.first.intersects(c)) {
                            int j = node2id.get(out.second);
                            edges.add(leftNotRight ? new int[] {i,j} : new int[] {j, i});
                        }
                    }
                }
                int[] newP = CoarsestPartition.coarsestStablePartition(p, edges.toArray(new int[edges.size()][]));
                if(!Arrays.equals(p, newP)) {
                    anythingChanged = true;
                    p = newP;
                }
            }
        } while(anythingChanged);

        // Group nodes of the nfa according to 'p'.
        Map<Integer, NFA.Node> block2newNode = newLinkedHashMap();
        Map<Integer,List<Integer>> block2oldNodeIds = newLinkedHashMap();
        for (int i = 0; i < p.length; i++) {
            int b = p[i];
            if (!block2newNode.containsKey(b)) {
                NFA.Node newNode = new NFA.Node();
                block2newNode.put(b, newNode);
                block2oldNodeIds.put(b, new ArrayList<Integer>());
            }
            block2oldNodeIds.get(b).add(i);
        }
        for(int b : block2newNode.keySet()) {
            NFA.Node newNode = block2newNode.get(b);
            for(int oldNodeId : block2oldNodeIds.get(b)) {
                NFA.Node oldNode = id2node[oldNodeId];
                newNode.patternIds.addAll(oldNode.patternIds);
                for (Pair<CharacterClass, NFA.Node> out : oldNode.out) {
                    CharacterClass cc = out.first;
                    NFA.Node dest = out.second;
                    NFA.Node newDest = block2newNode.get(p[node2id.get(dest)]);
                    Pair<CharacterClass, NFA.Node> edge = Pair.of(cc, newDest);
                    if(!newNode.out.contains(edge))
                        newNode.out.add(edge);
                }
            }
        }
        NFA.Node newInitial = block2newNode.get(p[node2id.get(initial)]);
        Set<NFA.Node> newNodes = new HashSet<NFA.Node>(block2newNode.values());
        return Pair.of(newNodes, newInitial);
    }

    private static Pair<Set<NFA.Node>, NFA.Node> computeEClosure(NFA nfa) {
        final Map<NFA.Node, Set<NFA.Node>> node2closure = newLinkedHashMap();

        Set<NFA.Node> allOldNodes = dfs(nfa.begin, true);
        for(NFA.Node node : allOldNodes) {
            node2closure.put(node, dfs(node, false));
        }

        final Map<NFA.Node, Set<NFA.Node>> newNode2contents = newLinkedHashMap();
        final Map<Set<NFA.Node>, NFA.Node> contents2newNode = newLinkedHashMap();
        Set<NFA.Node> newNodesToVisit = newLinkedHashSet();
        Set<NFA.Node> initialEC = node2closure.get(nfa.begin);
        NFA.Node newInitial = new NFA.Node();
        for(NFA.Node subNode : initialEC) {
            newInitial.patternIds.addAll(subNode.patternIds);
        }
        newNodesToVisit.add(newInitial);
        newNode2contents.put(newInitial, initialEC);
        contents2newNode.put(initialEC, newInitial);
        while(!newNodesToVisit.isEmpty()) {
            NFA.Node newNode = newNodesToVisit.iterator().next();
            newNodesToVisit.remove(newNode);
            Map<CharacterClass, Set<NFA.Node>> class2dest = newLinkedHashMap();
            for(NFA.Node subNode : newNode2contents.get(newNode)) {
                for(Pair<CharacterClass, NFA.Node> out : subNode.out) {
                    if(out.first == null) {
                        // Skip epsilon transitions: we're operating on epsilon closures
                        continue;
                    }
                    Set<NFA.Node> dest = class2dest.get(out.first);
                    if(dest == null) {
                        class2dest.put(out.first, dest = newLinkedHashSet());
                    }
                    dest.addAll(node2closure.get(out.second));
                }
            }
            for(CharacterClass cc : class2dest.keySet()) {
                Set<NFA.Node> dest = class2dest.get(cc);
                NFA.Node newDest = contents2newNode.get(dest);
                if(newDest == null) {
                    newDest = new NFA.Node();
                    for(NFA.Node subNode : dest) {
                        newDest.patternIds.addAll(subNode.patternIds);
                    }
                    newNode2contents.put(newDest, dest);
                    contents2newNode.put(dest, newDest);
                    newNodesToVisit.add(newDest);
                }
                newNode.transition(cc, newDest);
            }
        }

        return Pair.of(newNode2contents.keySet(), newInitial);
    }

    static Set<NFA.Node> dfs(NFA.Node origin, boolean acceptNonEps) {
        Set<NFA.Node> res = newLinkedHashSet();
        Stack<NFA.Node> toVisit = new Stack<NFA.Node>();
        toVisit.add(origin);
        while(!toVisit.isEmpty()) {
            NFA.Node node = toVisit.pop();
            if(!res.add(node))
                continue;
            for (Pair<CharacterClass, NFA.Node> out : node.out) {
                if(out.first == null || acceptNonEps) {
                    toVisit.push(out.second);
                }
            }
        }
        return res;
    }

    static NFA reduceNFA(NFA nfa) {
        //
        return nfa;
        // TODO
    }

    static NFA toNFA(RxNode rxNode) {
        if(rxNode instanceof Alternative) {
            Alternative x = (Alternative) rxNode;
            NFA res = new NFA(new NFA.Node(), new NFA.Node());
            NFA a = toNFA(x.a), b = toNFA(x.b);
            res.begin.transition(null, a.begin);
            res.begin.transition(null, b.begin);
            a.end.transition(null, res.end);
            b.end.transition(null, res.end);
            return res;
        } else if(rxNode instanceof CharacterClass) {
            NFA res = new NFA(new NFA.Node(), new NFA.Node());
            res.begin.transition((CharacterClass) rxNode, res.end);
            return res;
        } else if(rxNode instanceof Empty) {
            NFA res = new NFA(new NFA.Node(), new NFA.Node());
            res.begin.transition(null, res.end);
            return res;
        } else if(rxNode instanceof OnceOrMore) {
            OnceOrMore x = (OnceOrMore) rxNode;
            NFA res = toNFA(x.a);
            res.end.transition(null, res.begin);
            return res;
        } else if(rxNode instanceof Sequence) {
            Sequence x = (Sequence) rxNode;
            NFA a = toNFA(x.a), b = toNFA(x.b);
            NFA res = new NFA(a.begin, b.end);
            a.end.transition(null, b.begin);
            return res;
        } else if(rxNode instanceof Labeled) {
            Labeled x = (Labeled) rxNode;
            NFA a = toNFA(x.a);
            a.end.patternIds.add(x.patternId);
            return a;
        } else {
            throw new UnsupportedOperationException("Unsupported node type " + rxNode.getClass());
        }
    }

    static class NFA {
        final Node begin, end;

        public NFA(Node begin, Node end) {
            this.begin = begin;
            this.end = end;
        }

        static class Node {
            static AtomicInteger nextId = new AtomicInteger(0);

            final List<Pair<CharacterClass, Node>> out = newArrayList();
            final Set<Integer> patternIds = newLinkedHashSet();
            final int id = nextId.incrementAndGet();

            void transition(CharacterClass cc, Node dest) {
                out.add(Pair.<CharacterClass, Node>of(cc, dest));
            }

            public String toString() {
                return ""+id;
            }

            public boolean equals(Object o) {
                return id == ((Node) o).id;
            }
            public int hashCode() {
                return id;
            }
        }
    }

    private static RxNode reverse(RxNode rxNode) {
        if(rxNode instanceof Alternative) {
            Alternative x = (Alternative) rxNode;
            return new Alternative(reverse(x.a), reverse(x.b));
        } else if(rxNode instanceof CharacterClass) {
            return rxNode;
        } else if(rxNode instanceof Empty) {
            return rxNode;
        } else if(rxNode instanceof OnceOrMore) {
            OnceOrMore x = (OnceOrMore) rxNode;
            return new OnceOrMore(reverse(x.a));
        } else if(rxNode instanceof Sequence) {
            Sequence x = (Sequence) rxNode;
            // The only interesting case.
            return new Sequence(reverse(x.b), reverse(x.a));
        } else if(rxNode instanceof Labeled) {
            Labeled x = (Labeled) rxNode;
            return new Labeled(reverse(x.a), x.patternId);
        } else {
            throw new UnsupportedOperationException("Unsupported node type " + rxNode.getClass());
        }
    }
}
TOP

Related Classes of org.jkff.ire.regex.RegexCompiler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.