Package KFM

Source Code of KFM.RegExp

/*
*  This software and supporting documentation were developed by
*
*    Siemens Corporate Technology
*    Competence Center Knowledge Management and Business Transformation
*    D-81730 Munich, Germany
*
*    Authors (representing a really great team ;-) )
*            Stefan B. Augustin, Thorbj�rn Hansen, Manfred Langen
*
*  This software is Open Source under GNU General Public License (GPL).
*  Read the text of this license in LICENSE.TXT
*  or look at www.opensource.org/licenses/
*
*  Once more we emphasize, that:
*  THIS SOFTWARE IS MADE AVAILABLE,  AS IS,  WITHOUT ANY WARRANTY
*  REGARDING  THE  SOFTWARE,  ITS  PERFORMANCE OR
*  FITNESS FOR ANY PARTICULAR USE, FREEDOM FROM ANY COMPUTER DISEASES OR
*  ITS CONFORMITY TO ANY SPECIFICATION. THE ENTIRE RISK AS TO QUALITY AND
*  PERFORMANCE OF THE SOFTWARE IS WITH THE USER.
*
*/


// RegExp

// ************ package ******************************************************
// Up till 01.04.99 was package KFM.GUI.Templates;
package KFM;

// ************ imports ******************************************************

// KFM packages
import KFM.Language;
import KFM.Exceptions.ProgrammerException;

import com.oroinc.text.regex.*;

import java.io.*;
import java.util.Enumeration;
import java.util.NoSuchElementException;
import java.util.Hashtable;
import java.util.Properties;

/** **************************************************************************
*
* Simple interface to RegExp � la Perl5.
*
* <P>Provides the static function `match� that has a simple interface to RegExps � la Perl5. That's all you
* need, and it's simple.</P>
*
* <pre>
* ====================
* History:
*    creation date: sometime in 03.99
* @author ThH
* @version 0.1
* --------------------
* Changes:
* 01.04.99 ThH: Move from package visibility in KFM.GUI.Templates to public visibility in KFM.
*
*/
public class RegExp {

    private static Hashtable mPatternMap = new Hashtable(); // Pattern -> compiled Pattern

    /**
     * A function with a simple interface that matches a Perl5 RegExp against text and returns information on
     * the first match and the matches of its parenthesized pairs.
     *
     *@param patternString  A string containing a Perl5 Regexp.
     *@param text           A text to be matched.
     *@param caseSensitive  If and only if true, match is case sensitive, that is "A"!="a" etc..
     *                      Default is `false� (see the function below).
     *@param aIsSingleLine  Here the regular expression metacharacter '.' matches
     *                      everything, even new lines ('\n'). See below.
     *
     *@return  When no match found returns `null�, else returns a `MatchResult� describing the first match. Let
     *         `mr� be the returnd `MatchResult�, then you can access the following information:
     *
     *         Choose i=0 for the whole match, and i=>1 for a parenthesized pair:
     *            mr.group(i)        text
     *            mr.begin(i)        start position counting from beginning of match,
     *                               that is, from `mr.beginOffset(0)�.
     *            mr.end(i)          end position counting from beginning of match.
     *            mr.beginOffset(i)  start position counting from beginning of string
     *            mr.endOffset(i)    end position counting from beginning of string
     *
     * See com.oroinc.text.regex.* and its documentation for information on syntax of regexps and `MatchResult�.
     */
    public static MatchResult match(
        String patternString, String text, boolean caseSensitive,
        boolean aIsSingleLine)
    {
        int groups;
        PatternMatcher matcher;
        PatternCompiler compiler;
        Pattern pattern = null;
        PatternMatcherInput input;

        matcher  = new Perl5Matcher();

        pattern = (Pattern) mPatternMap.get(patternString);
        if(pattern == null) {
            // Create Perl5Compiler and Perl5Matcher instances.
            compiler = new Perl5Compiler();

            // Attempt to compile the pattern. If the pattern is not valid, report the error and exit.
            try {
                if(caseSensitive) {
                    pattern = compiler.compile(patternString);
                } else {
                    pattern = compiler.compile(patternString,
                        Perl5Compiler.CASE_INSENSITIVE_MASK |
                        (aIsSingleLine ? Perl5Compiler.SINGLELINE_MASK : 0));
                }
            } catch(MalformedPatternException e) {
                System.err.println("RegExp.match: Bad pattern: `" + e.getMessage() + "�.");
                //@@@ System.exit(1);
                //@@@ Make this cleaner some day.
                throw new ProgrammerException("RegExp.match: Bad pattern: `" + e.getMessage() + "�.");
            }
            mPatternMap.put(patternString, pattern);
        }

        // Create a PatternMatcherInput instance to keep track of the position
        // where the last match finished, so that the next match search will
        // start from there.  You always create a PatternMatcherInput instance
        // when you want to search a string for all of the matches it contains,
        // and not just the first one.
        //
        // ThH: I do not know whether we need this. Check it some time.
        input = new PatternMatcherInput(text);

        if(matcher.contains(input, pattern)) {
            return matcher.getMatch();
        } else {
            return null;
        }
    }

    public static MatchResult match(
        String patternString, String text, boolean caseSensitive)
    {
        return match(patternString, text, caseSensitive, /* isSingleLine */ false);
    }

    /**
    * Method wich matches a patternstring with a contentstring.
    * Here the regular expression metacharacter '.' matches
    * everything, even new lines ('\n'). See code below:
    *
    * For more information about the implementation of regular expression ba OROMatcher see:
    * $/KFM/www-docs/protected/developer/external-docu/OROMatcher-1.0.7/doc/index.html
    *
    * @return MatchResult null if no match was found.
    */
    //@ Use `KFM.RegExp� or enhance it so that you can use it.
    //@ Note that you do not need to compile the pattern each time, it is constant.
    //@ Can you refactor that out?
    public static MatchResult match (
        String patternString,
        String text,
        boolean caseSensitive,
        boolean isSingleLineMask,
        int Offset)
    {
        int groups;
        PatternMatcher matcher = new Perl5Matcher();
        PatternCompiler compiler = new Perl5Compiler();
        Pattern pattern = null;
        PatternMatcherInput input;
        MatchResult result;

        // Wenn you set the Perl5Compiler.SINGLELINE_MASK option
        // the contentstring is treated singleline, even if there
        // are some '\n' in it.
        try {

            if(caseSensitive) {
                if (isSingleLineMask){

                    pattern = compiler.compile(patternString, Perl5Compiler.SINGLELINE_MASK);
                }
                else {
                    pattern = compiler.compile(patternString);
                }
            } else {
                if (isSingleLineMask){
                    pattern = compiler.compile(
                        patternString,
                        Perl5Compiler.SINGLELINE_MASK |
                        Perl5Compiler.CASE_INSENSITIVE_MASK);
                }
                else {
                    pattern = compiler.compile(patternString, Perl5Compiler.CASE_INSENSITIVE_MASK);
                }
            }
        } catch(MalformedPatternException e) {
            //@@@ System.exit(1);
            //@@@ Make this cleaner some day.
            throw new ProgrammerException("LinkSearch.HtmlParser.match: Bad pattern: `" + e.getMessage() + "�.");
        }
        input = new PatternMatcherInput(text);
        // For debugging purposes.
        //  KFMSystem.log.debug( text);

        // set the current Offset to prevent
        // that the matcher starts again from the beginning of the string
        input.setCurrentOffset( Offset );

        if(matcher.contains(input, pattern)) {
            result = matcher.getMatch();
        } else {
            result = null;
        }
        return result;
    }


    /**
     * Match *case insensitive* Perl5 RegExp against text and return information on the first match and the
     * matches of its parenthesized pairs.
     *
     * @see #match(String, String, boolean)
     */
    public static MatchResult match(String patternString, String text)
    {
        return match(patternString, text, false /* not case sensitive */);
    }

    /**
     * Quote all regexp metacharacters in a string.
     *
     * Given a character string, returns a Perl5 expression that interprets each character of the original
     * string literally. In other words, all special metacharacters are quoted/escaped. This method is useful
     * for converting user input meant for literal interpretation into a safe regular expression representing
     * the literal input.
     *
     * Note: All this method does is call com.oroinc.text.regex.Perl5Compiler.quotemeta(String),
     * it is repeated here for easy reference.
     */
    public static final String quotemeta (String expression)
    {
        return com.oroinc.text.regex.Perl5Compiler.quotemeta(expression);
    }

    // ************************************************************
    // Debugging stuff
    // ************************************************************

    /**
     * Demo of RegExp package for testing purposes only.
     *
     * Reports all the matches of the Perl5 regular expression <code>patternP</code> on the string <code>text</code>.
     *
     * Note: Rename to <code>main</code> to run.
     *
     * Possible bug: Apparently only reports first match.
     *
     * @param args[]  Ignored.
     */
    public static final void mainTestRegExp(String args[])
    {
        String text = "Na <KFM A=1 B=\"b\" C>";

        // `patternP� is the probably a correct regexp for a HTML command, but I did not get it to work.
        //
        // The problem is that I can't extract the attribute/value pairs from `attrvaluesP� directly from `patternP�
        // because the regexp package does only provide access to the last value of `(...)*�, not to all values.
        //
        // I tried matching `attrvaluesP� again, but then ATTR=VALUE produced a match for the substring `ATTR�, which is
        // in fact a correct match but not the one I was hoping for. Ahh! Now I know! I could use the pattern
        //
        //   attrvalueP + attrvaluesP
        //
        // and move upwards. That would actually work.
        //
        // I could also try to get a better implementation of regexps.
        //
        // Note:
        // - \s: whitespace
        // - \w: word
        //
        final String tagP   = "\\w+";
        final String attrP  = "([-_a-zA-Z0-9]+)";
        final String valueP = "([-_a-zA-Z0-9.%]+|\"[^\"]*\")";
                                             // Note: Parenthesis group includes quotes.
                                             // That might be a bad idea, I don't know yet.
        final String attrWithValueP = attrP + "\\s*=\\s*" + valueP;
        final String attrWithOptValueP = attrP + "(\\s*=\\s*" + valueP + "|)";
        final String attrsP = "(?:\\s+" + attrWithOptValueP + ")*";
        final String patternP = "\\<\\s*(" + tagP + ")(" + attrsP + ")\\s*\\>";

        int groups;
        PatternMatcher matcher;
        PatternCompiler compiler;
        Pattern pattern = null;
        PatternMatcherInput input;
        MatchResult result;

        // Create Perl5Compiler and Perl5Matcher instances.
        compiler = new Perl5Compiler();
        matcher  = new Perl5Matcher();

        // Attempt to compile the pattern.  If the pattern is not valid,
        // report the error and exit.
        try {
            pattern = compiler.compile(patternP);
        } catch(MalformedPatternException e) {
            System.err.println("RegExp.main...: Bad pattern: `" + e.getMessage() + "�.");
            //@@@ System.exit(1);
            //@@@ Make this cleaner some day.
            throw new ProgrammerException("RegExp.main...: Bad pattern: `" + e.getMessage() + "�.");
        }

        // Create a PatternMatcherInput instance to keep track of the position
        // where the last match finished, so that the next match search will
        // start from there.  You always create a PatternMatcherInput instance
        // when you want to search a string for all of the matches it contains,
        // and not just the first one.
        input   = new PatternMatcherInput(text);

        // Loop until there are no more matches left.
        while(matcher.contains(input, pattern)) {
            // Since we're still in the loop, fetch match that was found.
            result = matcher.getMatch();

            // Perform whatever processing on the result you want.
            // Here we just print out all its elements to show how the
            // MatchResult methods are used.

            // The toString() method is provided as a convenience method.
            // It returns the entire match.  The following are all equivalent:
            //     System.out.println("Match: " + result);
            //     System.out.println("Match: " + result.toString());
            //     System.out.println("Match: " + result.group(0));
            System.out.println("Match: " + result.toString());

            // Print the length of the match.  The length() method is another
            // convenience method.  The lengths of subgroups can be obtained
            // by first retrieving the subgroup and then calling the string's
            // length() method.
            System.out.println("Length: " + result.length());

            // Retrieve the number of matched groups.  A group corresponds to
            // a parenthesized set in a pattern.
            groups = result.groups();
            System.out.println("Groups: " + groups);

            // Print the offset into the input of the beginning and end of the
            // match.  The beinOffset() and endOffset() methods return the
            // offsets of a group relative to the beginning of the input.  The
            // begin() and end() methods return the offsets of a group relative
            // the to the beginning of a match.
            System.out.println("Begin offset: " + result.beginOffset(0));
            System.out.println("End offset: " + result.endOffset(0));
            System.out.println("Groups: ");

            // Print the contents of each matched subgroup along with their
            // offsets relative to the beginning of the entire match.

            // Start at 1 because we just printed out group 0
            for(int group = 1; group < groups; group++) {
                System.out.println(group + ": " + result.group(group));
                System.out.println("Begin: " + result.begin(group));
                System.out.println("End: " + result.end(group));
            }
        }
    }
}
TOP

Related Classes of KFM.RegExp

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.