/*
* This software and supporting documentation were developed by
*
* Siemens Corporate Technology
* Competence Center Knowledge Management and Business Transformation
* D-81730 Munich, Germany
*
* Authors (representing a really great team ;-) )
* Stefan B. Augustin, Thorbj�rn Hansen, Manfred Langen
*
* This software is Open Source under GNU General Public License (GPL).
* Read the text of this license in LICENSE.TXT
* or look at www.opensource.org/licenses/
*
* Once more we emphasize, that:
* THIS SOFTWARE IS MADE AVAILABLE, AS IS, WITHOUT ANY WARRANTY
* REGARDING THE SOFTWARE, ITS PERFORMANCE OR
* FITNESS FOR ANY PARTICULAR USE, FREEDOM FROM ANY COMPUTER DISEASES OR
* ITS CONFORMITY TO ANY SPECIFICATION. THE ENTIRE RISK AS TO QUALITY AND
* PERFORMANCE OF THE SOFTWARE IS WITH THE USER.
*
*/
// RegExp
// ************ package ******************************************************
// Up till 01.04.99 was package KFM.GUI.Templates;
package KFM;
// ************ imports ******************************************************
// KFM packages
import KFM.Language;
import KFM.Exceptions.ProgrammerException;
import com.oroinc.text.regex.*;
import java.io.*;
import java.util.Enumeration;
import java.util.NoSuchElementException;
import java.util.Hashtable;
import java.util.Properties;
/** **************************************************************************
*
* Simple interface to RegExp � la Perl5.
*
* <P>Provides the static function `match� that has a simple interface to RegExps � la Perl5. That's all you
* need, and it's simple.</P>
*
* <pre>
* ====================
* History:
* creation date: sometime in 03.99
* @author ThH
* @version 0.1
* --------------------
* Changes:
* 01.04.99 ThH: Move from package visibility in KFM.GUI.Templates to public visibility in KFM.
*
*/
public class RegExp {
private static Hashtable mPatternMap = new Hashtable(); // Pattern -> compiled Pattern
/**
* A function with a simple interface that matches a Perl5 RegExp against text and returns information on
* the first match and the matches of its parenthesized pairs.
*
*@param patternString A string containing a Perl5 Regexp.
*@param text A text to be matched.
*@param caseSensitive If and only if true, match is case sensitive, that is "A"!="a" etc..
* Default is `false� (see the function below).
*@param aIsSingleLine Here the regular expression metacharacter '.' matches
* everything, even new lines ('\n'). See below.
*
*@return When no match found returns `null�, else returns a `MatchResult� describing the first match. Let
* `mr� be the returnd `MatchResult�, then you can access the following information:
*
* Choose i=0 for the whole match, and i=>1 for a parenthesized pair:
* mr.group(i) text
* mr.begin(i) start position counting from beginning of match,
* that is, from `mr.beginOffset(0)�.
* mr.end(i) end position counting from beginning of match.
* mr.beginOffset(i) start position counting from beginning of string
* mr.endOffset(i) end position counting from beginning of string
*
* See com.oroinc.text.regex.* and its documentation for information on syntax of regexps and `MatchResult�.
*/
public static MatchResult match(
String patternString, String text, boolean caseSensitive,
boolean aIsSingleLine)
{
int groups;
PatternMatcher matcher;
PatternCompiler compiler;
Pattern pattern = null;
PatternMatcherInput input;
matcher = new Perl5Matcher();
pattern = (Pattern) mPatternMap.get(patternString);
if(pattern == null) {
// Create Perl5Compiler and Perl5Matcher instances.
compiler = new Perl5Compiler();
// Attempt to compile the pattern. If the pattern is not valid, report the error and exit.
try {
if(caseSensitive) {
pattern = compiler.compile(patternString);
} else {
pattern = compiler.compile(patternString,
Perl5Compiler.CASE_INSENSITIVE_MASK |
(aIsSingleLine ? Perl5Compiler.SINGLELINE_MASK : 0));
}
} catch(MalformedPatternException e) {
System.err.println("RegExp.match: Bad pattern: `" + e.getMessage() + "�.");
//@@@ System.exit(1);
//@@@ Make this cleaner some day.
throw new ProgrammerException("RegExp.match: Bad pattern: `" + e.getMessage() + "�.");
}
mPatternMap.put(patternString, pattern);
}
// Create a PatternMatcherInput instance to keep track of the position
// where the last match finished, so that the next match search will
// start from there. You always create a PatternMatcherInput instance
// when you want to search a string for all of the matches it contains,
// and not just the first one.
//
// ThH: I do not know whether we need this. Check it some time.
input = new PatternMatcherInput(text);
if(matcher.contains(input, pattern)) {
return matcher.getMatch();
} else {
return null;
}
}
public static MatchResult match(
String patternString, String text, boolean caseSensitive)
{
return match(patternString, text, caseSensitive, /* isSingleLine */ false);
}
/**
* Method wich matches a patternstring with a contentstring.
* Here the regular expression metacharacter '.' matches
* everything, even new lines ('\n'). See code below:
*
* For more information about the implementation of regular expression ba OROMatcher see:
* $/KFM/www-docs/protected/developer/external-docu/OROMatcher-1.0.7/doc/index.html
*
* @return MatchResult null if no match was found.
*/
//@ Use `KFM.RegExp� or enhance it so that you can use it.
//@ Note that you do not need to compile the pattern each time, it is constant.
//@ Can you refactor that out?
public static MatchResult match (
String patternString,
String text,
boolean caseSensitive,
boolean isSingleLineMask,
int Offset)
{
int groups;
PatternMatcher matcher = new Perl5Matcher();
PatternCompiler compiler = new Perl5Compiler();
Pattern pattern = null;
PatternMatcherInput input;
MatchResult result;
// Wenn you set the Perl5Compiler.SINGLELINE_MASK option
// the contentstring is treated singleline, even if there
// are some '\n' in it.
try {
if(caseSensitive) {
if (isSingleLineMask){
pattern = compiler.compile(patternString, Perl5Compiler.SINGLELINE_MASK);
}
else {
pattern = compiler.compile(patternString);
}
} else {
if (isSingleLineMask){
pattern = compiler.compile(
patternString,
Perl5Compiler.SINGLELINE_MASK |
Perl5Compiler.CASE_INSENSITIVE_MASK);
}
else {
pattern = compiler.compile(patternString, Perl5Compiler.CASE_INSENSITIVE_MASK);
}
}
} catch(MalformedPatternException e) {
//@@@ System.exit(1);
//@@@ Make this cleaner some day.
throw new ProgrammerException("LinkSearch.HtmlParser.match: Bad pattern: `" + e.getMessage() + "�.");
}
input = new PatternMatcherInput(text);
// For debugging purposes.
// KFMSystem.log.debug( text);
// set the current Offset to prevent
// that the matcher starts again from the beginning of the string
input.setCurrentOffset( Offset );
if(matcher.contains(input, pattern)) {
result = matcher.getMatch();
} else {
result = null;
}
return result;
}
/**
* Match *case insensitive* Perl5 RegExp against text and return information on the first match and the
* matches of its parenthesized pairs.
*
* @see #match(String, String, boolean)
*/
public static MatchResult match(String patternString, String text)
{
return match(patternString, text, false /* not case sensitive */);
}
/**
* Quote all regexp metacharacters in a string.
*
* Given a character string, returns a Perl5 expression that interprets each character of the original
* string literally. In other words, all special metacharacters are quoted/escaped. This method is useful
* for converting user input meant for literal interpretation into a safe regular expression representing
* the literal input.
*
* Note: All this method does is call com.oroinc.text.regex.Perl5Compiler.quotemeta(String),
* it is repeated here for easy reference.
*/
public static final String quotemeta (String expression)
{
return com.oroinc.text.regex.Perl5Compiler.quotemeta(expression);
}
// ************************************************************
// Debugging stuff
// ************************************************************
/**
* Demo of RegExp package for testing purposes only.
*
* Reports all the matches of the Perl5 regular expression <code>patternP</code> on the string <code>text</code>.
*
* Note: Rename to <code>main</code> to run.
*
* Possible bug: Apparently only reports first match.
*
* @param args[] Ignored.
*/
public static final void mainTestRegExp(String args[])
{
String text = "Na <KFM A=1 B=\"b\" C>";
// `patternP� is the probably a correct regexp for a HTML command, but I did not get it to work.
//
// The problem is that I can't extract the attribute/value pairs from `attrvaluesP� directly from `patternP�
// because the regexp package does only provide access to the last value of `(...)*�, not to all values.
//
// I tried matching `attrvaluesP� again, but then ATTR=VALUE produced a match for the substring `ATTR�, which is
// in fact a correct match but not the one I was hoping for. Ahh! Now I know! I could use the pattern
//
// attrvalueP + attrvaluesP
//
// and move upwards. That would actually work.
//
// I could also try to get a better implementation of regexps.
//
// Note:
// - \s: whitespace
// - \w: word
//
final String tagP = "\\w+";
final String attrP = "([-_a-zA-Z0-9]+)";
final String valueP = "([-_a-zA-Z0-9.%]+|\"[^\"]*\")";
// Note: Parenthesis group includes quotes.
// That might be a bad idea, I don't know yet.
final String attrWithValueP = attrP + "\\s*=\\s*" + valueP;
final String attrWithOptValueP = attrP + "(\\s*=\\s*" + valueP + "|)";
final String attrsP = "(?:\\s+" + attrWithOptValueP + ")*";
final String patternP = "\\<\\s*(" + tagP + ")(" + attrsP + ")\\s*\\>";
int groups;
PatternMatcher matcher;
PatternCompiler compiler;
Pattern pattern = null;
PatternMatcherInput input;
MatchResult result;
// Create Perl5Compiler and Perl5Matcher instances.
compiler = new Perl5Compiler();
matcher = new Perl5Matcher();
// Attempt to compile the pattern. If the pattern is not valid,
// report the error and exit.
try {
pattern = compiler.compile(patternP);
} catch(MalformedPatternException e) {
System.err.println("RegExp.main...: Bad pattern: `" + e.getMessage() + "�.");
//@@@ System.exit(1);
//@@@ Make this cleaner some day.
throw new ProgrammerException("RegExp.main...: Bad pattern: `" + e.getMessage() + "�.");
}
// Create a PatternMatcherInput instance to keep track of the position
// where the last match finished, so that the next match search will
// start from there. You always create a PatternMatcherInput instance
// when you want to search a string for all of the matches it contains,
// and not just the first one.
input = new PatternMatcherInput(text);
// Loop until there are no more matches left.
while(matcher.contains(input, pattern)) {
// Since we're still in the loop, fetch match that was found.
result = matcher.getMatch();
// Perform whatever processing on the result you want.
// Here we just print out all its elements to show how the
// MatchResult methods are used.
// The toString() method is provided as a convenience method.
// It returns the entire match. The following are all equivalent:
// System.out.println("Match: " + result);
// System.out.println("Match: " + result.toString());
// System.out.println("Match: " + result.group(0));
System.out.println("Match: " + result.toString());
// Print the length of the match. The length() method is another
// convenience method. The lengths of subgroups can be obtained
// by first retrieving the subgroup and then calling the string's
// length() method.
System.out.println("Length: " + result.length());
// Retrieve the number of matched groups. A group corresponds to
// a parenthesized set in a pattern.
groups = result.groups();
System.out.println("Groups: " + groups);
// Print the offset into the input of the beginning and end of the
// match. The beinOffset() and endOffset() methods return the
// offsets of a group relative to the beginning of the input. The
// begin() and end() methods return the offsets of a group relative
// the to the beginning of a match.
System.out.println("Begin offset: " + result.beginOffset(0));
System.out.println("End offset: " + result.endOffset(0));
System.out.println("Groups: ");
// Print the contents of each matched subgroup along with their
// offsets relative to the beginning of the entire match.
// Start at 1 because we just printed out group 0
for(int group = 1; group < groups; group++) {
System.out.println(group + ": " + result.group(group));
System.out.println("Begin: " + result.begin(group));
System.out.println("End: " + result.end(group));
}
}
}
}