Package org.exist.xquery.functions.text

Source Code of org.exist.xquery.functions.text.RegexpFilter

/*
*  eXist Open Source Native XML Database
*  Copyright (C) 2001-09 Wolfgang M. Meier
*  wolfgang@exist-db.org
*  http://exist.sourceforge.net
*
*  This program is free software; you can redistribute it and/or
*  modify it under the terms of the GNU Lesser General Public License
*  as published by the Free Software Foundation; either version 2
*  of the License, or (at your option) any later version.
*
*  This program is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU Lesser General Public License for more details.
*
*  You should have received a copy of the GNU Lesser General Public License
*  along with this program; if not, write to the Free Software
*  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
*  $Id$
*/

package org.exist.xquery.functions.text;

import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import org.exist.dom.QName;
import org.exist.xquery.BasicFunction;
import org.exist.xquery.Cardinality;
import org.exist.xquery.FunctionSignature;
import org.exist.xquery.XPathException;
import org.exist.xquery.XQueryContext;
import org.exist.xquery.regex.JDK15RegexTranslator;
import org.exist.xquery.regex.RegexSyntaxException;
import org.exist.xquery.value.FunctionParameterSequenceType;
import org.exist.xquery.value.FunctionReturnSequenceType;
import org.exist.xquery.value.Sequence;
import org.exist.xquery.value.SequenceType;
import org.exist.xquery.value.StringValue;
import org.exist.xquery.value.Type;
import org.exist.xquery.value.ValueSequence;

/**
*   xQuery function for filtering strings from text that match the specified
* pattern. E.g.  AABBBBCBBC and BB.*BB results in BBBBCBB
*
* @author dizzzz
*/
public class RegexpFilter extends BasicFunction {
   
    protected static final FunctionParameterSequenceType FLAGS_PARAM = new FunctionParameterSequenceType("flags", Type.STRING, Cardinality.EXACTLY_ONE, "The flags");

  protected static final FunctionParameterSequenceType REGEX_PARAM = new FunctionParameterSequenceType("regularexpression", Type.STRING, Cardinality.EXACTLY_ONE, "The regular expression to perform against the text");

  protected static final FunctionParameterSequenceType TEXT_PARAM = new FunctionParameterSequenceType("text", Type.STRING, Cardinality.EXACTLY_ONE, "The text to filter");

  // Setup function signature
    public final static FunctionSignature signatures[] = {
    new FunctionSignature(
            new QName("filter", TextModule.NAMESPACE_URI, TextModule.PREFIX),
            "Filter substrings that match the regular expression in the text.",
            new SequenceType[]{
                TEXT_PARAM,
                REGEX_PARAM
            },
            new FunctionReturnSequenceType(Type.STRING, Cardinality.ZERO_OR_MORE, "the substrings")
      ),
    new FunctionSignature(
      new QName("groups", TextModule.NAMESPACE_URI, TextModule.PREFIX),
      "Tries to match the string in $text to the regular expression. " +
      "Returns an empty sequence if the string does not match, or a sequence whose " +
      "first item is the entire string, and whose following items are the matched groups.",
      new SequenceType[] {
                TEXT_PARAM,
                REGEX_PARAM
        },
      new FunctionReturnSequenceType(Type.STRING, Cardinality.ZERO_OR_MORE, "an empty sequence if the string does not match, or a sequence whose " +
      "first item is the entire string, and whose following items are the matched groups.")
    ),
    new FunctionSignature(
      new QName("groups", TextModule.NAMESPACE_URI, TextModule.PREFIX),
      "Tries to match the string in $text to the regular expression, using " +
      "the flags specified. Returns an empty sequence if the string does "+
      "not match, or a sequence whose first item is the entire string, and whose " +
      "following items are the matched groups.",
      new SequenceType[] {
                TEXT_PARAM,
                REGEX_PARAM,
        FLAGS_PARAM,
        },
      new FunctionReturnSequenceType(Type.STRING, Cardinality.ZERO_OR_MORE, "an empty sequence if the string does "+
      "not match, or a sequence whose first item is the entire string, and whose " +
      "following items are the matched groups.")
    ),
    new FunctionSignature(
      new QName("groups-regex", TextModule.NAMESPACE_URI, TextModule.PREFIX),
      "Tries to match the string in $text to the regular expression. " +
      "Returns an empty sequence if the string does not match, or a sequence whose " +
      "first item is the entire string, and whose following items are the matched groups. " +
      "Note:\n\n" +
      "The groups-regex() variants of the groups() functions are identical except that they avoid the translation of the specified regular expression from XPath2 to Java syntax. " +
      "That is, the regular expression is evaluated as is, and must be valid according to Java regular expression syntax, rather than the more restrictive XPath2 syntax.",
      new SequenceType[] {
                TEXT_PARAM,
                REGEX_PARAM
        },
      new FunctionReturnSequenceType(Type.STRING, Cardinality.ZERO_OR_MORE, "an empty sequence if the string does not match, or a sequence whose " +
      "first item is the entire string, and whose following items are the matched groups.")
    ),
    new FunctionSignature(
      new QName("groups-regex", TextModule.NAMESPACE_URI, TextModule.PREFIX),
      "Tries to match the string in $text to the regular expression, using " +
      "the flags specified. Returns an empty sequence if the string does "+
      "not match, or a sequence whose first item is the entire string, and whose " +
      "following items are the matched groups. " +
      "Note:\n\n" +
      "The groups-regex() variants of the groups() functions are identical except that they avoid the translation of the specified regular expression from XPath2 to Java syntax. " +
      "That is, the regular expression is evaluated as is, and must be valid according to Java regular expression syntax, rather than the more restrictive XPath2 syntax.",
      new SequenceType[] {
                TEXT_PARAM,
                REGEX_PARAM,
        FLAGS_PARAM,
        },
      new FunctionReturnSequenceType(Type.STRING, Cardinality.ZERO_OR_MORE, "an empty sequence if the string does "+
      "not match, or a sequence whose first item is the entire string, and whose " +
      "following items are the matched groups.")
    )
    };
           
    // Very Small cache
    private String  cachedRegexp = "";
    private Pattern cachedPattern = null;
   
   
    /** Creates a new instance of RegexpMatcher */
    public RegexpFilter(XQueryContext context, FunctionSignature signature) {
        super(context, signature);
    }
   
    /* (non-Javadoc)
     * @see org.exist.xquery.BasicFunction#eval(org.exist.xquery.value.Sequence[], org.exist.xquery.value.Sequence)
     */
    public Sequence eval(Sequence[] args, Sequence contextSequence) throws org.exist.xquery.XPathException {
       
      if(this.isCalledAs("filter")) {
        return filter(args);
      } else {
        return groups(args);
      }
    }
   
    public Sequence filter(Sequence[] args) throws org.exist.xquery.XPathException {
        // Check input parameters
        if(args.length != 2){
            return Sequence.EMPTY_SEQUENCE;
        }
       
        // Get regular expression
        Pattern pattern = null;
        String regexp = args[1].getStringValue();
        if(cachedRegexp.equals(regexp) && cachedPattern!=null){
            // Cached compiled pattern is available!
            pattern = cachedPattern;
           
        } else {
            // Compile new pattern
            pattern = Pattern.compile( regexp );
           
            // Cache new pattern
            cachedPattern = pattern;
            cachedRegexp = regexp;
        }

        // Match pattern on string
        final Matcher matcher = pattern.matcher( args[0].getStringValue() );
       
        // Create response
        final Sequence result = new ValueSequence();
       
        // Add each match to response sequence
        while( matcher.find() ){
            result.add( new StringValue(matcher.group()) );
        }
       
        return result;
    }
   
  public Sequence groups(Sequence[] args) throws XPathException {
        Sequence result;
    final Sequence input = args[0];
    if (input.isEmpty()) {
            result = Sequence.EMPTY_SEQUENCE;       
         } else {
            result = evalGeneric(args, input);
        }
       
        return result;         
  }

  /**
   * Translates the regular expression from XPath2 syntax to java regex
   * syntax.
   *
   * @param pattern XPath regexp expression
   * @return Java regexp expression
   * @throws XPathException
   */
  protected String translateRegexp(String pattern) throws XPathException {
    // convert pattern to Java regex syntax
        try {
          final int xmlVersion = 11;
          final boolean ignoreWhitespace = false;
          final boolean caseBlind = false;
      pattern = JDK15RegexTranslator.translate(pattern, xmlVersion, true, ignoreWhitespace, caseBlind);
    } catch (final RegexSyntaxException e) {
      throw new XPathException(this, "Conversion from XPath2 to Java regular expression " +
          "syntax failed: " + e.getMessage(), e);
    }
    return pattern;
  }

    private Sequence evalGeneric(Sequence[] args, Sequence stringArg) throws XPathException {
        final String string = stringArg.getStringValue();
   
    String pattern;
   
    if( isCalledAs( "groups-regex" ) ) {
      pattern = args[1].getStringValue();
    } else {
      pattern = translateRegexp(args[1].getStringValue());
    }
       
    int flags = 0;
        if(args.length==3)
            {flags = parseFlags(args[2].getStringValue());}
       
    return match(string, pattern, flags);
    }

    /**
     * @param string
     * @param pattern
     * @param flags
     * @throws XPathException
     */
    private Sequence match(String string, String pattern, int flags) throws XPathException {
        try {
          Matcher matcher;
      if(cachedRegexp == null || (!cachedRegexp.equals(pattern)) || flags != cachedPattern.flags()) {
        matcher = Pattern.compile(pattern, flags).matcher(string);
        cachedPattern = matcher.pattern();
        cachedRegexp = string;
            } else {
              matcher = cachedPattern.matcher(string);
            }
           
      if(!matcher.find()) {
        return Sequence.EMPTY_SEQUENCE;
      } else {
        final int items = matcher.groupCount() + 1;
        final Sequence seq = new ValueSequence();
        seq.add(new StringValue(string));
        for(int i=1;i<items;i++) {
          String val = matcher.group(i);
          if(val==null) {
            val="";
          }
          seq.add(new StringValue(val));
        }
        return seq;
      }
    } catch (final PatternSyntaxException e) {
      throw new XPathException(this, "err:FORX0001: Invalid regular expression: " + e.getMessage(), e);
    }
    }

    protected final static int parseFlags(String s) throws XPathException {
    int flags = 0;
    for(int i = 0; i < s.length(); i++) {
      final char ch = s.charAt(i);
      switch(ch) {
        case 'm':
          flags |= Pattern.MULTILINE;
          break;
        case 'i':
          flags = flags | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
          break;
                case 'x':
                    flags |= Pattern.COMMENTS;
                    break;
                case 's':
                    flags |= Pattern.DOTALL;
                    break;
        default:
          throw new XPathException("err:FORX0001: Invalid regular expression flag: " + ch);
      }
    }
    return flags;
  }
   
}
TOP

Related Classes of org.exist.xquery.functions.text.RegexpFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.