Source Code of client.net.sf.saxon.ce.functions.EscapeURI

package client.net.sf.saxon.ce.functions;


import client.net.sf.saxon.ce.Configuration;
import client.net.sf.saxon.ce.expr.XPathContext;
import client.net.sf.saxon.ce.functions.codenorm.Normalizer;
import client.net.sf.saxon.ce.om.Item;
import client.net.sf.saxon.ce.trans.Err;
import client.net.sf.saxon.ce.trans.XPathException;
import client.net.sf.saxon.ce.tree.util.FastStringBuffer;
import client.net.sf.saxon.ce.tree.util.UTF8CharacterSet;
import client.net.sf.saxon.ce.value.StringValue;


import java.util.Arrays;


/**
 * This class supports the functions encode-for-uri() and iri-to-uri()
 */


public class EscapeURI extends SystemFunction {


    public EscapeURI(int operation) {
        this.operation = operation;
    }


    public EscapeURI newInstance() {
        return new EscapeURI(operation);
    }


    public static final int ENCODE_FOR_URI = 1;
    public static final int IRI_TO_URI = 2;
    public static final int HTML_URI = 3;


    public static boolean[] allowedASCII = new boolean[128];


    static {
        Arrays.fill(allowedASCII, 0, 32, false);
        Arrays.fill(allowedASCII, 33, 127, true);
        allowedASCII[(int)'"'] = false;
        allowedASCII[(int)'<'] = false;
        allowedASCII[(int)'>'] = false;
        allowedASCII[(int)'\\'] = false;
        allowedASCII[(int)'^'] = false;
        allowedASCII[(int)'`'] = false;
        allowedASCII[(int)'{'] = false;
        allowedASCII[(int)'|'] = false;
        allowedASCII[(int)'}'] = false;
    }


    /**
    * Evaluate the function
    */


    public Item evaluateItem(XPathContext c) throws XPathException {
        Item item = argument[0].evaluateItem(c);
        if (item == null) {
            return StringValue.EMPTY_STRING;
        }
        final CharSequence s = item.getStringValueCS();
        switch (operation) {
            case ENCODE_FOR_URI:
                return StringValue.makeStringValue(escape(s, "-_.~"));
            case IRI_TO_URI:
                return StringValue.makeStringValue(iriToUri(s));
            case HTML_URI:
                return StringValue.makeStringValue(escapeHtmlURL(s, false, c.getConfiguration()));
            default:
                throw new UnsupportedOperationException("Unknown escape operation");
        }
    }


    /**
     * Escape special characters in a URI. The characters that are %HH-encoded are
     * all non-ASCII characters
     * @param s the URI to be escaped
     * @return the %HH-encoded string
     */


    public static CharSequence iriToUri(CharSequence s) {
        // NOTE: implements a late spec change which says that characters that are illegal in an IRI,
        // for example "\", must be %-encoded.
        if (allAllowedAscii(s)) {
            // it's worth doing a prescan to avoid the cost of copying in the common all-ASCII case
            return s;
        }
        FastStringBuffer sb = new FastStringBuffer(s.length()+20);
        for (int i=0; i<s.length(); i++) {
            final char c = s.charAt(i);
            if (c>=0x7f || !allowedASCII[(int)c]) {
                escapeChar(c, ((i+1)<s.length() ? s.charAt(i+1) : ' '), sb);
            } else {
                sb.append(c);
            }
        }
        return sb;
    }


    private static boolean allAllowedAscii(CharSequence s) {
        for (int i=0; i<s.length(); i++) {
            final char c = s.charAt(i);
            if (c>=0x7f || !allowedASCII[(int)c]) {
                return false;
            }
        }
        return true;
    }




    /**
     * Escape special characters in a URI. The characters that are %HH-encoded are
     * all non-ASCII characters, plus all ASCII characters except (a) letter A-Z
     * and a-z, (b) digits 0-9, and (c) characters listed in the allowedPunctuation
     * argument
     * @param s the URI to be escaped
     * @param allowedPunctuation ASCII characters other than letters and digits that
     * should NOT be %HH-encoded
     * @return the %HH-encoded string
     */


    public static CharSequence escape(CharSequence s, String allowedPunctuation) {
        FastStringBuffer sb = new FastStringBuffer(s.length());
        for (int i=0; i<s.length(); i++) {
            char c = s.charAt(i);
            if ((c>='a' && c<='z') || (c>='A' && c<='Z') || (c>='0' && c<='9')) {
                sb.append(c);
            } else if (c<=0x20 || c>=0x7f) {
                escapeChar(c, ((i+1)<s.length() ? s.charAt(i+1) : ' '), sb);
            } else if (allowedPunctuation.indexOf(c) >= 0) {
                sb.append(c);
            } else {
                escapeChar(c, ' ', sb);
            }


        }
        return sb;
    }


    private static final String hex = "0123456789ABCDEF";


    /**
     * Escape a single character in %HH representation, or a pair of two chars representing
     * a surrogate pair
     * @param c the character to be escaped, or the first character of a surrogate pair
     * @param c2 the second character of a surrogate pair
     * @param sb the buffer to contain the escaped result
     */


    private static void escapeChar(char c, char c2, FastStringBuffer sb) {
        byte[] array = new byte[4];
        int used = UTF8CharacterSet.getUTF8Encoding(c, c2, array);
        for (int b=0; b<used; b++) {
            int v = (int)array[b] & 0xff;
            sb.append('%');
            sb.append(hex.charAt(v/16));
            sb.append(hex.charAt(v%16));
        }
    }


    /**
     * Check that any percent-encoding within a URI is well-formed. The method assumes that a percent
     * sign followed by two hex digits represents an octet of the UTF-8 representation of a character;
     * any other percent sign is assumed to represent itself.
     * @param uri the string to be checked for validity
     * @throws XPathException if the string is not validly percent-encoded
     */


    public static void checkPercentEncoding(String uri) throws XPathException {
        for (int i=0; i<uri.length();) {
            char c = uri.charAt(i);
            byte[] bytes;
            // Note: we're translating the UTF-8 byte sequence but then not using the value
            int expectedOctets;
            if (c == '%') {
                if (i+2 >= uri.length()) {
                    throw new XPathException("% sign in URI must be followed by two hex digits" +
                                    Err.wrap(uri));
                }
                int h1 = hexDigits.indexOf(uri.charAt(i+1));
                if (h1 > 15) {
                    h1 -= 6;
                }


                int h2 = hexDigits.indexOf(uri.charAt(i+2));
                if (h2 > 15) {
                    h2 -= 6;
                }
                if (h1 >= 0 && h2 >= 0) {
                    int b = h1<<4 | h2;
                    expectedOctets = UTF8RepresentationLength[h1];
                    if (expectedOctets == -1) {
                        throw new XPathException("First %-encoded octet in URI is not valid as the start of a UTF-8 " +
                                "character: first two bits must not be '10'" +
                                    Err.wrap(uri));
                    }
                    bytes = new byte[expectedOctets];
                    bytes[0] = (byte)b;
                    i+=3;
                    for (int q=1; q<expectedOctets; q++) {
                        if (i+2 > uri.length() || uri.charAt(i) != '%') {
                            throw new XPathException("Incomplete %-encoded UTF-8 octet sequence in URI " +
                                    Err.wrap(uri));
                        }
                        h1 = hexDigits.indexOf(uri.charAt(i+1));
                        if (h1 > 15) {
                            h1 -= 6;
                        }


                        h2 = hexDigits.indexOf(uri.charAt(i+2));
                        if (h2 > 15) {
                            h2 -= 6;
                        }
                        if (h1 < 0 || h2 < 0) {
                            throw new XPathException("Invalid %-encoded UTF-8 octet sequence in URI" +
                                    Err.wrap(uri));
                        }
                        if (UTF8RepresentationLength[h1] != -1) {
                            throw new XPathException("In a URI, a %-encoded UTF-8 octet after the first " +
                                    "must have '10' as the first two bits" +
                                    Err.wrap(uri));
                        }
                        b = h1<<4 | h2;
                        bytes[q] = (byte)b;
                        i += 3;
                    }
                } else {
                    throw new XPathException("% sign in URI must be followed by two hex digits" +
                                    Err.wrap(uri));
                }
            } else {
                i++;
            }


        }


    }


    /**
     * Escape a URI according to the HTML rules: that is, a non-ASCII character (specifically,
     * a character outside the range 32 - 126) is replaced by the %HH encoding of the octets in
     * its UTF-8 representation
     * @param url the URI to be escaped
     * @param normalize
     * @return the URI after escaping non-ASCII characters
     */


    public static CharSequence escapeHtmlURL(CharSequence url, boolean normalize, Configuration config) throws XPathException {
        // optimize for the common case where the string is all ASCII characters
        for (int i=url.length()-1; i>=0; i--) {
            char ch = url.charAt(i);
            if (ch<32 || ch>126) {
                if (normalize) {
                    CharSequence normalized = new Normalizer(Normalizer.C, config).normalize(url);
                    return reallyEscapeURL(normalized);
                } else {
                    return reallyEscapeURL(url);
                }
            }
        }
        return url;
    }


    private static CharSequence reallyEscapeURL(CharSequence url) {
        FastStringBuffer sb = new FastStringBuffer(url.length() + 20);
        final String hex = "0123456789ABCDEF";
        byte[] array = new byte[4];


        for (int i=0; i<url.length(); i++) {
            char ch = url.charAt(i);
            if (ch<32 || ch>126) {
                int used = UTF8CharacterSet.getUTF8Encoding(ch,
                                                 (i+1 < url.length() ? url.charAt(i+1): ' '), array);
                for (int b=0; b<used; b++) {
                    //int v = (array[b]>=0 ? array[b] : 256 + array[b]);
                    int v = ((int)array[b]) & 0xff;
                    sb.append('%');
                    sb.append(hex.charAt(v/16));
                    sb.append(hex.charAt(v%16));
                }


            } else {
                sb.append(ch);
            }
        }
        return sb;
    }






    private static String hexDigits = "0123456789abcdefABCDEF";


    // Length of a UTF8 byte sequence, as a function of the first nibble
    private static int[] UTF8RepresentationLength = {1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 2, 2, 3, 4};
}






// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. 
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is “Incompatible With Secondary Licenses”, as defined by the Mozilla Public License, v. 2.0.
Source Code of client.net.sf.saxon.ce.functions.EscapeURI

Related Classes of client.net.sf.saxon.ce.functions.EscapeURI