Package org.cyberneko.html.filters

Source Code of org.cyberneko.html.filters.Purifier

/*
* (C) Copyright 2004, Andy Clark.  All rights reserved.
*
* This file is distributed under an Apache style license. Please
* refer to the LICENSE file for specific details.
*/

package org.cyberneko.html.filters;

import org.cyberneko.html.HTMLEventInfo;

import java.lang.reflect.Method;
import java.lang.reflect.InvocationTargetException;

import org.apache.xerces.util.AugmentationsImpl;
import org.apache.xerces.util.XMLChar;
import org.apache.xerces.util.XMLStringBuffer;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.NamespaceContext;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLLocator;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.XMLComponentManager;
import org.apache.xerces.xni.parser.XMLConfigurationException;

/**
* This filter purifies the HTML input to ensure XML well-formedness.
* The purification process includes:
* <ul>
* <li>fixing illegal characters in the document, including
<ul>
<li>element and attribute names,
<li>processing instruction target and data,
<li>document text;
</ul>
* <li>ensuring the string "--" does not appear in the content of
*     a comment; and
* <li>ensuring the string "]]>" does not appear in the content of
*     a CDATA section.
* </ul>
* <p>
* Illegal characters in XML names are converted to the character
* sequence "_u####_" where "####" is the value of the Unicode
* character represented in hexadecimal. Whereas illegal characters
* appearing in document content is converted to the character
* sequence "\\u####".
* <p>
* In comments, the character '-' is replaced by the character
* sequence "- " to prevent "--" from ever appearing in the comment
* content. For CDATA sections, the character ']' is replaced by
* the character sequence "] " to prevent "]]" from appearing.
*
* @author Andy Clark
*
* @version $Id$
*/
public class Purifier
    extends DefaultFilter {

    //
    // Constants
    //

    /** Namespaces. */
    protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";

    /** Include infoset augmentations. */
    protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";

    /** Recognized features. */
    private static final String[] RECOGNIZED_FEATURES = {
        NAMESPACES,
        AUGMENTATIONS,
    };

    /** Recognized features defaults. */
    private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = {
        null,
        null,
    };

    // static vars

    /** Synthesized event info item. */
    protected static final HTMLEventInfo SYNTHESIZED_ITEM =
        new HTMLEventInfo.SynthesizedItem();

    //
    // Data
    //

    // features

    /** Namespaces. */
    protected boolean fNamespaces;

    /** Augmentations. */
    protected boolean fAugmentations;

    // state

    /** True if the doctype declaration was seen. */
    protected boolean fSeenDoctype;

    /** True if root element was seen. */
    protected boolean fSeenRootElement;

    /** True if inside a CDATA section. */
    protected boolean fInCDATASection;

    // doctype declaration info

    /** Public identifier of doctype declaration. */
    protected String fPublicId;

    /** System identifier of doctype declaration. */
    protected String fSystemId;

    // temp vars

    /** Qualified name. */
    private QName fQName = new QName();

    /** Augmentations. */
    private final Augmentations fInfosetAugs = new AugmentationsImpl();

    /** String buffer. */
    private final XMLStringBuffer fStringBuffer = new XMLStringBuffer();

    //
    // XMLComponent methods
    //

    public void reset(XMLComponentManager manager)
        throws XMLConfigurationException {

        // state
        fInCDATASection = false;

        // features
        fNamespaces = manager.getFeature(NAMESPACES);
        fAugmentations = manager.getFeature(AUGMENTATIONS);

    } // reset(XMLComponentManager)

    //
    // XMLDocumentHandler methods
    //

    /** Start document. */
    public void startDocument(XMLLocator locator, String encoding,
                              Augmentations augs) throws XNIException {
        handleStartDocument();
        super.startDocument(locator, encoding, augs);
    } // startDocument(XMLLocator,String,Augmentations)

    /** Start document. */
    public void startDocument(XMLLocator locator, String encoding,
                              NamespaceContext nscontext, Augmentations augs)
        throws XNIException {
        handleStartDocument();
        super.startDocument(locator, encoding, nscontext, augs);
    } // startDocument(XMLLocator,NamespaceContext,String,Augmentations)

    /** Comment. */
    public void comment(XMLString text, Augmentations augs)
        throws XNIException {
        StringBuffer str = new StringBuffer(purifyText(text).toString());
        int length = str.length();
        for (int i = length-1; i >= 0; i--) {
            char c = str.charAt(i);
            if (c == '-') {
                str.insert(i + 1, ' ');
            }
        }
        fStringBuffer.length = 0;
        fStringBuffer.append(str.toString());
        text = fStringBuffer;
        super.comment(text, augs);
    } // comment(XMLString,Augmentations)

    /** Processing instruction. */
    public void processingInstruction(String target, XMLString data,
                                      Augmentations augs)
        throws XNIException {
        target = purifyName(target, true);
        data = purifyText(data);
        super.processingInstruction(target, data, augs);
    } // processingInstruction(String,XMLString,Augmentations)

    /** Doctype declaration. */
    public void doctypeDecl(String root, String pubid, String sysid,
                            Augmentations augs) throws XNIException {
        fSeenDoctype = true;
        // NOTE: It doesn't matter what the root element name is because
        //       it must match the root element. -Ac
        fPublicId = pubid;
        fSystemId = sysid;
        // NOTE: If the public identifier is specified, then a system
        //       identifier must also be specified. -Ac
        if (fPublicId != null && fSystemId == null) {
            fSystemId = "";
        }
        // NOTE: Can't save the augmentations because the object state
        //       is transient. -Ac
    } // doctypeDecl(String,String,String,Augmentations)

    /** Start element. */
    public void startElement(QName element, XMLAttributes attrs,
                             Augmentations augs) throws XNIException {
        handleStartElement(element, attrs);
        super.startElement(element, attrs, augs);
    } // startElement(QName,XMLAttributes,Augmentations)

    /** Empty element. */
    public void emptyElement(QName element, XMLAttributes attrs,
                             Augmentations augs) throws XNIException {
        handleStartElement(element, attrs);
        super.emptyElement(element, attrs, augs);
    } // emptyElement(QName,XMLAttributes,Augmentations)

    /** Start CDATA section. */
    public void startCDATA(Augmentations augs) throws XNIException {
        fInCDATASection = true;
        super.startCDATA(augs);
    } // startCDATA(Augmentations)

    /** End CDATA section. */
    public void endCDATA(Augmentations augs) throws XNIException {
        fInCDATASection = false;
        super.endCDATA(augs);
    } // endCDATA(Augmentations)

    /** Characters. */
    public void characters(XMLString text, Augmentations augs)
        throws XNIException {
        text = purifyText(text);
        if (fInCDATASection) {
            StringBuffer str = new StringBuffer(text.toString());
            int length = str.length();
            for (int i = length-1; i >= 0; i--) {
                char c = str.charAt(i);
                if (c == ']') {
                    str.insert(i + 1, ' ');
                }
            }
            fStringBuffer.length = 0;
            fStringBuffer.append(str.toString());
            text = fStringBuffer;
        }
        super.characters(text,augs);
    } // characters(XMLString,Augmentations)

    /** End element. */
    public void endElement(QName element, Augmentations augs)
        throws XNIException {
        element = purifyQName(element);
        super.endElement(element, augs);
    } // endElement(QName,Augmentations)

    //
    // Protected methods
    //

    /** Handle start document. */
    protected void handleStartDocument() {
        fSeenDoctype = false;
        fSeenRootElement = false;
    } // handleStartDocument()

    /** Handle start element. */
    protected void handleStartElement(QName element, XMLAttributes attrs) {

        // handle element and attributes
        element = purifyQName(element);
        int attrCount = attrs != null ? attrs.getLength() : 0;
        for (int i = 0; i < attrCount; i++) {
            attrs.getName(i, fQName);
            attrs.setName(i, purifyQName(fQName));
        }

        // synthesize doctype declaration
        if (!fSeenRootElement && fSeenDoctype) {
            Augmentations augs = synthesizedAugs();
            super.doctypeDecl(element.rawname, fPublicId, fSystemId, augs);
        }

        // mark start element as seen
        fSeenRootElement = true;

    } // handleStartElement(QName,XMLAttributes)

    /** Returns an augmentations object with a synthesized item added. */
    protected final Augmentations synthesizedAugs() {
        Augmentations augs = null;
        if (fAugmentations) {
            augs = fInfosetAugs;
            Class cls = augs.getClass();
            Method method = null;
            try {
                method = cls.getMethod("clear", null);
            }
            catch (NoSuchMethodException e) {
                try {
                    method = cls.getMethod("removeAllItems", null);
                }
                catch (NoSuchMethodException e2) {
                    // NOTE: This should not happen! -Ac
                    augs = new AugmentationsImpl();
                }
            }
            if (method != null) {
                try {
                    method.invoke(augs, null);
                }
                catch (IllegalAccessException e) {
                    // NOTE: This should not happen! -Ac
                    augs = new AugmentationsImpl();
                }
                catch (InvocationTargetException e) {
                    // NOTE: This should not happen! -Ac
                    augs = new AugmentationsImpl();
                }
            }
            augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM);
        }
        return augs;
    } // synthesizedAugs():Augmentations

    //
    // Protected methods
    //

    /** Purify qualified name. */
    protected QName purifyQName(QName qname) {
        qname.prefix = purifyName(qname.prefix, true);
        qname.localpart = purifyName(qname.localpart, true);
        qname.rawname = purifyName(qname.rawname, false);
        return qname;
    } // purifyQName(QName):QName

    /** Purify name. */
    protected String purifyName(String name, boolean localpart) {
        if (name == null) {
            return name;
        }
        StringBuffer str = new StringBuffer();
        int length = name.length();
        boolean seenColon = localpart;
        for (int i = 0; i < length; i++) {
            char c = name.charAt(i);
            if (i == 0) {
                if (!XMLChar.isNameStart(c)) {
                    str.append("_u"+toHexString(c,4)+"_");
                }
                else {
                    str.append(c);
                }
            }
            else {
                if ((fNamespaces && c == ':' && seenColon) || !XMLChar.isName(c)) {
                    str.append("_u"+toHexString(c,4)+"_");
                }
                else {
                    str.append(c);
                }
                seenColon = seenColon || c == ':';
            }
        }
        return str.toString();
    } // purifyName(String):String

    /** Purify content. */
    protected XMLString purifyText(XMLString text) {
        fStringBuffer.length = 0;
        for (int i = 0; i < text.length; i++) {
            char c = text.ch[text.offset+i];
            if (XMLChar.isInvalid(c)) {
                fStringBuffer.append("\\u"+toHexString(c,4));
            }
            else {
                fStringBuffer.append(c);
            }
        }
        return fStringBuffer;
    } // purifyText(XMLString):XMLString

    //
    // Protected static methods
    //

    /** Returns a padded hexadecimal string for the given value. */
    protected static String toHexString(int c, int padlen) {
        StringBuffer str = new StringBuffer(padlen);
        str.append(Integer.toHexString(c));
        int len = padlen - str.length();
        for (int i = 0; i < len; i++) {
            str.insert(0, '0');
        }
        return str.toString().toUpperCase();
    } // toHexString(int,int):String

} // class Purifier
TOP

Related Classes of org.cyberneko.html.filters.Purifier

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.