Source Code of org.cyberneko.html.filters.Purifier

/* 
 * (C) Copyright 2004, Andy Clark.  All rights reserved.
 *
 * This file is distributed under an Apache style license. Please
 * refer to the LICENSE file for specific details.
 */


package org.cyberneko.html.filters;


import org.cyberneko.html.HTMLEventInfo;


import java.lang.reflect.Method;
import java.lang.reflect.InvocationTargetException;


import org.apache.xerces.util.AugmentationsImpl;
import org.apache.xerces.util.XMLChar;
import org.apache.xerces.util.XMLStringBuffer;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.NamespaceContext;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLLocator;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.XMLComponentManager;
import org.apache.xerces.xni.parser.XMLConfigurationException;


/**
 * This filter purifies the HTML input to ensure XML well-formedness.
 * The purification process includes:
 * <ul>
 * <li>fixing illegal characters in the document, including
 *  <ul>
 *  <li>element and attribute names,
 *  <li>processing instruction target and data,
 *  <li>document text;
 *  </ul>
 * <li>ensuring the string "--" does not appear in the content of
 *     a comment; and
 * <li>ensuring the string "]]>" does not appear in the content of
 *     a CDATA section.
 * </ul>
 * <p>
 * Illegal characters in XML names are converted to the character 
 * sequence "_u####_" where "####" is the value of the Unicode 
 * character represented in hexadecimal. Whereas illegal characters
 * appearing in document content is converted to the character
 * sequence "\\u####".
 * <p>
 * In comments, the character '-' is replaced by the character
 * sequence "- " to prevent "--" from ever appearing in the comment
 * content. For CDATA sections, the character ']' is replaced by
 * the character sequence "] " to prevent "]]" from appearing.
 * 
 * @author Andy Clark
 * 
 * @version $Id$
 */
public class Purifier
    extends DefaultFilter {


    //
    // Constants
    //


    /** Namespaces. */
    protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";


    /** Include infoset augmentations. */
    protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";


    /** Recognized features. */
    private static final String[] RECOGNIZED_FEATURES = {
        NAMESPACES,
        AUGMENTATIONS,
    };


    /** Recognized features defaults. */
    private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = {
        null,
        null,
    };


    // static vars


    /** Synthesized event info item. */
    protected static final HTMLEventInfo SYNTHESIZED_ITEM = 
        new HTMLEventInfo.SynthesizedItem();


    //
    // Data
    //


    // features


    /** Namespaces. */
    protected boolean fNamespaces;


    /** Augmentations. */
    protected boolean fAugmentations;


    // state


    /** True if the doctype declaration was seen. */
    protected boolean fSeenDoctype;


    /** True if root element was seen. */
    protected boolean fSeenRootElement;


    /** True if inside a CDATA section. */
    protected boolean fInCDATASection;


    // doctype declaration info


    /** Public identifier of doctype declaration. */
    protected String fPublicId;


    /** System identifier of doctype declaration. */
    protected String fSystemId;


    // temp vars


    /** Qualified name. */
    private QName fQName = new QName();


    /** Augmentations. */
    private final Augmentations fInfosetAugs = new AugmentationsImpl();


    /** String buffer. */
    private final XMLStringBuffer fStringBuffer = new XMLStringBuffer();


    //
    // XMLComponent methods
    //


    public void reset(XMLComponentManager manager) 
        throws XMLConfigurationException {


        // state
        fInCDATASection = false;


        // features
        fNamespaces = manager.getFeature(NAMESPACES);
        fAugmentations = manager.getFeature(AUGMENTATIONS);


    } // reset(XMLComponentManager)


    //
    // XMLDocumentHandler methods
    //


    /** Start document. */
    public void startDocument(XMLLocator locator, String encoding,
                              Augmentations augs) throws XNIException {
        handleStartDocument();
        super.startDocument(locator, encoding, augs);
    } // startDocument(XMLLocator,String,Augmentations)


    /** Start document. */
    public void startDocument(XMLLocator locator, String encoding,
                              NamespaceContext nscontext, Augmentations augs)
        throws XNIException {
        handleStartDocument();
        super.startDocument(locator, encoding, nscontext, augs);
    } // startDocument(XMLLocator,NamespaceContext,String,Augmentations)


    /** Comment. */
    public void comment(XMLString text, Augmentations augs)
        throws XNIException {
        StringBuffer str = new StringBuffer(purifyText(text).toString());
        int length = str.length();
        for (int i = length-1; i >= 0; i--) {
            char c = str.charAt(i);
            if (c == '-') {
                str.insert(i + 1, ' ');
            }
        }
        fStringBuffer.length = 0;
        fStringBuffer.append(str.toString());
        text = fStringBuffer;
        super.comment(text, augs);
    } // comment(XMLString,Augmentations)


    /** Processing instruction. */
    public void processingInstruction(String target, XMLString data,
                                      Augmentations augs)
        throws XNIException {
        target = purifyName(target, true);
        data = purifyText(data);
        super.processingInstruction(target, data, augs);
    } // processingInstruction(String,XMLString,Augmentations)


    /** Doctype declaration. */
    public void doctypeDecl(String root, String pubid, String sysid,
                            Augmentations augs) throws XNIException {
        fSeenDoctype = true;
        // NOTE: It doesn't matter what the root element name is because
        //       it must match the root element. -Ac
        fPublicId = pubid;
        fSystemId = sysid;
        // NOTE: If the public identifier is specified, then a system
        //       identifier must also be specified. -Ac
        if (fPublicId != null && fSystemId == null) {
            fSystemId = "";
        }
        // NOTE: Can't save the augmentations because the object state
        //       is transient. -Ac
    } // doctypeDecl(String,String,String,Augmentations)


    /** Start element. */
    public void startElement(QName element, XMLAttributes attrs,
                             Augmentations augs) throws XNIException {
        handleStartElement(element, attrs);
        super.startElement(element, attrs, augs);
    } // startElement(QName,XMLAttributes,Augmentations)


    /** Empty element. */
    public void emptyElement(QName element, XMLAttributes attrs,
                             Augmentations augs) throws XNIException {
        handleStartElement(element, attrs);
        super.emptyElement(element, attrs, augs);
    } // emptyElement(QName,XMLAttributes,Augmentations)


    /** Start CDATA section. */
    public void startCDATA(Augmentations augs) throws XNIException {
        fInCDATASection = true;
        super.startCDATA(augs);
    } // startCDATA(Augmentations)


    /** End CDATA section. */
    public void endCDATA(Augmentations augs) throws XNIException {
        fInCDATASection = false;
        super.endCDATA(augs);
    } // endCDATA(Augmentations)


    /** Characters. */
    public void characters(XMLString text, Augmentations augs)
        throws XNIException {
        text = purifyText(text);
        if (fInCDATASection) {
            StringBuffer str = new StringBuffer(text.toString());
            int length = str.length();
            for (int i = length-1; i >= 0; i--) {
                char c = str.charAt(i);
                if (c == ']') {
                    str.insert(i + 1, ' ');
                }
            }
            fStringBuffer.length = 0;
            fStringBuffer.append(str.toString());
            text = fStringBuffer;
        }
        super.characters(text,augs);
    } // characters(XMLString,Augmentations)


    /** End element. */
    public void endElement(QName element, Augmentations augs)
        throws XNIException {
        element = purifyQName(element);
        super.endElement(element, augs);
    } // endElement(QName,Augmentations)


    //
    // Protected methods
    //


    /** Handle start document. */
    protected void handleStartDocument() {
        fSeenDoctype = false;
        fSeenRootElement = false;
    } // handleStartDocument()


    /** Handle start element. */
    protected void handleStartElement(QName element, XMLAttributes attrs) {


        // handle element and attributes
        element = purifyQName(element);
        int attrCount = attrs != null ? attrs.getLength() : 0;
        for (int i = 0; i < attrCount; i++) {
            attrs.getName(i, fQName);
            attrs.setName(i, purifyQName(fQName));
        }


        // synthesize doctype declaration
        if (!fSeenRootElement && fSeenDoctype) {
            Augmentations augs = synthesizedAugs();
            super.doctypeDecl(element.rawname, fPublicId, fSystemId, augs);
        }


        // mark start element as seen
        fSeenRootElement = true;


    } // handleStartElement(QName,XMLAttributes)


    /** Returns an augmentations object with a synthesized item added. */
    protected final Augmentations synthesizedAugs() {
        Augmentations augs = null;
        if (fAugmentations) {
            augs = fInfosetAugs;
            Class cls = augs.getClass();
            Method method = null;
            try {
                method = cls.getMethod("clear", null);
            }
            catch (NoSuchMethodException e) {
                try {
                    method = cls.getMethod("removeAllItems", null);
                }
                catch (NoSuchMethodException e2) {
                    // NOTE: This should not happen! -Ac
                    augs = new AugmentationsImpl();
                }
            }
            if (method != null) {
                try {
                    method.invoke(augs, null);
                }
                catch (IllegalAccessException e) {
                    // NOTE: This should not happen! -Ac
                    augs = new AugmentationsImpl();
                } 
                catch (InvocationTargetException e) {
                    // NOTE: This should not happen! -Ac
                    augs = new AugmentationsImpl();
                } 
            }
            augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM);
        }
        return augs;
    } // synthesizedAugs():Augmentations


    //
    // Protected methods
    //


    /** Purify qualified name. */
    protected QName purifyQName(QName qname) {
        qname.prefix = purifyName(qname.prefix, true);
        qname.localpart = purifyName(qname.localpart, true);
        qname.rawname = purifyName(qname.rawname, false);
        return qname;
    } // purifyQName(QName):QName


    /** Purify name. */
    protected String purifyName(String name, boolean localpart) {
        if (name == null) {
            return name;
        }
        StringBuffer str = new StringBuffer();
        int length = name.length();
        boolean seenColon = localpart;
        for (int i = 0; i < length; i++) {
            char c = name.charAt(i);
            if (i == 0) {
                if (!XMLChar.isNameStart(c)) {
                    str.append("_u"+toHexString(c,4)+"_");
                }
                else {
                    str.append(c);
                }
            }
            else {
                if ((fNamespaces && c == ':' && seenColon) || !XMLChar.isName(c)) {
                    str.append("_u"+toHexString(c,4)+"_");
                }
                else {
                    str.append(c);
                }
                seenColon = seenColon || c == ':';
            }
        }
        return str.toString();
    } // purifyName(String):String


    /** Purify content. */
    protected XMLString purifyText(XMLString text) {
        fStringBuffer.length = 0;
        for (int i = 0; i < text.length; i++) {
            char c = text.ch[text.offset+i];
            if (XMLChar.isInvalid(c)) {
                fStringBuffer.append("\\u"+toHexString(c,4));
            }
            else {
                fStringBuffer.append(c);
            }
        }
        return fStringBuffer;
    } // purifyText(XMLString):XMLString


    //
    // Protected static methods
    //


    /** Returns a padded hexadecimal string for the given value. */
    protected static String toHexString(int c, int padlen) {
        StringBuffer str = new StringBuffer(padlen);
        str.append(Integer.toHexString(c));
        int len = padlen - str.length();
        for (int i = 0; i < len; i++) {
            str.insert(0, '0');
        }
        return str.toString().toUpperCase();
    } // toHexString(int,int):String


} // class Purifier
Source Code of org.cyberneko.html.filters.Purifier

Related Classes of org.cyberneko.html.filters.Purifier