Package nu.validator.htmlparser.gwt

Source Code of nu.validator.htmlparser.gwt.HtmlParser

/*
* Copyright (c) 2007 Henri Sivonen
* Copyright (c) 2007-2008 Mozilla Foundation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/

package nu.validator.htmlparser.gwt;

import java.util.LinkedList;

import nu.validator.htmlparser.common.XmlViolationPolicy;
import nu.validator.htmlparser.impl.ErrorReportingTokenizer;
import nu.validator.htmlparser.impl.Tokenizer;
import nu.validator.htmlparser.impl.UTF16Buffer;

import org.xml.sax.ErrorHandler;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import com.google.gwt.core.client.JavaScriptObject;
import com.google.gwt.user.client.Timer;

/**
* This class implements an HTML5 parser that exposes data through the DOM
* interface.
*
* <p>By default, when using the constructor without arguments, the
* this parser treats XML 1.0-incompatible infosets as fatal errors.
* This corresponds to
* <code>FATAL</code> as the general XML violation policy. To make the parser
* support non-conforming HTML fully per the HTML 5 spec while on the other
* hand potentially violating the DOM API contract, set the general XML
* violation policy to <code>ALLOW</code>. This does not work with a standard
* DOM implementation. Handling all input without fatal errors and without
* violating the DOM API contract is possible by setting
* the general XML violation policy to <code>ALTER_INFOSET</code>. <em>This
* makes the parser non-conforming</em> but is probably the most useful
* setting for most applications.
*
* <p>The doctype is not represented in the tree.
*
* <p>The document mode is represented as user data <code>DocumentMode</code>
* object with the key <code>nu.validator.document-mode</code> on the document
* node.
*
* <p>The form pointer is also stored as user data with the key
* <code>nu.validator.form-pointer</code>.
*
* @version $Id: HtmlDocumentBuilder.java 255 2008-05-29 08:57:38Z hsivonen $
* @author hsivonen
*/
public class HtmlParser {

    private static final int CHUNK_SIZE = 512;
   
    private final Tokenizer tokenizer;

    private final BrowserTreeBuilder domTreeBuilder;

    private final StringBuilder documentWriteBuffer = new StringBuilder();

    private ErrorHandler errorHandler;

    private UTF16Buffer stream;

    private int streamLength;

    private boolean lastWasCR;

    private boolean ending;

    private ParseEndListener parseEndListener;

    private final LinkedList<UTF16Buffer> bufferStack = new LinkedList<UTF16Buffer>();

    /**
     * Instantiates the parser
     *
     * @param implementation
     *            the DOM implementation
     *            @param xmlPolicy the policy
     */
    public HtmlParser(JavaScriptObject document) {
        this.domTreeBuilder = new BrowserTreeBuilder(document);
        this.tokenizer = new ErrorReportingTokenizer(domTreeBuilder);
        this.domTreeBuilder.setNamePolicy(XmlViolationPolicy.ALTER_INFOSET);
        this.tokenizer.setCommentPolicy(XmlViolationPolicy.ALTER_INFOSET);
        this.tokenizer.setContentNonXmlCharPolicy(XmlViolationPolicy.ALTER_INFOSET);
        this.tokenizer.setContentSpacePolicy(XmlViolationPolicy.ALTER_INFOSET);
        this.tokenizer.setNamePolicy(XmlViolationPolicy.ALTER_INFOSET);
        this.tokenizer.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET);
    }

    /**
     * Parses a document from a SAX <code>InputSource</code>.
     * @param is the source
     * @return the doc
     * @see javax.xml.parsers.DocumentBuilder#parse(org.xml.sax.InputSource)
     */
    public void parse(String source, boolean useSetTimeouts, ParseEndListener callback) throws SAXException {
        parseEndListener = callback;
        domTreeBuilder.setFragmentContext(null);
        tokenize(source, useSetTimeouts, null);  
    }
    /**
     * @param is
     * @throws SAXException
     * @throws IOException
     * @throws MalformedURLException
     */
    private void tokenize(String source, final boolean useSetTimeouts, String context) throws SAXException {
       
                lastWasCR = false;
                ending = false;
                documentWriteBuffer.setLength(0);
                streamLength = source.length();
                stream = new UTF16Buffer(source.toCharArray(), 0,
                        (streamLength < CHUNK_SIZE ? streamLength : CHUNK_SIZE));
                bufferStack.clear();
                push(stream);
                domTreeBuilder.setFragmentContext(context == null ? null : context.intern());
               
                tokenizer.start();
                pump(useSetTimeouts);
    }

    private void pump(boolean useSetTimeouts) throws SAXException {

       
        if (ending) {
            tokenizer.end();
            domTreeBuilder.getDocument(); // drops the internal reference
            parseEndListener.parseComplete();
            // Don't schedule timeout
            return;
        }

        int docWriteLen = documentWriteBuffer.length();
        if (docWriteLen > 0) {
            char[] newBuf = new char[docWriteLen];
            documentWriteBuffer.getChars(0, docWriteLen, newBuf, 0);
            push(new UTF16Buffer(newBuf, 0, docWriteLen));
            documentWriteBuffer.setLength(0);
        }

        for (;;) {
            UTF16Buffer buffer = peek();
            if (!buffer.hasMore()) {
                if (buffer == stream) {
                    if (buffer.getEnd() == streamLength) {
                        // Stop parsing
                        tokenizer.eof();
                        ending = true;
                        break;
                    } else {
                        int newEnd = buffer.getStart() + CHUNK_SIZE;
                        buffer.setEnd(newEnd < streamLength ? newEnd
                                : streamLength);
                        continue;
                    }
                } else {
                    pop();
                    continue;
                }
            }
            // now we have a non-empty buffer
            buffer.adjust(lastWasCR);
            lastWasCR = false;
            if (buffer.hasMore()) {
                lastWasCR = tokenizer.tokenizeBuffer(buffer);
                domTreeBuilder.maybeRunScript();
                break;
            } else {
                continue;
            }
        }

        if(useSetTimeouts){
            // schedule
            Timer timer = new Timer() {
   
                @Override public void run() {
                    try {
                        pump(true);
                    } catch (SAXException e) {
                        ending = true;
                        if (errorHandler != null) {
                            try {
                                errorHandler.fatalError(new SAXParseException(
                                        e.getMessage(), null, null, -1, -1, e));
                            } catch (SAXException e1) {
                            }
                        }
                    }
                }
   
            };
            timer.schedule(1);
        }else{
            try {
                pump(false);
            } catch (SAXException e) {
                ending = true;
                if (errorHandler != null) {
                    try {
                        errorHandler.fatalError(new SAXParseException(
                                e.getMessage(), null, null, -1, -1, e));
                    } catch (SAXException e1) {
                    }
                }
            }
        }
    }

    private void push(UTF16Buffer buffer) {
        bufferStack.addLast(buffer);
    }

    private UTF16Buffer peek() {
        return bufferStack.getLast();
    }

    private void pop() {
        bufferStack.removeLast();
    }

    public void documentWrite(final String text) throws SAXException {

      UTF16Buffer buffer = new UTF16Buffer(text.toCharArray(), 0, text.length());
      while (buffer.hasMore()) {
          buffer.adjust(lastWasCR);
          lastWasCR = false;
          if (buffer.hasMore()) {
              lastWasCR = tokenizer.tokenizeBuffer(buffer);           
              domTreeBuilder.maybeRunScript();
          }
      }
    }

    /**
     * @see javax.xml.parsers.DocumentBuilder#setErrorHandler(org.xml.sax.ErrorHandler)
     */
    public void setErrorHandler(ErrorHandler errorHandler) {
        this.errorHandler = errorHandler;
        domTreeBuilder.setErrorHandler(errorHandler);
        tokenizer.setErrorHandler(errorHandler);
    }

    /**
     * Sets whether comment nodes appear in the tree.
     * @param ignoreComments <code>true</code> to ignore comments
     * @see nu.validator.htmlparser.impl.TreeBuilder#setIgnoringComments(boolean)
     */
    public void setIgnoringComments(boolean ignoreComments) {
        domTreeBuilder.setIgnoringComments(ignoreComments);
    }

    /**
     * Sets whether the parser considers scripting to be enabled for noscript treatment.
     * @param scriptingEnabled <code>true</code> to enable
     * @see nu.validator.htmlparser.impl.TreeBuilder#setScriptingEnabled(boolean)
     */
    public void setScriptingEnabled(boolean scriptingEnabled) {
        domTreeBuilder.setScriptingEnabled(scriptingEnabled);
    }

}
TOP

Related Classes of nu.validator.htmlparser.gwt.HtmlParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.