Package org.apache.shindig.gadgets.parse.nekohtml

Source Code of org.apache.shindig.gadgets.parse.nekohtml.NekoHtmlParser

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*/
package org.apache.shindig.gadgets.parse.nekohtml;

import org.apache.shindig.gadgets.GadgetException;
import org.apache.shindig.gadgets.parse.GadgetHtmlParser;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import java.io.IOException;
import java.io.StringReader;

import com.google.inject.Inject;
import com.google.inject.Singleton;

/**
* Parser that uses the NekoHtml parser and produces an un-abridged DOM
*
* TODO: Create a reusable instance in ThreadLocal
*/
@Singleton
public class NekoHtmlParser extends GadgetHtmlParser {

  private final DOMImplementation documentProvider;

  @Inject
  public NekoHtmlParser(DOMImplementation documentProvider) {
    this.documentProvider = documentProvider;
  }

  @Override
  public Document parseDomImpl(String source) throws GadgetException {
    try {
      return parseDomInternal(source);
    } catch (Exception e) {
      throw new GadgetException(GadgetException.Code.HTML_PARSE_ERROR, e);
    }
  }

  private Document parseDomInternal(String source) throws SAXException, IOException, GadgetException {
    if (attemptFullDocParseFirst(source)) {
      InputSource input = new InputSource(new StringReader(source));
      DOMParser parser = new DOMParser();
      // Force parser not to use HTMLDocumentImpl as document implementation otherwise
      // it forces all element names to uppercase.
      parser.setProperty("http://apache.org/xml/properties/dom/document-class-name",
          "org.apache.xerces.dom.DocumentImpl");
      // Dont convert element names to upper/lowercase
      parser.setProperty("http://cyberneko.org/html/properties/names/elems", "default");
      // Preserve case of attributes
      parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change");
      // Record entity references
      parser.setFeature("http://apache.org/xml/features/scanner/notify-char-refs", true);
      parser.setFeature("http://cyberneko.org/html/features/scanner/notify-builtin-refs", true);
      // No need to defer as full DOM is walked later
      parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
      parser.parse(input);
      return parser.getDocument();
    } else {
      DocumentFragment fragment = parseFragmentImpl(source);
      normalizeFragment(fragment.getOwnerDocument(), fragment);
      return fragment.getOwnerDocument();
    }
  }
 
  @Override
  protected DocumentFragment parseFragmentImpl(String source) throws GadgetException {
    try {
      Document htmlDoc = documentProvider.createDocument(null, null, null);
      // Workaround for error check failure adding text node to entity ref as a child
      htmlDoc.setStrictErrorChecking(false);
      DocumentFragment fragment = htmlDoc.createDocumentFragment();
      InputSource input = new InputSource(new StringReader(source));
      DOMFragmentParser parser = new DOMFragmentParser();
      parser.setProperty("http://cyberneko.org/html/properties/names/elems", "default");
      parser.setFeature("http://cyberneko.org/html/features/document-fragment", true);
      parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change");
      parser.setFeature("http://apache.org/xml/features/scanner/notify-char-refs", true);
      parser.setFeature("http://cyberneko.org/html/features/scanner/notify-builtin-refs", true);
      parser.parse(input, fragment);
      return fragment;
    } catch (Exception e) {
      throw new GadgetException(GadgetException.Code.HTML_PARSE_ERROR, e);
    }
  }
 
}
TOP

Related Classes of org.apache.shindig.gadgets.parse.nekohtml.NekoHtmlParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.