Package org.olat.core.util.filter.impl

Source Code of org.olat.core.util.filter.impl.NekoHTMLFilter

/**
* OLAT - Online Learning and Training<br>
* http://www.olat.org
* <p>
* Licensed under the Apache License, Version 2.0 (the "License"); <br>
* you may not use this file except in compliance with the License.<br>
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing,<br>
* software distributed under the License is distributed on an "AS IS" BASIS, <br>
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br>
* See the License for the specific language governing permissions and <br>
* limitations under the License.
* <p>
* Copyright (c) frentix GmbH<br>
* http://www.frentix.com<br>
* <p>
*/
package org.olat.core.util.filter.impl;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;

import org.cyberneko.html.parsers.SAXParser;
import org.olat.core.logging.LogDelegator;
import org.olat.core.util.filter.Filter;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
* Description:<br>
* Filter the HTML code using Neko SAX parser and extract the content.
* Neko parse the HTML entities too and deliver cleaned text.
*
* <P>
* Initial Date:  2 dec. 2009 <br>
* @author srosse
*/
public class NekoHTMLFilter extends LogDelegator implements Filter {

  @Override
  public String filter(String original) {
    try {
      SAXParser parser = new SAXParser();
      HTMLHandler contentHandler = new HTMLHandler((int)((float)original.length() * 0.66f));
      parser.setContentHandler(contentHandler);
      parser.parse(new InputSource(new StringReader(original)));
      return contentHandler.toString();
    } catch (SAXException e) {
      logError("", e);
      return null;
    } catch (IOException e) {
      logError("", e);
      return null;
    } catch (Exception e) {
      logError("", e);
      return null;
    }
  }

  public String filter(InputStream in) {
    try {
      SAXParser parser = new SAXParser();
      HTMLHandler contentHandler = new HTMLHandler((int)(1000 * 0.66f));
      parser.setContentHandler(contentHandler);
      parser.parse(new InputSource(in));
      return contentHandler.toString();
    } catch (SAXException e) {
      logError("", e);
      return null;
    } catch (IOException e) {
      logError("", e);
      return null;
    } catch (Exception e) {
      logError("", e);
      return null;
    }
  }
 
  private class HTMLHandler extends DefaultHandler {
    private boolean collect = true;
    private final StringBuilder sb;
   
    public HTMLHandler(int size) {
      sb = new StringBuilder(size);
    }

    @Override
    public void startElement(String uri, String localName, String qName, Attributes attributes) {
      if("script".equals(localName.toLowerCase())) {
        collect = false;
      }
    }
   
    @Override
    public void characters(char[] chars, int offset, int length) {
      if(collect) {
        sb.append(chars, offset, length);
      }
    }

    @Override
    public void endElement(String uri, String localName, String qName) {
      if("script".equals(localName.toLowerCase())) {
        collect = true;
      }
      if (sb.length() > 0 && sb.charAt(sb.length() - 1) != ' ') {
        sb.append(' ');
      }
    }

    public String toString() {
      return sb.toString();
    }
  }
}
TOP

Related Classes of org.olat.core.util.filter.impl.NekoHTMLFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.