/**
* OLAT - Online Learning and Training<br>
* http://www.olat.org
* <p>
* Licensed under the Apache License, Version 2.0 (the "License"); <br>
* you may not use this file except in compliance with the License.<br>
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing,<br>
* software distributed under the License is distributed on an "AS IS" BASIS, <br>
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br>
* See the License for the specific language governing permissions and <br>
* limitations under the License.
* <p>
* Copyright (c) frentix GmbH<br>
* http://www.frentix.com<br>
* <p>
*/
package org.olat.core.util.filter.impl;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import org.cyberneko.html.parsers.SAXParser;
import org.olat.core.logging.LogDelegator;
import org.olat.core.util.filter.Filter;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* Description:<br>
* Filter the HTML code using Neko SAX parser and extract the content.
* Neko parse the HTML entities too and deliver cleaned text.
*
* <P>
* Initial Date: 2 dec. 2009 <br>
* @author srosse
*/
public class NekoHTMLFilter extends LogDelegator implements Filter {
@Override
public String filter(String original) {
try {
SAXParser parser = new SAXParser();
HTMLHandler contentHandler = new HTMLHandler((int)((float)original.length() * 0.66f));
parser.setContentHandler(contentHandler);
parser.parse(new InputSource(new StringReader(original)));
return contentHandler.toString();
} catch (SAXException e) {
logError("", e);
return null;
} catch (IOException e) {
logError("", e);
return null;
} catch (Exception e) {
logError("", e);
return null;
}
}
public String filter(InputStream in) {
try {
SAXParser parser = new SAXParser();
HTMLHandler contentHandler = new HTMLHandler((int)(1000 * 0.66f));
parser.setContentHandler(contentHandler);
parser.parse(new InputSource(in));
return contentHandler.toString();
} catch (SAXException e) {
logError("", e);
return null;
} catch (IOException e) {
logError("", e);
return null;
} catch (Exception e) {
logError("", e);
return null;
}
}
private class HTMLHandler extends DefaultHandler {
private boolean collect = true;
private final StringBuilder sb;
public HTMLHandler(int size) {
sb = new StringBuilder(size);
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) {
if("script".equals(localName.toLowerCase())) {
collect = false;
}
}
@Override
public void characters(char[] chars, int offset, int length) {
if(collect) {
sb.append(chars, offset, length);
}
}
@Override
public void endElement(String uri, String localName, String qName) {
if("script".equals(localName.toLowerCase())) {
collect = true;
}
if (sb.length() > 0 && sb.charAt(sb.length() - 1) != ' ') {
sb.append(' ');
}
}
public String toString() {
return sb.toString();
}
}
}