/*
* JBoss, Home of Professional Open Source
* Copyright 2012 Red Hat Inc. and/or its affiliates and other contributors
* as indicated by the @authors tag. All rights reserved.
*/
package org.jboss.elasticsearch.tools.content;
import java.util.List;
import java.util.Map;
import org.elasticsearch.common.settings.SettingsException;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
/**
* Content preprocessor which takes String value from source field, strip html tags from it, unescape html entities (
* <code>&lt;</code>, <code>&gt;</code>, <code>&amp;</code> atd) and store result to another or same target
* field. Example of configuration for this preprocessor:
*
* <pre>
* {
* "name" : "HTML content to text description convertor",
* "class" : "org.jboss.elasticsearch.tools.content.StripHtmlPreprocessor",
* "settings" : {
* "source_field" : "content",
* "target_field" : "description"
* }
* }
* </pre>
*
* Options are:
* <ul>
* <li><code>source_field</code> - source field in input data. Dot notation for nested values can be used here (see
* {@link XContentMapValues#extractValue(String, Map)}).
* <li><code>target_field</code> - target field in data to store mapped value into. Can be same as input field. Dot
* notation can be used here for structure nesting.
* <li><code>source_bases</code> - list of fields in source data which are used as bases for stripping. If defined then
* stripping is performed for each of this fields, <code>source_field</code> and <code>target_field</code> are resolved
* relatively against this base. Base must provide object or list of objects.
* </ul>
*
* @author Vlastimil Elias (velias at redhat dot com)
* @see StructuredContentPreprocessorFactory
*/
public class StripHtmlPreprocessor extends StructuredContentPreprocessorWithSourceBasesBase<Object> {
protected static final String CFG_SOURCE_FIELD = "source_field";
protected static final String CFG_TARGET_FIELD = "target_field";
protected String fieldSource;
protected String fieldTarget;
@Override
public void init(Map<String, Object> settings) throws SettingsException {
super.init(settings);
fieldSource = XContentMapValues.nodeStringValue(settings.get(CFG_SOURCE_FIELD), null);
validateConfigurationStringNotEmpty(fieldSource, CFG_SOURCE_FIELD);
fieldTarget = XContentMapValues.nodeStringValue(settings.get(CFG_TARGET_FIELD), null);
validateConfigurationStringNotEmpty(fieldTarget, CFG_TARGET_FIELD);
}
@Override
protected Object createContext(Map<String, Object> data) {
return null;
}
@Override
protected void processOneSourceValue(Map<String, Object> data, Object context, String base,
PreprocessChainContext chainContext) {
Object v = null;
if (fieldSource.contains(".")) {
v = XContentMapValues.extractValue(fieldSource, data);
} else {
v = data.get(fieldSource);
}
if (v != null) {
if (!(v instanceof String)) {
String msg = "Value for field '" + getFullFieldName(base, fieldSource)
+ "' is not String, so can't be processed";
addDataWarning(chainContext, msg);
logger.debug(msg);
} else {
String value = stripHtml(v.toString());
StructureUtils.putValueIntoMapOfMaps(data, fieldTarget, value);
}
}
}
protected String stripHtml(String value) {
if (value == null || value.trim().isEmpty())
return value;
Document doc = Jsoup.parse(Jsoup.clean(value, Whitelist.relaxed()));
return convertNodeToText(doc.body());
}
protected String convertNodeToText(Element element) {
if (element == null)
return "";
final StringBuilder buffer = new StringBuilder();
new NodeTraversor(new NodeVisitor() {
@Override
public void head(Node node, int depth) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
String text = textNode.text().replace('\u00A0', ' ').trim(); // non breaking space
if (!text.isEmpty()) {
buffer.append(text);
if (!text.endsWith(" ")) {
buffer.append(" "); // the last text gets appended the extra space too but we remove it later
}
}
}
}
@Override
public void tail(Node node, int depth) {
}
}).traverse(element);
String output = buffer.toString();
if (output.endsWith(" ")) { // removal of the last extra space
output = output.substring(0, output.length() - 1);
}
return output;
}
public String getFieldSource() {
return fieldSource;
}
public String getFieldTarget() {
return fieldTarget;
}
public List<String> getSourceBases() {
return sourceBases;
}
}