Package org.jboss.elasticsearch.tools.content

Source Code of org.jboss.elasticsearch.tools.content.StripHtmlPreprocessor

/*
* JBoss, Home of Professional Open Source
* Copyright 2012 Red Hat Inc. and/or its affiliates and other contributors
* as indicated by the @authors tag. All rights reserved.
*/
package org.jboss.elasticsearch.tools.content;

import java.util.List;
import java.util.Map;

import org.elasticsearch.common.settings.SettingsException;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

/**
* Content preprocessor which takes String value from source field, strip html tags from it, unescape html entities (
* <code>&amp;lt;</code>, <code>&amp;gt;</code>, <code>&amp;amp;</code> atd) and store result to another or same target
* field. Example of configuration for this preprocessor:
*
* <pre>
* {
*     "name"     : "HTML content to text description convertor",
*     "class"    : "org.jboss.elasticsearch.tools.content.StripHtmlPreprocessor",
*     "settings" : {
*         "source_field"  : "content",
*         "target_field"  : "description"
*     }
* }
* </pre>
*
* Options are:
* <ul>
* <li><code>source_field</code> - source field in input data. Dot notation for nested values can be used here (see
* {@link XContentMapValues#extractValue(String, Map)}).
* <li><code>target_field</code> - target field in data to store mapped value into. Can be same as input field. Dot
* notation can be used here for structure nesting.
* <li><code>source_bases</code> - list of fields in source data which are used as bases for stripping. If defined then
* stripping is performed for each of this fields, <code>source_field</code> and <code>target_field</code> are resolved
* relatively against this base. Base must provide object or list of objects.
* </ul>
*
* @author Vlastimil Elias (velias at redhat dot com)
* @see StructuredContentPreprocessorFactory
*/
public class StripHtmlPreprocessor extends StructuredContentPreprocessorWithSourceBasesBase<Object> {

  protected static final String CFG_SOURCE_FIELD = "source_field";
  protected static final String CFG_TARGET_FIELD = "target_field";

  protected String fieldSource;
  protected String fieldTarget;

  @Override
  public void init(Map<String, Object> settings) throws SettingsException {
    super.init(settings);
    fieldSource = XContentMapValues.nodeStringValue(settings.get(CFG_SOURCE_FIELD), null);
    validateConfigurationStringNotEmpty(fieldSource, CFG_SOURCE_FIELD);
    fieldTarget = XContentMapValues.nodeStringValue(settings.get(CFG_TARGET_FIELD), null);
    validateConfigurationStringNotEmpty(fieldTarget, CFG_TARGET_FIELD);
  }

  @Override
  protected Object createContext(Map<String, Object> data) {
    return null;
  }

  @Override
  protected void processOneSourceValue(Map<String, Object> data, Object context, String base,
      PreprocessChainContext chainContext) {
    Object v = null;
    if (fieldSource.contains(".")) {
      v = XContentMapValues.extractValue(fieldSource, data);
    } else {
      v = data.get(fieldSource);
    }

    if (v != null) {
      if (!(v instanceof String)) {
        String msg = "Value for field '" + getFullFieldName(base, fieldSource)
            + "' is not String, so can't be processed";
        addDataWarning(chainContext, msg);
        logger.debug(msg);
      } else {
        String value = stripHtml(v.toString());
        StructureUtils.putValueIntoMapOfMaps(data, fieldTarget, value);
      }
    }
  }

  protected String stripHtml(String value) {
    if (value == null || value.trim().isEmpty())
      return value;
    Document doc = Jsoup.parse(Jsoup.clean(value, Whitelist.relaxed()));
    return convertNodeToText(doc.body());
  }

  protected String convertNodeToText(Element element) {
    if (element == null)
      return "";
    final StringBuilder buffer = new StringBuilder();
    new NodeTraversor(new NodeVisitor() {
      @Override
      public void head(Node node, int depth) {
        if (node instanceof TextNode) {
          TextNode textNode = (TextNode) node;
          String text = textNode.text().replace('\u00A0', ' ').trim(); // non breaking space
          if (!text.isEmpty()) {
            buffer.append(text);
            if (!text.endsWith(" ")) {
              buffer.append(" "); // the last text gets appended the extra space too but we remove it later
            }
          }
        }
      }

      @Override
      public void tail(Node node, int depth) {
      }
    }).traverse(element);
    String output = buffer.toString();
    if (output.endsWith(" ")) { // removal of the last extra space
      output = output.substring(0, output.length() - 1);
    }
    return output;
  }

  public String getFieldSource() {
    return fieldSource;
  }

  public String getFieldTarget() {
    return fieldTarget;
  }

  public List<String> getSourceBases() {
    return sourceBases;
  }

}
TOP

Related Classes of org.jboss.elasticsearch.tools.content.StripHtmlPreprocessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.