Package info.bliki.htmlcleaner.util

Source Code of info.bliki.htmlcleaner.util.AbstractHtmlExtractor

package info.bliki.htmlcleaner.util;

import info.bliki.htmlcleaner.BaseToken;
import info.bliki.htmlcleaner.HtmlCleaner;
import info.bliki.htmlcleaner.TagNode;

import java.io.IOException;
import java.util.List;


public abstract class AbstractHtmlExtractor<T> {
  final T fResultObject;

  public AbstractHtmlExtractor(T resultObject) {
    super();
    this.fResultObject = resultObject;
  }

  /**
   * Append the content of the nodes to the given result object.
   *
   * @param nodes
   */
  protected abstract void appendContent(List<Object> nodes);

  /**
   * Append the content of the given <code>TagNode</code> to the resultObject
   *
   * @param tagNode
   * @return <code>true</code> if <code>appendContent()</code> should be
   *         called.
   */
  protected abstract boolean isFound(TagNode tagNode);

  protected T getResultObject() {
    return fResultObject;
  }

  protected void visitTokenList(List<Object> nodes) {
    if (nodes != null && !nodes.isEmpty()) {
      for (Object item : nodes) {
        if (item != null) {
          if (item instanceof List) {
            @SuppressWarnings("unchecked")
            final List<Object> list = (List<Object>) item;
            visitTokenList(list);
          } else if (item instanceof BaseToken) {
            visitBaseToken((BaseToken) item);
          }
        }
      }
    }
  }

  protected void visitBaseToken(BaseToken node) {
    if (node instanceof TagNode) {
      TagNode tagNode = (TagNode) node;
      if (isFound(tagNode)) {
        appendContent(tagNode.getChildren());
      } else {
        List<Object> children = tagNode.getChildren();
        if (children.size() != 0) {
          visitTokenList(children);
        }
      }
    }
  }

  /**
   * Extract the information from the given html text.
   *
   * @param html
   */
  public void extractContent(String html) {
    HtmlCleaner cleaner = null;
    try {
      cleaner = new HtmlCleaner(html);
      cleaner.clean();
      TagNode body = cleaner.getBodyNode();
      visitBaseToken(body);
    } catch (IOException e) {
    }
  }
}
TOP

Related Classes of info.bliki.htmlcleaner.util.AbstractHtmlExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.