Source Code of info.bliki.htmlcleaner.util.AbstractHtmlExtractor

package info.bliki.htmlcleaner.util;


import info.bliki.htmlcleaner.BaseToken;
import info.bliki.htmlcleaner.HtmlCleaner;
import info.bliki.htmlcleaner.TagNode;


import java.io.IOException;
import java.util.List;




public abstract class AbstractHtmlExtractor<T> {
  final T fResultObject;


  public AbstractHtmlExtractor(T resultObject) {
    super();
    this.fResultObject = resultObject;
  }


  /**
   * Append the content of the nodes to the given result object.
   * 
   * @param nodes
   */
  protected abstract void appendContent(List<Object> nodes);


  /**
   * Append the content of the given <code>TagNode</code> to the resultObject
   * 
   * @param tagNode
   * @return <code>true</code> if <code>appendContent()</code> should be
   *         called.
   */
  protected abstract boolean isFound(TagNode tagNode);


  protected T getResultObject() {
    return fResultObject;
  }


  protected void visitTokenList(List<Object> nodes) {
    if (nodes != null && !nodes.isEmpty()) {
      for (Object item : nodes) {
        if (item != null) {
          if (item instanceof List) {
            @SuppressWarnings("unchecked")
            final List<Object> list = (List<Object>) item;
            visitTokenList(list);
          } else if (item instanceof BaseToken) {
            visitBaseToken((BaseToken) item);
          }
        }
      }
    }
  }


  protected void visitBaseToken(BaseToken node) {
    if (node instanceof TagNode) {
      TagNode tagNode = (TagNode) node;
      if (isFound(tagNode)) {
        appendContent(tagNode.getChildren());
      } else {
        List<Object> children = tagNode.getChildren();
        if (children.size() != 0) {
          visitTokenList(children);
        }
      }
    }
  }


  /**
   * Extract the information from the given html text.
   * 
   * @param html
   */
  public void extractContent(String html) {
    HtmlCleaner cleaner = null;
    try {
      cleaner = new HtmlCleaner(html);
      cleaner.clean();
      TagNode body = cleaner.getBodyNode();
      visitBaseToken(body);
    } catch (IOException e) {
    }
  }
}
Source Code of info.bliki.htmlcleaner.util.AbstractHtmlExtractor

Related Classes of info.bliki.htmlcleaner.util.AbstractHtmlExtractor