Examples of HtmlCleaner

com.qkyrie.markdown2pdf.internal.converting.HtmlCleaner
User: Quinten Date: 31-3-2014 Time: 09:44 @author Quinten De Swaef
info.bliki.htmlcleaner.HtmlCleaner
of few constructors cleaner.setXXX(...) // optionally, set cleaner's behaviour clener.clean(); // calls cleaning process cleaner.writeXmlXXX(...) // writes resulting XML to string, file or any output stream Created by: Vladimir Nikic
Date: November, 2006
no.priv.garshol.duke.cleaners.HTMLCleaner
A cleaner that removes HTML-style entity references, such as Þ and —. @since 1.3
org.apache.wookie.util.html.HtmlCleaner
A HTML processor implemented using HtmlCleaner
org.htmlcleaner.HtmlCleaner
te an instance of HtmlCleaner HtmlCleaner cleaner = new HtmlCleaner(); // take default cleaner properties CleanerProperties props = cleaner.getProperties(); // customize cleaner's behaviour with property setters props.setXXX(...); // Clean HTML taken from simple string, file, URL, input stream, // input source or reader. Result is root node of created // tree-like structure. Single cleaner instance may be safely used // multiple times. TagNode node = cleaner.clean(...); // optionally find parts of the DOM or modify some nodes TagNode[] myNodes = node.getElementsByXXX(...); // and/or Object[] myNodes = node.evaluateXPath(xPathExpression); // and/or aNode.removeFromTree(); // and/or aNode.addAttribute(attName, attValue); // and/or aNode.removeAttribute(attName, attValue); // and/or cleaner.setInnerHtml(aNode, htmlContent); // and/or do some other tree manipulation/traversal // serialize a node to a file, output stream, DOM, JDom... new XXXSerializer(props).writeXmlXXX(aNode, ...); myJDom = new JDomSerializer(props, true).createJDom(aNode); myDom = new DomSerializer(props, true).createDOM(aNode);
org.outerj.daisy.diff.HtmlCleaner
org.outerj.daisy.htmlcleaner.HtmlCleaner

Examples of org.htmlcleaner.HtmlCleaner

    return getError(response);
  }
  
  private static String getError(String response) throws IOException{
    String error = null;
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode html = cleaner.clean(response);
    TagNode errortag = html.findElementByAttValue("id", "error", true, true);
    if (errortag != null){
      error = errortag.getAttributeByName("title");
    }
    return error;

View Full Code Here

Examples of org.htmlcleaner.HtmlCleaner

    private CleanerProperties parserProps;
    private DomSerializer2 domCreator;




    public HtmlParser() {
        this.htmlToXmlParser = new HtmlCleaner();
        this.parserProps = this.htmlToXmlParser.getProperties();
        this.parserProps.setRecognizeUnicodeChars(true);
        this.parserProps.setUseEmptyElementTags(true);
        this.parserProps.setAdvancedXmlEscape(true);
        this.parserProps.setTranslateSpecialEntities(true);

View Full Code Here

Examples of org.htmlcleaner.HtmlCleaner

      String charset = get.getRequestCharSet();


      //
      // Check for charset overrides in the HTML start page
      //
      HtmlCleaner cleaner = new HtmlCleaner();
 
      TagNode httpEquivNode = cleaner.clean(get.getResponseBodyAsStream()).findElementByAttValue("http-equiv", "content-type", true, false);
      if (httpEquivNode != null && httpEquivNode.hasAttribute("content")){
        String value = httpEquivNode.getAttributeByName("content");
        int offset = value.indexOf("charset=");
        if (offset >= -1){
            charset = value.substring(offset+8).toUpperCase();

View Full Code Here

Examples of org.htmlcleaner.HtmlCleaner

    }


    @Override
    public String select(String text) {
        try {
            HtmlCleaner htmlCleaner = new HtmlCleaner();
            TagNode tagNode = htmlCleaner.clean(text);
            Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
            Object result;
            try {
                result = xPathExpression.evaluate(document, XPathConstants.NODESET);
            } catch (XPathExpressionException e) {

View Full Code Here

Examples of org.htmlcleaner.HtmlCleaner


    @Override
    public List<String> selectList(String text) {
        List<String> results = new ArrayList<String>();
        try {
            HtmlCleaner htmlCleaner = new HtmlCleaner();
            TagNode tagNode = htmlCleaner.clean(text);
            Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
            Object result;
            try {
                result = xPathExpression.evaluate(document, XPathConstants.NODESET);
            } catch (XPathExpressionException e) {

View Full Code Here

Examples of org.htmlcleaner.HtmlCleaner

    @Ignore("take long time")
    @Test
    public void parserPerformanceTest() throws XPatherException {
        System.out.println(html.length());


        HtmlCleaner htmlCleaner = new HtmlCleaner();
        TagNode tagNode = htmlCleaner.clean(html);
        Document document = Jsoup.parse(html);


        long time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            htmlCleaner.clean(html);
        }
        System.out.println(System.currentTimeMillis()-time);


        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            tagNode.evaluateXPath("//a");
        }
        System.out.println(System.currentTimeMillis()-time);


        System.out.println("=============");


        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            Jsoup.parse(html);
        }
        System.out.println(System.currentTimeMillis()-time);


        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            document.select("a");
        }
        System.out.println(System.currentTimeMillis()-time);


        System.out.println("=============");


        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            htmlCleaner.clean(html);
        }
        System.out.println(System.currentTimeMillis()-time);


        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {

View Full Code Here

Examples of org.htmlcleaner.HtmlCleaner

  public HtmlCleanerBookProcessor() {
    this.htmlCleaner = createHtmlCleaner();
  }


  private static HtmlCleaner createHtmlCleaner() {
    HtmlCleaner result = new HtmlCleaner();
    CleanerProperties cleanerProperties = result.getProperties();
    cleanerProperties.setOmitXmlDeclaration(true);
    cleanerProperties.setOmitDoctypeDeclaration(false);
    cleanerProperties.setRecognizeUnicodeChars(true);
    cleanerProperties.setTranslateSpecialEntities(false);
    cleanerProperties.setIgnoreQuestAndExclam(true);

View Full Code Here

Examples of org.htmlcleaner.HtmlCleaner

public class HHCParser {


  public static final String DEFAULT_HTML_INPUT_ENCODING = "Windows-1251";
  
  public static List<TOCReference> parseHhc(InputStream hhcFile, Resources resources) throws IOException, ParserConfigurationException,  XPathExpressionException {
    HtmlCleaner htmlCleaner = new HtmlCleaner();
    CleanerProperties props = htmlCleaner.getProperties();
    TagNode node = htmlCleaner.clean(hhcFile);
    Document hhcDocument = new DomSerializer(props).createDOM(node);
    XPath xpath = XPathFactory.newInstance().newXPath();
    Node ulNode = (Node) xpath.evaluate("body/ul", hhcDocument
        .getDocumentElement(), XPathConstants.NODE);
    List<TOCReference> sections = processUlNode(ulNode, resources);

View Full Code Here

Examples of org.htmlcleaner.HtmlCleaner

  // Utility to minimise number of times the cleaner is created
  
  private void createHtmlCleanerIfNeeded()
  {
    if (null == cleaner) {
      cleaner = new HtmlCleaner();
      CleanerProperties props = cleaner.getProperties();
      props.setAllowHtmlInsideAttributes(true);
      props.setAllowMultiWordAttributes(true);
      props.setRecognizeUnicodeChars(true);
      props.setOmitComments(true);

View Full Code Here

Examples of org.htmlcleaner.HtmlCleaner

  }


  private void init() {
    
    // Initialize HTMLCleaner
    cleaner = new HtmlCleaner();
    CleanerProperties props = cleaner.getProperties();
    props.setAllowHtmlInsideAttributes(true);
    props.setAllowMultiWordAttributes(true);
    props.setRecognizeUnicodeChars(true);
    props.setOmitComments(true);

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.