Package fr.eolya.simplepipeline.stage

Source Code of fr.eolya.simplepipeline.stage.ActoniaMeta

package fr.eolya.simplepipeline.stage;

import org.apache.commons.lang3.StringEscapeUtils;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;

import fr.eolya.simplepipeline.config.PipelineConfig;
import fr.eolya.simplepipeline.document.Doc;

/*
* Configuration snippet sample :
*
*  <stage classname="fr.eolya.simplepipeline.stage.ActoniaMeta">
*      <param name="onaction">add</param>
*      <param name="contenttype">item_contenttype</param>
*      <param name="contentcharset">item_charset</param>
*      <param name="source">content</param>
*  </stage>
*/

public class ActoniaMeta extends Stage {

    private boolean stopPipelineOnError = false;
    private String contentTypeElement = null;
    private String contentCharsetElement = null;
    private String sourceElement = null;

  /**
   * Perform initialization.
   */
    public void initialize() {
        super.initialize();
       
        stopPipelineOnError = PipelineConfig.isEnabled(props.getProperty("stoppipelineonerror"));
        contentTypeElement = props.getProperty("contenttype");
        contentCharsetElement = props.getProperty("contentcharset");
        sourceElement = props.getProperty("source");
    }
 
  @Override
  public void processDoc(Doc doc) throws Exception {

    // Check onaction
    if (!doProcess(doc)) {
      if (nextStage != null)
        nextStage.processDoc(doc)
      return;
    }

    java.util.Date startTime = new java.util.Date();
   
    if (logger!=null) logger.log("    extract meta (actonia)");

        String contentType = "";
        String contentCharset = "";
        String source = "";

        if (contentTypeElement != null && !"".equals(contentTypeElement)) {
        contentType = doc.getElementText("//" + contentTypeElement);
    }
   
    if (contentCharsetElement != null && !"".equals(contentCharsetElement)) {
        contentCharset = doc.getElementText("//" + contentCharsetElement);
    }

        if (sourceElement != null && !"".equals(sourceElement)) {
            source = doc.getElementText("//" + sourceElement);
            if (source == null)
                source = "";
        }
        String rawData = StringEscapeUtils.unescapeHtml4(source);
           
        HtmlCleaner cleaner = new HtmlCleaner();
        //CleanerProperties props = cleaner.getProperties();        
        //props.setXXX(...);
        TagNode node = cleaner.clean(rawData);
        TagNode[] myNodes;
       
        myNodes = node.getElementsByName("a", true);
        for (int i=0;i<myNodes.length;i++)
        {
            String href = myNodes[i].getAttributeByName("href");
            String anchorText = myNodes[i].getText().toString();
            doc.addElement("/job", "actonia_link", href + "|" + anchorText);
        }
        doc.addElement("/job", "actonia_link_count", String.valueOf(myNodes.length));

        myNodes = node.getElementsByName("h1", true);
        for (int i=0;i<myNodes.length;i++)
        {
            doc.addElement("/job", "actonia_h1", myNodes[i].getText().toString());
        }

        myNodes = node.getElementsByName("h2", true);
        for (int i=0;i<myNodes.length;i++)
        {
            doc.addElement("/job", "actonia_h2", myNodes[i].getText().toString());
        }

        myNodes = node.getElementsByName("h3", true);
        for (int i=0;i<myNodes.length;i++)
        {
            doc.addElement("/job", "actonia_h3", myNodes[i].getText().toString());
        }

        myNodes = node.getElementsByName("h4", true);
        for (int i=0;i<myNodes.length;i++)
        {
            doc.addElement("/job", "actonia_h4", myNodes[i].getText().toString());
        }

        java.util.Date endTime = new java.util.Date();
    processingTime += (endTime.getTime() - startTime.getTime());

    if (nextStage != null) {
      nextStage.processDoc(doc);
    }   
  }

  @Override
  public String getName() {
    // TODO Auto-generated method stub
    return null;
  }

  @Override
  public String getDescription() {
    // TODO Auto-generated method stub
    return null;
  }

}
TOP

Related Classes of fr.eolya.simplepipeline.stage.ActoniaMeta

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.