Package spiderman.plugin.util

Source Code of spiderman.plugin.util.Util

package spiderman.plugin.util;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.eweb4j.spiderman.task.Task;
import org.eweb4j.spiderman.url.UrlRuleChecker;
import org.eweb4j.spiderman.xml.Rules;
import org.eweb4j.spiderman.xml.Target;
import org.eweb4j.util.CommonUtil;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;

public class Util {
 
  public static Target isTargetUrl(Task task) throws Exception{
    for (Target target : task.site.getTargets().getTarget()){
      Rules rules = target.getUrlRules();
      if (UrlRuleChecker.check(task.url, rules.getRule(), rules.getPolicy())){
        if (task.target == null)
          task.target = target;
        return target;
      }
    }
   
    return null;
  }
 
  public static Collection<String> findAllLinkHref(String html, String hostUrl) throws Exception{
    Collection<String> urls = new ArrayList<String>();
   
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode node = cleaner.clean(html);
    Object[] ns = node.evaluateXPath("//a[@href]");
    for (Object object : ns) {
      TagNode node2=(TagNode) object;
      String href = node2.getAttributeByName("href");
      if (href == null || href.trim().length() == 0)
        continue;
     
      if (!href.startsWith("https://") && !href.startsWith("http://")){
        StringBuilder sb = new StringBuilder("http://").append(new URL(hostUrl).getHost());
        if (!href.startsWith("/"))
          sb.append("/");
        href = sb.append(href).toString();
      }
//      href = URLCanonicalizer.getCanonicalURL(href);
      if (href == null)
        continue;
      if (href.startsWith("mailto:"))
        continue;
     
      urls.add(href);
    }
   
    return urls;
  }
 
  public static void main(String[] args){
    String html = getHtml("http://www.groupon.my/all-deals/klang");
    List<String> rs = CommonUtil.findByRegex(html, "(?<=(\"dealPermaLink\":\")).[^\"]*");
    System.out.println(rs);
  }
 
  public static String getHtml(String urlString) { 
      try
        StringBuffer html = new StringBuffer()
        URL url = new URL(urlString)
        HttpURLConnection conn = (HttpURLConnection) url.openConnection()
        InputStreamReader isr = new InputStreamReader(conn.getInputStream())
        BufferedReader br = new BufferedReader(isr)
        String temp; 
        while ((temp = br.readLine()) != null) { 
          html.append(temp).append("\n")
       
        br.close()
        isr.close()
        return html.toString()
      } catch (Exception e) { 
        e.printStackTrace()
        return null
     
  }
}
TOP

Related Classes of spiderman.plugin.util.Util

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.