Package fr.eolya.utils.http

Source Code of fr.eolya.utils.http.HttpUtils

package fr.eolya.utils.http;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.MasonTagTypes;
import net.htmlparser.jericho.MicrosoftConditionalCommentTagTypes;
import net.htmlparser.jericho.PHPTagTypes;
import net.htmlparser.jericho.Source;

import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpVersion;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CookieStore;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.client.protocol.ClientContext;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;

import com.sun.syndication.feed.synd.SyndEntryImpl;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.io.SyndFeedInput;
import com.sun.syndication.io.XmlReader;

public class HttpUtils {

  public static String urlNormalize(String url, String preferedHost) {

    String ret_url = url.trim();

    // Perform some url nomalizations described here : http://en.wikipedia.org/wiki/URL_normalization

    try {
      // Remove last "/" - NO !!!
      //if (ret_url.lastIndexOf("/") == ret_url.length()-1)
      //  ret_url = ret_url.substring(0, ret_url.length()-1);

      // Remove final "?" if unique in url -     http://www.example.com/display? -> http://www.example.com/display
      if (ret_url.lastIndexOf("?") == ret_url.length()-1)
        ret_url = ret_url.substring(0, ret_url.length()-1);

      // Fix "?&"
      int index = ret_url.indexOf("?&");
      //int l = ret_url.length()-2;
      if (index != -1) {
        if (index!=ret_url.length()-2) {
          ret_url = ret_url.substring(0, index+1) + ret_url.substring(index+2);
        }
        else {
          ret_url = ret_url.substring(0, ret_url.length()-2);
        }
      }

      // Replace "&" by "&"
      ret_url = StringEscapeUtils.unescapeHtml4(ret_url);

      // Replace " " by "%20"
      ret_url = ret_url.replace(" ", "%20");

      // Replace "'" by "%27"
      ret_url = ret_url.replace("'", "%27");

      // Replace "%5F" by "_"
      ret_url = ret_url.replace("%5f", "_");
      ret_url = ret_url.replace("%5F", "_");

      // Remove dot-segments.
      // http://www.example.com/../a/b/../c/./d.html => http://www.example.com/a/c/d.html        
      URI uri = new URI(ret_url);
      uri = uri.normalize();
      ret_url = uri.toURL().toExternalForm();

      // Remove dot-segments at the beginning of the path
      // http://www.example.com/../a/d.html => http://www.example.com/a/d.html        
      URL tempUrl = new URL(ret_url);
      String path = tempUrl.getFile();
      String pattern = "";
      while (path.startsWith("/../")) {
        path = path.substring(3);
        pattern += "/..";
      }
      if (!pattern.equals("")) {
        index = ret_url.indexOf(pattern);
        ret_url = ret_url.substring(0, index) + ret_url.substring(index + pattern.length());
      }

      // Remove default port
      if (ret_url.indexOf("http://"+uri.getHost() + ":80")!=-1) {
        ret_url = ret_url.replace("//"+uri.getHost() + ":80", "//"+uri.getHost());
      }
      if (ret_url.indexOf("https://"+uri.getHost() + ":443")!=-1) {
        ret_url = ret_url.replace("//"+uri.getHost() + ":443", "//"+uri.getHost());
      }

      // translate to prefered host (www.site.com vs site.com)
      if (preferedHost!=null && !"".equals(preferedHost)) {
        if (uri.getHost().equals("www." + preferedHost) || ("www." + uri.getHost()).equals(preferedHost)) {
          ret_url = ret_url.replace("//"+uri.getHost(), "//"+preferedHost);
        }
      }

      // Remove the fragment.
      // http://www.example.com/bar.html#section1 => http://www.example.com/bar.html
      if (ret_url.indexOf("#")!=-1)
        ret_url = ret_url.substring(0, ret_url.indexOf("#"));

      // Reorder parameters in query string
      //ret_url = urlReorderParameters (ret_url);

      return ret_url;
    }
    catch (Exception e){}

    return ret_url;
  }

  public static String urlRemoveParameters (String url, String paramsToRemove)
  {
    if (paramsToRemove==null || "".equals(paramsToRemove)) return url;

    try {
      URL u = new URL(url);
      if (u.getQuery()==null && u.getPath().indexOf(";jsessionid=")==-1) return url;
    } catch (MalformedURLException e1) {
      e1.printStackTrace();
      return null;
    }

    try {

      url = url.replace("?&", "?");

      if ("*".equals(paramsToRemove)) {
        int offset = url.lastIndexOf("?");
        if (offset!=-1) return url.substring(0, offset);
      }

      paramsToRemove = paramsToRemove.replaceAll(" ", "").replaceAll(";", ",");
      String[] aToRemove = paramsToRemove.split(",");
      String tempUrl = url;
      for (int i=0; i<aToRemove.length; i++) {
        boolean found = true;
        while (found) {
          found = false;
          String re = "[?&;]" + aToRemove[i].toLowerCase() + "[=&]";
          Pattern p = Pattern.compile(re);
          Matcher m = p.matcher(tempUrl.toLowerCase());
          if (m.find()) {
            found = true;
            int start = m.start();
            int stop = start;
            if ("jsessionid".equals(aToRemove[i].toLowerCase())) {
              stop = tempUrl.indexOf("?", start+1);
              if (stop==-1) stop = tempUrl.indexOf("&", start+1);
            }
            else{
              stop = tempUrl.indexOf("&", start+1);
            }
            if (stop==-1) {
              tempUrl = tempUrl.substring(0, start);
            }
            else {
              String ope = tempUrl.substring(start, start+1);
              if (";".equals(ope)) ope = "?";
              tempUrl = tempUrl.substring(0, start) + ope + tempUrl.substring(stop+1);
            }
          }
          re = "[?&;]" + aToRemove[i].toLowerCase() + "$";
          p = Pattern.compile(re);
          m = p.matcher(tempUrl.toLowerCase());
          if (m.find()) {
            found = true;
            int start = m.start();
            int stop = start;
            if ("jsessionid".equals(aToRemove[i].toLowerCase())) {
              stop = tempUrl.indexOf("?", start+1);
              if (stop==-1) stop = tempUrl.indexOf("&", start+1);
            }
            else{
              stop = tempUrl.indexOf("&", start+1);
            }
            if (stop==-1) {
              tempUrl = tempUrl.substring(0, start);
            }
            else {
              String ope = tempUrl.substring(start, start+1);
              if (";".equals(ope)) ope = "?";
              tempUrl = tempUrl.substring(0, start) + ope + tempUrl.substring(stop+1);
            }
          }
        }
      }
      return tempUrl;
    }
    catch (Exception e) {
      e.printStackTrace();
    }
    return "";
 

  public static Map<String, String> getAuthCookies(int authMode, String authLogin, String authPasswd, String authParam, String proxyHost, String proxyPort, String proxyExclude, String proxyUser, String proxyPassword) {

    if (authMode == 0) return null;

    Map<String, String> authCookies = null;
    String[] aAuthParam = authParam.split("\\|");

    // http://www.java-tips.org/other-api-tips/httpclient/how-to-use-http-cookies.html
    DefaultHttpClient httpclient = new DefaultHttpClient();

    HttpPost httppost = new HttpPost(aAuthParam[0]);
    //httpclient.getParams().setParameter("http.useragent", "Custom Browser");
    httpclient.getParams().setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1);
    httpclient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY);

    CookieStore cookieStore = new BasicCookieStore();
    HttpContext localContext = new BasicHttpContext();
    localContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore);

    try
    {
      List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>(2);
      for (int i = 1; i < aAuthParam.length; i++) {
        String[] aPair = aAuthParam[i].split("=");
        aPair[1] = aPair[1].replaceAll("\\$\\$auth_login\\$\\$", authLogin);
        aPair[1] = aPair[1].replaceAll("\\$\\$auth_passwd\\$\\$", authPasswd);
        nameValuePairs.add(new BasicNameValuePair(aPair[0], aPair[1]));
      }
      httppost.setEntity(new UrlEncodedFormEntity(nameValuePairs));
      httppost.setHeader("ContentType", "application/x-www-form-urlencoded");
      HttpResponse response = httpclient.execute(httppost, localContext);
      HttpEntity entity = response.getEntity();
      if (entity != null)
      {
        entity.consumeContent();
      }

      List<Cookie> cookies = httpclient.getCookieStore().getCookies();
      if (!cookies.isEmpty()) {
        authCookies = new HashMap<String, String>();
        for (Cookie c : cookies)
        {
          // TODO: What about the path, the domain ???
          authCookies.put(c.getName(), c.getValue());
        }
      }   
      httppost.abort();
    }
    catch (ClientProtocolException e)
    {
      return null;
    }
    catch (IOException e)
    {
      return null;
    }   
    return authCookies;
  }

  //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  public static boolean urlBelongSameHost(String urlReferer, String urlHref, List<String> hostAliases) {
    if (urlReferer!=null && urlBelongSameHost(urlReferer,urlHref)) return true;
    if (hostAliases!=null) {
      for (int i=0; i<hostAliases.size(); i++) {
        hostAliases.set(i,hostAliases.get(i).trim());
        if (hostAliases.get(i).indexOf("*")==-1) {
          if (urlBelongSameHost(hostAliases.get(i), urlHref)) return true;         
        } else {
          String alias = hostAliases.get(i).replace("*", "");
          if (hostAliases.get(i).indexOf("*")==0) {
            if (urlHref.endsWith(alias)) return true;
          }
          if (hostAliases.get(i).indexOf("*")==hostAliases.get(i).length()-1) {
            if (urlHref.startsWith(alias)) return true;           
          }
        }
      }
    }
    return false;
  }

  //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  public static boolean urlBelongSameHost(String urlReferer, String urlHref) {
    return areSameHosts(getUrlHost(urlReferer), getUrlHost(urlHref));
  }

  //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  private static boolean areSameHosts(String hostR, String hostH) {

    String host1 = hostR.toLowerCase().trim();
    String host2 = hostH.toLowerCase().trim();

    if (host1.startsWith("www.") && !host2.startsWith("www."))
      host2 = "www." + host2;

    if (!host1.startsWith("www.") && host2.startsWith("www."))
      host1 = "www." + host1;

    return host1.equals(host2);
  }

  //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  private static String getUrlHost(String url) {
    try {
      if (!url.startsWith("http://") && !url.startsWith("https://")) {
        url = "http://" + url;
      }
      URL u = new URL(url);
      return u.getHost();
    }
    catch (Exception e) {
      return "";
    }   
  }

  /**
   * Encode url
   *
   * @param url url to be encoded
   * @return
   */
  public static String urlEncode (String url)
  {
    try {
      URL u = new URL(url);
      String host = u.getHost();
      int indexFile = url.indexOf("/", url.indexOf(host));
      if (indexFile==-1) return url;

      String urlFile = u.getFile();
      urlFile = URLDecoder.decode(urlFile, "UTF-8");

      String protocol = u.getProtocol();
      int port = u.getPort();
      if (port!=-1 && port!=80 && "http".equals(protocol))
        host += ":" .concat(String.valueOf(port));
      if (port!=-1 && port!=443 && "https".equals(protocol))
        host += ":" .concat(String.valueOf(port));

      URI uri = new URI(u.getProtocol(), host, urlFile, null);
      String ret = uri.toASCIIString();
      ret = ret.replaceAll("%3F", "?");
      return ret;      
    }
    catch (Exception e) {
      e.printStackTrace();
    }
    return "";
  }

  public static HashMap<String, String> extractMetas(String rawPage) throws IOException {

    final HashMap<String, String> m = new HashMap<String, String>();

    HtmlCleaner cleaner = new HtmlCleaner();
    //CleanerProperties props = cleaner.getProperties();    
    //props.setXXX(...);
    TagNode node = cleaner.clean(rawPage);
    TagNode[] myNodes;

    // <meta name="..." content="..." />
    // <meta http-equiv="refresh" content=
    myNodes = node.getElementsByName("meta", true);
    for (int i=0;i<myNodes.length;i++)
    {
      String name = myNodes[i].getAttributeByName("name");
      if (name!=null)
      {
        String scheme = myNodes[i].getAttributeByName("scheme");
        if (scheme!=null)
          name += "_" + scheme;

        String content = myNodes[i].getAttributeByName("content");
        if (content!=null && !"".equals(content))
        {
          m.put("meta_" + name.toLowerCase().replaceAll("\\-", "_"), content);
        }
      }

      String equiv = myNodes[i].getAttributeByName("http-equiv");
      if (equiv!=null)
      {
        String content = myNodes[i].getAttributeByName("content");
        if (content!=null && !"".equals(content))
        {
          m.put("meta_equiv_" + equiv.toLowerCase().replaceAll("\\-", "_"), content);
        }
      }
    }

    // <link ... />
    myNodes = node.getElementsByName("link", true);
    for (int i=0;i<myNodes.length;i++)
    {
      String href = myNodes[i].getAttributeByName("href");
      String rel = myNodes[i].getAttributeByName("rel");
      if (href!=null && rel!=null && "canonical".equals(rel))
      {
        m.put("meta_link_canonical", href);
      }           
    }
    return m;
  }

  //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  public static boolean isRelativeURL(String urlHref) {
    if (urlHref.equals("")) return false;
   
    // Case 1 : urlHref starts with "http://"
    if (urlHref.startsWith("http://") || urlHref.startsWith("https://")) {
      return false;
    }
   
    // Case 2 : urlHref looks like "?..."
    if (urlHref.startsWith("?")) {
      return false;
    }

    // Case 3 : urlHref looks like "/path/file.html..."
    if (urlHref.startsWith("/")) {
      return false;
    }

    return true;
  }

  //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  public static String urlGetAbsoluteURL(String urlReferer, String urlHref) {
    try {
      if (urlHref.equals(""))
        return "";

      // Case 1 : urlHref starts with "http://"
      if (urlHref.startsWith("http://") || urlHref.startsWith("https://")) {
        return urlHref;
      }

      URL url = new URL(urlReferer);

      // Case 1.1 : urlHref starts with "//"
      if (urlHref.startsWith("//")) {
        return url.getProtocol() + ":" + urlHref;
      }

      String urlRefererHost = url.getProtocol() + "://" + url.getHost();
      if (url.getPort() != -1) {
        urlRefererHost = urlRefererHost + ":" + String.valueOf(url.getPort());
      }

      // Case 2 : urlHref looks like "?..."
      if (urlHref.startsWith("?")) {
        // find "?" in urlReferer
        /*
        if (urlReferer.indexOf("?")!=-1)
          return urlReferer.substring(0,urlReferer.indexOf("?")) + urlHref;
        else
          return urlReferer + urlHref;
         */ 
        return urlRefererHost + "/" + url.getPath() + urlHref;
      }

      // Case 3 : urlHref looks like "/path/file.html..."
      if (urlHref.startsWith("/")) {
        return urlRefererHost + urlHref;
      }

      // Case 4 : urlHref looks like "path/file.html..."
      String urlRefererPath = url.getPath();
      if ("".equals(urlRefererPath)) urlRefererPath = "/";

      //if (urlRefererPath.indexOf(".")==-1 && urlRefererPath.lastIndexOf("/") != urlRefererPath.length()-1)
      //  urlRefererPath = urlRefererPath + "/";

      int offset = urlRefererPath.lastIndexOf("/");
      /*
      if (offset <= 0) {
        urlRefererPath = "";
      } else {
        urlRefererPath = urlRefererPath.substring(0, offset);
      }
       */
      urlRefererPath = urlRefererPath.substring(0, offset);

      return urlRefererHost + urlRefererPath + "/" + urlHref;

    }
    catch (Exception e) {
      //e.printStackTrace ();
    }
    return "";
  }

  //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  private static String fixUpUrl(String url)
  {
    String ret = url;
    if ("".equals(ret))
      ret = "/";
    else
    {
      if (ret.indexOf(".")>0)
      {
        ret = ret.substring(0,ret.lastIndexOf("/")+1);
      }
      else
      {
        if (!ret.endsWith("/"))
          ret += "/";   
      }       
    }
    return ret;
  }

  //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  public static boolean isChildOf(URL urlChild, URL urlFather)
  {
    String urlChildPath = fixUpUrl(urlChild.getPath().toLowerCase());
    String urlFatherPath = fixUpUrl(urlFather.getPath().toLowerCase());

    return urlChildPath.startsWith(urlFatherPath);
  }

  //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  public static String getHtmlDeclaredLanguage(String rawData)
  {
    if (rawData==null || "".equals(rawData)) return "";

    Hashtable<String,Integer> langFreq = new Hashtable<String,Integer>();
    BufferedReader in = new BufferedReader(new StringReader(rawData));
    String line;
    try {
      while ((line = in.readLine()) != null)
      {
        line = line.toLowerCase();

        //<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="fr-fr">
        if (line.indexOf("<html")>=0 && line.toLowerCase().indexOf(" xml:lang")>=0)
        {
          String lang = parseAttributeValue(line, "xml:lang=");
          if (lang!=null && lang.length()>=2)
          {
            lang = lang.substring(0,2);

            if (langFreq.containsKey(lang))
              langFreq.put(lang, langFreq.get(lang) + 1);
            else
              langFreq.put(lang, 1);
          }
        }

        //<html lang="fr">
        if (line.indexOf("<html")>=0 && line.toLowerCase().indexOf(" lang")>=0)
        {
          String lang = parseAttributeValue(line, "lang=");
          if (lang!=null && lang.length()>=2)
          {
            lang = lang.substring(0,2);

            if (langFreq.containsKey(lang))
              langFreq.put(lang, langFreq.get(lang) + 1);
            else
              langFreq.put(lang, 1);
          }
        }

        //<meta http-equiv="content-language" content="fr-fr" />
        if (line.indexOf("<meta")>=0 && line.toLowerCase().indexOf(" http-equiv")>=0 && line.toLowerCase().indexOf("content-language")>=0 )
        { 
          String lang = parseAttributeValue(line, "content=");
          if (lang!=null && lang.length()>=2)
          {
            lang = lang.substring(0,2);

            if (langFreq.containsKey(lang))
              langFreq.put(lang, langFreq.get(lang) + 1);
            else
              langFreq.put(lang, 1);
          }
        }

        //<meta name="language" content="fr-fr" />
        if (line.indexOf("<meta")>=0 && line.toLowerCase().indexOf(" name")>=0 && line.toLowerCase().indexOf("language")>=0 && line.toLowerCase().indexOf(" content")>=0)
        { 
          String lang = parseAttributeValue(line, "content=");
          if (lang!=null && lang.length()>=2)
          {
            lang = lang.substring(0,2);

            if (langFreq.containsKey(lang))
              langFreq.put(lang, langFreq.get(lang) + 1);
            else
              langFreq.put(lang, 1);
          }
        }

        //<meta name="content-language" content="fr-fr" />
        if (line.indexOf("<meta")>=0 && line.toLowerCase().indexOf(" name")>=0 && line.toLowerCase().indexOf("content-language")>=0 && line.toLowerCase().indexOf(" content")>=0)
        {  
          String lang = parseAttributeValue(line, "content=");
          if (lang!=null && lang.length()>=2)
          {
            lang = lang.substring(0,2);

            if (langFreq.containsKey(lang))
              langFreq.put(lang, langFreq.get(lang) + 1);
            else
              langFreq.put(lang, 1);
          }
        }   
      }

      // Get the best candidate
      Vector<String> v = new Vector<String>(langFreq.keySet());
      Iterator<String> it = v.iterator();
      int max = 0;
      String lang = "";
      while (it.hasNext()) {
        String element =  (String)it.next();
        //System.out.println( element + " " + encodingFreq.get(element));
        if (langFreq.get(element)>max)
        {
          max = langFreq.get(element);
          lang = element;
        }
      }

      return lang;
    } catch (IOException e) {
      e.printStackTrace();
    }
    return "";
  }

  /**
   * Parse the character encoding from the specified content type header.
   * If the content type is null, or there is no explicit character encoding,
   * <code>null</code> is returned.
   * <br />
   * This method was copied from org.apache.catalina.util.RequestUtil,
   * which is licensed under the Apache License, Version 2.0 (the "License").
   *
   * @param contentType a content type header
   */
  public static String parseCharacterEncoding(String contentType) {

    if (contentType == null)
      return (null);

    String value = "";

    int start = contentType.indexOf("charset='");
    if (start >= 0)
    {
      value = contentType.substring(start + 9);
    }
    else
    {
      start = contentType.indexOf("charset=\"");
      if (start >= 0)
      {
        value = contentType.substring(start + 9);
      }
      else
      {
        start = contentType.indexOf("charset=");
        if (start < 0)
          return (null);
        value = contentType.substring(start + 8);
      }
    }   

    int end = value.indexOf(';');
    if (end >= 0)
      value = value.substring(0, end);

    end = value.indexOf('"');
    if (end >= 0)
      value = value.substring(0, end);

    end = value.indexOf('\'');
    if (end >= 0)
      value = value.substring(0, end);

    end = value.indexOf('/');
    if (end >= 0)
      value = value.substring(0, end);

    end = value.indexOf('>');
    if (end >= 0)
      value = value.substring(0, end);

    value = value.replaceAll("\"","");
    value = value.replaceAll("'","");

    return (value.trim());
  }

  public static String parseAttributeValue(String line, String attName) {
    if (line == null)
      return (null);
    int start = line.indexOf(attName);
    if (start < 0)
      return (null);
    String value = line.substring(start + attName.length());
    value = value.trim();

    if (value.charAt(0)=='"' || value.charAt(0)=='\'')
      value = value.substring(1);

    int end = value.indexOf(';');
    if (end >= 0)
      value = value.substring(0, end);

    end = value.indexOf('"');
    if (end >= 0)
      value = value.substring(0, end);

    end = value.indexOf('\'');
    if (end >= 0)
      value = value.substring(0, end);

    end = value.indexOf('/');
    if (end >= 0)
      value = value.substring(0, end);

    end = value.indexOf('>');
    if (end >= 0)
      value = value.substring(0, end);

    value = value.replaceAll("\"","");
    value = value.replaceAll("'","");

    return (value.trim());
 

  public static String filtreEncoding(String encoding)
  {
    encoding = encoding.toLowerCase();
    if (encoding.startsWith("utf") && !"utf-8".equals(encoding))
      return "";

    return encoding;
  }

  public static List<String> extractLinksFromFeed(String rawPage)
  {
    final ArrayList<String> list = new ArrayList<String>();

    try
    {
      XmlReader xmlReader = new XmlReader(new ByteArrayInputStream(rawPage.getBytes()));
      SyndFeedInput input = new SyndFeedInput();
      SyndFeed feed = input.build(xmlReader);

      Object[] arraySyndEntry = feed.getEntries().toArray();
      for (int k=arraySyndEntry.length-1; k>=0; k--)
      {
        SyndEntryImpl syndEntry = (SyndEntryImpl)arraySyndEntry[k];
        String link = strLinkCleanup(syndEntry.getLink());
        if (!list.contains(link))
          list.add(link);
      }
      return list;
    }
    catch (Exception e)
    {
      e.printStackTrace();
      return null;
    }
  }

  static private String strLinkCleanup(String str) {
    if (str == null) return "";

    // line start and end
    str = str.replaceAll("^[\\n\\t\\s]*", "");
    str = str.replaceAll("[\\n\\t\\s]*$", "");

    // some unicode chars
    str = str.replaceAll("\\u0091", "'");
    str = str.replaceAll("\\u0092", "'");
    str = str.replaceAll("\\u0093", "\"");
    str = str.replaceAll("\\u0094", "\"");

    return str;
  }

  public static List<String> extractAbsoluteLinks(String rawPage, String urlPage, int depth) throws IOException {

    List<String> links = extractLinks(rawPage, depth);
    String baseHref = null;

    for (int i=0; i<links.size(); i++) {
      try {
        String url = null;
        if (baseHref==null && isRelativeURL(links.get(i).trim())) baseHref = getBaseHref(rawPage);
        if (baseHref!=null && isRelativeURL(links.get(i).trim())) {
          url = urlGetAbsoluteURL(baseHref, links.get(i).trim());
        } else {
          url = urlGetAbsoluteURL(urlPage, links.get(i).trim());
        }
        links.set(i, url);
      }
      catch (Exception e) {
        e.printStackTrace();
      }
    }
    return links;
  }

  public static String getBaseHref(String rawPage) throws IOException {
    if (rawPage==null || !StringUtils.containsIgnoreCase(rawPage, "<base")) return null;

    HtmlCleaner cleaner = new HtmlCleaner();
    //CleanerProperties props = cleaner.getProperties();    
    //props.setXXX(...);
    TagNode node = cleaner.clean(rawPage);
    TagNode[] myNodes = node.getElementsByName("base", true);
    if (myNodes==null || myNodes.length==0) return null;
    String href = myNodes[0].getAttributeByName("href");
    if (href!=null) return href;
    return null;
  }
  /**
   * Extract link in html string according to depth parameter
   * if depth = 0 : extract only redirection or iframe or framset urls links
   * if depth = 1 : extract only standard urls links (<a href='..'>
   * if depth = 2 : extract all links
   *
   * @param rawPage the input html string
   * @param depth the type of links to be extracted
   * @return the extracted urls in a String List
   * @throws IOException
   */
  public static List<String> extractLinks(String rawPage, int depth) throws IOException {

    final ArrayList<String> list = new ArrayList<String>();

    HtmlCleaner cleaner = new HtmlCleaner();
    //CleanerProperties props = cleaner.getProperties();    
    //props.setXXX(...);
    TagNode node = cleaner.clean(rawPage);

    TagNode[] myNodes;

    if (depth==1 || depth==2) {
      // <a href=
      myNodes = node.getElementsByName("a", true);
      for (int i=0;i<myNodes.length;i++)
      {
        String link = myNodes[i].getAttributeByName("href");

        if (link!=null) {
          link = link.trim();

          if (link!=null && !"".equals(link))
          {
            if (isValidUrl(link))
              if (!list.contains(link))
                list.add(link);
          }
        }
      }

      // <area href=
      myNodes = node.getElementsByName("area", true);
      for (int i=0;i<myNodes.length;i++)
      {
        String link = myNodes[i].getAttributeByName("href");
        if (link!=null && !"".equals(link))
          if (isValidUrl(link))
            if (!list.contains(link))
              list.add(link);
      }
    }

    if (depth==0 || depth==2) {
      // <frame src=
      myNodes = node.getElementsByName("frame", true);
      for (int i=0;i<myNodes.length;i++)
      {
        String link = myNodes[i].getAttributeByName("src");
        if (link!=null && !"".equals(link))
          if (isValidUrl(link))
            if (!list.contains(link))
              list.add(link);
      }

      // <iframe src=
      myNodes = node.getElementsByName("iframe", true);
      for (int i=0;i<myNodes.length;i++)
      {
        String link = myNodes[i].getAttributeByName("src");
        if (link!=null && !"".equals(link))
          if (isValidUrl(link))
            if (!list.contains(link))
              list.add(link);
      }

      // <meta http-equiv="refresh" content=
      myNodes = node.getElementsByName("meta", true);
      for (int i=0;i<myNodes.length;i++)
      {
        String equiv = myNodes[i].getAttributeByName("http-equiv");
        if ((equiv!=null) && (equiv.equalsIgnoreCase("refresh")))
        {
          String link = myNodes[i].getAttributeByName("content");
          if (link!=null && !"".equals(link))
          {
            if (link.indexOf("=")>0)
            {
              link = link.substring(link.indexOf("=")+1);
              if (!list.contains(link))
                list.add(link);
            }
          }
        }
      }

      // Look for embeded flash
      // <param name="movie" value="..."
      myNodes = node.getElementsByName("param", true);
      for (int i=0;i<myNodes.length;i++)
      {
        String name = myNodes[i].getAttributeByName("name");
        if ("movie".equals(name))
        {
          String link = myNodes[i].getAttributeByName("value");
          if (!list.contains(link))
            list.add(link);
        }
      }
    }

    // <frame src= (par Jericho parser car HTML Cleaner echoue)
    MicrosoftConditionalCommentTagTypes.register();
    PHPTagTypes.register();
    PHPTagTypes.PHP_SHORT.deregister(); // remove PHP short tags for this example otherwise they override processing instructions
    MasonTagTypes.register();
    Source source=new Source(rawPage);
    source.fullSequentialParse();

    if (depth==0 || depth==2) {
      List<Element> linkElements=source.getAllElements(HTMLElementName.FRAME);
      for (Element linkElement : linkElements) {
        String link=linkElement.getAttributeValue("src");
        if (link!=null && !"".equals(link))
          if (isValidUrl(link))
            if (!list.contains(link))
              list.add(link);
      }
    }
    if (depth==1 || depth==2) {

      List<Element> linkElements=source.getAllElements(HTMLElementName.A);
      for (Element linkElement : linkElements) {
        String link=linkElement.getAttributeValue("href");
        if (link!=null && !"".equals(link))
          if (isValidUrl(link))
            if (!list.contains(link))
              list.add(link);
        /*
      if (href==null) continue;
      // A element can contain other tags so need to extract the text from it:
      String label=linkElement.getContent().getTextExtractor().toString();
      System.out.println(label+" <"+href+'>');
         */
      }
    }

    String strPattern = "location[.]href=['\"](.*)['\"]";
    Pattern pattern = Pattern.compile(strPattern);
    Matcher matcher = pattern.matcher(rawPage);
    while (matcher.find()) {
      try{
        String url = matcher.group(1);
        if (url.indexOf("'")!=-1)
          url = url.substring(0, url.indexOf("'"));
        if (url.indexOf('"')!=-1)
          url = url.substring(0, url.indexOf('"'));
        if (!list.contains(url))
          list.add(url);           
      }
      catch (Exception e){}
    }

    // Look for location.href='...'
    //    strPattern = "href=['\"](.*)['\"]";
    //    pattern = Pattern.compile(strPattern);
    //    matcher = pattern.matcher(rawPage);
    //    while (matcher.find()) {
    //      try{
    //        String url = matcher.group(1);
    //        if (url.indexOf("'")!=-1)
    //          url = url.substring(0, url.indexOf("'"));
    //        if (url.indexOf('"')!=-1)
    //          url = url.substring(0, url.indexOf('"'));
    //        if (!list.contains(url))
    //          list.add(url);           
    //      }
    //      catch (Exception e){}
    //    }

    if (depth==0 || depth==2) {
      // Look for location.replace("...")
      strPattern = "location[.]replace\\(['\"](.*)['\"]\\)";
      pattern = Pattern.compile(strPattern);
      matcher = pattern.matcher(rawPage);
      while (matcher.find()) {
        try{
          String url = matcher.group(1);
          if (url.indexOf("'")!=-1)
            url = url.substring(0, url.indexOf("'"));
          if (url.indexOf('"')!=-1)
            url = url.substring(0, url.indexOf('"'));
          if (!list.contains(url))
            list.add(url);           
        }
        catch (Exception e){}
      }

      // Look for window.location='...'
      strPattern = "window[.]location=['\"](.*)['\"]";
      pattern = Pattern.compile(strPattern);
      matcher = pattern.matcher(rawPage);
      while (matcher.find()) {
        try{
          String url = matcher.group(1);
          if (url.indexOf("'")!=-1)
            url = url.substring(0, url.indexOf("'"));
          if (url.indexOf('"')!=-1)
            url = url.substring(0, url.indexOf('"'));
          if (!list.contains(url))
            list.add(url);           
        }
        catch (Exception e){}
      }
    }

    return list;
  }

  private static boolean isValidUrl(String url) {
    String temp = url.toLowerCase();
    if (!temp.startsWith("mailto:") && !temp.startsWith("javascript:") && !temp.startsWith("#") && !temp.startsWith("\\") && !temp.startsWith("'") && !temp.startsWith("\"")) {
      if (temp.startsWith("http")) {
        try {
          if (temp.startsWith("http:/") && !temp.startsWith("http://")) return false;
          @SuppressWarnings("unused")
          URL u = new URL(temp);
          return true;
        }
        catch (Exception e){
          return false;
        }
      }
      return true;
    }
    else return false;
  }
 
    public static String urlGetFileName(String url) {
        try
        {   
            URL u = new URL(url);
            String name = u.getPath();
            if (name.lastIndexOf("/")!=-1 && name.lastIndexOf("/") < name.length())
                name = name.substring(name.lastIndexOf("/")+1);
            return name;
        }
        catch (Exception e) {}
        return "";
    }
   
    public static String urlAddBasicAuthentication(String url, String login, String password) {
    url = url.replace("http://", "http://"+login+":"+password+"@");
    url = url.replace("https://", "https://"+login+":"+password+"@");
    return url;
    }

}
TOP

Related Classes of fr.eolya.utils.http.HttpUtils

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.