Source Code of org.apache.nutch.util.URLUtil

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.nutch.util;


import java.net.MalformedURLException;
import java.net.*;
import java.util.regex.Pattern;


import org.apache.nutch.util.domain.DomainSuffix;
import org.apache.nutch.util.domain.DomainSuffixes;


/** Utility class for URL analysis */
public class URLUtil {
  
  /**
   * Resolve relative URL-s and fix a few java.net.URL errors
   * in handling of URLs with embedded params and pure query
   * targets.
   * @param base base url
   * @param target target url (may be relative)
   * @return resolved absolute url.
   * @throws MalformedURLException
   */
  public static URL resolveURL(URL base, String target)
          throws MalformedURLException {
    target = target.trim();


    /* this is probably not needed anymore - see NUTCH-797.
    // handle params that are embedded into the base url - move them to target
    // so URL class constructs the new url class properly
    if (base.toString().indexOf(';') > 0)
      return fixEmbeddedParams(base, target);
    */
    
    // handle the case that there is a target that is a pure query,
    // for example
    // http://careers3.accenture.com/Careers/ASPX/Search.aspx?co=0&sk=0
    // It has urls in the page of the form href="?co=0&sk=0&pg=1", and by
    // default
    // URL constructs the base+target combo as
    // http://careers3.accenture.com/Careers/ASPX/?co=0&sk=0&pg=1, incorrectly
    // dropping the Search.aspx target
    //
    // Browsers handle these just fine, they must have an exception similar to
    // this
    if (target.startsWith("?")) {
      return fixPureQueryTargets(base, target);
    }


    return new URL(base, target);
  }


  /** Handle the case in RFC3986 section 5.4.1 example 7, and similar. */
   static URL fixPureQueryTargets(URL base, String target)
          throws MalformedURLException {
    if (!target.startsWith("?")) return new URL(base, target);


    String basePath = base.getPath();
    String baseRightMost = "";
    int baseRightMostIdx = basePath.lastIndexOf("/");
    if (baseRightMostIdx != -1) {
      baseRightMost = basePath.substring(baseRightMostIdx + 1);
    }


    if (target.startsWith("?")) target = baseRightMost + target;


    return new URL(base, target);
  }


  /**
   * Handles cases where the url param information is encoded into the base url
   * as opposed to the target.
   * <p>
   * If the taget contains params (i.e. ';xxxx') information then the target
   * params information is assumed to be correct and any base params information
   * is ignored. If the base contains params information but the tareget does
   * not, then the params information is moved to the target allowing it to be
   * correctly determined by the java.net.URL class.
   * 
   * @param base
   *          The base URL.
   * @param target
   *          The target path from the base URL.
   * 
   * @return URL A URL with the params information correctly encoded.
   * 
   * @throws MalformedURLException
   *           If the url is not a well formed URL.
   */
  private static URL fixEmbeddedParams(URL base, String target)
          throws MalformedURLException {


    // the target contains params information or the base doesn't then no
    // conversion necessary, return regular URL
    if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
      return new URL(base, target);
    }


    // get the base url and it params information
    String baseURL = base.toString();
    int startParams = baseURL.indexOf(';');
    String params = baseURL.substring(startParams);


    // if the target has a query string then put the params information after
    // any path but before the query string, otherwise just append to the path
    int startQS = target.indexOf('?');
    if (startQS >= 0) {
      target = target.substring(0, startQS) + params
              + target.substring(startQS);
    } else {
      target += params;
    }


    return new URL(base, target);
  }


  private static Pattern IP_PATTERN = Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})");


  /** Returns the domain name of the url. The domain name of a url is
   *  the substring of the url's hostname, w/o subdomain names. As an
   *  example <br><code>
   *  getDomainName(conf, new URL(http://lucene.apache.org/))
   *  </code><br>
   *  will return <br><code> apache.org</code>
   *   */
  public static String getDomainName(URL url) {
    DomainSuffixes tlds = DomainSuffixes.getInstance();
    String host = url.getHost();
    //it seems that java returns hostnames ending with .
    if(host.endsWith("."))
      host = host.substring(0, host.length() - 1);
    if(IP_PATTERN.matcher(host).matches())
      return host;
    
    int index = 0;
    String candidate = host;
    for(;index >= 0;) {
      index = candidate.indexOf('.');
      String subCandidate = candidate.substring(index+1); 
      if(tlds.isDomainSuffix(subCandidate)) {
        return candidate; 
      }
      candidate = subCandidate;
    }
    return candidate;
  }


  /** Returns the domain name of the url. The domain name of a url is
   *  the substring of the url's hostname, w/o subdomain names. As an
   *  example <br><code>
   *  getDomainName(conf, new http://lucene.apache.org/)
   *  </code><br>
   *  will return <br><code> apache.org</code>
   * @throws MalformedURLException
   */
  public static String getDomainName(String url) throws MalformedURLException {
    return getDomainName(new URL(url));
  }


  /** Returns the top level domain name of the url. The top level domain name
   *  of a url is the substring of the url's hostname, w/o subdomain names.
   *  As an example <br><code>
   *  getTopLevelDomainName(conf, new http://lucene.apache.org/)
   *  </code><br>
   *  will return <br><code> org</code>
   * @throws MalformedURLException
   */
  public static String getTopLevelDomainName(URL url) throws MalformedURLException {
    String suffix = getDomainSuffix(url).toString();
    int idx = suffix.lastIndexOf(".");
    if (idx != -1) {
      return suffix.substring(idx + 1);
    } else {
      return suffix;
    }
  }


  /** Returns the top level domain name of the url. The top level domain name
   *  of a url is the substring of the url's hostname, w/o subdomain names.
   *  As an example <br><code>
   *  getTopLevelDomainName(conf, new http://lucene.apache.org/)
   *  </code><br>
   *  will return <br><code> org</code>
   * @throws MalformedURLException
   */
  public static String getTopLevelDomainName(String url) throws MalformedURLException {
    return getTopLevelDomainName(new URL(url));
  }


  /** Returns whether the given urls have the same domain name.
   * As an example, <br>
   * <code> isSameDomain(new URL("http://lucene.apache.org")
   * , new URL("http://people.apache.org/"))
   * <br> will return true. </code>
   *
   * @return true if the domain names are equal
   */
  public static boolean isSameDomainName(URL url1, URL url2) {
    return getDomainName(url1).equalsIgnoreCase(getDomainName(url2));
  }


  /**Returns whether the given urls have the same domain name.
  * As an example, <br>
  * <code> isSameDomain("http://lucene.apache.org"
  * ,"http://people.apache.org/")
  * <br> will return true. </code>
  * @return true if the domain names are equal
  * @throws MalformedURLException
  */
  public static boolean isSameDomainName(String url1, String url2)
    throws MalformedURLException {
    return isSameDomainName(new URL(url1), new URL(url2));
  }


  /** Returns the {@link DomainSuffix} corresponding to the
   * last public part of the hostname
   */
  public static DomainSuffix getDomainSuffix(URL url) {
    DomainSuffixes tlds = DomainSuffixes.getInstance();
    String host = url.getHost();
    if(IP_PATTERN.matcher(host).matches())
      return null;
    
    int index = 0;
    String candidate = host;
    for(;index >= 0;) {
      index = candidate.indexOf('.');
      String subCandidate = candidate.substring(index+1);
      DomainSuffix d = tlds.get(subCandidate);
      if(d != null) {
        return d; 
      }
      candidate = subCandidate;
    }
    return null;
  }


  /** Returns the {@link DomainSuffix} corresponding to the
   * last public part of the hostname
   */
  public static DomainSuffix getDomainSuffix(String url) throws MalformedURLException {
    return getDomainSuffix(new URL(url));
  }


  /** Partitions of the hostname of the url by "."  */
  public static String[] getHostSegments(URL url) {
    String host = url.getHost();
    //return whole hostname, if it is an ipv4
    //TODO : handle ipv6
    if(IP_PATTERN.matcher(host).matches())
      return new String[] {host};
    return host.split("\\.");
  }


  /** Partitions of the hostname of the url by "."
   * @throws MalformedURLException */
  public static String[] getHostSegments(String url) throws MalformedURLException {
   return getHostSegments(new URL(url));
  }


  /**
   * <p>Given two urls, a src and a destination of a redirect, it returns the 
   * representative url.<p>
   * 
   * <p>This method implements an extended version of the algorithm used by the
   * Yahoo! Slurp crawler described here:<br>
   * <a href=
   * "http://help.yahoo.com/l/nz/yahooxtra/search/webcrawler/slurp-11.html"> How
   * does the Yahoo! webcrawler handle redirects?</a> <br>
   * <br>
   * <ol>
   * <li>Choose target url if either url is malformed.</li>
   * <li>If different domains the keep the destination whether or not the 
   * redirect is temp or perm</li>
   * <ul><li>a.com -> b.com*</li></ul>
   * <li>If the redirect is permanent and the source is root, keep the source.</li>
   * <ul><li>*a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html</li></ul>
   * <li>If the redirect is permanent and the source is not root and the 
   * destination is root, keep the destination</li>
   * <ul><li>a.com/xyz/index.html -> a.com*</li></ul>
   * <li>If the redirect is permanent and neither the source nor the destination
   * is root, then keep the destination</li>
   * <ul><li>a.com/xyz/index.html -> a.com/abc/page.html*</li></ul>
   * <li>If the redirect is temporary and source is root and destination is not
   * root, then keep the source</li>
   * <ul><li>*a.com -> a.com/xyz/index.html</li></ul>
   * <li>If the redirect is temporary and source is not root and destination is
   * root, then keep the destination</li>
   * <ul><li>a.com/xyz/index.html -> a.com*</li></ul>
   * <li>If the redirect is temporary and neither the source or the destination
   * is root, then keep the shortest url.  First check for the shortest host,
   * and if both are equal then check by path.  Path is first by length then by
   * the number of / path separators.</li>
   * <ul>
   * <li>a.com/xyz/index.html -> a.com/abc/page.html*</li>
   * <li>*www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html</li>
   * </ul>
   * <li>If the redirect is temporary and both the source and the destination
   * are root, then keep the shortest sub-domain</li>
   * <ul><li>*www.a.com -> www.news.a.com</li></ul>
   * <br>
   * While not in this logic there is a further piece of representative url 
   * logic that occurs during indexing and after scoring.  During creation of 
   * the basic fields before indexing, if a url has a representative url stored
   * we check both the url and its representative url (which should never be 
   * the same) against their linkrank scores and the highest scoring one is 
   * kept as the url and the lower scoring one is held as the orig url inside 
   * of the index.
   * 
   * @param src The source url.
   * @param dst The destination url.
   * @param temp Is the redirect a temporary redirect.
   * 
   * @return String The representative url.
   */
  public static String chooseRepr(String src, String dst, boolean temp) {


    // validate both are well formed urls
    URL srcUrl;
    URL dstUrl;
    try {
      srcUrl = new URL(src);
      dstUrl = new URL(dst);
    }
    catch (MalformedURLException e) {
      return dst;
    }


    // get the source and destination domain, host, and page
    String srcDomain = URLUtil.getDomainName(srcUrl);
    String dstDomain = URLUtil.getDomainName(dstUrl);
    String srcHost = srcUrl.getHost();
    String dstHost = dstUrl.getHost();
    String srcFile = srcUrl.getFile();
    String dstFile = dstUrl.getFile();


    // are the source and destination the root path url.com/ or url.com
    boolean srcRoot = (srcFile.equals("/") || srcFile.length() == 0);
    boolean destRoot = (dstFile.equals("/") || dstFile.length() == 0);


    // 1) different domain them keep dest, temp or perm
    // a.com -> b.com*
    //    
    // 2) permanent and root, keep src
    // *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html
    //      
    // 3) permanent and not root and dest root, keep dest
    // a.com/xyz/index.html -> a.com*
    //      
    // 4) permanent and neither root keep dest
    // a.com/xyz/index.html -> a.com/abc/page.html*
    //      
    // 5) temp and root and dest not root keep src
    // *a.com -> a.com/xyz/index.html
    //  
    // 7) temp and not root and dest root keep dest
    // a.com/xyz/index.html -> a.com*
    //  
    // 8) temp and neither root, keep shortest, if hosts equal by path else by
    // hosts. paths are first by length then by number of / separators
    // a.com/xyz/index.html -> a.com/abc/page.html*
    // *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html
    //  
    // 9) temp and both root keep shortest sub domain
    // *www.a.com -> www.news.a.com


    // if we are dealing with a redirect from one domain to another keep the
    // destination
    if (!srcDomain.equals(dstDomain)) {
      return dst;
    }


    // if it is a permanent redirect
    if (!temp) {
      
      // if source is root return source, otherwise destination
      if (srcRoot) {
        return src;
      }
      else {
        return dst;
      }
    }
    else { // temporary redirect


      // source root and destination not root
      if (srcRoot && !destRoot) {
        return src;
      }
      else if (!srcRoot && destRoot) { // destination root and source not
        return dst;
      }
      else if (!srcRoot && !destRoot && (srcHost.equals(dstHost))) {


        // source and destination hosts are the same, check paths, host length
        int numSrcPaths = srcFile.split("/").length;
        int numDstPaths = dstFile.split("/").length;
        if (numSrcPaths != numDstPaths) {
          return (numDstPaths < numSrcPaths ? dst : src);
        }
        else {
          int srcPathLength = srcFile.length();
          int dstPathLength = dstFile.length();
          return (dstPathLength < srcPathLength ? dst : src);
        }
      }
      else {


        // different host names and both root take the shortest
        int numSrcSubs = srcHost.split("\\.").length;
        int numDstSubs = dstHost.split("\\.").length;
        return (numDstSubs < numSrcSubs ? dst : src);
      }
    }
  }


  /**
   * Returns the lowercased hostname for the url or null if the url is not well
   * formed.
   * 
   * @param url The url to check.
   * @return String The hostname for the url.
   */
  public static String getHost(String url) {
    try {
      return new URL(url).getHost().toLowerCase();
    }
    catch (MalformedURLException e) {
      return null;
    }
  }


  /**
   * Returns the page for the url.  The page consists of the protocol, host,
   * and path, but does not include the query string.  The host is lowercased
   * but the path is not.
   * 
   * @param url The url to check.
   * @return String The page for the url.
   */
  public static String getPage(String url) {
    try {
      // get the full url, and replace the query string with and empty string
      url = url.toLowerCase();
      String queryStr = new URL(url).getQuery();
      return (queryStr != null) ? url.replace("?" + queryStr, "") : url;
    }
    catch (MalformedURLException e) {
      return null;
    }
  }
  
  public static String getProtocol(String url) {
    try {
      return getProtocol(new URL(url));
    } catch (Exception e) {
      return null;
    }
  }
  
  public static String getProtocol(URL url) {
    return url.getProtocol();
  }


  public static String toASCII(String url) {
    try {
      URL u = new URL(url);
      URI p = new URI(u.getProtocol(),
        u.getUserInfo(),
        IDN.toASCII(u.getHost()),
        u.getPort(),
        u.getPath(),
        u.getQuery(),
        u.getRef());


      return p.toString();
    }
    catch (Exception e) {
      return null;
    }
  }


  public static String toUNICODE(String url) {
    try {
      URL u = new URL(url);
      StringBuilder sb = new StringBuilder();
      sb.append(u.getProtocol());
      sb.append("://");
      if (u.getUserInfo() != null) {
        sb.append(u.getUserInfo());
        sb.append('@');
      }
      sb.append(IDN.toUnicode(u.getHost()));
      if (u.getPort() != -1) {
        sb.append(':');
        sb.append(u.getPort());
      }
      sb.append(u.getFile()); // includes query
      if (u.getRef() != null) {
        sb.append('#');
        sb.append(u.getRef());
      }


      return sb.toString();
    }
    catch (Exception e) {
      return null;
    }
  }




  /** For testing */
  public static void main(String[] args){
    
    if(args.length!=1) {
      System.err.println("Usage : URLUtil <url>");
      return ;
    }
    
    String url = args[0];
    try {
      System.out.println(URLUtil.getDomainName(new URL(url)));
    }
    catch (MalformedURLException ex) {
      ex.printStackTrace();
    }
  }
}
Source Code of org.apache.nutch.util.URLUtil

Related Classes of org.apache.nutch.util.URLUtil