Source Code of net.matuschek.spider.docfilter.LinkLocalizer

package net.matuschek.spider.docfilter;


/************************************************
    Copyright (c) 2001/2002 by Daniel Matuschek
*************************************************/




import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.StringTokenizer;


import net.matuschek.http.HttpDoc;
import net.matuschek.util.NullWriter;


import org.w3c.dom.Node;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.w3c.dom.Document;


import org.w3c.tidy.Tidy;




/**
 * Localizer tries to replace absolute links by relative links
 * and should allow offline browsing. 
 *
 * It uses JTidy to parse the file.
 * 
 * @author Daniel Matuschek 
 * @version $Revision: 1.11 $
 */
public class LinkLocalizer implements DocumentFilter
{
  /** processing enabled ? */
  protected boolean enabled=true;


  /**
   * This method processes the file and will replace 
   * absolute links by relative.
   *
   * @return the old document, if the ContentType is not
   * text/html, a new (localized) document otherwise.
   */
  public HttpDoc process(HttpDoc input) 
    throws FilterException 
  {
    if (input == null) { 
      return null;
    }


    if (! input.isHTML()) {
      return input;
    }


    if (! enabled) {
      return input;
    }


    // okay, parse the HTML code
    ByteArrayInputStream bis = new ByteArrayInputStream(input.getContent());
    Tidy tidy = new Tidy();
    tidy.setUpperCaseTags(false);
    tidy.setUpperCaseAttrs(false);
    tidy.setErrout(new PrintWriter(new NullWriter()));


    Document doc = tidy.parseDOM(bis,null);


    rewriteDOM(doc,input.getURL());


    ByteArrayOutputStream bos = new ByteArrayOutputStream();
    tidy.pprint(doc,bos);


    input.setContent(bos.toByteArray());
    
    return input;
  }




  /**
   * Enable processing, the will parse the document and try to
   * replace absolute by relative links.
   */
  public void enable() {
    this.enabled=true;
  }




  /**
   * Disable processing, the filter will not change the document
   * content.
   */
  public void disable() {
    this.enabled=false;
  }




  /**
   * Is the link processing enabled ?
   *
   * @return true, if the filter processes links, false otherwise
   */
  public boolean isEnabled() {
    return this.enabled;
  }




  /**
   * Rewrite this DOM with relative URLs. Will process the whole DOM
   *
   * @param node root node of the DOM to modify
   * @param url base URL of teh document itself (for relative addressing)
   */
  private void rewriteDOM(Node node, URL url) 
    throws FilterException
  {


    // this should not happen !
    if (node==null) {
      throw new FilterException("Got a null node");
    }


    // ELEMENT ?
    if (node instanceof Element) {
      String name = node.getNodeName();
      if (name.equals("a") 
    || name.equals("area")) {
  localizeAttrib(node,"href",url);


      }  else if (name.equals("img") 
     || name.equals("frame")) {
  localizeAttrib(node,"src",url);


      }
    }


    // recursive travel through all childs
    NodeList childs = node.getChildNodes();


    for (int i=0; i<childs.getLength(); i++) {
      rewriteDOM(childs.item(i),url);
    }
    


  }




  /**
   * Localize a given attribute for a Element. <br />
   * Thanks to Paul Tan for the feedback
   *
   * @param node an element node that should be localized
   * @param attribute name of the attribute that should be localized
   * @param context an URL that is the context for relative 
   * addressing (base address)
   */
  private void localizeAttrib(Node node,
            String attribute,
            URL context) 
  {
    Element el = (Element)node;
    String oldValue = el.getAttribute(attribute);


    // only localize if the attribute exists
    // only localize if the file is in another directory
    if (!oldValue.equals("") && oldValue.indexOf("/")!=-1) {
      String newValue = localizeURL(oldValue,context);
      el.setAttribute(attribute, newValue);
    } // end of if ()
    
  }






  /**
   * Localize a given URL.
   *
   * Thanks to Paul Tan and Laurent Salinas for the feedback.
   *
   * @param urlStr a String containing a URL, can be relative 
   * (e.g. ../index.html) or absolute ("http://myserver/")
   * @param context an URL that a the context URL for relative URLs
   *
   * @return a String containing an URL that will be relative to the given
   * context if both URLs are on the same host, otherwise it will simply
   * return urlStr
   */
  private String localizeURL(String urlStr, URL context) {
    URL url;
    try {
      url = new URL(context, urlStr);
    } catch (MalformedURLException e) {
      return urlStr;
    }


    // only localize "http:" links
    if (! url.getProtocol().equalsIgnoreCase("http")) {
      return urlStr;
    }


    // only localize if new URL is on the same host !
    
    if ((context != null) 
  && (context.getHost().equalsIgnoreCase(url.getHost()))) {
      String ref = url.getRef();
      String path = url.getPath();
      
      // Already relative
      // this should only happen if the context
      // is null
      if (path.startsWith("../")) {
  return urlStr;
      }


      // URL references
      if ((ref != null) && (! ref.equals(""))) {
  path = path+"#"+ref;
      }


      // implied index.html
      if ((path.length()>0) && (path.charAt(path.length()-1)) == '/') {
  path = path+"index.html";
      }
  
      return localizePath(url.getPath(),context.getPath());
    } else {
      return urlStr;
    }
  }




  /** 
   * Localize a given path. Very dumb, but it works ;-)
   *
   * @param path path to localize
   * @param context reference path
   * @return a path that is given relative
   * 
   * Example: <br />
   * path="/images/test.gif" <br />
   * context="/test/index.html"<br />
   * result="../images/test.gif"
   */
  private String localizePath(String path, String context) {
    StringTokenizer st = new StringTokenizer(context,"/");
    int depth = st.countTokens();
    if (! context.endsWith("/")) {
      depth--;
    }      


    StringBuffer sb = new StringBuffer();
    if (depth>0) {
      for (int i=0; i<depth; i++) {
  sb.append("/..");
      }
      sb.deleteCharAt(0);
    } else {
      if (path.startsWith("/")) {
  // delete first character (absolute path);
  path=path.substring(1);
      }
    }
    sb.append(path);


    return sb.toString();
  }


}
Source Code of net.matuschek.spider.docfilter.LinkLocalizer

Related Classes of net.matuschek.spider.docfilter.LinkLocalizer