Package net.matuschek.spider.docfilter

Source Code of net.matuschek.spider.docfilter.LinkLocalizer

package net.matuschek.spider.docfilter;

/************************************************
    Copyright (c) 2001/2002 by Daniel Matuschek
*************************************************/


import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.StringTokenizer;

import net.matuschek.http.HttpDoc;
import net.matuschek.util.NullWriter;

import org.w3c.dom.Node;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.w3c.dom.Document;

import org.w3c.tidy.Tidy;


/**
* Localizer tries to replace absolute links by relative links
* and should allow offline browsing.
*
* It uses JTidy to parse the file.
*
* @author Daniel Matuschek
* @version $Revision: 1.11 $
*/
public class LinkLocalizer implements DocumentFilter
{
  /** processing enabled ? */
  protected boolean enabled=true;

  /**
   * This method processes the file and will replace
   * absolute links by relative.
   *
   * @return the old document, if the ContentType is not
   * text/html, a new (localized) document otherwise.
   */
  public HttpDoc process(HttpDoc input)
    throws FilterException
  {
    if (input == null) {
      return null;
    }

    if (! input.isHTML()) {
      return input;
    }

    if (! enabled) {
      return input;
    }

    // okay, parse the HTML code
    ByteArrayInputStream bis = new ByteArrayInputStream(input.getContent());
    Tidy tidy = new Tidy();
    tidy.setUpperCaseTags(false);
    tidy.setUpperCaseAttrs(false);
    tidy.setErrout(new PrintWriter(new NullWriter()));

    Document doc = tidy.parseDOM(bis,null);

    rewriteDOM(doc,input.getURL());

    ByteArrayOutputStream bos = new ByteArrayOutputStream();
    tidy.pprint(doc,bos);

    input.setContent(bos.toByteArray());
   
    return input;
  }


  /**
   * Enable processing, the will parse the document and try to
   * replace absolute by relative links.
   */
  public void enable() {
    this.enabled=true;
  }


  /**
   * Disable processing, the filter will not change the document
   * content.
   */
  public void disable() {
    this.enabled=false;
  }


  /**
   * Is the link processing enabled ?
   *
   * @return true, if the filter processes links, false otherwise
   */
  public boolean isEnabled() {
    return this.enabled;
  }


  /**
   * Rewrite this DOM with relative URLs. Will process the whole DOM
   *
   * @param node root node of the DOM to modify
   * @param url base URL of teh document itself (for relative addressing)
   */
  private void rewriteDOM(Node node, URL url)
    throws FilterException
  {

    // this should not happen !
    if (node==null) {
      throw new FilterException("Got a null node");
    }

    // ELEMENT ?
    if (node instanceof Element) {
      String name = node.getNodeName();
      if (name.equals("a")
    || name.equals("area")) {
  localizeAttrib(node,"href",url);

      else if (name.equals("img")
     || name.equals("frame")) {
  localizeAttrib(node,"src",url);

      }
    }

    // recursive travel through all childs
    NodeList childs = node.getChildNodes();

    for (int i=0; i<childs.getLength(); i++) {
      rewriteDOM(childs.item(i),url);
    }
   

  }


  /**
   * Localize a given attribute for a Element. <br />
   * Thanks to Paul Tan for the feedback
   *
   * @param node an element node that should be localized
   * @param attribute name of the attribute that should be localized
   * @param context an URL that is the context for relative
   * addressing (base address)
   */
  private void localizeAttrib(Node node,
            String attribute,
            URL context)
  {
    Element el = (Element)node;
    String oldValue = el.getAttribute(attribute);

    // only localize if the attribute exists
    // only localize if the file is in another directory
    if (!oldValue.equals("") && oldValue.indexOf("/")!=-1) {
      String newValue = localizeURL(oldValue,context);
      el.setAttribute(attribute, newValue);
    } // end of if ()
   
  }



  /**
   * Localize a given URL.
   *
   * Thanks to Paul Tan and Laurent Salinas for the feedback.
   *
   * @param urlStr a String containing a URL, can be relative
   * (e.g. ../index.html) or absolute ("http://myserver/")
   * @param context an URL that a the context URL for relative URLs
   *
   * @return a String containing an URL that will be relative to the given
   * context if both URLs are on the same host, otherwise it will simply
   * return urlStr
   */
  private String localizeURL(String urlStr, URL context) {
    URL url;
    try {
      url = new URL(context, urlStr);
    } catch (MalformedURLException e) {
      return urlStr;
    }

    // only localize "http:" links
    if (! url.getProtocol().equalsIgnoreCase("http")) {
      return urlStr;
    }

    // only localize if new URL is on the same host !
   
    if ((context != null)
  && (context.getHost().equalsIgnoreCase(url.getHost()))) {
      String ref = url.getRef();
      String path = url.getPath();
     
      // Already relative
      // this should only happen if the context
      // is null
      if (path.startsWith("../")) {
  return urlStr;
      }

      // URL references
      if ((ref != null) && (! ref.equals(""))) {
  path = path+"#"+ref;
      }

      // implied index.html
      if ((path.length()>0) && (path.charAt(path.length()-1)) == '/') {
  path = path+"index.html";
      }
 
      return localizePath(url.getPath(),context.getPath());
    } else {
      return urlStr;
    }
  }


  /**
   * Localize a given path. Very dumb, but it works ;-)
   *
   * @param path path to localize
   * @param context reference path
   * @return a path that is given relative
   *
   * Example: <br />
   * path="/images/test.gif" <br />
   * context="/test/index.html"<br />
   * result="../images/test.gif"
   */
  private String localizePath(String path, String context) {
    StringTokenizer st = new StringTokenizer(context,"/");
    int depth = st.countTokens();
    if (! context.endsWith("/")) {
      depth--;
    }     

    StringBuffer sb = new StringBuffer();
    if (depth>0) {
      for (int i=0; i<depth; i++) {
  sb.append("/..");
      }
      sb.deleteCharAt(0);
    } else {
      if (path.startsWith("/")) {
  // delete first character (absolute path);
  path=path.substring(1);
      }
    }
    sb.append(path);

    return sb.toString();
  }

}

TOP

Related Classes of net.matuschek.spider.docfilter.LinkLocalizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.