package net.matuschek.spider.docfilter;
/************************************************
Copyright (c) 2001/2002 by Daniel Matuschek
*************************************************/
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.StringTokenizer;
import net.matuschek.http.HttpDoc;
import net.matuschek.util.NullWriter;
import org.w3c.dom.Node;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.w3c.dom.Document;
import org.w3c.tidy.Tidy;
/**
* Localizer tries to replace absolute links by relative links
* and should allow offline browsing.
*
* It uses JTidy to parse the file.
*
* @author Daniel Matuschek
* @version $Revision: 1.11 $
*/
public class LinkLocalizer implements DocumentFilter
{
/** processing enabled ? */
protected boolean enabled=true;
/**
* This method processes the file and will replace
* absolute links by relative.
*
* @return the old document, if the ContentType is not
* text/html, a new (localized) document otherwise.
*/
public HttpDoc process(HttpDoc input)
throws FilterException
{
if (input == null) {
return null;
}
if (! input.isHTML()) {
return input;
}
if (! enabled) {
return input;
}
// okay, parse the HTML code
ByteArrayInputStream bis = new ByteArrayInputStream(input.getContent());
Tidy tidy = new Tidy();
tidy.setUpperCaseTags(false);
tidy.setUpperCaseAttrs(false);
tidy.setErrout(new PrintWriter(new NullWriter()));
Document doc = tidy.parseDOM(bis,null);
rewriteDOM(doc,input.getURL());
ByteArrayOutputStream bos = new ByteArrayOutputStream();
tidy.pprint(doc,bos);
input.setContent(bos.toByteArray());
return input;
}
/**
* Enable processing, the will parse the document and try to
* replace absolute by relative links.
*/
public void enable() {
this.enabled=true;
}
/**
* Disable processing, the filter will not change the document
* content.
*/
public void disable() {
this.enabled=false;
}
/**
* Is the link processing enabled ?
*
* @return true, if the filter processes links, false otherwise
*/
public boolean isEnabled() {
return this.enabled;
}
/**
* Rewrite this DOM with relative URLs. Will process the whole DOM
*
* @param node root node of the DOM to modify
* @param url base URL of teh document itself (for relative addressing)
*/
private void rewriteDOM(Node node, URL url)
throws FilterException
{
// this should not happen !
if (node==null) {
throw new FilterException("Got a null node");
}
// ELEMENT ?
if (node instanceof Element) {
String name = node.getNodeName();
if (name.equals("a")
|| name.equals("area")) {
localizeAttrib(node,"href",url);
} else if (name.equals("img")
|| name.equals("frame")) {
localizeAttrib(node,"src",url);
}
}
// recursive travel through all childs
NodeList childs = node.getChildNodes();
for (int i=0; i<childs.getLength(); i++) {
rewriteDOM(childs.item(i),url);
}
}
/**
* Localize a given attribute for a Element. <br />
* Thanks to Paul Tan for the feedback
*
* @param node an element node that should be localized
* @param attribute name of the attribute that should be localized
* @param context an URL that is the context for relative
* addressing (base address)
*/
private void localizeAttrib(Node node,
String attribute,
URL context)
{
Element el = (Element)node;
String oldValue = el.getAttribute(attribute);
// only localize if the attribute exists
// only localize if the file is in another directory
if (!oldValue.equals("") && oldValue.indexOf("/")!=-1) {
String newValue = localizeURL(oldValue,context);
el.setAttribute(attribute, newValue);
} // end of if ()
}
/**
* Localize a given URL.
*
* Thanks to Paul Tan and Laurent Salinas for the feedback.
*
* @param urlStr a String containing a URL, can be relative
* (e.g. ../index.html) or absolute ("http://myserver/")
* @param context an URL that a the context URL for relative URLs
*
* @return a String containing an URL that will be relative to the given
* context if both URLs are on the same host, otherwise it will simply
* return urlStr
*/
private String localizeURL(String urlStr, URL context) {
URL url;
try {
url = new URL(context, urlStr);
} catch (MalformedURLException e) {
return urlStr;
}
// only localize "http:" links
if (! url.getProtocol().equalsIgnoreCase("http")) {
return urlStr;
}
// only localize if new URL is on the same host !
if ((context != null)
&& (context.getHost().equalsIgnoreCase(url.getHost()))) {
String ref = url.getRef();
String path = url.getPath();
// Already relative
// this should only happen if the context
// is null
if (path.startsWith("../")) {
return urlStr;
}
// URL references
if ((ref != null) && (! ref.equals(""))) {
path = path+"#"+ref;
}
// implied index.html
if ((path.length()>0) && (path.charAt(path.length()-1)) == '/') {
path = path+"index.html";
}
return localizePath(url.getPath(),context.getPath());
} else {
return urlStr;
}
}
/**
* Localize a given path. Very dumb, but it works ;-)
*
* @param path path to localize
* @param context reference path
* @return a path that is given relative
*
* Example: <br />
* path="/images/test.gif" <br />
* context="/test/index.html"<br />
* result="../images/test.gif"
*/
private String localizePath(String path, String context) {
StringTokenizer st = new StringTokenizer(context,"/");
int depth = st.countTokens();
if (! context.endsWith("/")) {
depth--;
}
StringBuffer sb = new StringBuffer();
if (depth>0) {
for (int i=0; i<depth; i++) {
sb.append("/..");
}
sb.deleteCharAt(0);
} else {
if (path.startsWith("/")) {
// delete first character (absolute path);
path=path.substring(1);
}
}
sb.append(path);
return sb.toString();
}
}