// $Header: /home/cvs/jakarta-jmeter/src/protocol/http/org/apache/jmeter/protocol/http/parser/RegexpHTMLParser.java,v 1.17 2004/03/24 03:23:00 sebb Exp $
/*
* Copyright 2003-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.jmeter.protocol.http.parser;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import org.apache.jorphan.logging.LoggingManager;
import org.apache.log.Logger;
// NOTE: Also looked at using Java 1.4 regexp instead of ORO. The change was
// trivial. Performance did not improve -- at least not significantly.
// Finally decided for ORO following advise from Stefan Bodewig (message
// to jmeter-dev dated 25 Nov 2003 8:52 CET) [Jordi]
import org.apache.oro.text.regex.MatchResult;
import org.apache.oro.text.regex.Pattern;
import org.apache.oro.text.regex.PatternMatcherInput;
import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Perl5Matcher;
import org.apache.oro.text.regex.MalformedPatternException;
/**
* HtmlParser implementation using regular expressions.
* <p>
* This class will find RLs specified in the following ways (where
* <b>url</b> represents the RL being found:
* <ul>
* <li><img src=<b>url</b> ... >
* <li><script src=<b>url</b> ... >
* <li><applet code=<b>url</b> ... >
* <li><input type=image src=<b>url</b> ... >
* <li><body background=<b>url</b> ... >
* <li><table background=<b>url</b> ... >
* <li><td background=<b>url</b> ... >
* <li><tr background=<b>url</b> ... >
* <li><applet ... codebase=<b>url</b> ... >
* <li><embed src=<b>url</b> ... >
* <li><embed codebase=<b>url</b> ... >
* <li><object codebase=<b>url</b> ... >
* </ul>
*
* <p>
* This class will take into account the following construct:
* <ul>
* <li><base href=<b>url</b>>
* </ul>
*
* <p>
* But not the following:
* <ul>
* <li>< ... codebase=<b>url</b> ... >
* </ul>
*
* @author <a href="mailto:jsalvata@apache.org">Jordi Salvat i Alabart</a>
* @version $Revision: 1.17 $ updated on $Date: 2004/03/24 03:23:00 $
*/
class RegexpHTMLParser extends HTMLParser
{
/**
* Regexp fragment matching a tag attribute's value (including
* the equals sign and any spaces before it). Note it matches
* unquoted values, which to my understanding, are not conformant
* to any of the HTML specifications, but are still quite common
* in the web and all browsers seem to understand them.
*/
private static final String VALUE=
"\\s*=\\s*(?:\"([^\"]*)\"|'([^']*)'|([^\"'\\s>\\\\][^\\s>]*)(?=[\\s>]))";
// Note there's 3 capturing groups per value
/**
* Regexp fragment matching the separation between two tag attributes.
*/
private static final String SEP=
"\\s(?:[^>]*\\s)?";
/**
* Regular expression used against the HTML code to find the URIs of
* images, etc.:
*/
private static final String REGEXP=
"<(?:"
+ "!--.*?-->"
+ "|BASE"+SEP+"HREF"+VALUE
+ "|(?:IMG|SCRIPT|FRAME|IFRAME)"+SEP+"SRC"+VALUE
+ "|APPLET"+SEP+"CODE(?:BASE)?"+VALUE
+ "|(?:EMBED|OBJECT)"+SEP+"(?:SRC|CODEBASE)"+VALUE
+ "|(?:BODY|TABLE|TR|TD)"+SEP+"BACKGROUND"+VALUE
+ "|INPUT(?:"+SEP+"(?:SRC"+VALUE+"|TYPE\\s*=\\s*(?:\"image\"|'image'|image(?=[\\s>])))){2,}"
+ "|LINK(?:"+SEP+"(?:HREF"+VALUE+"|REL\\s*=\\s*(?:\"stylesheet\"|'stylesheet'|stylesheet(?=[\\s>])))){2,}"
+ ")";
// Number of capturing groups possibly containing Base HREFs:
private static final int NUM_BASE_GROUPS= 3;
/**
* Compiled regular expression.
*/
static Pattern pattern;
/**
* Thread-local matcher:
*/
private static ThreadLocal localMatcher= new ThreadLocal()
{
protected Object initialValue()
{
return new Perl5Matcher();
}
};
/**
* Thread-local input:
*/
private static ThreadLocal localInput= new ThreadLocal()
{
protected Object initialValue()
{
return new PatternMatcherInput(new char[0]);
}
};
/** Used to store the Logger (used for debug and error messages). */
transient private static Logger log;
protected boolean isReusable()
{
return true;
}
/**
* Make sure to compile the regular expression upon instantiation:
*/
protected RegexpHTMLParser() {
super();
// Define this here to ensure it's ready to report any trouble
// with the regexp:
log= LoggingManager.getLoggerForClass();
// Compile the regular expression:
try
{
Perl5Compiler c= new Perl5Compiler();
pattern=
c.compile(
REGEXP,
Perl5Compiler.CASE_INSENSITIVE_MASK
| Perl5Compiler.SINGLELINE_MASK
| Perl5Compiler.READ_ONLY_MASK);
}
catch (MalformedPatternException mpe)
{
log.error(
"Internal error compiling regular expression in ParseRegexp.");
log.error("MalformedPatternException - " + mpe);
throw new Error(mpe.toString());//JDK1.4: remove .toString()
}
}
/* (non-Javadoc)
* @see org.apache.jmeter.protocol.http.parser.HtmlParser#getEmbeddedResourceURLs(byte[], java.net.URL)
*/
public Iterator getEmbeddedResourceURLs(byte[] html, URL baseUrl, URLCollection urls)
{
Perl5Matcher matcher= (Perl5Matcher)localMatcher.get();
PatternMatcherInput input= (PatternMatcherInput)localInput.get();
// TODO: find a way to avoid the cost of creating a String here --
// probably a new PatternMatcherInput working on a byte[] would do
// better.
input.setInput(new String(html));
while (matcher.contains(input, pattern))
{
MatchResult match= matcher.getMatch();
String s;
if (log.isDebugEnabled())
log.debug("match groups " + match.groups());
// Check for a BASE HREF:
for (int g=1; g <= NUM_BASE_GROUPS && g <= match.groups(); g++)
{
s= match.group(g);
if (s != null)
{
if (log.isDebugEnabled())
{
log.debug("new baseUrl: " + s + " - " + baseUrl.toString());
}
try
{
baseUrl= new URL(baseUrl, s);
}
catch (MalformedURLException e)
{
// Doesn't even look like a URL?
// Maybe it isn't: Ignore the exception.
if (log.isDebugEnabled())
{
log.debug(
"Can't build base URL from RL "
+ s
+ " in page "
+ baseUrl,
e);
}
}
}
}
for (int g= NUM_BASE_GROUPS+1; g <= match.groups(); g++)
{
s= match.group(g);
if (log.isDebugEnabled())
{
log.debug("group " + g + " - " + match.group(g));
}
if (s != null)
{
urls.addURL(s,baseUrl);
}
}
}
return urls.iterator();
}
}