{
String url;
int bookmark;
NodeList list;
NodeList robots;
MetaTag robot;
String content;
File file;
File dir;
PrintWriter out;
// get the next URL and add it to the done pile
url = (String)mPages.remove (0);
System.out.println ("processing " + url);
mFinished.add (url);
try
{
bookmark = mPages.size ();
// fetch the page and gather the list of nodes
mParser.setURL (url);
try
{
list = new NodeList ();
for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); )
list.add (e.nextNode ()); // URL conversion occurs in the tags
}
catch (EncodingChangeException ece)
{
// fix bug #998195 SiteCatpurer just crashed
// try again with the encoding now set correctly
// hopefully mPages, mImages, mCopied and mFinished won't be corrupted
mParser.reset ();
list = new NodeList ();
for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); )
list.add (e.nextNode ());
}
// handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html
// <meta name="robots" content="index,follow" />
// <meta name="robots" content="noindex,nofollow" />
robots = list.extractAllNodesThatMatch (
new AndFilter (
new NodeClassFilter (MetaTag.class),
new HasAttributeFilter ("name", "robots")), true);
if (0 != robots.size ())
{
robot = (MetaTag)robots.elementAt (0);
content = robot.getAttribute ("content").toLowerCase ();
if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow")))
// reset mPages
for (int i = bookmark; i < mPages.size (); i++)
mPages.remove (i);
if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex")))