/*
* Copyright (C) 2004 Paul Browne, http://www.firstpartners.net,
* built with the help of Fast-Soft (fastsoftdev@yahoo.com)
*
* released under terms of the GPL license
* http://www.opensource.org/licenses/gpl-license.php
*
* This product includes software developed by the
* Apache Software Foundation (http://www.apache.org)."
*
* This product includes software developed by the
* Spring Framework Project (http://www.springframework.org)."
*
*/
package net.fp.rp.search.back.extractor.util;
import net.fp.rp.common.exception.RpException;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedList;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
/**
* Class responsable for spidering the uri location and return the information
*
* @author Firstpartners.net
* @version 1.1
* Copyright @link www.firstpartners.net/red
*/
public class Spider {
/** Logger for this class and subclasses */
protected final Logger logger = Logger.getLogger(getClass());
/** Uri location of the document to spider */
private String uri;
/** Maxim length for the description */
private int maxLengthDesc;
/** Base of the links inside document */
private URL base;
/** Title of the document */
private String title;
/** Author of the document */
private String author;
/** Description (Summary) of the document */
private String description;
/** Links list */
private LinkedList links;
/** Values list */
private LinkedList values;
/**
* Creates a new Spider object for the specified location and maxim length
* of the summary required
*
* @param uri Uri location to spider
* @param lengthSummary Maxim length of the summary required
*/
public Spider(String uri, int lengthSummary) {
this.uri = uri;
this.maxLengthDesc = lengthSummary;
this.title = "";
this.author = "";
this.description = "";
this.links = new LinkedList();
this.values = new LinkedList();
}
/**
* Creates a new Spider object for the specified location
* with no summary length specified
* of the summary required
*
* @param uri Uri location to spider
*/
public Spider(String uri) {
this.uri = uri;
this.maxLengthDesc = 0;
this.title = "";
this.author = "";
this.description = "";
this.links = new LinkedList();
this.values = new LinkedList();
}
/**
* Start to spider the data
*
* @throws RpException Exception in parsing the data
*/
public void start() throws RpException {
InputStream in = null;
try {
in = UtilExtract.getStream(uri);
InputStreamReader isr = new InputStreamReader(in); // convert the stream to a reader.
SpiderParserCallback cb = new SpiderParserCallback(); // create a callback object
ParserDelegator pd = new ParserDelegator(); // create the delegator
pd.parse(isr, cb, true); // parse the stream
isr.close(); // close the stream
} catch (IOException e) {
} finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e) {
}
}
}
/**
* repairs a sloppy href, flips backwards /, adds missing /
*
* @param href web site reference
*
* @return repaired web page reference
*/
public String fixHref(String href) {
String newhref = href.replace('\\', '/'); // fix sloppy web references
int lastdot = newhref.lastIndexOf('.');
int lastslash = newhref.lastIndexOf('/');
if (lastslash > lastdot) {
if (newhref.charAt(newhref.length() - 1) != '/') {
newhref = newhref + "/"; // add on missing /
}
}
return newhref;
}
/**
* Add the URL-object to the links list
*
* @param u Url object
*/
public void addLink(URL u) {
links.add(u);
}
/**
* Add the value-object to the values list
*
* @param value DOCUMENT ME!
*/
public void addValue(String value) {
//post processing for the values
String s = value.replaceAll("\u00A0", "");
if (s.length() != 0) {
values.add(value.trim());
}
}
/**
* Get the author
*
* @return Author of the page
*/
public String getAuthor() {
return author;
}
/**
* Set the page author
*
* @param author Page author
*/
public void setAuthor(String author) {
this.author = author;
}
/**
* Get the document base
*
* @return Document base
*/
public URL getBase() {
return base;
}
/**
* Set the document base
*
* @param abase Document base
*/
public void setBase(String abase) {
try {
base = new URL(abase);
} catch (MalformedURLException e) {
}
}
/**
* Get the page description
*
* @return Page description
*/
public String getDescription() {
return description;
}
/**
* Set the page description
*
* @param description Page description
*/
public void setDescription(String description) {
if (description.length() > maxLengthDesc) {
this.description = description.substring(0, maxLengthDesc);
} else {
this.description = description;
}
}
/**
* Get the list of the links from the document
*
* @return List of links from documen
*/
public LinkedList getLinks() {
return links;
}
/**
* Set the list of links
*
* @param links List of the links of the document
*/
public void setLinks(LinkedList links) {
this.links = links;
}
/**
* Get the maxim length for the description
*
* @return Maxim length for the description
*/
public int getMaxLengthDesc() {
return maxLengthDesc;
}
/**
* Get the title of the document
*
* @return Title of the document
*/
public String getTitle() {
return title;
}
/**
* Set the title of the document
*
* @param title Title of the document
*/
public void setTitle(String title) {
this.title = title;
}
/**
* Get the uri of the document
*
* @return Document location
*/
public String getUri() {
return uri;
}
/**
* Get the list of the document values
*
* @return List of the values from docuemnt
*/
public LinkedList getValues() {
return values;
}
/**
* Inner class
*
* @author Firstpartners.net
* @version 1.1
*/
public class SpiderParserCallback extends HTMLEditorKit.ParserCallback {
/** Description meta data */
public static final String METADATA_DESCRIPTION = "description";
/** Summary meta data */
public static final String METADATA_SUMMARY = "summary";
/** Author meta data */
public static final String METADATA_AUTHOR = "author";
/** Webmaster meta data */
public static final String METADATA_WEBMASTER = "webmaster";
/** contents of last text element */
private String lastText = "";
/** summary text */
private StringBuffer summaryText;
/** flag to mark the actual process of title tag */
private boolean isInTitle = false;
/** flag to mark the actual process of body tag */
private boolean isInBody = false;
/** flag to mark the actual process of script tag */
private boolean isInScript = false;
/** flag to mark that the summary was found it */
private boolean foundSummary = false;
/**
* Creates a new instance of SpiderParserCallback
*/
public SpiderParserCallback() {
summaryText = new StringBuffer();
}
/**
* handle HTML tags that don't have a start and end tag
*
* @param t HTML tag
* @param a HTML attributes
* @param pos Position within file
*/
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
if (t.equals(HTML.Tag.META)) {
Object value = a.getAttribute(HTML.Attribute.NAME);
if (value != null) {
String tempMetaName = value.toString();
if ((METADATA_DESCRIPTION.equals(tempMetaName)) ||
(METADATA_SUMMARY.equals(tempMetaName))) {
Object content = a.getAttribute(HTML.Attribute.CONTENT);
if ((content != null) &&
!content.toString().trim().equals("")) {
setDescription(content.toString());
foundSummary = true;
}
} else if (METADATA_AUTHOR.equals(tempMetaName) ||
(tempMetaName.indexOf(METADATA_WEBMASTER) != -1)) {
Object author = a.getAttribute(HTML.Attribute.CONTENT);
if ((author != null) &&
!author.toString().trim().equals("")) {
setAuthor(author.toString());
}
}
}
return;
}
if (t.equals(HTML.Tag.BASE)) {
Object value = a.getAttribute(HTML.Attribute.HREF);
if (value != null) {
setBase(fixHref(value.toString()));
}
}
}
/**
* take care of start tags
*
* @param t HTML tag
* @param a HTML attributes
* @param pos Position within file
*/
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
if (t.equals(HTML.Tag.TITLE)) {
lastText = "";
isInTitle = true;
return;
}
if (t.equals(HTML.Tag.BODY)) {
isInBody = true;
return;
}
if (t.equals(HTML.Tag.SCRIPT)) {
isInScript = true;
return;
}
if (t.equals(HTML.Tag.A)) {
Object value = a.getAttribute(HTML.Attribute.HREF);
if (value != null) {
//node.addLinks(1);
String href = value.toString();
href = fixHref(href);
try {
URL referencedURL;
if (href.startsWith(UtilExtract.HTTP_BASE) ||
href.startsWith(UtilExtract.HTTPS_BASE)) {
referencedURL = new URL(href);
} else if (getBase() != null) {
referencedURL = new URL(getBase(), href);
} else {
referencedURL = new URL(new URL(uri), href);
}
addLink(referencedURL);
} catch (MalformedURLException e) {
}
}
}
}
/**
* take care of start tags
*
* @param t HTML tag
* @param pos Position within file
*/
public void handleEndTag(HTML.Tag t, int pos) {
if (t.equals(HTML.Tag.TITLE) && isInTitle) {
setTitle(lastText.trim());
isInTitle = false;
}
if (t.equals(HTML.Tag.SCRIPT)) {
isInScript = false;
}
if (t.equals(HTML.Tag.BODY)) {
if (!foundSummary) {
//process all the textes into one array -> and splitt this to be stored as summary
foundSummary = true;
setDescription(summaryText.toString());
}
isInBody = false;
}
}
/**
* take care of text between tags, check against keyword list for
* matches, if match found, set the node match status to true
*
* @param data Text between tags
* @param pos position of text within web page
*/
public void handleText(char[] data, int pos) {
lastText = new String(data);
addValue(lastText);
if (!foundSummary) {
if (isInBody && !isInScript) {
summaryText.append(lastText);
if (summaryText.length() > getMaxLengthDesc()) {
foundSummary = true;
setDescription(summaryText.toString());
}
}
}
}
}
}