Source Code of net.fp.rp.search.back.extractor.util.Spider$SpiderParserCallback

/*
 * Copyright (C) 2004 Paul Browne, http://www.firstpartners.net,
 * built with the help of Fast-Soft (fastsoftdev@yahoo.com)
 *
 * released under terms of the GPL license
 * http://www.opensource.org/licenses/gpl-license.php
 *
 * This product includes software developed by the
 * Apache Software Foundation (http://www.apache.org)."
 *
 * This product includes software developed by the
 * Spring Framework Project (http://www.springframework.org)."
 *
 */
package net.fp.rp.search.back.extractor.util;


import net.fp.rp.common.exception.RpException;


import org.apache.log4j.Logger;


import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;


import java.net.MalformedURLException;
import java.net.URL;


import java.util.LinkedList;


import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;




/**
 * Class responsable for spidering the uri location and return the information
 *
 * @author Firstpartners.net
 * @version 1.1
 * Copyright @link www.firstpartners.net/red
 */
public class Spider {
    /** Logger for this class and subclasses */
    protected final Logger logger = Logger.getLogger(getClass());


    /** Uri location of the document to spider */
    private String uri;


    /** Maxim length for the description */
    private int maxLengthDesc;


    /** Base of the links inside document */
    private URL base;


    /** Title of the document */
    private String title;


    /** Author of the document */
    private String author;


    /** Description (Summary) of the document */
    private String description;


    /** Links list */
    private LinkedList links;


    /** Values list */
    private LinkedList values;


    /**
     * Creates a new Spider object for the specified location and maxim length
     * of the summary required
     *
     * @param uri Uri location to  spider
     * @param lengthSummary Maxim length of the summary required
     */
    public Spider(String uri, int lengthSummary) {
        this.uri = uri;
        this.maxLengthDesc = lengthSummary;


        this.title = "";
        this.author = "";
        this.description = "";
        this.links = new LinkedList();
        this.values = new LinkedList();
    }


    /**
     * Creates a new Spider object for the specified location 
     * with no summary length specified
     * of the summary required
     *
     * @param uri Uri location to  spider
     */
    public Spider(String uri) {
        this.uri = uri;
        this.maxLengthDesc = 0;


        this.title = "";
        this.author = "";
        this.description = "";
        this.links = new LinkedList();
        this.values = new LinkedList();
    }




    /**
     * Start to spider the data
     *
     * @throws RpException Exception in parsing the data
     */
    public void start() throws RpException {
        InputStream in = null;


        try {
            in = UtilExtract.getStream(uri);


            InputStreamReader isr = new InputStreamReader(in); // convert the stream to a reader. 
            SpiderParserCallback cb = new SpiderParserCallback(); // create a callback object
            ParserDelegator pd = new ParserDelegator(); // create the delegator
            pd.parse(isr, cb, true); // parse the stream
            isr.close(); // close the stream
        } catch (IOException e) {
        } finally {
            try {
                if (in != null) {
                    in.close();
                }
            } catch (Exception e) {
            }
        }
    }


    /**
     * repairs a sloppy href, flips backwards /, adds missing /
     *
     * @param href web site reference
     *
     * @return repaired web page reference
     */
    public String fixHref(String href) {
        String newhref = href.replace('\\', '/'); // fix sloppy web references
        int lastdot = newhref.lastIndexOf('.');
        int lastslash = newhref.lastIndexOf('/');


        if (lastslash > lastdot) {
            if (newhref.charAt(newhref.length() - 1) != '/') {
                newhref = newhref + "/"; // add on missing /
            }
        }


        return newhref;
    }


    /**
     * Add the URL-object to the links list
     *
     * @param u Url object
     */
    public void addLink(URL u) {
        links.add(u);
    }


    /**
     * Add the value-object to the values list
     *
     * @param value DOCUMENT ME!
     */
    public void addValue(String value) {
        //post processing for the values
        String s = value.replaceAll("\u00A0", "");


        if (s.length() != 0) {
            values.add(value.trim());
        }
    }


    /**
     * Get the author
     *
     * @return Author of the page
     */
    public String getAuthor() {
        return author;
    }


    /**
     * Set the page author
     *
     * @param author Page author
     */
    public void setAuthor(String author) {
        this.author = author;
    }


    /**
     * Get the document base
     *
     * @return Document base
     */
    public URL getBase() {
        return base;
    }


    /**
     * Set the document base
     *
     * @param abase Document base
     */
    public void setBase(String abase) {
        try {
            base = new URL(abase);
        } catch (MalformedURLException e) {
        }
    }


    /**
     * Get the page description
     *
     * @return Page description
     */
    public String getDescription() {
        return description;
    }


    /**
     * Set the page description
     *
     * @param description Page description
     */
    public void setDescription(String description) {
        if (description.length() > maxLengthDesc) {
            this.description = description.substring(0, maxLengthDesc);
        } else {
            this.description = description;
        }
    }


    /**
     * Get the list of the links from the document
     *
     * @return List of links from documen
     */
    public LinkedList getLinks() {
        return links;
    }


    /**
     * Set the list of links
     *
     * @param links List of the links of the document
     */
    public void setLinks(LinkedList links) {
        this.links = links;
    }


    /**
     * Get the maxim length for the description
     *
     * @return Maxim length for the description
     */
    public int getMaxLengthDesc() {
        return maxLengthDesc;
    }


    /**
     * Get the title of the document
     *
     * @return Title of the document
     */
    public String getTitle() {
        return title;
    }


    /**
     * Set the title of the document
     *
     * @param title Title of the document
     */
    public void setTitle(String title) {
        this.title = title;
    }


    /**
     * Get the uri of the document
     *
     * @return Document location
     */
    public String getUri() {
        return uri;
    }


    /**
     * Get the list of the document values
     *
     * @return List of the values from docuemnt
     */
    public LinkedList getValues() {
        return values;
    }


    /**
     * Inner class
     *
     * @author Firstpartners.net
     * @version 1.1
     */
    public class SpiderParserCallback extends HTMLEditorKit.ParserCallback {
        /** Description meta data */
        public static final String METADATA_DESCRIPTION = "description";


        /** Summary meta data */
        public static final String METADATA_SUMMARY = "summary";


        /** Author meta data */
        public static final String METADATA_AUTHOR = "author";


        /** Webmaster meta data */
        public static final String METADATA_WEBMASTER = "webmaster";


        /** contents of last text element */
        private String lastText = "";


        /** summary text */
        private StringBuffer summaryText;


        /** flag to mark the actual process of title tag */
        private boolean isInTitle = false;


        /** flag to mark the actual process of body tag */
        private boolean isInBody = false;


        /** flag to mark the actual process of script tag */
        private boolean isInScript = false;


        /** flag to mark that the summary was found it */
        private boolean foundSummary = false;


        /**
         * Creates a new instance of SpiderParserCallback
         */
        public SpiderParserCallback() {
            summaryText = new StringBuffer();
        }


        /**
         * handle HTML tags that don't have a start and end tag
         *
         * @param t HTML tag
         * @param a HTML attributes
         * @param pos Position within file
         */
        public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
            if (t.equals(HTML.Tag.META)) {
                Object value = a.getAttribute(HTML.Attribute.NAME);


                if (value != null) {
                    String tempMetaName = value.toString();


                    if ((METADATA_DESCRIPTION.equals(tempMetaName)) ||
                            (METADATA_SUMMARY.equals(tempMetaName))) {
                        Object content = a.getAttribute(HTML.Attribute.CONTENT);


                        if ((content != null) &&
                                !content.toString().trim().equals("")) {
                            setDescription(content.toString());
                            foundSummary = true;
                        }
                    } else if (METADATA_AUTHOR.equals(tempMetaName) ||
                            (tempMetaName.indexOf(METADATA_WEBMASTER) != -1)) {
                        Object author = a.getAttribute(HTML.Attribute.CONTENT);


                        if ((author != null) &&
                                !author.toString().trim().equals("")) {
                            setAuthor(author.toString());
                        }
                    }
                }


                return;
            }


            if (t.equals(HTML.Tag.BASE)) {
                Object value = a.getAttribute(HTML.Attribute.HREF);


                if (value != null) {
                    setBase(fixHref(value.toString()));
                }
            }
        }


        /**
         * take care of start tags
         *
         * @param t HTML tag
         * @param a HTML attributes
         * @param pos Position within file
         */
        public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
            if (t.equals(HTML.Tag.TITLE)) {
                lastText = "";
                isInTitle = true;


                return;
            }


            if (t.equals(HTML.Tag.BODY)) {
                isInBody = true;


                return;
            }


            if (t.equals(HTML.Tag.SCRIPT)) {
                isInScript = true;


                return;
            }


            if (t.equals(HTML.Tag.A)) {
                Object value = a.getAttribute(HTML.Attribute.HREF);


                if (value != null) {
                    //node.addLinks(1);
                    String href = value.toString();
                    href = fixHref(href);


                    try {
                        URL referencedURL;


                        if (href.startsWith(UtilExtract.HTTP_BASE) ||
                                href.startsWith(UtilExtract.HTTPS_BASE)) {
                            referencedURL = new URL(href);
                        } else if (getBase() != null) {
                            referencedURL = new URL(getBase(), href);
                        } else {
                            referencedURL = new URL(new URL(uri), href);
                        }


                        addLink(referencedURL);
                    } catch (MalformedURLException e) {
                    }
                }
            }
        }


        /**
         * take care of start tags
         *
         * @param t HTML tag
         * @param pos Position within file
         */
        public void handleEndTag(HTML.Tag t, int pos) {
            if (t.equals(HTML.Tag.TITLE) && isInTitle) {
                setTitle(lastText.trim());
                isInTitle = false;
            }


            if (t.equals(HTML.Tag.SCRIPT)) {
                isInScript = false;
            }


            if (t.equals(HTML.Tag.BODY)) {


                if (!foundSummary) {
                    //process all the textes into one array -> and splitt this to be stored as summary
                    foundSummary = true;
                    setDescription(summaryText.toString());
                }


                isInBody = false;
            }
        }


        /**
         * take care of text between tags, check against keyword list for
         * matches, if match found, set the node match status to true
         *
         * @param data Text between tags
         * @param pos position of text within web page
         */
        public void handleText(char[] data, int pos) {
            lastText = new String(data);
            addValue(lastText);


            if (!foundSummary) {
                if (isInBody && !isInScript) {
                    summaryText.append(lastText);


                    if (summaryText.length() > getMaxLengthDesc()) {
                        foundSummary = true;
                        setDescription(summaryText.toString());
                    }
                }
            }
        }
    }
}
Source Code of net.fp.rp.search.back.extractor.util.Spider$SpiderParserCallback

Related Classes of net.fp.rp.search.back.extractor.util.Spider$SpiderParserCallback