Source Code of org.apache.roller.weblogger.util.LinkbackExtractor$LinkbackCallback

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
*  contributor license agreements.  The ASF licenses this file to You
* under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.  For additional information regarding
* copyright in this work, please see the NOTICE file in the top level
* directory of this distribution.
*/
package org.apache.roller.weblogger.util;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;


import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.Parser;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;


import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.io.FeedException;
import com.sun.syndication.io.SyndFeedInput;


/**
 * Parses HTML file for referring linkback title and excerpt.
 * 
 * @author David M Johnson
 */
public class LinkbackExtractor
{
    private static Log mLogger        = LogFactory.getFactory().getInstance(
                                              LinkbackExtractor.class);
    private boolean    mFound         = false;
    private String     mTitle         = "";
    private String     mRssLink       = null;
    private String     mExcerpt       = null;
    private String     mPermalink     = null;
    private int        mStart         = 0;
    private int        mEnd           = 0;
    private int        mMaxExcerpt    = 500;                           // characters
    private String     mRequestURL    = null;
    private String     mRequestURLWWW = null;
    private String     mRefererURL;


    //------------------------------------------------------------------------
    /**
     * Extract referring page title, excerpt, and permalink.
     * 
     * @param refererUrl
     * @param requestUrl
     */
    public LinkbackExtractor(String refererURL, String requestURL)
            throws MalformedURLException, IOException
    {
        try
        {
            extractByParsingHtml(refererURL, requestURL);
            if (mRssLink != null)
            {
                extractByParsingRss(mRssLink, requestURL);
            }
        }
        catch (Exception e)
        {
            if (mLogger.isDebugEnabled())
            {
                mLogger.debug("Extracting linkback", e);
            }
        }
    }


    //------------------------------------------------------------------------
    private void extractByParsingHtml(String refererURL, String requestURL)
            throws MalformedURLException, IOException
    {
        URL url = new URL(refererURL);
        InputStream is = url.openStream();


        mRefererURL = refererURL;


        if (requestURL.startsWith("http://www."))
        {
            mRequestURLWWW = requestURL;
            mRequestURL = "http://" + mRequestURLWWW.substring(11);
        }
        else
        {
            mRequestURL = requestURL;
            mRequestURLWWW = "http://www." + mRequestURL.substring(7);
        }


        // Trick gets Swing's HTML parser
        Parser parser = (new HTMLEditorKit() {
            public Parser getParser()
            {
                return super.getParser();
            }
        }).getParser();


        // Read HTML file into string
        StringBuffer sb = new StringBuffer();
        InputStreamReader isr = new InputStreamReader(is);
        BufferedReader br = new BufferedReader(isr);
        try
        {
            String line = null;
            while ((line = br.readLine()) != null)
            {
                sb.append(line);
            }
        }
        finally
        {
            br.close();
        }


        // Parse HTML string to find title and start and end position
        // of the referring excerpt.
        StringReader sr = new StringReader(sb.toString());
        parser.parse(sr, new LinkbackCallback(), true);


        if (mStart != 0 && mEnd != 0 && mEnd > mStart)
        {
            mExcerpt = sb.toString().substring(mStart, mEnd);
            mExcerpt = Utilities.removeHTML(mExcerpt);


            if (mExcerpt.length() > mMaxExcerpt)
            {
                mExcerpt = mExcerpt.substring(0, mMaxExcerpt) + "...";
            }
        }


        if (mTitle.startsWith(">") && mTitle.length() > 1)
        {
            mTitle = mTitle.substring(1);
        }
    }


    //------------------------------------------------------------------------
    private void extractByParsingRss(String rssLink, String requestURL)
            throws IllegalArgumentException, MalformedURLException, FeedException, IOException
    {
        SyndFeedInput feedInput = new SyndFeedInput();       
        SyndFeed feed = feedInput.build(
            new InputStreamReader(new URL(rssLink).openStream()));
        Iterator itemIter = feed.getEntries().iterator();
        String feedTitle = feed.getTitle();


        int count = 0;


        if (mLogger.isDebugEnabled())
        {
            mLogger.debug("Feed parsed, title: " + feedTitle);
        }


        while (itemIter.hasNext())
        {
            count++;
            SyndEntry item = (SyndEntry) itemIter.next();
            if (item.getDescription().getValue().indexOf(requestURL) != -1)
            {
                mFound = true;
                mPermalink = item.getLink().toString();
                if (feedTitle != null && feedTitle.trim().length() > 0)
                {
                    mTitle = feedTitle + ": " + item.getTitle();
                }
                else
                {
                    mTitle = item.getTitle();
                }
                mExcerpt = item.getDescription().getValue();
                mExcerpt = Utilities.removeHTML(mExcerpt);
                if (mExcerpt.length() > mMaxExcerpt)
                {
                    mExcerpt = mExcerpt.substring(0, mMaxExcerpt) + "...";
                }
                break;
            }
        }


        if (mLogger.isDebugEnabled())
        {
            mLogger.debug("Parsed " + count + " articles, found linkback="
                    + mFound);
        }
    }


    //------------------------------------------------------------------------
    /**
     * Returns the excerpt.
     * 
     * @return String
     */
    public String getExcerpt()
    {
        return mExcerpt;
    }


    //------------------------------------------------------------------------
    /**
     * Returns the title.
     * 
     * @return String
     */
    public String getTitle()
    {
        return mTitle;
    }


    //------------------------------------------------------------------------
    /**
     * Returns the permalink.
     * 
     * @return String
     */
    public String getPermalink()
    {
        return mPermalink;
    }


    //------------------------------------------------------------------------
    /**
     * Sets the permalink.
     * 
     * @param permalink
     *            The permalink to set
     */
    public void setPermalink(String permalink)
    {
        mPermalink = permalink;
    }


    /////////////////////////////////////////////////////////////////////////


    /**
     * Parser callback that finds title and excerpt. As we walk through the HTML
     * tags, we keep track of the most recently encountered divider tag in the
     * mStart field. Once we find the referring permalink, we set the mFound
     * flag. After that, we look for the next divider tag and save it's position
     * in the mEnd field.
     */
    private final class LinkbackCallback extends ParserCallback
    {
        // Dividers
        private Tag[] mDivTags    = { Tag.TD, Tag.DIV, Tag.SPAN,
                                          Tag.BLOCKQUOTE, Tag.P, Tag.LI,
                                          Tag.BR, Tag.HR, Tag.PRE, Tag.H1,
                                          Tag.H2, Tag.H3, Tag.H4, Tag.H5,
                                          Tag.H6 };


        private List  mList       = Arrays.asList(mDivTags);


        private Tag   mCurrentTag = null;


        /**
         * Look for divider tags and for the permalink.
         * 
         * @param tag
         *            HTML tag
         * @param atts
         *            Attributes of that tag
         * @param pos
         *            Tag's position in file
         */
        public void handleStartTag(Tag tag, MutableAttributeSet atts, int pos)
        {
            if (mList.contains(tag) && !mFound)
            {
                mStart = pos;
            }
            else if (mList.contains(tag) && mFound && mEnd == 0)
            {
                mEnd = pos;
            }
            else if (tag.equals(Tag.A))
            {
                String href = (String) atts.getAttribute(HTML.Attribute.HREF);
                if (href == null)
                    return;
                int hashPos = href.lastIndexOf('#');
                if (hashPos != -1)
                {
                    href = href.substring(0, hashPos);
                }
                if (href != null
                        && (href.equals(mRequestURL) || href
                                .equals(mRequestURLWWW)))
                {
                    mFound = true;
                }
                else
                {
                    /*
                     * if (mLogger.isDebugEnabled()) { mLogger.debug("No match:
                     * "+href); }
                     */
                }
            }
            mCurrentTag = tag;
        }


        /**
         * Needed to handle SPAN tag.
         */
        public void handleSimpleTag(Tag tag, MutableAttributeSet atts, int pos)
        {
            if (mList.contains(tag) && mFound && mEnd == 0)
            {
                mEnd = pos;
            }
            else if (tag.equals(Tag.LINK))
            {
                // Look out for RSS autodiscovery link
                String title = (String) atts.getAttribute(HTML.Attribute.TITLE);
                String type = (String) atts.getAttribute(HTML.Attribute.TYPE);
                if (title != null && type != null
                        && type.equals("application/rss+xml")
                        && title.equals("RSS"))
                {
                    mRssLink = (String) atts.getAttribute(HTML.Attribute.HREF);


                    if (mLogger.isDebugEnabled())
                    {
                        mLogger.debug("Found RSS link " + mRssLink);
                    }


                    if (mRssLink.startsWith("/") && mRssLink.length() > 1)
                    {
                        try
                        {
                            URL url = new URL(mRefererURL);
                            mRssLink = url.getProtocol() + "://"
                                    + url.getHost() + ":" + url.getPort()
                                    + mRssLink;
                        }
                        catch (MalformedURLException e)
                        {
                            mRssLink = null;
                            if (mLogger.isDebugEnabled())
                            {
                                mLogger.debug("Determining RSS URL", e);
                            }
                        }
                    }
                    else if (!mRssLink.startsWith("http"))
                    {
                        int slash = mRefererURL.lastIndexOf("/");
                        if (slash != -1)
                        {
                            mRssLink = mRefererURL.substring(0, slash) + "/"
                                    + mRssLink;
                        }
                    }
                    if (mLogger.isDebugEnabled())
                    {
                        mLogger.debug("Qualified RSS link is " + mRssLink);
                    }
                }
            }
        }


        /**
         * Stop at the very first divider tag after the permalink.
         * 
         * @param tag
         *            End tag
         * @param pos
         *            Position in HTML file
         */
        public void handleEndTag(Tag tag, int pos)
        {
            if (mList.contains(tag) && mFound && mEnd == 0)
            {
                mEnd = pos;
            }
            else if (mList.contains(tag) && !mFound)
            {
                mStart = pos;
            }
            else
            {
                mCurrentTag = null;
            }
        }


        /**
         * Get the page title
         */
        public void handleText(char[] data, int pos)
        {
            if (mCurrentTag != null && mCurrentTag.equals(Tag.TITLE))
            {
                String newText = new String(data);
                if (mTitle.length() < 50)
                {
                    mTitle += newText;
                }
            }
        }
    }
}
Source Code of org.apache.roller.weblogger.util.LinkbackExtractor$LinkbackCallback

Related Classes of org.apache.roller.weblogger.util.LinkbackExtractor$LinkbackCallback