Source Code of appl.Portal.Utils.LinkSearch.SearchEngineResultSet$SearchEngineResultItem

/*
 *  This software and supporting documentation were developed by
 *
 *    Siemens Corporate Technology
 *    Competence Center Knowledge Management and Business Transformation
 *    D-81730 Munich, Germany
 *
 *    Authors (representing a really great team ;-) )
 *            Stefan B. Augustin, Thorbj�rn Hansen, Manfred Langen
 *
 *  This software is Open Source under GNU General Public License (GPL). 
 *  Read the text of this license in LICENSE.TXT 
 *  or look at www.opensource.org/licenses/
 *
 *  Once more we emphasize, that:
 *  THIS SOFTWARE IS MADE AVAILABLE,  AS IS,  WITHOUT ANY WARRANTY
 *  REGARDING  THE  SOFTWARE,  ITS  PERFORMANCE OR
 *  FITNESS FOR ANY PARTICULAR USE, FREEDOM FROM ANY COMPUTER DISEASES OR
 *  ITS CONFORMITY TO ANY SPECIFICATION. THE ENTIRE RISK AS TO QUALITY AND
 *  PERFORMANCE OF THE SOFTWARE IS WITH THE USER.
 *
 */




// ************ package ****************************************************
package appl.Portal.Utils.LinkSearch;


// ************ imports ******************************************************


import java.io.*;
import java.util.*;
import java.net.*;
import sun.net.www.http.HttpClient;


import appl.Portal.Utils.LinkSearch.ResultItem;
import appl.Portal.Utils.LinkSearch.ResultSet;
//import appl.Portal.Utils.XML.XMLwithDOMBuilder;
//import appl.Portal.Utils.XML.HTMLwithXSLBuilder;
//import appl.Portal.Utils.LinkSearch.SearchEngineResultSet;
import appl.Portal.Utils.LinkSearch.Item;
import appl.Portal.Utils.LinkSearch.HtmlParser;


import KFM.HTML.HtmlLoader2;
import KFM.log.*;


/**
* This class is the superclass of all Searchengine specific subclasses (e. g. AltaVista)
* and delivers a ResultSet.
*
* The method buildSearchUrl() is defined here, wich has to be
* implemeted by all subclasses in their own way
*
* See interface "ResultSet.java" for full Documentation of the methods
* defined in the interface implemented by this class
*/
public abstract class SearchEngineResultSet implements ResultSet
{


    // ************************************************************
    // Constants
    // ************************************************************


    // ************************************************************
    // Variables
    // ************************************************************


    /** The proxy host to be used. Provided within the constructor.
     *  We need it, as all Internet content can only be accessed via a proxy.
     *
     * @see mProxyPort.
     */
    private String mProxyHost = "proxy";


    /** The proxy port to be used. Provided within the constructor.
     *  We need it, as all Internet content can only be accessed via a proxy.
     *
     * @see mProxyHost.
     */
    private String mProxyPort = "80";


    /** The searchenginges usually have the ability to split up the results
     *  in ranges (0-10, 11-20, 21-30, ...).
     *
     *  mRangeDelta: how many results are to be retrieved from the searchengine
     *  at one time
     */
    private int mRangeDelta = 10;


    /** Indicates the maximum number of items wich shall be retrieved from a searchengine.
    * Note: Yahoo never delivers more than 200.
    */
    private int mMaxAmount = 0;


    private Date mDateOfResultSet;


    /** Stores the name of the searchengine. */
    protected String mOrigin;


    /** Stores the searchterm. */
    protected String mSearchTerm;


    protected String mSearchUrlString;


    /** Some links found in the searched webpage are relative to the searchUrl. */
    protected static String mAbsoluteBase;


    /** Some links found in the searched webpage are relative to the searchUrl. */
    protected static String mRelativeBase;


    /** Stores the content of the website. */
    private String mContent = null;


    /** Fetches the result-html-page of a searchengine query.
    *
    * for detailed information see docu at
    * file:///O|/KFM/www-docs/protected/developer/Tutorial/UtilityKlassen.htm#HTML_Loader
    *
    */
    private HtmlLoader2 mHtmlLoader = new HtmlLoader2();


    /**  Keeps track of the current index of the Vector that stores the resultset. */
    private int mCnt = 0;


    /** Stores a regular expression.
    *
    * Input: content of a result-html-page.
    *
    * This string stores a regular expression, wich matches the dynamic content
    * of a result-html-page of a searchengine query.
    * Usage:
    * Normally, a 'RegExpFrame' pattern would look like this:
    *
    *        </font><dl><dt><b>[0-9]+. </b>(.*?)</dl><font face=Arial size=-1>
    *
    * Now, please take a look at an example of a string wich shall be matched:
    *
    *        </font><dl><dt><b>1. </b>Here come the resultitems e. g. from 1 to 10</dl><font face=Arial size=-1>
    *
    * The regular expression will match the string.
    *
    * The whole match is always called group(0). group(0) is of type string
    * and contains everything: matching conditions
    * ("</font><dl><dt><b>1. </b>" and "</dl><font face=Arial size=-1>")
    * and the amount of characters matched by a '(.*?)'
    *
    * The parenthesized thing '(.*?)' delivers a group bigger than 0, also of type string, e. g. group(1)
    * and contains the matched string, without(!) the matching conditions.
    *
    * In the example above: "Here come the resultitems e. g. from 1 to 10".
    *
    * To RegExpItemSet group(0) is passed on.
    *
    * For information about OROMatcher and regular Expressions see doc at
    * $/KFM/www-docs/protected/developer/appl/Portal/MetaSearch/SearchEngineWrapper.html)
    */
    protected String mRegExpFrame;


    /** Stores a regular expression.
    *
    * Input: group(0) from 'RegExpFrame'
    *
    * This string stores a regular expression, wich matches one item of the dynamic content
    * of a searchengine query.
    * Usage:
    * Normally, a 'RegExpItemSet' pattern would look like this:
    *
    *     <b>[0-9]+. </b>(.*?)<font color="#808080">(.*?)</font><br>
    *
    * Now, please take a look at an example of a string wich shall be matched:
    *
    *    <b>1. </b><a href="http://www.vetmed.uni-muenchen.de/">
    *    <font color="#808080">www.vetmed.uni-muenchen.de/</font><br>
    *
    *
    * The regular expression will match the string.
    *
    * The whole match is always called group(0). group(0) is of type string
    * and contains everything: matching conditions
    * ("<b>1. </b>" and "<font color="#808080">" and "</font><br>")
    * and the amount of characters matched by a '(.*?)'
    *
    * The parenthesized thing '(.*?)' delivers a group bigger than 0, also of type string, e. g. group(1)
    * and contains the matched string, without(!) the matching conditions.
    *
    * In the example above: "<a href="http://www.vetmed.uni-muenchen.de/">" and
    *                           "www.vetmed.uni-muenchen.de/".
    *
    *
    * To RegExpItem group(0) is passed on.
    *
    *
    * For information about OROMatcher and regular Expressions see doc at
    * $/KFM/www-docs/protected/developer/appl/Portal/MetaSearch/SearchEngineWrapper.html)
    */
    protected String mRegExpItemSet;


    /** Stores a regular expression.
    *
    * Input: group(0) from 'RegExpItemSet'.
    *
    * This string stores a regular expression, wich matches the attributes of one item of the dynamic content
    * of a searchengine query.
    *
    * Usage:
    * Normally, a 'RegExpItem' pattern would look like this:
    *
    *    <b>[0-9]+\. </b><a href="(.+?)">\s*<b>(.+?)</b></a>\s*</dt>\s*<dd>(.+?)</dd>\s*<br><b>URL:</b>
    *    <font color="#808080">(.+?)</font>\s*<br><font color="#808080">(.+?)</font><br>\s*
    *
    * Now, please take a look at an example of a string wich shall be matched:
    *
    *    <b>2. </b><a href="http://www.lodging-germany.com/munchen/hotels.htm">
    *    <b>Hotels Germany. H�tels Allemagne. M�nchen. Hotel Munchen, h�tels Munich.</b></a></dt>
    *    <dd>Munchen, M�nich a large selection of hotels in the inner city in all price categories....</dd>
    *    <br><b>URL:</b>\s*<font color="#808080">www.lodging-germany.com/munchen/hotels.htm</font>
    *    <br><font color="#808080">Last modified on: 17-Feb-2000 - 9K bytes - in English</font><br>
    *
    *
    * The regular expression will match the string.
    *
    * The whole match is always called group(0). group(0) is of type string
    * and contains everything: matching conditions
    * ("<b>2. </b>" and "<font color="#808080">" and "</font><br>" etc.)
    * and the amount of characters matched by a '(.*?)'
    *
    * The parenthesized thing '(.*?)' delivers a group bigger than 0, also of type string, e. g. group(1)
    * and contains the matched string, without(!) the matching conditions.
    *
    * In the example above:
    *
    * group(0) = contains whole example string
    * group(1) = "Url"              =    "<a href="http://www.vetmed.uni-muenchen.de/">" and
    * group(2) = "Description"      = "Hotels Germany. H�tels Allemagne. M�nchen. Hotel Munchen, h�tels Munich."
    * group(3) = "Summary"          = "Munchen, M�nich a large selection of hotels in the inner city in all price categories...."
    * group(4) = "HighlightedUrl"   = "www.lodging-germany.com/munchen/hotels.htm"
    * group(5) = other details      = "Last modified on: 17-Feb-2000 - 9K bytes - in English"
    *
    * For information about OROMatcher and regular Expressions see doc at
    * $/KFM/www-docs/protected/developer/appl/Portal/MetaSearch/SearchEngineWrapper.html)
    */
    protected String mRegExpItem;


    /** Allows to reference the index of a Hashtable with a name given by the arrayindex of mNames[].
    *
    * For information on groups and Hashtable referencing see doc at
    * $/KFM/www-docs/protected/developer/appl/Portal/MetaSearch/SearchEngineWrapper.html)
    *
    *
    * As "mRegExpItem" describes one item of a resultset and delivers its itemattributes
    * as subgroups (e. g. group(1) - group(5), see example directly above), we could store these
    * attributes in a string array or a Vector.
    * But for better readability of code, we store the attributes in a
    * hashtable, referencing the items of the hashtable with the names given in mNames;
    * example:
    * mNames may contain { "Url", "Description" }
    * we have the following subgroups: group(0) and group(1)
    * So, after instantiating a hashtable we can
    * reference "Url"          with group(1)
    * reference "Description"  with group(2)
    */
    protected String[] mNames;


    private static HtmlParser mHtmlParser;


    // ************************************************************
    // Methods
    // ************************************************************


    public SearchEngineResultSet ()
    {
    }


    public SearchEngineResultSet (String aProxyHost, String aProxyPort)
    {
        mProxyHost = aProxyHost;
        mProxyPort = aProxyPort;
    }


    /** Concatenates the different parts of the Url:
     * the searchengine's address + specific parameters (language, etc).
     */
    abstract boolean buildSearchUrl(String[] someSearchWords);


    /** Form the index part to the Url with the given index i.
     *
     * Has to be search engine specific because of the different
     * types of parameters needed here.
     *
     *@param aRangeStart  Start of range.
     */
    abstract String buildIndexPart(int aRangeStart);


    public boolean hasMoreItems() { return mCnt < mHtmlParser.getNumberOfItems(); }


    /** Returns previous item.*/
    public ResultItem previousItem ()
    {
        if(mCnt > 0) {
            // get Item at index from HtmlParser
            Hashtable tHash  = mHtmlParser.getItem(mCnt);
            // decrement index
            --mCnt;
            SearchEngineResultItem tItem = new SearchEngineResultItem();
            // convert item stored in a HashTable to a "true" ResultItem


            try {// a little bit strange, but i think it has to be this way
                if((String) tHash.get("Url") != null) {
                   tItem.setURL((String) tHash.get("Url"));
                }
            } catch(MalformedURLException e) {
                KFMSystem.log.error("SearchEngineResultSet:: previousItem: Got a MalformedURLException", e);
            }
            // set description, wich gets "Summary" in the DOM Document
            if((String) tHash.get("Description") != null)
                tItem.setDescription((String) tHash.get("Description"));


            if((String) tHash.get("Title") != null)
                tItem.setTitle((String) tHash.get("Title"));


            if((String) tHash.get("Author") != null)
                tItem.setAuthor((String) tHash.get("Author"));


            if((String) tHash.get("DocLanguage") != null)
                tItem.setLanguage((String) tHash.get("DocLanguage"));


            if((String) tHash.get("DocSize") != null)
                tItem.setSize((Long) tHash.get("DocSize"));


            if((String) tHash.get("ModifiedDate") != null)
                tItem.setLastModified((Date) tHash.get("ModifiedDate"));


            if((String) tHash.get("Score") != null)
                tItem.setHitScore((Double) tHash.get("Score"));


            return tItem;
        } else {
            return null;
        }
    }


    public ResultItem nextItem()
    {
        if(mCnt < mHtmlParser.getNumberOfItems()) {
            // get Item at index from HtmlParser
            Hashtable tHash  = mHtmlParser.getItem(mCnt);
            // increment counter
            ++mCnt;
            SearchEngineResultItem tItem = new SearchEngineResultItem();
            // convert item stored in a HashTable to a "true" ResultItem
            try {// a little bit strange, but i think it has to be this way
                if((String) tHash.get("Url") != null)
                    tItem.setURL((String) tHash.get("Url"));
            } catch(MalformedURLException e) {
                KFMSystem.log.error("SearchEngineResultSet:: nextItem: Got a MalformedURLException", e);
            }


            // set description, wich will be "Summary" in the DOM Document
            if((String) tHash.get("Description") != null)
                tItem.setDescription((String) tHash.get("Description"));


            if((String) tHash.get("Title") != null)
                tItem.setTitle((String) tHash.get("Title"));


            if((String) tHash.get("Author") != null)
                tItem.setAuthor((String) tHash.get("Author"));


            if((String) tHash.get("DocLanguage") != null)
                tItem.setLanguage((String) tHash.get("DocLanguage"));


            if((String) tHash.get("DocSize") != null)
                tItem.setSize((Long) tHash.get("DocSize"));


            if((String) tHash.get("ModifiedDate") != null)
                tItem.setLastModified((Date) tHash.get("ModifiedDate"));


            if((String) tHash.get("Score") != null)
                tItem.setHitScore((Double) tHash.get("Score"));


            return tItem;
        } else {
            return null;
        }
    }


    // store Origin of ResultSet e.g. AltaVista
    public String getOrigin () {
        if(mOrigin != null) { return mOrigin; } else { return null; }
    }


    // stores the amount of items to bde retrieved
    public void setMaxAmount (int aAmount)    { mMaxAmount = aAmount; }


    public void setRangeDelta(int rangeDelta) { mRangeDelta = rangeDelta; }


    // return Origin of ResultSet e.g. AltaVista
    public void setOrigin(String aOrigin)     { mOrigin = aOrigin; }


    // return date of document
    public Date getDate() {
        if(mDateOfResultSet != null) { return mDateOfResultSet; } else { return null; }
    }


    public void setDate(Date aDateOfResultSet)              { mDateOfResultSet = aDateOfResultSet; }
    public String getSearchTerm()                           { return mSearchTerm; }
    public void setSearchTerm(String aSearchTerm)           { mSearchTerm = aSearchTerm; }
    public String getSearchUrlString()                      { return mSearchUrlString; }
    public void setSearchUrlString(String aSearchUrlString) { mSearchUrlString = aSearchUrlString; }


    public void startSearch (String[] someSearchWords, boolean proxyFlag)
    {
        // store searchTerm
        this.setSearchTerm(someSearchWords[0]);


        if(this.buildSearchUrl(someSearchWords)) {
            // create a new instance of the mHtmlParser with
            // the searchenginespecific parameters
            mHtmlParser = new HtmlParser();
            mHtmlParser.setRegExpFrame(mRegExpFrame);
            mHtmlParser.setRegExpItemSet(mRegExpItemSet);
            mHtmlParser.setRegExpItem(mRegExpItem);
            mHtmlParser.setNames(mNames);


            for(int i = 1; i < this.mMaxAmount; i += mRangeDelta) {
                String tSearchUrlString = this.getSearchUrlString() + this.buildIndexPart(i);


                // * Load page (with proxy)


                try {
                    // ** Set proxy
                    if(proxyFlag) {
                        System.getProperties().put("proxyHost", mProxyHost);
                        // proxyport
                        System.getProperties().put("proxyPort", mProxyPort);
                        // Reset the properties otherwise the HTMLLoader won't load any longer.
                        HttpClient.resetProperties();
                    }


                    KFMSystem.log.detail("SearchEngineReusltSet::StartSearch: "
                            + "Loading URL " + tSearchUrlString);
                    boolean tSuccessfulLoaded = mHtmlLoader.get(new URL(tSearchUrlString), 0);


                    // Is page really loaded?
                    if(tSuccessfulLoaded == false) {
                        // @@@
                        KFMSystem.log.info("SearchEngineReusltSet::StartSearch: "
                            + "Load error with searchstring: " + tSearchUrlString + ".");
                    }
                    // Then return content as a string.
                    // You could also take mHtmlLoader.getBody()
                    // which would deliver all between <body> and </body>.
                    mContent = mHtmlLoader.getContent();
                } catch(IOException e) {
                    /// @@@
                    KFMSystem.log.error("SearchEngineResultSet:: startSearch: Got an IOException", e);
                }
                // Parse document.
                mHtmlParser.parse(mContent);
            }
        } else {
            // @@@
            KFMSystem.log.info("Url could not be built at SearchEngineResultSet.buildSearchUrl()");
        }
    }


    /** This inner class stores the attributes ("Url", "Description", "Date") of a ResultItem.
    *
    * I don't want to document it all, because I think you can quite guess what its all about.
    *
    * Just keep in mind that a Description may turn out to be a summary
    * in the XML document etc.
    *
    * See interface "ResultItem.java" for full Documentation
    */
    static public class SearchEngineResultItem extends ResultItemAdapter
    {
        private URL mUrl;
        private String mDescription;
        private String mTitle;
        private String mAuthor;
        private String mLanguage;
        private Long mFileSize;
        private Hashtable mCategories[];
        private Double mHitScore;
        private Date mLastModified;


        public String getURL() { return mUrl.toString(); }


//      public void setURL(URL aURL) { mUrl = aURL; }


        public void setURL(String aURL)
            throws MalformedURLException
        {
            try {
                mUrl = new URL(aURL);
            } catch(MalformedURLException e) {
                // Something went wrong with the Url. Let's try it with adding a base:
                try {
                    if(aURL.startsWith("/")) {
                        aURL = mAbsoluteBase + aURL;
                    } else {
                        aURL = mRelativeBase + aURL;
                    }
                    // KFMSystem.log.info("Changed Url to " + aURL);
                    mUrl = new URL(aURL);
                } catch (MalformedURLException ex) {
                    KFMSystem.log.error("SearchEngineResultSet:: setURL: Got a MalformedURLException", e);
                }
            }


        }


        public String getTitle() { return mTitle; }
        public void setTitle(String aTitle) { mTitle = aTitle; }


        public String getAuthor() { return mAuthor; }
        public void setAuthor(String aAuthor) { mAuthor = aAuthor; }


        public Long getSize() { return mFileSize; }
        public void setSize(Long aFileSize) { mFileSize = aFileSize; }


        public Date getLastModified() { return mLastModified; }
        public void setLastModified(Date aLastModifiedDate) { mLastModified = aLastModifiedDate; }


        public Hashtable[] getCategories() { return mCategories; }
        public void setCategories(Hashtable[] aCategories) { mCategories = aCategories; }


        public void setDescription(String newDescription) { mDescription = newDescription; }
        public String getDescription() { return mDescription; }


        public Double getHitScore() { return mHitScore; }
        public void setHitScore(Double aHitScore) { mHitScore = aHitScore; }


        public String getLanguage() { return mLanguage; }
        public void setLanguage(String aLanguage) { mLanguage = aLanguage; }
    }
}
Source Code of appl.Portal.Utils.LinkSearch.SearchEngineResultSet$SearchEngineResultItem

Related Classes of appl.Portal.Utils.LinkSearch.SearchEngineResultSet$SearchEngineResultItem