Source Code of appl.Portal.Utils.LinkSearch.HtmlParser

/*
 *  This software and supporting documentation were developed by
 *
 *    Siemens Corporate Technology
 *    Competence Center Knowledge Management and Business Transformation
 *    D-81730 Munich, Germany
 *
 *    Authors (representing a really great team ;-) )
 *            Stefan B. Augustin, Thorbj�rn Hansen, Manfred Langen
 *
 *  This software is Open Source under GNU General Public License (GPL). 
 *  Read the text of this license in LICENSE.TXT 
 *  or look at www.opensource.org/licenses/
 *
 *  Once more we emphasize, that:
 *  THIS SOFTWARE IS MADE AVAILABLE,  AS IS,  WITHOUT ANY WARRANTY
 *  REGARDING  THE  SOFTWARE,  ITS  PERFORMANCE OR
 *  FITNESS FOR ANY PARTICULAR USE, FREEDOM FROM ANY COMPUTER DISEASES OR
 *  ITS CONFORMITY TO ANY SPECIFICATION. THE ENTIRE RISK AS TO QUALITY AND
 *  PERFORMANCE OF THE SOFTWARE IS WITH THE USER.
 *
 */




// ************ package ****************************************************
package appl.Portal.Utils.LinkSearch;


// ************ imports ******************************************************


import appl.Portal.Utils.LinkSearch.Item;


// KFM
import KFM.Exceptions.ProgrammerException;
import KFM.log.*;


// OROMatcher packages
import com.oroinc.text.regex.*;


// java packages
import java.util.Hashtable;
import java.util.Vector;




public class HtmlParser
{


    /** Vector that stores the ResultSet locally.
    *
    * Remember: The whole ResultSet is never returned by
    * this class but always just ( Hashtable  - wise ) with getItem
    */
    private Vector mResultSet = new Vector();


    /**
    * This variable tells the method match( String, String, boolean, int )
    * from wich offset on to match.
    */
    private int mOffset;


    /** The HTML - document to be parsed is contained in this string.  */
    private String mHTMLFile;


    /**
    * debug flag
    */
    boolean mDebug = false;


    private String[] mNames;


    /**
    *
    */


    private String mRegExpFrame;


    /**
    *
    */
    private String mRegExpItemSet;


    private String mRegExpItem;


    /** Variable that holds the current item ( see class "Item.java" ). */
    private Item mItem;


    /** Constructor. */
    public HtmlParser (){}


    public String getRegExpFrame(){return mRegExpFrame; }
    public void setRegExpFrame( String aRegExpFrame ){mRegExpFrame = aRegExpFrame;}


    public String getRegExpItemSet(){return mRegExpItemSet;}
    public void setRegExpItemSet( String aRegExpItemSet ){mRegExpItemSet =aRegExpItemSet;}


    public String getRegExpItem(){return mRegExpItem;}
    public void setRegExpItem( String aRegExpItem ){mRegExpItem =aRegExpItem;}


    public String[] getNames(){return mNames;}
    public void setNames ( String[] aNames ){mNames = aNames;}


    /** New since 2002-12-12: Access mRegExpFrame.group(1) instead of group(0), which was a mistake. */
    public void parse ( String aHTMLFile )
    {


        mHTMLFile = aHTMLFile;
        MatchResult tMatcher;
        mOffset = 0;
        tMatcher = this.match(mRegExpFrame, mHTMLFile, true, mOffset );
        if( tMatcher != null)
        {
            KFMSystem.log.debug("Matched EXPFRAME");


            // shorter string for performance
            String tCutString = tMatcher.group(1);
            KFMSystem.log.debug("Extracted String is:");
            KFMSystem.log.debug(tCutString);


            // tell match to start matching at index mOffset
            mOffset = 0;


            while (true)
            {
                // tell match to start again matching at index mOffset
                tMatcher = this.match( mRegExpItemSet, tCutString, true, mOffset );


                // note: the tMatcher.endOffset( 0 ) is greater 1 than the offset of the
                // last matched charakter
                // match() found something
                if( tMatcher != null )
                {
                    // after every match the offset is set
                    // to the end of the last match
                    mOffset = tMatcher.endOffset( 0 );
                    //see SearchEngineResultSet.java for explanation of
                    // 'mRegExpItem' and 'mNames'
                    // 'tMatcher.group(0)' contains the whole matched
                    // string


                    KFMSystem.log.debug("Matched ITEMSET");
                    KFMSystem.log.debug("Extracted String is:");
                    KFMSystem.log.debug(tMatcher.group(0));


                    mItem = new Item( mRegExpItem, tMatcher.group(0), mNames );
                    // add the Element to that Vector wich contains all
                    // ResultItems
                    // Note: 'mItem.getItem()' returns a Hashtable!
                    Hashtable tItem = mItem.getItem();
                    if (!tItem.isEmpty()){
                        mResultSet.addElement( tItem);
                    }
                }
                else {
                    KFMSystem.log.debug("NO ITEMSET MATCH");
                    break;
                }
            }
        }


        KFMSystem.log.debug("parsing done");
    }


    public int getNumberOfItems()
    {
        return mResultSet.size();
    }


    /**
    * Returns one resultitem.
    */


    public Hashtable getItem( int aIndex )
    {
        return (Hashtable)mResultSet.elementAt(aIndex);
    }
    /**
    * Method wich matches a patternstring with a contentstring.
    * Here the regular expression metacharacter '.' matches
    * everything, even new lines ('\n'). See code below:
    *
    * For more information about the implementation of regular expression ba OROMatcher see:
    * $/KFM/www-docs/protected/developer/external-docu/OROMatcher-1.0.7/doc/index.html
    *
    * @return MatchResult
    */
    public MatchResult match (
        String patternString,
        String text,
        boolean caseSensitive,
        int Offset)
    {
        int groups;
        PatternMatcher matcher = new Perl5Matcher();
        PatternCompiler compiler = new Perl5Compiler();
        Pattern pattern = null;
        PatternMatcherInput input;
        MatchResult result;


        // Wenn you set the Perl5Compiler.SINGLELINE_MASK option
        // the contentstring is treated singleline, even if there
        // are some '\n' in it.
        try {
            pattern = compiler.compile(patternString,
                Perl5Compiler.SINGLELINE_MASK | Perl5Compiler.CASE_INSENSITIVE_MASK);
        } catch(MalformedPatternException e) {
            System.err.println("LinkSearch.HtmlParser.match: Bad pattern: `" + e.getMessage() + "�.");
            //@@@ System.exit(1);
            //@@@ Make this cleaner some day.
            throw new ProgrammerException("LinkSearch.HtmlParser.match: Bad pattern: `" + e.getMessage() + "�.");
        }


        input = new PatternMatcherInput(text);
        // For debugging purposes.
        //  KFMSystem.log.debug( text);


        // set the current Offset to prevent
        // that the matcher starts again from the beginning of the string
        input.setCurrentOffset( Offset );


        if(matcher.contains(input, pattern)) {
            result = matcher.getMatch();
        } else {
            result = null;
        }
        return result;
    }


}
Source Code of appl.Portal.Utils.LinkSearch.HtmlParser

Related Classes of appl.Portal.Utils.LinkSearch.HtmlParser