Package appl.Portal.Utils.LinkSearch

Source Code of appl.Portal.Utils.LinkSearch.HtmlParser

/*
*  This software and supporting documentation were developed by
*
*    Siemens Corporate Technology
*    Competence Center Knowledge Management and Business Transformation
*    D-81730 Munich, Germany
*
*    Authors (representing a really great team ;-) )
*            Stefan B. Augustin, Thorbj�rn Hansen, Manfred Langen
*
*  This software is Open Source under GNU General Public License (GPL).
*  Read the text of this license in LICENSE.TXT
*  or look at www.opensource.org/licenses/
*
*  Once more we emphasize, that:
*  THIS SOFTWARE IS MADE AVAILABLE,  AS IS,  WITHOUT ANY WARRANTY
*  REGARDING  THE  SOFTWARE,  ITS  PERFORMANCE OR
*  FITNESS FOR ANY PARTICULAR USE, FREEDOM FROM ANY COMPUTER DISEASES OR
*  ITS CONFORMITY TO ANY SPECIFICATION. THE ENTIRE RISK AS TO QUALITY AND
*  PERFORMANCE OF THE SOFTWARE IS WITH THE USER.
*
*/


// ************ package ****************************************************
package appl.Portal.Utils.LinkSearch;

// ************ imports ******************************************************

import appl.Portal.Utils.LinkSearch.Item;

// KFM
import KFM.Exceptions.ProgrammerException;
import KFM.log.*;

// OROMatcher packages
import com.oroinc.text.regex.*;

// java packages
import java.util.Hashtable;
import java.util.Vector;


public class HtmlParser
{

    /** Vector that stores the ResultSet locally.
    *
    * Remember: The whole ResultSet is never returned by
    * this class but always just ( Hashtable  - wise ) with getItem
    */
    private Vector mResultSet = new Vector();

    /**
    * This variable tells the method match( String, String, boolean, int )
    * from wich offset on to match.
    */
    private int mOffset;

    /** The HTML - document to be parsed is contained in this string.  */
    private String mHTMLFile;

    /**
    * debug flag
    */
    boolean mDebug = false;

    private String[] mNames;

    /**
    *
    */

    private String mRegExpFrame;

    /**
    *
    */
    private String mRegExpItemSet;

    private String mRegExpItem;

    /** Variable that holds the current item ( see class "Item.java" ). */
    private Item mItem;

    /** Constructor. */
    public HtmlParser (){}

    public String getRegExpFrame(){return mRegExpFrame; }
    public void setRegExpFrame( String aRegExpFrame ){mRegExpFrame = aRegExpFrame;}

    public String getRegExpItemSet(){return mRegExpItemSet;}
    public void setRegExpItemSet( String aRegExpItemSet ){mRegExpItemSet =aRegExpItemSet;}

    public String getRegExpItem(){return mRegExpItem;}
    public void setRegExpItem( String aRegExpItem ){mRegExpItem =aRegExpItem;}

    public String[] getNames(){return mNames;}
    public void setNames ( String[] aNames ){mNames = aNames;}

    /** New since 2002-12-12: Access mRegExpFrame.group(1) instead of group(0), which was a mistake. */
    public void parse ( String aHTMLFile )
    {

        mHTMLFile = aHTMLFile;
        MatchResult tMatcher;
        mOffset = 0;
        tMatcher = this.match(mRegExpFrame, mHTMLFile, true, mOffset );
        if( tMatcher != null)
        {
            KFMSystem.log.debug("Matched EXPFRAME");

            // shorter string for performance
            String tCutString = tMatcher.group(1);
            KFMSystem.log.debug("Extracted String is:");
            KFMSystem.log.debug(tCutString);

            // tell match to start matching at index mOffset
            mOffset = 0;

            while (true)
            {
                // tell match to start again matching at index mOffset
                tMatcher = this.match( mRegExpItemSet, tCutString, true, mOffset );

                // note: the tMatcher.endOffset( 0 ) is greater 1 than the offset of the
                // last matched charakter
                // match() found something
                if( tMatcher != null )
                {
                    // after every match the offset is set
                    // to the end of the last match
                    mOffset = tMatcher.endOffset( 0 );
                    //see SearchEngineResultSet.java for explanation of
                    // 'mRegExpItem' and 'mNames'
                    // 'tMatcher.group(0)' contains the whole matched
                    // string

                    KFMSystem.log.debug("Matched ITEMSET");
                    KFMSystem.log.debug("Extracted String is:");
                    KFMSystem.log.debug(tMatcher.group(0));

                    mItem = new Item( mRegExpItem, tMatcher.group(0), mNames );
                    // add the Element to that Vector wich contains all
                    // ResultItems
                    // Note: 'mItem.getItem()' returns a Hashtable!
                    Hashtable tItem = mItem.getItem();
                    if (!tItem.isEmpty()){
                        mResultSet.addElement( tItem);
                    }
                }
                else {
                    KFMSystem.log.debug("NO ITEMSET MATCH");
                    break;
                }
            }
        }

        KFMSystem.log.debug("parsing done");
    }

    public int getNumberOfItems()
    {
        return mResultSet.size();
    }

    /**
    * Returns one resultitem.
    */

    public Hashtable getItem( int aIndex )
    {
        return (Hashtable)mResultSet.elementAt(aIndex);
    }
    /**
    * Method wich matches a patternstring with a contentstring.
    * Here the regular expression metacharacter '.' matches
    * everything, even new lines ('\n'). See code below:
    *
    * For more information about the implementation of regular expression ba OROMatcher see:
    * $/KFM/www-docs/protected/developer/external-docu/OROMatcher-1.0.7/doc/index.html
    *
    * @return MatchResult
    */
    public MatchResult match (
        String patternString,
        String text,
        boolean caseSensitive,
        int Offset)
    {
        int groups;
        PatternMatcher matcher = new Perl5Matcher();
        PatternCompiler compiler = new Perl5Compiler();
        Pattern pattern = null;
        PatternMatcherInput input;
        MatchResult result;

        // Wenn you set the Perl5Compiler.SINGLELINE_MASK option
        // the contentstring is treated singleline, even if there
        // are some '\n' in it.
        try {
            pattern = compiler.compile(patternString,
                Perl5Compiler.SINGLELINE_MASK | Perl5Compiler.CASE_INSENSITIVE_MASK);
        } catch(MalformedPatternException e) {
            System.err.println("LinkSearch.HtmlParser.match: Bad pattern: `" + e.getMessage() + "�.");
            //@@@ System.exit(1);
            //@@@ Make this cleaner some day.
            throw new ProgrammerException("LinkSearch.HtmlParser.match: Bad pattern: `" + e.getMessage() + "�.");
        }

        input = new PatternMatcherInput(text);
        // For debugging purposes.
        //  KFMSystem.log.debug( text);

        // set the current Offset to prevent
        // that the matcher starts again from the beginning of the string
        input.setCurrentOffset( Offset );

        if(matcher.contains(input, pattern)) {
            result = matcher.getMatch();
        } else {
            result = null;
        }
        return result;
    }

}
TOP

Related Classes of appl.Portal.Utils.LinkSearch.HtmlParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.