Source Code of org.htmlparser.NodeReader

// $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/NodeReader.java,v 1.2 2004/02/10 13:41:10 woolfel Exp $
/*
 * ====================================================================
 * Copyright 2002-2004 The Apache Software Foundation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 */


// The developers of JMeter and Apache are greatful to the developers
// of HTMLParser for giving Apache Software Foundation a non-exclusive
// license. The performance benefits of HTMLParser are clear and the
// users of JMeter will benefit from the hard work the HTMLParser
// team. For detailed information about HTMLParser, the project is
// hosted on sourceforge at http://htmlparser.sourceforge.net/.
//
// HTMLParser was originally created by Somik Raha in 2000. Since then
// a healthy community of users has formed and helped refine the
// design so that it is able to tackle the difficult task of parsing
// dirty HTML. Derrick Oswald is the current lead developer and was kind
// enough to assist JMeter.




package org.htmlparser;


//////////////////
// Java Imports //
//////////////////
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringWriter;


import org.htmlparser.parserHelper.StringParser;
import org.htmlparser.scanners.TagScanner;
import org.htmlparser.tags.EndTag;
import org.htmlparser.tags.Tag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;


/**
 * NodeReader builds on the BufferedReader, providing methods to read one element
 * at a time
 */
public class NodeReader extends BufferedReader
{
    public static final String DECIPHER_ERROR =
        "NodeReader.readElement() : Error occurred while trying to decipher the tag using scanners";
    protected int posInLine = -1;
    protected String line;
    protected Node node = null;
    protected TagScanner previousOpenScanner = null;
    protected String url;
    private Parser parser;
    private int lineCount;
    private String previousLine;
    private StringParser stringParser = new StringParser();
    private RemarkNodeParser remarkNodeParser = new RemarkNodeParser();
    private NodeList nextParsedNode = new NodeList();
    private boolean dontReadNextLine = false;
    /**
     * The constructor takes in a reader object, it's length and the url to be read.
     */
    public NodeReader(Reader in, int len, String url)
    {
        super(in, len);
        this.url = url;
        this.parser = null;
        this.lineCount = 1;
    }
    /**
     * This constructor basically overrides the existing constructor in the
     * BufferedReader class.
     * The URL defaults to an empty string.
     * @see #NodeReader(Reader,int,String)
     */


    public NodeReader(Reader in, int len)
    {
        this(in, len, "");
    }
    /**
     * The constructor takes in a reader object, and the url to be read.
     * The buffer size defaults to 8192.
     * @see #NodeReader(Reader,int,String)
     */
    public NodeReader(Reader in, String url)
    {
        this(in, 8192, url);
    }


    /**
     * Get the url for this reader.
     * @return The url specified in the constructor;
     */
    public String getURL()
    {
        return (url);
    }


    /**
     * This method is intended to be called only by scanners, when a situation of dirty html has arisen, 
     * and action has been taken to correct the parsed tags. For e.g. if we have html of the form :
     * <pre>
     * <a href="somelink.html"><img src=...><td><tr><a href="someotherlink.html">...</a>
     * </pre>
     * Now to salvage the first link, we'd probably like to insert an end tag somewhere (typically before the
     * second begin link tag). So that the parsing continues uninterrupted, we will need to change the existing
     * line being parsed, to contain the end tag in it. 
     */
    public void changeLine(String line)
    {
        this.line = line;
    }
    public String getCurrentLine()
    {
        return line;
    }
    /**
     * Get the last line number that the reader has read
     * @return int last line number read by the reader
     */
    public int getLastLineNumber()
    {
        return lineCount - 1;
    }


    /**
     * This method is useful when designing your own scanners. You might need to find out what is the location where the
     * reader has stopped last.
     * @return int Last position read by the reader
     */
    public int getLastReadPosition()
    {
        if (node != null)
            return node.elementEnd();
        else
            return 0;
    }


    /*
     * Read the next line
     * @return String containing the line
     */
    public String getNextLine()
    {
        try
        {
            previousLine = line;
            line = readLine();
            if (line != null)
                lineCount++;
            posInLine = 0;
            return line;
        }
        catch (IOException e)
        {
            System.err.println("I/O Exception occurred while reading!");
        }
        return null;
    }
    /**
     * Returns the parser object for which this reader exists
     * @return org.htmlparser.Parser
     */
    public Parser getParser()
    {
        return parser;
    }
    /**
     * Gets the previousOpenScanner.
     * @return Returns a TagScanner
     */
    public TagScanner getPreviousOpenScanner()
    {
        return previousOpenScanner;
    }


    /**
     * Returns true if the text at <code>pos</code> in <code>line</code> should be scanned as a tag.
     * Basically an open angle followed by a known special character or a letter.
     * @param line The current line being parsed.
     * @param pos The position in the line to examine.
     * @return <code>true</code> if we think this is the start of a tag.
     */
    private boolean beginTag(String line, int pos)
    {
        char ch;
        boolean ret;


        ret = false;


        if (pos + 2 <= line.length())
            if ('<' == line.charAt(pos))
            {
                ch = line.charAt(pos + 1);
                // the order of these tests might be optimized for speed
                if ('/' == ch
                    || '%' == ch
                    || Character.isLetter(ch)
                    || '!' == ch)
                    ret = true;
            }


        return (ret);
    }


    /**
     * Read the next element
     * @return Node - The next node
       */
    public Node readElement() throws ParserException
    {
        return (readElement(false));
    }


    /**
     * Read the next element
     * @param balance_quotes If <code>true</code> string nodes are parsed
     * paying attention to single and double quotes, such that tag-like
     * strings are ignored if they are quoted.
     * @return Node - The next node
       */
    public Node readElement(boolean balance_quotes) throws ParserException
    {
        try
        {
            if (nextParsedNode.size() > 0)
            {
                node = nextParsedNode.elementAt(0);
                nextParsedNode.remove(0);
                return node;
            }
            if (readNextLine())
            {
                do
                {
                    line = getNextLine();
                }
                while (line != null && line.length() == 0);


            }
            else if (dontReadNextLine)
            {
                dontReadNextLine = false;
            }
            else
                posInLine = getLastReadPosition() + 1;
            if (line == null)
                return null;


            if (beginTag(line, posInLine))
            {
                node = remarkNodeParser.find(this, line, posInLine);
                if (node != null)
                    return node;
                node = Tag.find(this, line, posInLine);
                if (node != null)
                {
                    Tag tag = (Tag) node;
                    try
                    {
                        node = tag.scan(parser.getScanners(), url, this);
                        return node;
                    }
                    catch (Exception e)
                    {
                        StringBuffer msgBuffer = new StringBuffer();
                        msgBuffer.append(
                            DECIPHER_ERROR
                                + "\n"
                                + "    Tag being processed : "
                                + tag.getTagName()
                                + "\n"
                                + "    Current Tag Line : "
                                + tag.getTagLine());
                        appendLineDetails(msgBuffer);
                        ParserException ex =
                            new ParserException(msgBuffer.toString(), e);


                        parser.getFeedback().error(msgBuffer.toString(), ex);
                        throw ex;
                    }
                }


                node = EndTag.find(line, posInLine);
                if (node != null)
                    return node;
            }
            else
            {
                node = stringParser.find(this, line, posInLine, balance_quotes);
                if (node != null)
                    return node;
            }


            return null;
        }
        catch (ParserException pe)
        {
            throw pe;
        }
        catch (Exception e)
        {
            StringBuffer msgBuffer =
                new StringBuffer("NodeReader.readElement() : Error occurred while trying to read the next element,");
            StringWriter sw = new StringWriter();
            e.printStackTrace(new PrintWriter(sw));
            appendLineDetails(msgBuffer);
            msgBuffer.append("\n Caused by:\n").append(
                sw.getBuffer().toString());
            ParserException ex = new ParserException(msgBuffer.toString(), e);
            parser.getFeedback().error(msgBuffer.toString(), ex);
            throw ex;
        }
    }
    public void appendLineDetails(StringBuffer msgBuffer)
    {
        msgBuffer.append("\nat Line ");
        msgBuffer.append(getLineCount());
        msgBuffer.append(" : ");
        msgBuffer.append(getLine());
        msgBuffer.append("\nPrevious Line ").append(getLineCount() - 1);
        msgBuffer.append(" : ").append(getPreviousLine());
    }
    /**
     * Do we need to read the next line ?
     * @return true - yes/ false - no
     */
    protected boolean readNextLine()
    {
        if (dontReadNextLine)
        {
            return false;
        }
        if (posInLine == -1
            || (line != null && node.elementEnd() + 1 >= line.length()))
            return true;
        else
            return false;
    }
    /**
     * The setParser method is used by the parser to put its own object into the reader. This happens internally,
     * so this method is not generally for use by the developer or the user.
     */
    public void setParser(Parser newParser)
    {
        parser = newParser;
    }
    /**
     * Sets the previousOpenScanner.
     * @param previousOpenScanner The previousOpenScanner to set
     */
    public void setPreviousOpenScanner(TagScanner previousOpenScanner)
    {
        this.previousOpenScanner = previousOpenScanner;
    }


    /**
     * @param lineSeparator New Line separator to be used
     */
    public static void setLineSeparator(String lineSeparator)
    {
        Node.setLineSeparator(lineSeparator);
    }


    /**
     * Gets the line seperator that is being used
     * @return String
     */
    public static String getLineSeparator()
    {
        return (Node.getLineSeparator());
    }
    /**
     * Returns the lineCount.
     * @return int
     */
    public int getLineCount()
    {
        return lineCount;
    }


    /**
     * Returns the previousLine.
     * @return String
     */
    public String getPreviousLine()
    {
        return previousLine;
    }


    /**
     * Returns the line.
     * @return String
     */
    public String getLine()
    {
        return line;
    }


    /**
     * Sets the lineCount.
     * @param lineCount The lineCount to set
     */
    public void setLineCount(int lineCount)
    {
        this.lineCount = lineCount;
    }


    /**
     * Sets the posInLine.
     * @param posInLine The posInLine to set
     */
    public void setPosInLine(int posInLine)
    {
        this.posInLine = posInLine;
    }


    public void reset() throws IOException
    {
        super.reset();
        lineCount = 1;
        posInLine = -1;
    }


    public StringParser getStringParser()
    {
        return stringParser;
    }


    /**
     * Adds the given node on the front of an internal list of pre-parsed nodes.
     * Used in recursive calls where downstream nodes have been recognized in
     * order to parse the current node.
     * @param nextParsedNode The node that will be returned next by the reader.
     */
    public void addNextParsedNode(Node nextParsedNode)
    {
        this.nextParsedNode.prepend(nextParsedNode);
    }


    public boolean isDontReadNextLine()
    {
        return dontReadNextLine;
    }


    public void setDontReadNextLine(boolean dontReadNextLine)
    {
        this.dontReadNextLine = dontReadNextLine;
    }


}
Source Code of org.htmlparser.NodeReader

Related Classes of org.htmlparser.NodeReader