Package de.susebox.jtopas

Source Code of de.susebox.jtopas.AbstractTokenizer

/*
* AbstractTokenizer.java: base class for Tokenizer implementations.
*
* Copyright (C) 2004 Heiko Blau
*
* This file belongs to the JTopas Library.
* JTopas is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the
* Free Software Foundation; either version 2.1 of the License, or (at your
* option) any later version.
*
* This software is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with JTopas. If not, write to the
*
*   Free Software Foundation, Inc.
*   59 Temple Place, Suite 330,
*   Boston, MA 02111-1307
*   USA
*
* or check the Internet: http://www.fsf.org
*
* Contact:
*   email: heiko@susebox.de
*/

package de.susebox.jtopas;

//-----------------------------------------------------------------------------
// Imports
//
import java.io.Reader;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.LinkedList;
import java.util.Arrays;

import de.susebox.java.lang.ExtIndexOutOfBoundsException;

import de.susebox.jtopas.spi.WhitespaceHandler;
import de.susebox.jtopas.spi.KeywordHandler;
import de.susebox.jtopas.spi.PatternHandler;
import de.susebox.jtopas.spi.SeparatorHandler;
import de.susebox.jtopas.spi.SequenceHandler;

import de.susebox.jtopas.spi.StandardWhitespaceHandler;
import de.susebox.jtopas.spi.StandardKeywordHandler;
import de.susebox.jtopas.spi.StandardSeparatorHandler;
import de.susebox.jtopas.spi.StandardSequenceHandler;

import de.susebox.jtopas.spi.DataProvider;
import de.susebox.jtopas.spi.DataMapper;


//-----------------------------------------------------------------------------
// Class AbstractTokenizer
//

/**<p>
* Base class for {@link Tokenizer} implementations. <code>AbstractTokenizer</code>
* separates the data analysis from the actual data provision. Although the class
* maintains read and write positions the physical representation of the logical
* character buffer behind these positions concerns only the subclasses.
*</p>
*
* @see Tokenizer
* @see TokenizerProperties
* @author Heiko Blau
*/
public abstract class AbstractTokenizer implements Tokenizer, TokenizerPropertyListener {

  //---------------------------------------------------------------------------
  // Abstract methods
  //

  /**
   * Subclasses have to provide {@link de.susebox.jtopas.spi.DataProvider}
   * instances for various token type handlers. The given start position is the
   * absolute number of characters from the beginning of the data source.
   *
   * @param   startPos    position in the input data
   * @param   length      number of characters
   * @return  the <code>DataProvider</code> for the given data range
   */
  protected abstract DataProvider getDataProvider(int startPos, int length);

  /**
   * This method is called when the tokenizer runs out of data. Its main purpose
   * is to call the {@link TokenizerSource#read} method. It is also responsible
   * to handle the flag {@link TokenizerProperties#F_KEEP_DATA} flag).
   *
   * @return  number of read bytes or -1 if an end-of-file condition occured
   * @throws  TokenizerException wrapped exceptions from the {@link TokenizerSource#read}
   *          method
   */
  protected abstract int readMoreData() throws TokenizerException;
 
 
  //---------------------------------------------------------------------------
  // Constructors
  //
 
  /**
   * Default constructor that sets the tokenizer control flags as it would be
   * approbriate for C/C++ and Java. Found token images are copied. No line nor
   * column informations are provided. Nested comments are not allowed.
   *<br>
   * The tokenizer will use the {@link TokenizerProperties#DEFAULT_WHITESPACES}
   * and {@link TokenizerProperties#DEFAULT_SEPARATORS} for whitespace and
   * separator handling.
   */ 
  public AbstractTokenizer() {
    _baseTokenizer = this;
    if (_defaultProperties == null) {
      _defaultProperties = new StandardTokenizerProperties();
    }
    setTokenizerProperties(_defaultProperties);
  }
 
  /**
   * Contructing a <code>AbstractTokenizer</code> with a backing {@link TokenizerProperties}
   * instance.
   *
   * @param properties  an {@link TokenizerProperties} object containing the
   *                    settings for the tokenizing process
   */
  public AbstractTokenizer(TokenizerProperties properties) {
    _baseTokenizer = this;
    setTokenizerProperties(properties);
  }

 
  //---------------------------------------------------------------------------
  // data source
  //
 
  /**
   * Setting the source of data. This method is usually called during setup of
   * the <code>Tokenizer</code> but may also be invoked while the tokenizing
   * is in progress. It will reset the tokenizers input buffer, line and column
   * counters etc.
   *<br>
   * Subclasses should override this method to do their own actions on a data source
   * change. Generally, this base method should be called first in the subclass
   * implementation of <code>setSource</code> (equivalent to super calls in
   * constructors of derived classes).
   *
   * @param source  a {@link TokenizerSource} to read data from
   * @see #getSource
   */
  public void setSource(TokenizerSource source) {
    _source           = source;
    _eofReached       = false;
    _currentReadPos   = 0;
    _currentWritePos  = 0;
    if (isFlagSet(Flags.F_COUNT_LINES)) {
      _lineNumber     = 0;
      _columnNumber   = 0;
    } else {
      _lineNumber     = -1;
      _columnNumber   = -1;
    }
    Arrays.fill(_scannedToken, null);
  }
 
  /**
   * Convenience method to avoid the construction of a {@link TokenizerSource}
   * from the most important data source {@link java.io.Reader}.
   *
   * @param reader  the {@link java.io.Reader} to get data from
   */
  public void setSource(Reader reader) {
    setSource(new ReaderSource(reader));
  }
  
  /**
   * Retrieving the {@link TokenizerSource} of this <code>Tokenizer</code>. The
   * method may return <code>null</code> if there is no <code>TokenizerSource</code>
   * associated with it.
   *
   * @param the {@link TokenizerSource} associated with this <code>Tokenizer</code>
   * @see #setSource
   */
  public TokenizerSource getSource() {
    return _source;
  }
 
 
  //---------------------------------------------------------------------------
  // Methods of the Tokenizer interface
  //
 
  /**
   * Setting the tokenizer characteristics. See the method description in
   * {@link Tokenizer}.
   *
   * @param   props   the {@link TokenizerProperties} for this tokenizer
   * @throws  NullPointerException if the <code>null</code> is passed to the call
   * @see     #getTokenizerProperties
   */
  public void setTokenizerProperties(TokenizerProperties props) throws NullPointerException {
    if (props == null) {
      throw new NullPointerException();
    }

    // set properties
    if (_properties != null) {
      _properties.removeTokenizerPropertyListener(this);
    }
    _properties = props;
    _properties.addTokenizerPropertyListener(this);

    // who is going to handle the various token types ?
    if (_properties instanceof WhitespaceHandler) {
      setWhitespaceHandler((WhitespaceHandler)_properties);
    } else {
      setWhitespaceHandler(new StandardWhitespaceHandler(_properties));
    }
    if (_properties instanceof SeparatorHandler) {
      setSeparatorHandler((SeparatorHandler)_properties);
    } else {
      setSeparatorHandler(new StandardSeparatorHandler(_properties));
    }
    if (_properties instanceof SequenceHandler) {
      setSequenceHandler((SequenceHandler)_properties);
    } else {
      setSequenceHandler(new StandardSequenceHandler(_properties));
    }
    if (props instanceof KeywordHandler) {
      setKeywordHandler((KeywordHandler)props);
    } else {
      setKeywordHandler(new StandardKeywordHandler(_properties));
    }
    if (_properties instanceof PatternHandler) {
      setPatternHandler((PatternHandler)_properties);
    } else {
      setPatternHandler(null);
    }
     
    // flag handling
    int newFlags = _properties.getParseFlags();

    if (newFlags != _flags) {
      propertyChanged(new TokenizerPropertyEvent(
                            TokenizerPropertyEvent.PROPERTY_MODIFIED,
                            new TokenizerProperty(TokenizerProperty.PARSE_FLAG_MASK,
                                                  new String[] { Integer.toBinaryString(newFlags) } ),
                            new TokenizerProperty(TokenizerProperty.PARSE_FLAG_MASK,
                                                  new String[] { Integer.toBinaryString(_flags) } )));
    }
  }

  /**
   * Retrieving the current tokenizer characteristics. See the method description
   * in {@link Tokenizer}.
   *
   * @return  the {@link TokenizerProperties} of this <code>Tokenizer</code>
   * @see     #setTokenizerProperties
   */
  public TokenizerProperties getTokenizerProperties() {
    return _properties;
  }
 
  /**
   * Setting the control flags of the <code>Tokenizer</code>. See the method
   * description in {@link Tokenizer}.
   *
   * @param flags the parser control flags
   * @param mask  the mask for the flags to set or unset
   * @throws TokenizerException if one or more of the flags given cannot be honored
   * @see   #getParseFlags
   */
  public void changeParseFlags(int flags, int mask) throws TokenizerException {
    // test the given flags
    if ((mask | VALID_FLAGS_MASK) != VALID_FLAGS_MASK) {
      throw new TokenizerException(
                  "One or more flags cannot be set separately for a {0}. Violating flags in {1}: {2}.",
                  new Object[] { AbstractTokenizer.class.getName(),
                                 Integer.toHexString(flags),
                                 Integer.toHexString(mask & ~VALID_FLAGS_MASK) } );
    }
   
    // set the new flags for this tokenizer
    _flagMask = mask;
    _flags    = (flags & mask) | (getTokenizerProperties().getParseFlags() & ~mask);

    // when counting lines initialize the current line and column position
    if ( ! isFlagSet(Flags.F_COUNT_LINES)) {
      _lineNumber   = 0;
      _columnNumber = 0;
    }
  }

  /**
   * Retrieving the parser control flags. See the method description in
   * {@link Tokenizer}.
   *
   * @return the current parser control flags
   * @see #changeParseFlags
   */
  public int getParseFlags() {
    return (getTokenizerProperties().getParseFlags() & ~_flagMask) + (_flags & _flagMask);
  }
 
  /**
   * Setting a new {@link de.susebox.jtopas.spi.KeywordHandler} or removing any
   * previously installed one. See the method description in {@link Tokenizer}.
   *
   * @param handler the (new) {@link KeywordHandler} to use or <code>null</code>
   *                to remove it
   */
  public void setKeywordHandler(de.susebox.jtopas.spi.KeywordHandler handler) {
    synchronized(this) {
      if (handler == _properties) {
        if (_properties != null && _properties.getKeywords().hasNext()) {
          _keywordHandler = handler;
        } else {
          _keywordHandler = null;
        }
        _internalFlags &= ~IFLAG_EXTERNAL_KEYWORD_HANDLER;
      } else {
        _keywordHandler = handler;
        _internalFlags |= IFLAG_EXTERNAL_KEYWORD_HANDLER;
      }
    }
  }
 
  /**
   * Retrieving the current {@link de.susebox.jtopas.spi.KeywordHandler}. See the
   * method description in {@link Tokenizer}.
   *
   * @return the currently active whitespace keyword or <code>null</code>, if
   *         keyword support is switched off
   */
  public de.susebox.jtopas.spi.KeywordHandler getKeywordHandler() {
    synchronized(this) {
      if ((_internalFlags & IFLAG_EXTERNAL_KEYWORD_HANDLER) == 0) {
        return (de.susebox.jtopas.spi.KeywordHandler)getTokenizerProperties();
      } else {
        return _keywordHandler;
      }
    }
  }
 
  /**
   * Setting a new {@link de.susebox.jtopas.spi.WhitespaceHandler} or removing any
   * previously installed one. See the method description in {@link Tokenizer}.
   *
   * @param handler the (new) whitespace handler to use or <code>null</code> to
   *                switch off whitespace handling
   * @see   #getWhitespaceHandler
   */
  public void setWhitespaceHandler(de.susebox.jtopas.spi.WhitespaceHandler handler) {
    _whitespaceHandler = handler;
  }
 
  /**
   * Retrieving the current {@link de.susebox.jtopas.spi.WhitespaceHandler}. See
   * the method description in {@link Tokenizer}.
   *
   * @return  the currently active whitespace handler or null, if the base
   *          implementation is working
   */
  public de.susebox.jtopas.spi.WhitespaceHandler getWhitespaceHandler() {
    return _whitespaceHandler;
  }
 
  /**
   * Setting a new {@link de.susebox.jtopas.spi.SeparatorHandler} or removing any
   * previously installed <code>SeparatorHandler</code>. See the method description
   * in {@link Tokenizer}.
   *
   * @param handler the (new) separator handler to use or <code>null</code> to
   *                remove it
   * @see   #getSeparatorHandler
   */
  public void setSeparatorHandler(de.susebox.jtopas.spi.SeparatorHandler handler) {
    _separatorHandler = handler;
  }
 
  /**
   * Retrieving the current {@link de.susebox.jtopas.spi.SeparatorHandler}. See
   * the method description in {@link Tokenizer}.
   *
   * @return  the currently active {@link SeparatorHandler} or <code>null</code>,
   *          if separators aren't recognized by the tokenizer
   * @see     #setSequenceHandler
   */
  public de.susebox.jtopas.spi.SeparatorHandler getSeparatorHandler() {
    return _separatorHandler;
  }
 
  /**
   * Setting a new {@link de.susebox.jtopas.spi.SequenceHandler} or removing any
   * previously installed one. See the method description in {@link Tokenizer}.
   *
   * @param handler   the (new) {@link SequenceHandler} to use or null to remove it
   */
  public void setSequenceHandler(de.susebox.jtopas.spi.SequenceHandler handler) {
    synchronized(this) {
      if (handler == _properties) {
        if (_properties != null && (   _properties.getSpecialSequences().hasNext()
                                    || _properties.getStrings().hasNext()
                                    || _properties.getBlockComments().hasNext()
                                    || _properties.getLineComments().hasNext())) {
          _sequenceHandler = handler;
        } else {
          _sequenceHandler = null;
        }
        _internalFlags &= ~IFLAG_EXTERNAL_SEQUENCE_HANDLER;
      } else {
        _sequenceHandler = handler;
        _internalFlags |= IFLAG_EXTERNAL_SEQUENCE_HANDLER;
      }
    }
  }
 
  /**
   * Retrieving the current {@link SequenceHandler}. See the method description
   * in {@link Tokenizer}.
   *
   * @return  the currently active {@link SequenceHandler} or null, if the base
   *          implementation is working
   */
  public de.susebox.jtopas.spi.SequenceHandler getSequenceHandler() {
    synchronized(this) {
      if ((_internalFlags & IFLAG_EXTERNAL_SEQUENCE_HANDLER) == 0) {
        return (de.susebox.jtopas.spi.SequenceHandler)getTokenizerProperties();
      } else {
        return _sequenceHandler;
      }
    }
  }
 
  /**
   * Setting a new {@link de.susebox.jtopas.spi.PatternHandler} or removing any
   * previously installed one. See the method description in {@link Tokenizer}.
   *
   * @param handler the (new) {@link de.susebox.jtopas.spi.PatternHandler} to
   *                use or <code>null</code> to remove it
   * @see #getPatternHandler
   */
  public void setPatternHandler(de.susebox.jtopas.spi.PatternHandler handler) {
    synchronized(this) {
      if (handler == _properties) {
        if (_properties != null && _properties.getPatterns().hasNext()) {
          _patternHandler = handler;
        } else {
          _patternHandler = null;
        }
        _internalFlags &= ~IFLAG_EXTERNAL_PATTERN_HANDLER;
      } else {
        _patternHandler = handler;
        _internalFlags |= IFLAG_EXTERNAL_PATTERN_HANDLER;
      }
    }
  }
 
  /**
   * Retrieving the current {@link de.susebox.jtopas.spi.PatternHandler}. See the
   * method description in {@link Tokenizer}.
   *
   * @return  the currently active {@link de.susebox.jtopas.spi.PatternHandler}
   *          or <code>null</code>, if patterns are not recognized by the tokenizer
   * @see #setPatternHandler
   */
  public de.susebox.jtopas.spi.PatternHandler getPatternHandler() {
    synchronized(this) {
      if ((_internalFlags & IFLAG_EXTERNAL_PATTERN_HANDLER) == 0) {
        return (de.susebox.jtopas.spi.PatternHandler)getTokenizerProperties();
      } else {
        return _patternHandler;
      }
    }
  }
 
  /**
   * Query the current row. The method can only be used if the flag {@link TokenizerProperties#F_COUNT_LINES}
   * has been set. Without this flag being set, the return value is undefined.
   *<br>
   * Note that row counting starts with 0, while editors often use 1 for the first
   * row.
   *
   * @return current row (starting with 0)
   *         or -1 if the flag {@link TokenizerProperties#F_COUNT_LINES} is set
   */
  public int getCurrentLine() {
    return _lineNumber;
  }
 
  /**
   * Retrieve the current column. The method can only be used if the flag <code>F_COUNT_LINES</code>
   * has been set.
   * Without this flag being set, the return value is undefined.
   * Note that column counting starts with 0, while editors often use 1 for the first
   * column in one row.
   *
   * @return current column number (starting with 0)
   */
  public int getCurrentColumn() {
    return _columnNumber;
  }
 
  /**
   * Checking if there are more tokens available. See the method description in
   * {@link Tokenizer}.
   *
   * @return  <code>true</code> if a ca_ll to {@link #nextToken} or {@link #nextImage}
   *          will succed, <code>false</code> otherwise
   */
  public boolean hasMoreToken() {
    return _scannedToken[0] == null || _scannedToken[0].getType() != Token.EOF;
  }
 
  /**
   * Retrieving the next {@link Token}. See the method description in
   * {@link Tokenizer}.
   *
   * @return found {@link Token} including the EOF token
   * @throws TokenizerException generic exception (list) for all problems that may occur while parsing
   *         (IOExceptions for instance)
   */
  public Token nextToken() throws TokenizerException {
    boolean returnIt = false;
   
    // Get the next token
__MAIN_LOOP__:
    do {
      // analyze look-ahead token
      if (_scannedToken[1] == null) {
        if ( ! isEOF(0)) {
          if ( ! isWhitespace(0)) {
            if ( ! isPattern(0, false)) {
              if ( ! isSpecialSequence(0)) {
                if ( ! isSeparator(0)) {
                  _scannedToken[1] = new Token(Token.NORMAL);
                }
              }
            }
          }
        }
      }
      _scannedToken[0] = _scannedToken[1];
      _scannedToken[1] = _scannedToken[2];
      _scannedToken[2] = null;
     
      // get new token or complete the previously found look-ahead token
      Token             token = _scannedToken[0];
      TokenizerProperty prop  = (TokenizerProperty)token.getCompanion();
     
      token.setCompanion((prop != null) ? prop.getCompanion() : null);
      token.setStartPosition(getReadPosition());
      token.setStartLine(_lineNumber);
      token.setStartColumn(_columnNumber);

      returnIt = true;
     
      switch (token.getType()) {
      case Token.EOF:
        token.setLength(0);
        break;
      case Token.WHITESPACE:
        token.setLength(completeWhitespace());
        returnIt = isFlagSet(Flags.F_RETURN_SIMPLE_WHITESPACES);
        break;
      case Token.SEPARATOR:     // Separators are always single characters.
        token.setLength(1);
        break;
      case Token.STRING:
        token.setLength(completeString(prop));
        break;
      case Token.LINE_COMMENT:
        token.setLength(completeLineComment(prop));
        returnIt = isFlagSet(prop, Flags.F_RETURN_LINE_COMMENTS);
        break;
      case Token.BLOCK_COMMENT:
        token.setLength(completeBlockComment(prop));
        returnIt = isFlagSet(prop, Flags.F_RETURN_BLOCK_COMMENTS);
        break;
      case Token.SPECIAL_SEQUENCE:
        token.setLength(prop.getImages()[0].length());
        break;
      case Token.PATTERN:
        // already contained in the first look-ahead token, see token shifting
        break;
      default:
        prop = completeBoundedToken(token);
      }
     
      // compute new line and column positions (if flag is set) and complete
      // the token
      adjustLineAndColumn(token.getType(), token.getLength());
      token.setEndLine(_lineNumber);
      token.setEndColumn(_columnNumber);

      // need to extract the image ?
      if (returnIt) {
        boolean tokenPosOnly     = (prop != null) ? isFlagSet(prop, Flags.F_TOKEN_POS_ONLY) :
                                                    isFlagSet(Flags.F_TOKEN_POS_ONLY);
        boolean returnImageParts = (prop != null) ? isFlagSet(prop, Flags.F_RETURN_IMAGE_PARTS) :
                                                    isFlagSet(Flags.F_RETURN_IMAGE_PARTS);
        if ( ! tokenPosOnly || returnImageParts) {   
          token.setImage(getText(_currentReadPos, token.getLength()));
        }
        if (returnImageParts) {
          switch (token.getType()) {
          case Token.WHITESPACE:
            token.setImageParts(splitIntoLines(token.getImage()));
            break;
          case Token.STRING:
            token.setImageParts(splitString(prop, token.getImage()));
            break;
          case Token.LINE_COMMENT:
            token.setImageParts(splitIntoLines(token.getImage().substring(prop.getImages()[0].length())));
            break;
          case Token.BLOCK_COMMENT:
            token.setImageParts(splitBlockComment(prop, token.getImage()));
            break;
          case Token.PATTERN:
            break;
          case Token.EOF:
            token.setImageParts(new String[] {} );
            break;
          default:
            token.setImageParts(new String[] { token.getImage() } );
          }
        }
      }

      // this is the one and only point where the current read position is
      // adjusted (except for the data shifting in readMoreData).
      _currentReadPos += token.getLength();
   
    } while ( ! returnIt);

    // the current token is the first in the list
    return _scannedToken[0];
  }
 
  /**
   * This method is a convenience method. It returns only the next token image
   * without any informations about its type or associated information. See the
   * method description in {@link Tokenizer}.
   *
   * @return the token image of the next token
   * @throws TokenizerException generic exception (list) for all problems that may occur while parsing
   *         (IOExceptions for instance)
   * @see #currentImage
   */
  public String nextImage() throws TokenizerException {
    nextToken();
    return currentImage();
  }
 
  /**
   * Retrieve the {@link Token} that was found by the last call to {@link #nextToken}.
   * See the method description in {@link Tokenizer}.
   *
   * @return  the {@link Token} retrieved by the lahasest call to {@link #nextToken}.
   * @throws  TokenizerException if the tokenizer has no current token
   */
  public Token currentToken() throws TokenizerException {
    if (_scannedToken[0] == null) {
      throw new TokenizerException("No current token available (nextToken was not called / read position changed)");
    }
    return _scannedToken[0];
  }
  /**
   * Convenience method to retrieve only the token image of the {@link Token} that
   * would be returned by {@link #currentToken}. See the method description in
   * {@link Tokenizer}.
   *
   * @return the token image of the current token
   * @see #currentToken
   */
  public String currentImage() throws TokenizerException {
    Token token = currentToken();
   
    if (token.getType() == Token.EOF) {
      return null;
    } else if ( ! isFlagSet(Flags.F_TOKEN_POS_ONLY) || token.getImage() != null) {
      return token.getImage();
    } else {
      return getText(token.getStartPosition(), token.getLength());
    }
  }
 
  /**
   * If the flag {@link TokenizerProperties#F_COUNT_LINES} is set, this method will
   * return the line number starting with 0 in the input stream. See the method
   * description in {@link Tokenizer}.
   *
   * @return the current line number starting with 0 or -1 if no line numbers are supplied.
   * @see #getColumnNumber
   */ 
  public int getLineNumber() {
    return _lineNumber;
  }
 
  /**
   * If the flag {@link TokenizerProperties#F_COUNT_LINES} is set, this method will
   * return the current column positionstarting with 0 in the input stream. See
   * the method description in {@link Tokenizer}.
   *
   * @return the current column position
   * @see #getLineNumber
   */ 
  public int getColumnNumber() {
    return _columnNumber;
  }
 
  /**
   * Getting the current read offset. See the method description in
   * {@link Tokenizer}.
   *
   * @return the absolute offset in characters from the start of the data source
   *         of the Tokenizer where reading will be continued
   * @see #setReadPositionAbsolute
   * @see #setReadPositionRelative
   */
  public int getReadPosition() {
    return _currentReadPos;
  }
 
  /**
   * Retrieving the number of the currently available characters. See the method
   * description in {@link Tokenizer}.
   *
   * @return number of currently available characters
   */
  public int currentlyAvailable() {
    return _currentWritePos - getRangeStart();
  }
 
  /**
   * Try to read more data into the text buffer of the tokenizer. See the method
   * description in {@link Tokenizer}.
   *
   * @return  the number of character now available
   * @throws  TokenizerException generic exception (list) for all problems that
   *          may occur while reading (IOExceptions for instance)
   */
  public int readMore() throws TokenizerException {
    readMoreDataFromBase();
    return currentlyAvailable();
  }

  /**
   * Returns the character at the given position. The method does not attempt to
   * read more data.
   *
   * @param   pos   get character on this position in the data stream
   * @return  the character at the given position
   * @throws  IndexOutOfBoundsException if the parameter <code>pos</code> is not
   *          in the available text range (text window)
   */
  public char getChar(int pos) throws IndexOutOfBoundsException {
    return getBaseDataProvider(pos, 1).getCharAt(0);
  }
 
  /**
   * Retrieve text from the currently available range. See the method description
   * in {@link Tokenizer}.
   *
   * @param   start   position where the text begins
   * @param   len     length of the text
   * @return  the text beginning at the given position ith the given length
   * @throws  IndexOutOfBoundsException if the starting position or the length
   *          is out of the current text window
   */
  public String getText(int start, int len) throws IndexOutOfBoundsException {
    return getBaseDataProvider(start, len).toString();
  }
 
  /**
   * This method sets the tokenizers current read position to the given absolute
   * read position. See the method description in {@link Tokenizer}.
   *<br>
   * When using this method with embedded tokenizers, the user is responsible to
   * set the read position in the currently used tokenizer. It will be propagated
   * by the next call to {@link #switchTo}. Until that point, a call to this
   * method has no effect on the other tokenizers sharing the same data source.
   *
   * @param   position  absolute position for the next parse operation
   * @throws  IndexOutOfBoundsException if the parameter <code>position</code> is
   *          not in the available text range (text window)
   * @see     #setReadPositionRelative
   */
  public void setReadPositionAbsolute(int position) throws IndexOutOfBoundsException {
    if (position < getRangeStart()) {
      throw new ExtIndexOutOfBoundsException(
                  "Invalid read position {0} below the current text window start {1}.",
                  new Object[] { new Integer(position), new Integer(getRangeStart()) }
                );
    } else if (position > _currentWritePos) {
      throw new ExtIndexOutOfBoundsException(
                  "Invalid read position {0} at or above the current text window end {1}.",
                  new Object[] { new Integer(position), new Integer(currentlyAvailable() + getRangeStart()) }
                );
    }
    _currentReadPos = position;
    Arrays.fill(_scannedToken, null);
   
    // adjust line and column counting
    if (isFlagSet(Flags.F_COUNT_LINES)) {
      SortedMap map         = _position2LineMap.headMap(new Integer(position + 1));
     
      if (map != null &&  ! map.isEmpty()) {
        Integer lastLineStart = (Integer)map.lastKey();
       
        _lineNumber   = ((Integer)map.get(lastLineStart)).intValue();
        _columnNumber = position - lastLineStart.intValue();
      } else {
        _lineNumber   = 0;
        _columnNumber = position;
      }
    }
  } 

  /**
   * This method sets the tokenizers new read position the given number of characters
   * forward (positive value) or backward (negative value) starting from the current
   * read position. See the method description in {@link Tokenizer}.
   *<br>
   * When using this method with embedded tokenizers, the user is responsible to
   * set the read position in the currently used tokenizer. It will be propagated
   * by the next call to {@link #switchTo}. Until that point, a call to this
   * method has no effect on the other tokenizers sharing the same data source.
   *
   * @param   offset  number of characters to move forward (positive offset) or
   *                 backward (negative offset)
   * @throws  IndexOutOfBoundsException if the parameter <code>offset</code> would
   *          move the read position out of the available text range (text window)
   * @see     #setReadPositionAbsolute
   */
  public void setReadPositionRelative(int offset) throws IndexOutOfBoundsException {
    setReadPositionAbsolute(getReadPosition() + offset);
  }

  /**
   * Closing this tokenizer frees resources and deregisters from the
   * associated {@link TokenizerProperties} object.
   */
  public void close() {
    // deregister from the properties
    if (_properties != null) {
      _properties.removeTokenizerPropertyListener(this);
      _properties = null;
    }

    // freeing memory
    if (_position2LineMap != null) {
      _position2LineMap.clear();
      _position2LineMap = null;
    }
   
    // adjust members
    _eofReached         = true;
    _flags              = 0;
    _flagMask           = 0;
    _internalFlags      = 0;
    _currentReadPos     = 0;
    _currentWritePos    = 0;
    _lineNumber         = -1;
    _columnNumber       = -1;
    _nextTokenizer      = null;
    _prevTokenizer      = null;
    _whitespaceHandler  = null;
    _separatorHandler   = null;
    _keywordHandler     = null;
    _sequenceHandler    = null;
    _patternHandler     = null;
    _source             = null;
    Arrays.fill(_scannedToken, null);
  }

 
  //---------------------------------------------------------------------------
  // embedded tokenizer support
  //
 
  /**
   * Adding an embedded tokenizer. Embedded tokenizer work on the same input
   * buffer as their base tokenizer. A situation where embedded tokenizer could
   * be applied, is a HTML stream with cascading style sheet (CSS) and JavaScript
   * parts.
   *<br>
   * There are no internal means of switching from one tokenizer to another.
   * This should be done by the caller using the method {@link #switchTo}.
   *<br>
   * The {@link TokenizerProperties#F_KEEP_DATA} and {@link TokenizerProperties#F_COUNT_LINES}
   * flags of the base tokenizer take effect also in the embedded tokenizers.
   *<br>
   * Since is might be possible that the given <code>tokenizer</code> is a
   * derivation of the <code>AbstractTokenizer</code> class, this method is
   * synchronized on <code>tokenizer</code>.
   *
   * @param  tokenizer   an embedded tokenizer
   * @throws TokenizerException if something goes wrong (not likely :-)
   */
  public void addTokenizer(AbstractTokenizer tokenizer) throws TokenizerException {
    AbstractTokenizer curr = this;
   
    while (curr._nextTokenizer != null) {
      curr = curr._nextTokenizer;
    }
   
    if (tokenizer != null) {
      synchronized(tokenizer) {
        curr._nextTokenizer      = tokenizer;
        tokenizer._prevTokenizer = curr;

        // share the input buffer of the base tokenizer
        AbstractTokenizer baseTokenizer = getBaseTokenizer();
       
        tokenizer._baseTokenizer = baseTokenizer;
       
        // inherited flags
        tokenizer.changeParseFlags(baseTokenizer.getParseFlags(), Flags.F_COUNT_LINES);
      }
    }
  }

  /**
   * Changing fron one tokenizer to another. If the given tokenizer has not been
   * added with {@link #addTokenizer}, an exception is thrown.<br>
   * The <code>switchTo</code> method does the nessecary synchronisation between
   * <code>this</code> and the given tokenizer. The user is therefore responsible
   * to use <code>switchTo</code> whenever a tokenizer change is nessecary. It
   * must be done this way:
   *<blockquote><pre>
   *   Tokenizer base     = new MyTokenizer(...)
   *   Tokenizer embedded = new MyTokenizer(...)
   *
   *   // setting properties (comments, keywords etc.)
   *   ...
   *
   *   // embedding a tokenizer
   *   base.addTokenizer(embedded);
   *  
   *   // tokenizing with base
   *   ...
   *   if (<i>switch_condition</i>) {
   *     base.switchTo(embedded);
   *   }
   *
   *   // tokenizing with embedded
   *   ...
   *   if (<i>switch_condition</i>) {
   *     embedded.switchTo(base);
   *   }
   *</pre></blockquote>
   * That way we avoid a more complex synchronisation between tokenizers whenever
   * one of them parses the next data in the input stream. However, the danger
   * of not synchronized tokenizers remains, so take care.
   *<br>
   * Since is might be possible that the given <code>tokenizer</code> is a
   * derivation of the <code>AbstractTokenizer</code> class, this method is
   * synchronized on <code>tokenizer</code>.
   *
   * @param tokenizer   the tokenizer that should be used from now on
   */
  public void switchTo(AbstractTokenizer tokenizer) throws TokenizerException {
    if (tokenizer != null) {
      synchronized(tokenizer) {
        if (tokenizer._baseTokenizer != _baseTokenizer) {
          throw new TokenizerException("Trying to switch to an alien tokenizer (not added with addTokenizer).", null);
        }
        tokenizer._eofReached       = this._eofReached;
        tokenizer._currentReadPos   = this._currentReadPos;
        tokenizer._currentWritePos  = this._currentWritePos;
        tokenizer._columnNumber     = this._columnNumber;
        tokenizer._lineNumber       = this._lineNumber;
        tokenizer._position2LineMap = this._position2LineMap;
      }
    } else {
      throw new TokenizerException(new NullPointerException());
    }
  }


  //---------------------------------------------------------------------------
  // Methods that may be overwritten in derived classes
  //
 
  /**
   * This method checks if the character is a whitespace. Implement Your own
   * code for situations where this default implementation is not fast enough
   * or otherwise not really good.
   *
   * @param testChar  check this character
   * @return <code>true</code> if the given character is a whitespace,
   *         <code>false</code> otherwise
   */
  protected boolean isWhitespace(char testChar) {
    if (_whitespaceHandler != null) {
      return _whitespaceHandler.isWhitespace(testChar);
    } else {
      return false;
    }
  }
     
  /**
   * This method detects the number of whitespace characters starting at the given
   * position. It should return the number of characters identified as whitespaces
   * starting from and including the given start position.
   *<br>
   * Then overriding this method, use {@link #getBaseDataProvider} to access characters.
   *<br>
   * Do not attempt to actually read more data or do anything that leads to the
   * change of the data source or to tokenizer switching. This is done by the
   * tokenizer framework.
   *
   * @param   startingAtPos  start checking for whitespace from this position
   * @param   maxChars  if there is no non-whitespace character, read up to this number of characters
   * @return  number of whitespace characters starting from the given offset
   * @throws  TokenizerException failure while reading data from the input stream
   */
  protected int readWhitespaces(int startingAtPos, int maxChars) throws TokenizerException {
    if (_whitespaceHandler != null) {
      DataProvider dataProvider = getBaseDataProvider(startingAtPos, maxChars);
      return _whitespaceHandler.countLeadingWhitespaces(dataProvider);
    } else {
      return 0;
    }
  }
 
  /**
   * This method checks if the character sequence starting at a given position
   * with a given lenghth is a keyword. If so, it returns the keyword description
   * as {@link TokenizerProperty} object.
   *
   * @param   startingAtPos   check at this position
   * @param   length          the candidate has this number of characters
   * @throws  TokenizerException routed exception from the active {@link de.susebox.jtopas.spi.KeywordHandler}
   * @return  {@link TokenizerProperty} describing the keyword or <code>null</code>
   */
  protected TokenizerProperty isKeyword(int startingAtPos, int length) throws TokenizerException {
    if (_keywordHandler != null) {
      DataProvider dataProvider = getBaseDataProvider(startingAtPos, length);
      return _keywordHandler.isKeyword(dataProvider);
    } else {
      return null;
    }
  }
 
 
  //---------------------------------------------------------------------------
  // TokenizerPropertyListener methods
  //
 
  /**
   * Splits a given String into lines. The method ist used to retrieve the
   * image parts of several token types.
   *
   * @param   image   split this string into lines
   * @return  an array containing the lines of the image without line separator
   *          characters
   */
  protected String[] splitIntoLines(String image) {
    LinkedList  lines = new LinkedList();
    int         index = 0;
    int         start = 0;
   
    while (index < image.length()) {
      switch (image.charAt(index)) {
      case '\r':
        lines.add(image.substring(start, index));
        if (index + 1 < image.length() && image.charAt(index + 1) == '\n') {
          index += 2;
        } else {
          index++;
        }
        start = index;
        break;
      case '\n':
        lines.add(image.substring(start, index));
        start = ++index;
        break;
      default:
        index++;
      }
    }
   
    if (start < index || start > 0) {
      lines.add(image.substring(start, index));
    }
 
    return (String[])lines.toArray(new String[lines.size()]);
  }
 
  /**
   * Splits a given string into lines and removing string escapes. The method is
   * used to retrieve the image parts for string token types.
   *
   * @param   prop    the {@link TokenizerProperty} describing a string
   * @param   image   split this string into lines
   * @return  an array containing the lines of the image without line separator
   *          characters
   */
  protected String[] splitString(TokenizerProperty prop, String image) {
    // complete string
    String[]      images        = prop.getImages();
    String        begin         = images[0];
    String        end           = images[1];
    String        esc           = images[2];
    boolean       noCase        = isFlagSet(prop, Flags.F_NO_CASE);
    boolean       escEqualsEnd  =    ( ! noCase && esc.compareTo(end)           == 0)
                                  || (   noCase && esc.compareToIgnoreCase(end) == 0);

    StringBuffer  buffer        = null;
    int           index         = begin.length();
    int           start         = index;
    int           endIndex;
   
    if (   image.length() - start >= end.length()
        && (   ( ! noCase && end.equals(image.substring(image.length() - end.length())))
            || (   noCase && end.equalsIgnoreCase(image.substring(image.length() - end.length()))))) {
      endIndex = image.length() - end.length();   
    } else {
      endIndex = image.length();
    }
   
    while (index < endIndex) {
      if (   ( ! noCase && image.startsWith(esc, index))
          || (   noCase && image.substring(index, index + esc.length()).equalsIgnoreCase(esc))) {
        if (buffer == null) {
          buffer = new StringBuffer(image.length());
        }
        buffer.append(image.substring(start, index));
        index += esc.length();
        if (index < image.length()) {
          if (   ( ! noCase && image.startsWith(esc, index))
              || (   noCase && image.substring(index, index + esc.length()).equalsIgnoreCase(esc))) {
            buffer.append(esc);
            index += esc.length();
          } else if (   ( ! noCase && image.startsWith(begin, index))
                     || (   noCase && image.substring(index, index + begin.length()).equalsIgnoreCase(begin))) {
            buffer.append(begin);
            index += begin.length();
          } else if (   ( ! noCase && image.startsWith(end, index))
                     || (   noCase && image.substring(index, index + end.length()).equalsIgnoreCase(end))) {
            buffer.append(end);
            index += end.length();
          }
        }
        start = index;
      }
      index++;
    }
   
    if (buffer != null && start < index) {
      buffer.append(image.substring(start, endIndex));
    }
 
    return splitIntoLines((buffer != null) ? buffer.toString() : image.substring(start, endIndex));
  }
 
  /**
   * Splits a given block comment into lines. The method is used to retrieve the
   * image parts for block comment token types.
   *
   * @param   prop    the {@link TokenizerProperty} describing a block comment
   * @param   image   split this string into lines
   * @return  an array containing the lines of the image without line separator
   *          characters
   */
  protected String[] splitBlockComment(TokenizerProperty prop, String image) {
    // complete string
    String[]  images  = prop.getImages();
    String    start   = images[0];
    String    end     = images[1];
    boolean   noCase  = isFlagSet(prop, Flags.F_NO_CASE);

    if (   image.length() - start.length() >= end.length()
        && (   ( ! noCase && end.equals(image.substring(image.length() - end.length())))
            || (   noCase && end.equalsIgnoreCase(image.substring(image.length() - end.length()))))) {
      return splitIntoLines(image.substring(start.length(), image.length() - end.length()));
    } else {
      return splitIntoLines(image.substring(start.length()));
    }
  }
 
  /**
   * Event handler method. The given {@link TokenizerPropertyEvent} parameter
   * contains the nessecary information about the property change. We choose
   * one single method in favour of various more specialized methods since the
   * reactions on adding, removing and modifying tokenizer properties are often
   * the same (flushing cash, rereading information etc.) are probably not very
   * different.
   *<br>
   * Note that a modification of the parse flags in the backing {@link TokenizerProperties}
   * object removes all flags previously modified through {@link #changeParseFlags}.
   *
   * @param event the {@link TokenizerPropertyEvent} that describes the change
   */
  public void propertyChanged(TokenizerPropertyEvent event) {
    TokenizerProperty prop   = event.getProperty();
    String[]          images = prop.getImages();
   
    synchronized(this) {
      switch (event.getType()) {
      case TokenizerPropertyEvent.PROPERTY_ADDED:
      case TokenizerPropertyEvent.PROPERTY_REMOVED:
        switch (prop.getType()) {
        case Token.LINE_COMMENT:
        case Token.BLOCK_COMMENT:
        case Token.STRING:
        case Token.SPECIAL_SEQUENCE:
          if (    (_internalFlags & IFLAG_EXTERNAL_SEQUENCE_HANDLER) == 0
              &&  _properties instanceof de.susebox.jtopas.spi.SequenceHandler) {
            setSequenceHandler((de.susebox.jtopas.spi.SequenceHandler)_properties);
          }
          break;
        case Token.KEYWORD:
          if (    (_internalFlags & IFLAG_EXTERNAL_KEYWORD_HANDLER) == 0
              &&  _properties instanceof de.susebox.jtopas.spi.KeywordHandler) {
            setKeywordHandler((de.susebox.jtopas.spi.KeywordHandler)_properties);
          }
          break;
        case Token.PATTERN:
          if (    (_internalFlags & IFLAG_EXTERNAL_PATTERN_HANDLER) == 0
              &&  _properties instanceof de.susebox.jtopas.spi.PatternHandler) {
            setPatternHandler((de.susebox.jtopas.spi.PatternHandler)_properties);
          }
          break;
        }
        break;
       
      case TokenizerPropertyEvent.PROPERTY_MODIFIED:
        switch (prop.getType()) {
        case TokenizerProperty.PARSE_FLAG_MASK:
          _flags    = getTokenizerProperties().getParseFlags();
          _flagMask = 0;
          if (isFlagSet(Flags.F_COUNT_LINES)) {
            if (_lineNumber < 0) {
              if (_position2LineMap != null) {
                _position2LineMap.clear();
              }
              _lineNumber = 0;
              putPosition(_currentReadPos, _lineNumber);
            }
            if (_columnNumber < 0) {
              _columnNumber = 0;
            }
          } else {
            _lineNumber   = -1;
            _columnNumber = -1;
          }
          break;
        }
        break;
      }
    }
  }
 

  //---------------------------------------------------------------------------
  // Implementation
  //
 
  /**
   * Embedded tokenizers have their base tokenizer they share the input stream
   * with.
   *
   * @return the base tokenizer (the one owning the input stream and text buffer)
   */
  protected AbstractTokenizer getBaseTokenizer() {
    return _baseTokenizer;
  }
 
  /**
   * Returns the {@link de.susebox.jtopas.spi.DataProvider} of the base tokenizer.
   * This is this tokenizer if it is not an embedded one.
   *
   * @param   startPos    position in the input data
   * @param   length      number of characters
   * @return  the <code>DataProvider</code> for the given data range
   */
  protected DataProvider getBaseDataProvider(int startPos, int length) {
    return getBaseTokenizer().getDataProvider(startPos, length);
  }

  /**
   * This method organizes the input buffer. It moves the current text window if
   * nessecary or allocates more space, if data should be kept completely (see the
   * {@link TokenizerProperties#F_KEEP_DATA} flag).
   * Its main purpose is to call the {@link TokenizerSource#read} method.
   *
   * @return  number of read bytes or -1 if an end-of-file condition occured
   * @throws  TokenizerException wrapped exceptions from the {@link TokenizerSource#read}
   *          method
   */
  protected int readMoreDataFromBase() throws TokenizerException  {
    // its always the base tokenizer doing the reading
    int readChars = -1;
   
    if ( ! _eofReached) {
      AbstractTokenizer baseTokenizer = getBaseTokenizer();

      if (baseTokenizer != this) {
        readChars = baseTokenizer.readMoreData();
      } else {
        readChars = readMoreData();
      }
      if (readChars > 0) {
        _currentWritePos += readChars;
      } else if (readChars < 0) {
        readChars   = -1;
        _eofReached = true;
      }

      // Inform all embedded tokenizers about input buffer changes
      synchronizeAll();
    }
    return readChars;
  }
 
  /**
   * When the method {@link #readMoreData} changes the contents of the input buffer
   * or the input buffer itself, all embedded tokenizers must be synchronized.
   * That means their member variables are adjusted to the base tokenizer.
   *
   * @throws TokenizerException if something goes wrong
   */
  protected void synchronizeAll() throws TokenizerException {
    AbstractTokenizer embedded = getBaseTokenizer();

    while ((embedded = embedded._nextTokenizer) != null) {
      switchTo(embedded);   // adjust the member variables
    }
  }

  /**
   * Checks the EOF condition at the given offset.
   *
   * @param offset  check at this position relative to the current read position
   * @return <code>true</code> if EOF has been reached, <code>false</code> otherwise
   * @throws TokenizerException failure while reading data from the input stream
   */
  protected boolean isEOF(int offset) throws TokenizerException {
    if (_currentReadPos + offset < _currentWritePos || readMoreDataFromBase() > 0) {
      return false;
    } else {
      _scannedToken[1] = new Token(Token.EOF);
      return true;
    }
  }
   
  /**
   * The number of characters until the next comment, whitespace, string, special
   * sequence or separator are determined. The character sequnce is then checked
   * for keyword or pattern matching.
   *
   * @param token buffer to receive information about the keyword or normal token
   * @return <code>null</code> or a {@link TokenizerProperty} if a keyword or pattern is found
   * @throws TokenizerException failure while reading data from the input stream
   */
  protected TokenizerProperty completeBoundedToken(Token token) throws TokenizerException {
    // find out the return value (length of normal token)
    int len = 1// the first character is a normal one, see call of this method
   
    while ( ! (   isEOF(len)
               || isWhitespace(len)
               || isPattern(len, true)
               || isSpecialSequence(len)
               || isSeparator(len))) {
      len++;
    }
    token.setLength(len);
   
    // test on keyword or non-free pattern
    TokenizerProperty     prop = null;
    PatternHandler.Result result;
   
    if ((prop = isKeyword(_currentReadPos, len)) != null) {
      token.setType(Token.KEYWORD);
      token.setCompanion(prop.getCompanion());
    } else {
      token.setType(Token.NORMAL);
    }
    return prop;
  }
 
  /**
   * After having identified a whitespace, this method continues to read data
   * until it detects a non-whitespace.
   *
   * @return number of consecutive whitespaces
   * @throws TokenizerException failure while reading data from the input stream
   */
  protected int completeWhitespace() throws TokenizerException {
    int start     = _currentReadPos + 1// the first whitespace we have already
    int available = _currentWritePos - start;
    int len       = readWhitespaces(start, available);
   
    while (len == available) {
      if (readMoreDataFromBase() <= 0) {
        break;
      }
      start    += len;
      available = _currentWritePos - start;
      len      += readWhitespaces(start, available);
    }
    return len + 1;   // the first whitespace we had already
  }
 
  /**
   * This method checks at the given offset if it is a whitespace.
   *
   * @param offset  check at this position relative to the current read position
   * @throws TokenizerException failure while reading data from the input stream
   * @return <code>true</code> if a whitespace sequence was found at the given offset,
   *         <code>false</code> otherwise
   */
  protected boolean isWhitespace(int offset) throws TokenizerException {
    if (_whitespaceHandler != null) {
      if (_currentReadPos + offset >= _currentWritePos && readMoreDataFromBase() < 0) {
        return false;
      }
   
      if (isWhitespace(getChar(_currentReadPos + offset))) {
        _scannedToken[1] = new Token(Token.WHITESPACE);
        return true;
      }
    }
    return false;
  }
     
  /**
   * This method checks at the given offset if it contains a separator.
   *
   * @param offset  check at this position relative to the current read position
   * @throws TokenizerException failure while reading data from the input stream
   * @return <code>true</code> if a separator was found atthe given offset,
   *         <code>false</code> otherwise
   */
  protected boolean isSeparator(int offset) throws TokenizerException {
    if (    _separatorHandler != null
        &&  (_currentReadPos + offset < _currentWritePos ||  readMoreDataFromBase() > 0)
        &&  _separatorHandler.isSeparator(getChar(_currentReadPos + offset))) {
      _scannedToken[1] = new Token(Token.SEPARATOR);
      return true;
    } else {
      return false;
    }
  }
 
  /**
   * Testing for pattern matching.
   *
   * @param   offset            check at this position relative to the current read position
   * @param   freePatternOnly   if <code>true</code> consider only pattern that can occur anywhere in the data
   * @throws  TokenizerException failure while reading data from the input stream
   * @return  <code>true</code> if a pattern match was found at the given offset,
   *          <code>false</code> otherwise
   */
  protected boolean isPattern(int offset, boolean freePatternOnly) throws TokenizerException {
    if (_patternHandler != null) {
      // for pattern, we might need a lot of data
      int startingAtPos = _currentReadPos + offset;
     
      while (_currentWritePos - startingAtPos < PATTERN_MAX_SIZE) {
        if (readMoreDataFromBase() <= 0) {
          break;
        }
      }

      // try pattern matching
      DataProvider          dataProvider  = getBaseDataProvider(startingAtPos, _currentWritePos - startingAtPos);
      PatternHandler.Result result        = _patternHandler.matches(dataProvider);
      boolean               isFree        = (result != null) ? isFlagSet(result.getProperty(), Flags.F_FREE_PATTERN) : false;
     
      if (result != null && (isFree || ! freePatternOnly)) {
        if ( ! isFree) {
          int nextOffset = offset + result.getLengthOfMatch();
         
          if (   isEOF(nextOffset)
              || isWhitespace(nextOffset)
              || isPattern(nextOffset, true)
              || isSpecialSequence(nextOffset)
              || isSeparator(nextOffset)) {
            _scannedToken[2] = _scannedToken[1];
          } else {
            return false;
          }
        }
        _scannedToken[1] = new Token(Token.PATTERN, null, result.getProperty());
        _scannedToken[1].setLength(result.getLengthOfMatch());
        if (isFlagSet(result.getProperty(), Flags.F_RETURN_IMAGE_PARTS)) {
          _scannedToken[1].setImageParts(result.getGroups());
        }
        return true;
      }
    }
   
    // no pattern matching available or no match found
    return false;
  }
 
  /**
   * This method checks at the given offset if it contains a a special sequence.
   * Unlike the method {@link #test4SpecialSequence} it does nothing more.
   *
   * @param offset  check at this position relative to the current read position
   * @throws TokenizerException failure while reading data from the input stream
   * @return <code>true</code> if a special sequence was found at the given offset,
   *         <code>false</code> otherwise
   */
  protected boolean isSpecialSequence(int offset) throws TokenizerException {
    if (_sequenceHandler != null) {
      // do we need more data to ensure enough characters for even the longest
      // possible sequence match
      int startingAtPos = _currentReadPos + offset;
   
      while (_sequenceHandler.getSequenceMaxLength() > _currentWritePos - startingAtPos) {
        if (readMoreDataFromBase() <= 0) {
          break;
        }
      }
     
      // invoke the sequence handler
      DataProvider      dataProvider = getBaseDataProvider(startingAtPos, _currentWritePos - startingAtPos);
      TokenizerProperty prop         = _sequenceHandler.startsWithSequenceCommentOrString(dataProvider);
   
      if (prop != null) {
        _scannedToken[1] = new Token(prop.getType(), null, prop);
        return true;
      }
    }
   
    // no sequence handler given or no special sequence at given offset
    return false;
  }
 
  /**
   * Completing a line comment. After a line comment sequence has been found, all
   * characters up to and including the end-of-line combination belong to the
   * line comment. Note that on reaching end-of-file a line comment does not
   * nessecarily ends with an end-of-line sequence (linefeed for example).
   *
   * @param   prop    the property describing the line comment to complete
   * @return  length of the line comment
   * @throws  TokenizerException failure while reading data from the input stream
   */
  protected int completeLineComment(TokenizerProperty prop) throws TokenizerException {
    String[]  images = prop.getImages();
    int       len    = images[0].length();

    while (_currentReadPos + len < _currentWritePos || readMoreDataFromBase() > 0) {
      switch (getChar(_currentReadPos + len)) {
      case '\r':
        len++;
        if (_currentReadPos + len < _currentWritePos || readMoreDataFromBase() > 0) {
          if (getChar(_currentReadPos + len) == '\n') {
            len++;
          }
        }
        return len;
      case '\n':
        len++;     
        return len;
      default:
        len++;
      }
    }
    return len;
  }
 
  /**
   * Completing a block comment. After a block comment sequence has been found, all
   * characters up to and including the end sequence of the block comment belong
   * to the block comment. Note that on reaching end-of-file a block comment does
   * not nessecarily ends with an end-of-block-comment sequence.
   *
   * @param   prop    the property describing the block comment to complete
   * @return  length of the block comment
   * @throws  TokenizerException failure while reading data from the input stream
   */
  protected int completeBlockComment(TokenizerProperty prop) throws TokenizerException {
    String[]  images = prop.getImages();
    String    start  = images[0];
    String    end    = images[1];
    boolean   noCase = isFlagSet(prop, Flags.F_NO_CASE);
    boolean   nested = isFlagSet(prop, Flags.F_ALLOW_NESTED_COMMENTS);
    int       len    = start.length();
    int       level  = 0;

    __LOOP__:
    do {
      // test on nested comments: we take only care for nesting the same
      // block comment
      if (nested) {
        switch (comparePrefix(len, start, noCase)) {
        case 0:     // comment start identified
          level++;
          len += start.length();
          continue __LOOP__;
        case -1:    // EOF reached
          return _currentWritePos - _currentReadPos;
        }
      }
     
      // is it the end ?
      switch (comparePrefix(len, end, noCase)) {
      case 0:       // comment end identified
        level--;
        len += end.length();
        break;
      case -1:      // EOF reached
        return _currentWritePos - _currentReadPos;
      default:
        len++;
      }
    } while (level >= 0);

    // block comment regularly terminated
    return len;
  }
 
  /**
   * Completing a string. After a string start sequence has been found, all
   * characters up to and including the end-of-string sequence belong to the
   * string. Note that on reaching end-of-file a string does not nessecarily ends
   * with an end-of-string sequence.
   *
   * @param   prop    the property describing the string to complete
   * @return  length of the string
   * @throws  TokenizerException failure while reading data from the input stream
   */
  protected int completeString(TokenizerProperty prop) throws TokenizerException {
    // complete string
    String[]  images        = prop.getImages();
    String    start         = images[0];
    String    end           = images[1];
    String    esc           = images[2];
    int       len           = start.length();
    boolean   noCase        = isFlagSet(prop, Flags.F_NO_CASE);
    boolean   escEqualsEnd  =    ( ! noCase && esc.compareTo(end)           == 0)
                              || (   noCase && esc.compareToIgnoreCase(end) == 0);

    while (true) {
      // test on escape
      if (esc != null) {
        switch (comparePrefix(len, esc, noCase)) {
        case 0:       // escape found
          len += esc.length();
          if (escEqualsEnd) {
            switch (comparePrefix(len, end, noCase)) {
            case 0:
              len += end.length();
              break;
            case -1:      // EOF reached
              return _currentWritePos - _currentReadPos;
            default:      // this is the regular return point if the esc is the string end
              return len;  
            }
          } else {
            len++;        // esc != string end: skip the next character
          }
          continue;
        case -1:          // EOF reached
          return _currentWritePos - _currentReadPos;
        }
      }

      // test on end sequence
      switch (comparePrefix(len, end, noCase)) {
      case 0:             // this is the regular return point if esc != string end
        len += end.length();   
        return len;  
      case -1:            // EOF reached   
        return _currentWritePos - _currentReadPos;
      default:
        len++;
      }
    }
  }

  /**
   * This method compares the characters at the given offset (from the current
   * read position) with the given prefix.
   *
   * @param   offset  start comparing at this offset from the current read position
   * @param   prefic  compare read data with this prefix
   * @param   noCase  case- or not case-sensitive comparison
   * @throws  TokenizerException failure while reading data from the input stream
   * @return  0 if the the given prefix matches the input stream, -1 on EOF and
   *          1 if not matching
   */
  protected int comparePrefix(int offset, String prefix, boolean noCase)
    throws TokenizerException
  {
    // compare
    int len = prefix.length();
   
    for (int pos = offset; pos < offset + len; ++pos) {
      // do we have enough data
      if (_currentReadPos + pos >= _currentWritePos && readMoreDataFromBase() < 0) {
        return -1;
      }
     
      // compare single character
      char c1 = prefix.charAt(pos - offset);
      char c2 = getChar(_currentReadPos + pos);
     
      if (   c1 != c2
          && (! noCase || Character.toUpperCase(c1) != Character.toUpperCase(c2))) {
        return 1;
      }
    }
   
    // found
    return 0;
  }
 
  /**
   * The method recomputes the line and column position of the tokenizer, if the
   * flag {@link TokenizerProperties#F_COUNT_LINES} is set. It gets the token type of the
   * {@link Token} that has been retrieved by the calling {@link #nextToken}.
   * Using the tokenizer control flags and certain other information it tries to
   * to find end-of-line sequences as fast as possible. For example, a line
   * comment should always contain a end-of-line sequence, so we can simply
   * increase the line count and set the column count to 0.
   *
   * @param type    the type of the current token
   * @param length  the length of the current token
   */
  protected void adjustLineAndColumn(int type, int length) {
    // line and column counting not required
    if ( ! isFlagSet(Flags.F_COUNT_LINES)) {
      return;
    }
   
    // there might be a simple way to determine the current line and column position
    switch (type) {
    case Token.EOF:
      return;
       
    case Token.LINE_COMMENT:        // a line comment always ends with a newline
      _lineNumber++;
      _columnNumber = 0;
      putPosition(_currentReadPos + length, _lineNumber);
      return;
     
    case Token.SPECIAL_SEQUENCE:
    case Token.SEPARATOR:
    case Token.NORMAL:
    case Token.KEYWORD:
      if (_whitespaceHandler != null && _whitespaceHandler.newlineIsWhitespace()) { // newline is a whitespace character
        _columnNumber += length;                      // it should therefore not occure in other
        return;                                       // tokens
      }
      break;
       
    case Token.WHITESPACE:
      if ( ! (_whitespaceHandler.isWhitespace('\n') || _whitespaceHandler.isWhitespace('\r'))) { 
        _columnNumber += length;                      // newline is not a whitespace; we do not have
        return;                                       // to test for it in the current token
      }
      break;
    }
   
    // count it
    int newLineNumber = _lineNumber;
   
    for (int pos = _currentReadPos; pos < _currentReadPos + length; ++pos) {
      switch (getChar(pos)) {
      case '\r':
        if (pos + 1 >= _currentReadPos + length || getChar(pos + 1) != '\n') {
          _lineNumber++;
          _columnNumber = 0;
          putPosition(pos + 1, _lineNumber);
          break;
        }
        pos++;
        /* no break; */
      case '\n':
        _lineNumber++;
        _columnNumber = 0;
        putPosition(pos + 1, _lineNumber);
        break;
       
      default:
        _columnNumber++;
      }
    }
  }
 
  /**
   * Putting a new position into the position-to-line-number map.
   *
   * @param position  the position to map to the current line number
   */
  private void putPosition(int position, int lineNumber) {
    if (_position2LineMap == null) {
      _position2LineMap = new TreeMap();
    }
    _position2LineMap.put(new Integer(position), new Integer(lineNumber));
  }
 
  /**
   * Checking a given flag. The method considers both the globally set flags
   * in the associated {@link TokenizerProperties} instance and the locally set
   * by {@link #changeParseFlags}.
   *
   * @param flag one of the <code>F_...</code> flags defined in {@link TokenizerProperties}
   */
  protected boolean isFlagSet(int flag) {
    return (getParseFlags() & flag) != 0;
  }
 
  /**
   * Checking if a given flag is set for the given {@link TokenizerProperty}, for
   * this <code>Tokenizer</code> or for the used {@link TokenizerProperties}. The method considers both the globally set flags
   * in the associated {@link TokenizerProperties} instance and the locally set
   * by {@link #changeParseFlags}.
   *
   * @param prop  check the flag for this property
   * @param flag  one of the {@link Flags} constants
   */
  protected boolean isFlagSet(TokenizerProperty prop, int flag) {
    return prop.isFlagSet(flag, (getTokenizerProperties().getParseFlags() & flag) != 0 || isFlagSet(flag));
  }
 
 
  //---------------------------------------------------------------------------
  // Class members
  //
 
  /**
   * mask of flags that can be set separately for a <code>AbstractTokenizer</code>.
   */
  protected static final int VALID_FLAGS_MASK =
      Flags.F_RETURN_WHITESPACES
    | Flags.F_TOKEN_POS_ONLY
    | Flags.F_KEEP_DATA
    | Flags.F_COUNT_LINES;
 
  /**
   * {@link TokenizerProperties} tha tare used if no others have been
   * specified by calling {@link #setTokenizerProperties}.
   */
  protected StandardTokenizerProperties _defaultProperties = null;
 
  /**
   * Buffer sizes
   */
  private static final int PATTERN_MAX_SIZE = 0x40000;   // 256K
 
  /**
   * Bits for the internal flag bitmask
   */
  private static final byte IFLAG_EXTERNAL_PATTERN_HANDLER  = 0x01;
  private static final byte IFLAG_EXTERNAL_KEYWORD_HANDLER  = 0x02;
  private static final byte IFLAG_EXTERNAL_SEQUENCE_HANDLER = 0x04;

 
  //---------------------------------------------------------------------------
  // Members
  //
 
  /**
   * overall tokenizer flags.
   */
  protected int _flags = 0;
 
  /**
   * a combination of <code>F_...</code> constants defined in {@link TokenizerProperties}
   * indicating which bits in the {@link #_flags} member are valid. All other
   * flags are taken from the associated {@link TokenizerProperties} object.
   *
   * @see #changeParseFlags
   */
  private int _flagMask = 0;
 
  /**
   * Flag if EOF has been reached. The flag should speed up calls to {@link readMoreDataFromBase}
   */
  private boolean _eofReached = true;
 
  /**
   * Data index there {@link #nextToken} will start parsing.
   */
  protected int _currentReadPos = 0;

  /**
   * Data index there {@link #readMoreDataFromBase} will fill in new data.
   */
  protected int _currentWritePos = 0;
 
  /**
   * if line counting is enabled, this contains the current line number starting
   * with 0.
   */
  protected int _lineNumber = -1;

  /**
   * if line counting is enabled, this contains the current column number starting
   * with 0.
   */
  protected int _columnNumber = -1;
 
  /**
   * List of currently known token. The first element is the current token returned
   * by the last call to {@link #nextToken}. The following elements are look-ahead
   * token that have already been identified when extracting the current token.
   */
  protected Token[] _scannedToken = new Token[] { null, null, null };
 
  /**
   * For embedded tokenizers: this is the list of the succeding tokenizers
   */
  protected AbstractTokenizer _nextTokenizer = null;

  /**
   * For embedded tokenizers: this is the base tokenizer that reads the data
   */
  protected AbstractTokenizer _baseTokenizer = null;

  /**
   * For embedded tokenizers: this is the list of the previous tokenizers
   */
  protected AbstractTokenizer _prevTokenizer = null;
 
  /**
   * Whitespace handler
   */
  private de.susebox.jtopas.spi.WhitespaceHandler _whitespaceHandler = null;

  /**
   * Separator handler
   */
  private de.susebox.jtopas.spi.SeparatorHandler _separatorHandler = null;

  /**
   * Keyword handler
   */
  private de.susebox.jtopas.spi.KeywordHandler _keywordHandler = null;

  /**
   * Sequence handler
   */
  private de.susebox.jtopas.spi.SequenceHandler _sequenceHandler = null;
 
  /**
   * Sequence handler
   */
  private de.susebox.jtopas.spi.PatternHandler _patternHandler = null;
 
  /**
   * The source of input data
   */
  private TokenizerSource _source = null;
 
  /**
   * The characteristics of this tokenizer.
   */
  private TokenizerProperties _properties = null;
 
  /**
   * Line number to position mapping
   */
  private TreeMap _position2LineMap = null;
 
  /**
   * Control flags for the internal work
   */
  private long _internalFlags = 0;
}
TOP

Related Classes of de.susebox.jtopas.AbstractTokenizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.