Source Code of org.eclipse.wst.xml.core.internal.parser.XMLSourceParser

/*******************************************************************************
 * Copyright (c) 2001, 2010 IBM Corporation and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     IBM Corporation - initial API and implementation
 *     Jens Lukowski/Innoopract - initial renaming/restructuring
 *     
 *******************************************************************************/
package org.eclipse.wst.xml.core.internal.parser;






import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;


import org.eclipse.jface.text.BadLocationException;
import org.eclipse.jface.text.IDocument;
import org.eclipse.wst.sse.core.internal.document.DocumentReader;
import org.eclipse.wst.sse.core.internal.ltk.parser.BlockMarker;
import org.eclipse.wst.sse.core.internal.ltk.parser.BlockTagParser;
import org.eclipse.wst.sse.core.internal.ltk.parser.BlockTokenizer;
import org.eclipse.wst.sse.core.internal.ltk.parser.RegionParser;
import org.eclipse.wst.sse.core.internal.ltk.parser.StructuredDocumentRegionHandler;
import org.eclipse.wst.sse.core.internal.ltk.parser.StructuredDocumentRegionParser;
import org.eclipse.wst.sse.core.internal.ltk.parser.StructuredDocumentRegionParserExtension;
import org.eclipse.wst.sse.core.internal.provisional.text.IStructuredDocumentRegion;
import org.eclipse.wst.sse.core.internal.provisional.text.ITextRegion;
import org.eclipse.wst.sse.core.internal.provisional.text.ITextRegionContainer;
import org.eclipse.wst.sse.core.internal.provisional.text.ITextRegionList;
import org.eclipse.wst.sse.core.internal.text.CharSequenceReader;
import org.eclipse.wst.sse.core.internal.text.IRegionComparible;
import org.eclipse.wst.sse.core.internal.util.Debug;
import org.eclipse.wst.xml.core.internal.Logger;
import org.eclipse.wst.xml.core.internal.regions.DOMRegionContext;




/**
 * Takes input from the HTMLTokenizer and creates a tag list
 */


public class XMLSourceParser implements RegionParser, BlockTagParser, StructuredDocumentRegionParser, IRegionComparible, StructuredDocumentRegionParserExtension {
  // made public to aid access from inner classes in hierarchy.
  // TODO: in future, figure out how to solve without exposing data.
  public CharSequence fCharSequenceSource = null;
  private IDocument fDocumentInput;
  protected int fOffset = 0;
  // DMW: 2/12/03. Removed some state data, since not really needed,
  // and since it added a lot to overhead (since so many regions are
  // created.
  // protected IStructuredDocumentRegion fCurrentNode = null;
  // protected IStructuredDocumentRegion fNodes = null;
  // protected List fRegions = null;
  // protected Object fInput = null;
  protected String fStringInput = null;
  protected List fStructuredDocumentRegionHandlers;


  protected BlockTokenizer fTokenizer = null;
  protected long startTime;
  protected long stopTime;


  /**
   * HTMLSourceParser constructor comment.
   */
  public XMLSourceParser() {
    super();
    fStructuredDocumentRegionHandlers = new ArrayList();
  }


  /**
   * This is a simple utility to count nodes. Used only for debug
   * statements.
   */
  protected int _countNodes(IStructuredDocumentRegion nodes) {
    int result = 0;
    IStructuredDocumentRegion countNode = nodes;
    while (countNode != null) {
      result++;
      countNode = countNode.getNext();
    }
    return result;
  }


  public void addBlockMarker(BlockMarker marker) {
    getTokenizer().addBlockMarker(marker);
  }


  public synchronized void addStructuredDocumentRegionHandler(StructuredDocumentRegionHandler handler) {
    if (fStructuredDocumentRegionHandlers == null)
      fStructuredDocumentRegionHandlers = new ArrayList();
    synchronized (fStructuredDocumentRegionHandlers) {
      fStructuredDocumentRegionHandlers.add(handler);
    }
  }


  public void beginBlockScan(String newTagName) {
    getTokenizer().beginBlockTagScan(newTagName);
  }


  /**
   * @return IStructuredDocumentRegion
   */
  protected IStructuredDocumentRegion createStructuredDocumentRegion(String type) {
    IStructuredDocumentRegion newNode = null;
    if (type == DOMRegionContext.BLOCK_TEXT)
      newNode = XMLStructuredRegionFactory.createRegion(XMLStructuredRegionFactory.XML_BLOCK);
    else
      newNode = XMLStructuredRegionFactory.createRegion(XMLStructuredRegionFactory.XML);
    return newNode;
  }


  protected void fireNodeParsed(IStructuredDocumentRegion fCurrentNode) {
    /*
     * Never let an Exceptions from foreign code interfere with completion
     * of parsing. To get an exception here is definitely a program error
     * somewhere, but we can't afford to interrupt the flow of control. or
     * backwards typing can result!
     * 
     * Protect the user's data above everything.
     */
    Object[] handlers = null;
    synchronized (fStructuredDocumentRegionHandlers) {
      if (fStructuredDocumentRegionHandlers == null)
        return;


      handlers = fStructuredDocumentRegionHandlers.toArray();
    }
    if (fCurrentNode != null && handlers != null) {
      for (int i = 0; i < handlers.length; i++) {
        try {
          ((StructuredDocumentRegionHandler) handlers[i]).nodeParsed(fCurrentNode);
        }
        catch (Exception e) {
          Logger.log(Logger.ERROR, "Error occurred while firing Node Parsed event", e); //$NON-NLS-1$
        }
      }
    }
  }


  public BlockMarker getBlockMarker(String tagName) {
    List markers = getTokenizer().getBlockMarkers();
    for (int i = 0; i < markers.size(); i++) {
      BlockMarker marker = (BlockMarker) markers.get(i);
      if (marker.isCaseSensitive()) {
        if (marker.getTagName().equals(tagName))
          return marker;
      }
      else {
        if (marker.getTagName().equalsIgnoreCase(tagName))
          return marker;
      }
    }
    return null;
  }


  public List getBlockMarkers() {
    return getTokenizer().getBlockMarkers();
  }


  /**
   * @return IStructuredDocumentRegion
   */
  public IStructuredDocumentRegion getDocumentRegions() {
    IStructuredDocumentRegion headnode = null;
    if (headnode == null) {
      if (Debug.perfTest) {
        startTime = System.currentTimeMillis();
      }
      headnode = parseNodes();
      if (Debug.perfTest) {
        stopTime = System.currentTimeMillis();
        System.out.println(" -- creating nodes of IStructuredDocument -- "); //$NON-NLS-1$
        System.out.println(" Time parse and init all regions: " + (stopTime - startTime) + " (msecs)"); //$NON-NLS-2$//$NON-NLS-1$
        // System.out.println(" for " + fRegions.size() + "
        // Regions");//$NON-NLS-2$//$NON-NLS-1$
        System.out.println("      and " + _countNodes(headnode) + " Nodes"); //$NON-NLS-2$//$NON-NLS-1$
      }
    }
    return headnode;
  }


  protected ITextRegion getNextRegion() {
    ITextRegion region = null;
    try {
      region = getTokenizer().getNextToken();
      // DMW: 2/12/03 Removed state
      // if (region != null) {
      // fRegions.add(region);
      // }
      return region;
    }
    catch (StackOverflowError e) {
      Logger.logException(getClass().getName() + ": input could not be parsed correctly at position " + getTokenizer().getOffset(), e); //$NON-NLS-1$
      throw e;
    }
    catch (Exception e) {
      Logger.logException(getClass().getName() + ": input could not be parsed correctly at position " + getTokenizer().getOffset() + " (" + e.getLocalizedMessage() + ")", e); //$NON-NLS-3$//$NON-NLS-2$//$NON-NLS-1$
    }
    return null;
  }


  /**
   * Return the full list of known regions. Typically getNodes should be
   * used instead of this method.
   */
  public List getRegions() {
    IStructuredDocumentRegion headNode = null;
    if (!getTokenizer().isEOF()) {
      headNode = getDocumentRegions();
      // throw new IllegalStateException("parsing has not finished");
    }
    // for memory recovery, we assume if someone
    // requests all regions, we can reset our big
    // memory consuming objects
    // but the new "getRegions" method is then more expensive.
    // I don't think its used much, though.
    List localRegionsList = getRegions(headNode);
    primReset();
    return localRegionsList;
  }


  /**
   * Method getRegions.
   * 
   * @param headNode
   * @return List
   */
  protected List getRegions(IStructuredDocumentRegion headNode) {
    List allRegions = new ArrayList();
    IStructuredDocumentRegion currentNode = headNode;
    while (currentNode != null) {
      ITextRegionList nodeRegions = currentNode.getRegions();
      for (int i = 0; i < nodeRegions.size(); i++) {
        allRegions.add(nodeRegions.get(i));
      }
      currentNode = currentNode.getNext();
    }
    return allRegions;
  }


  /**
   * @deprecated - use the add/remove methods instead
   * @return java.util.List
   */
  public List getStructuredDocumentRegionHandlers() {
    if (fStructuredDocumentRegionHandlers == null) {
      fStructuredDocumentRegionHandlers = new ArrayList(0);
    }
    return fStructuredDocumentRegionHandlers;
  }


  /**
   * Returns text from the current input. Text is only valid before
   * getNodes() has been called and only when a raw String or DocumentReader
   * is given as the input.
   */
  public String getText(int offset, int length) {
    String text = null;
    if (fCharSequenceSource != null) {
      int start = fOffset + offset;
      int end = start + length;
      text = fCharSequenceSource.subSequence(start, end).toString();
    }
    else if (fDocumentInput != null) {
      try {
        text = fDocumentInput.get(offset, length);
      }
      catch (BadLocationException e) {
        text = ""; //$NON-NLS-1$
      }
    }
    else {
      if (fStringInput == null || fStringInput.length() == 0 || offset + length > fStringInput.length() || offset < 0) {
        text = ""; //$NON-NLS-1$
      }
      else {
        // offset is entirely valid during parsing as the parse
        // numbers haven't been adjusted.
        text = fStringInput.substring(offset, offset + length);
      }
    }
    return text;
  }


  protected BlockTokenizer getTokenizer() {
    if (fTokenizer == null) {
      fTokenizer = new XMLTokenizer();
    }
    return fTokenizer;
  }




  public RegionParser newInstance() {
    XMLSourceParser newInstance = new XMLSourceParser();
    newInstance.setTokenizer(getTokenizer().newInstance());
    return newInstance;
  }


  protected IStructuredDocumentRegion parseNodes() {
    // regions are initially reported as complete offsets within the
    // scanned input
    // they are adjusted here to be indexes from the currentNode's start
    // offset
    IStructuredDocumentRegion headNode = null;
    IStructuredDocumentRegion lastNode = null;
    ITextRegion region = null;
    IStructuredDocumentRegion currentNode = null;
    String type = null;


    while ((region = getNextRegion()) != null) {
      type = region.getType();
      // these types (might) demand a IStructuredDocumentRegion for each
      // of them
      if (type == DOMRegionContext.BLOCK_TEXT) {
        if (currentNode != null && currentNode.getLastRegion().getType() == DOMRegionContext.BLOCK_TEXT) {
          // multiple block texts indicated embedded containers; no
          // new IStructuredDocumentRegion
          currentNode.addRegion(region);
          currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart());
          region.adjustStart(-currentNode.getStart());
          // DW 4/16/2003 regions no longer have parents
          // region.setParent(currentNode);
        }
        else {
          // not continuing a IStructuredDocumentRegion
          if (currentNode != null) {
            // ensure that any existing node is at least
            // terminated
            if (!currentNode.isEnded()) {
              currentNode.setLength(region.getStart() - currentNode.getStart());
              // fCurrentNode.setTextLength(region.getStart() -
              // fCurrentNode.getStart());
            }
            lastNode = currentNode;
          }
          fireNodeParsed(currentNode);
          currentNode = createStructuredDocumentRegion(type);
          if (lastNode != null) {
            lastNode.setNext(currentNode);
          }
          currentNode.setPrevious(lastNode);
          currentNode.setStart(region.getStart());
          currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart());
          currentNode.setEnded(true);
          region.adjustStart(-currentNode.getStart());
          currentNode.addRegion(region);
          // DW 4/16/2003 regions no longer have parents
          // region.setParent(currentNode);
        }
      }
      // the following contexts OPEN new StructuredDocumentRegions
      else if ((currentNode != null && currentNode.isEnded()) || (type == DOMRegionContext.XML_CONTENT) || (type == DOMRegionContext.XML_CHAR_REFERENCE) || (type == DOMRegionContext.XML_ENTITY_REFERENCE) || (type == DOMRegionContext.XML_PI_OPEN) || (type == DOMRegionContext.XML_TAG_OPEN) || (type == DOMRegionContext.XML_END_TAG_OPEN) || (type == DOMRegionContext.XML_COMMENT_OPEN) || (type == DOMRegionContext.XML_CDATA_OPEN) || (type == DOMRegionContext.XML_DECLARATION_OPEN)) {
        if (currentNode != null) {
          // ensure that any existing node is at least terminated
          if (!currentNode.isEnded()) {
            currentNode.setLength(region.getStart() - currentNode.getStart());
            // fCurrentNode.setTextLength(region.getStart() -
            // fCurrentNode.getStart());
          }
          lastNode = currentNode;
        }
        fireNodeParsed(currentNode);
        currentNode = createStructuredDocumentRegion(type);
        if (lastNode != null) {
          lastNode.setNext(currentNode);
        }
        currentNode.setPrevious(lastNode);
        currentNode.setStart(region.getStart());
        currentNode.addRegion(region);
        currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart());
        region.adjustStart(-currentNode.getStart());
        // DW 4/16/2003 regions no longer have parents
        // region.setParent(currentNode);
      }
      // the following contexts neither open nor close
      // StructuredDocumentRegions; just add to them
      else if ((type == DOMRegionContext.XML_TAG_NAME) || (type == DOMRegionContext.XML_TAG_ATTRIBUTE_NAME) || (type == DOMRegionContext.XML_TAG_ATTRIBUTE_EQUALS) || (type == DOMRegionContext.XML_TAG_ATTRIBUTE_VALUE) || (type == DOMRegionContext.XML_COMMENT_TEXT) || (type == DOMRegionContext.XML_PI_CONTENT) || (type == DOMRegionContext.XML_DOCTYPE_INTERNAL_SUBSET)) {
        currentNode.addRegion(region);
        currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart());
        region.adjustStart(-currentNode.getStart());
        // DW 4/16/2003 regions no longer have parents
        // region.setParent(currentNode);
      }
      // the following contexts close off StructuredDocumentRegions
      // cleanly
      else if ((type == DOMRegionContext.XML_PI_CLOSE) || (type == DOMRegionContext.XML_TAG_CLOSE) || (type == DOMRegionContext.XML_EMPTY_TAG_CLOSE) || (type == DOMRegionContext.XML_COMMENT_CLOSE) || (type == DOMRegionContext.XML_DECLARATION_CLOSE) || (type == DOMRegionContext.XML_CDATA_CLOSE)) {
        currentNode.setEnded(true);
        currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart());
        currentNode.addRegion(region);
        region.adjustStart(-currentNode.getStart());
        // DW 4/16/2003 regions no longer have parents
        // region.setParent(currentNode);
      }
      // this is extremely rare, but valid
      else if (type == DOMRegionContext.WHITE_SPACE) {
        ITextRegion lastRegion = currentNode.getLastRegion();
        // pack the embedded container with this region
        if (lastRegion instanceof ITextRegionContainer) {
          ITextRegionContainer container = (ITextRegionContainer) lastRegion;
          container.getRegions().add(region);
          // containers must have parent set ...
          // setting for EACH subregion is redundent, but not sure
          // where else to do, so will do here for now.
          container.setParent(currentNode);
          // DW 4/16/2003 regions no longer have parents
          // region.setParent(container);
          region.adjustStart(container.getLength() - region.getStart());
        }
        currentNode.getLastRegion().adjustLength(region.getLength());
        currentNode.adjustLength(region.getLength());
      }
      else if (type == DOMRegionContext.UNDEFINED && currentNode != null) {
        // skip on a very-first region situation as the default
        // behavior is good enough
        // combine with previous if also undefined
        if (currentNode.getLastRegion() != null && currentNode.getLastRegion().getType() == DOMRegionContext.UNDEFINED) {
          currentNode.getLastRegion().adjustLength(region.getLength());
          currentNode.adjustLength(region.getLength());
        }
        // previous wasn't undefined
        else {
          currentNode.addRegion(region);
          currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart());
          region.adjustStart(-currentNode.getStart());
        }
      }
      else {
        // if an unknown type is the first region in the document,
        // ensure that a node exists
        if (currentNode == null) {
          currentNode = createStructuredDocumentRegion(type);
          currentNode.setStart(region.getStart());
        }
        currentNode.addRegion(region);
        currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart());
        region.adjustStart(-currentNode.getStart());
        // DW 4/16/2003 regions no longer have parents
        // region.setParent(currentNode);
        if (Debug.debugTokenizer)
          System.out.println(getClass().getName() + " found region of not specifically handled type " + region.getType() + " @ " + region.getStart() + "[" + region.getLength() + "]"); //$NON-NLS-4$//$NON-NLS-3$//$NON-NLS-2$//$NON-NLS-1$
        //$NON-NLS-3$//$NON-NLS-2$//$NON-NLS-1$
      }


      // these regions also get their own node, so close them cleanly
      // NOTE: these regions have new StructuredDocumentRegions created
      // for them above; it may
      // be more readable if that is handled here as well, but the
      // current layout
      // ensures that they open StructuredDocumentRegions the same way
      if ((type == DOMRegionContext.XML_CONTENT) || (type == DOMRegionContext.XML_CHAR_REFERENCE) || (type == DOMRegionContext.XML_ENTITY_REFERENCE)) {
        currentNode.setEnded(true);
      }
      if (headNode == null && currentNode != null) {
        headNode = currentNode;
      }
    }
    if (currentNode != null) {
      fireNodeParsed(currentNode);
      currentNode.setPrevious(lastNode);
    }
    // fStringInput = null;
    primReset();
    return headNode;
  }


  protected void primReset() {
    // fNodes = null;
    // fRegions = null;
    // fInput = null;
    fStringInput = null;
    fCharSequenceSource = null;
    fDocumentInput = null;
    fOffset = 0;
    // fCurrentNode = null;
    // DMW: also reset tokenizer so it doesn't hold on
    // to large arrays
    getTokenizer().reset(new char[0]);
  }


  /*
   * (non-Javadoc)
   * 
   * @see org.eclipse.wst.sse.core.internal.text.IRegionComparible#regionMatches(int,
   *      int, java.lang.String)
   */
  public boolean regionMatches(int offset, int length, String stringToCompare) {
    // by definition
    if (stringToCompare == null)
      return false;


    int ajustedOffset = fOffset + offset;
    boolean result = false;
    if (fCharSequenceSource != null && fCharSequenceSource instanceof IRegionComparible) {
      result = ((IRegionComparible) fCharSequenceSource).regionMatches(ajustedOffset, length, stringToCompare);
    }
    else {
      // old fashioned ways
      String test = null;
      if (fCharSequenceSource != null) {
        test = fCharSequenceSource.subSequence(ajustedOffset, ajustedOffset + length).toString();
      }
      else if (fStringInput != null) {
        test = fStringInput.substring(ajustedOffset, ajustedOffset + length);
      }
      result = stringToCompare.equals(test);
    }
    return result;
  }


  public boolean regionMatchesIgnoreCase(int offset, int length, String stringToCompare) {
    // by definition
    if (stringToCompare == null)
      return false;


    int ajustedOffset = fOffset + offset;
    boolean result = false;
    if (fCharSequenceSource != null && fCharSequenceSource instanceof IRegionComparible) {
      result = ((IRegionComparible) fCharSequenceSource).regionMatchesIgnoreCase(ajustedOffset, length, stringToCompare);
    }
    else {
      // old fashioned ways
      String test = null;
      if (fCharSequenceSource != null) {
        test = fCharSequenceSource.subSequence(ajustedOffset, ajustedOffset + length).toString();
      }
      else if (fStringInput != null) {
        test = fStringInput.substring(ajustedOffset, ajustedOffset + length);
      }
      result = stringToCompare.equalsIgnoreCase(test);
    }
    return result;
  }


  public void removeBlockMarker(BlockMarker marker) {
    getTokenizer().removeBlockMarker(marker);
  }


  public void removeBlockMarker(String tagName) {
    getTokenizer().removeBlockMarker(tagName);
  }


  public void removeStructuredDocumentRegionHandler(StructuredDocumentRegionHandler handler) {
    if (fStructuredDocumentRegionHandlers == null)
      return;
    synchronized (fStructuredDocumentRegionHandlers) {
      fStructuredDocumentRegionHandlers.remove(handler);
    }
  }


  /**
   * Resets the input.
   */
  public void reset(java.io.FileInputStream instream) {
    primReset();
    // fInput = instream;
    getTokenizer().reset(instream);
  }


  /**
   * Resets the input.
   */
  public void reset(java.io.Reader reader) {
    reset(reader, 0);
  }


  /**
   * Resets the input.
   */
  public void reset(java.io.Reader reader, int position) {
    primReset();
    fOffset = position;
    getTokenizer().reset(reader, position);
    if (reader instanceof DocumentReader) {
      IDocument doc = ((DocumentReader) reader).getDocument();
      if (doc instanceof CharSequence) {
        fCharSequenceSource = (CharSequence) doc;
      }
      else {
        // old fashioned IDocument
        fDocumentInput = ((DocumentReader) reader).getDocument();
      }


    }
    else if (reader instanceof CharSequenceReader) {
      fCharSequenceSource = ((CharSequenceReader) reader).getOriginalSource();
    }
  }


  /**
   * Resets the input. Use this version to allow text to be retrieved
   * <em>during</em> parsing, such as by the
   * StructuredDocumentRegionHandler.
   */
  public void reset(String sourceString) {
    reset(new StringReader(sourceString));
    fStringInput = sourceString;
  }


  /**
   * Resets the input. Use this version to allow text to be retrieved
   * <em>during</em> parsing, such as by the
   * StructuredDocumentRegionHandler.
   */
  public void reset(String sourceString, int position) {
    StringReader reader = new StringReader(sourceString);
    reset(reader, position);
    fStringInput = sourceString;
  }


  public void resetHandlers() {
    Object[] handlers = null;
    synchronized (fStructuredDocumentRegionHandlers) {
      if (fStructuredDocumentRegionHandlers == null)
        return;


      handlers = fStructuredDocumentRegionHandlers.toArray();
    }
    for (int i = 0; i < handlers.length; i++) {
      try {
        ((StructuredDocumentRegionHandler) handlers[i]).resetNodes();
      }
      catch (Exception e) {
        Logger.log(Logger.ERROR, "Error occurred while resetting handlers", e); //$NON-NLS-1$
      }
    }
  }


  /**
   * 
   * @param List
   */
  public void setStructuredDocumentRegionHandlers(List newStructuredDocumentRegionHandlers) {
    fStructuredDocumentRegionHandlers = newStructuredDocumentRegionHandlers;
  }


  protected void setTokenizer(BlockTokenizer newTokenizer) {
    // DMW: changed from private to protected, so subclass could use in
    // creation of 'newInstance'.
    fTokenizer = newTokenizer;
  }
}
Source Code of org.eclipse.wst.xml.core.internal.parser.XMLSourceParser

Related Classes of org.eclipse.wst.xml.core.internal.parser.XMLSourceParser