Package org.apache.uima.cas_data.impl

Source Code of org.apache.uima.cas_data.impl.XCasToCasDataSaxHandler

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.uima.cas_data.impl;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.impl.XCASParsingException;
import org.apache.uima.cas.impl.XCASSerializer;
import org.apache.uima.cas_data.CasData;
import org.apache.uima.cas_data.PrimitiveArrayFS;
import org.apache.uima.cas_data.ReferenceArrayFS;
import org.apache.uima.internal.util.StringUtils;
import org.xml.sax.Attributes;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.DefaultHandler;

/**
* A SAX ContentHandler that reads XCAS and creates a CasData.
*
*
*/
public class XCasToCasDataSaxHandler extends DefaultHandler {
  // ///////////////////////////////////////////////////////////////////////
  // Internal states for the parser.

  // Expect the start of the XML document.
  private static final int DOC_STATE = 0;

  // At the top level. Expect a FS.
  private static final int FS_STATE = 1;

  // Inside a FS. Expect features, or the end of the FS.
  private static final int FEAT_STATE = 2;

  // Inside FS. We have seen a _content attribute, and expect text.
  private static final int CONTENT_STATE = 3;

  // Inside a feature element. We expect the feature value.
  private static final int FEAT_CONTENT_STATE = 4;

  // Inside an array element. Expect array element value.
  private static final int ARRAY_ELE_CONTENT_STATE = 5;

  // Inside an array FS. Expect an array element, or the end of the FS.
  private static final int ARRAY_ELE_STATE = 6;

  // End parser states.
  // ///////////////////////////////////////////////////////////////////////

  private static final String reservedAttrPrefix = "_";

  // For error message printing, if the Locator object can't provide source
  // of XML input.
  private static final String unknownXMLSource = "<unknown>";

  private static final String DEFAULT_CONTENT_FEATURE = "value";

  // private long time;

  // SAX locator. Used for error message generation.
  private Locator locator;

  // The CasData we're filling.
  private CasData cas;

  // What we expect next.
  private int state;

  // StringBuffer to accumulate text.
  private StringBuffer buffer;

  // Most recently created FS. Needed for array elements
  // and embedded feature values.
  private FeatureStructureImpl currentFS;

  // The name of the content feature, if we've seen one.
  private String currentContentFeat;

  // The current position when parsing array elements.
  private int arrayPos;

  // The type of the array we're currently reading. Needed for proper
  // treatment of array element values.
  private int arrayType;

  /**
   * Create new XCasToCasDataSaxHandler.
   *
   * @param aCasData
   *          the CasData to which FeatureStructures parsed from XCAS will be appended
   */
  public XCasToCasDataSaxHandler(CasData aCasData) {
    super();
    this.buffer = new StringBuffer();
    this.cas = aCasData;
  }

  private final void resetBuffer() {
    // this.buffer.delete(0, this.buffer.length());
    this.buffer = new StringBuffer();
  }

  /*
   * (non-Javadoc)
   *
   * @see org.xml.sax.ContentHandler#startDocument()
   */
  public void startDocument() throws SAXException {
    // Do setup work in the constructor.
    this.state = DOC_STATE;
    // System.out.println("Starting to read document.");
    // time = System.currentTimeMillis();
  }

  /*
   * (non-Javadoc)
   *
   * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String,
   *      java.lang.String, org.xml.sax.Attributes)
   */
  public void startElement(String nameSpaceURI, String localName, String qualifiedName,
          Attributes attrs) throws SAXException {
    resetBuffer();
    switch (state) {
      case DOC_STATE: {
        if (!qualifiedName.equals(XCASSerializer.casTagName)) {
          throw createException(XCASParsingException.WRONG_ROOT_TAG, qualifiedName);
        }
        this.state = FS_STATE;
        break;
      }
      case FS_STATE: {
        this.currentContentFeat = DEFAULT_CONTENT_FEATURE;
        readFS(qualifiedName, attrs);
        break;
      }
      case ARRAY_ELE_STATE: {
        readArrayElement(qualifiedName, attrs);
        break;
      }
      default: {
        // If we're not in an element expecting state, raise an error.
        throw createException(XCASParsingException.TEXT_EXPECTED, qualifiedName);
      }
    }
  }

  // Get ready to read array element.
  private void readArrayElement(String eleName, Attributes attrs) throws SAXParseException {
    if (!eleName.equals(XCASSerializer.ARRAY_ELEMENT_TAG)) {
      throw createException(XCASParsingException.ARRAY_ELE_EXPECTED, eleName);
    }
    if (attrs.getLength() > 0) {
      throw createException(XCASParsingException.ARRAY_ELE_ATTRS);
    }
    this.state = ARRAY_ELE_CONTENT_STATE;
    resetBuffer();
  }

  // Create a new FS.
  private void readFS(String qualifiedName, Attributes attrs) throws SAXParseException {
    if (isArrayType(qualifiedName)) {
      readArray(qualifiedName, attrs);
    } else {
      this.currentFS = new FeatureStructureImpl();
      this.currentFS.setType(getCasTypeName(qualifiedName));
      readFS(this.currentFS, attrs);
    }
    this.cas.addFeatureStructure(this.currentFS);
  }

  /**
   * Gets the CAS type name corresponding to an XCAS tag name. The type name is usually equal to the
   * tag name, but the characters : and - are translated into the sequences _colon_ and _dash_,
   * respectively.
   *
   * @param aTagName
   *          XCAS tag name
   * @return CAS type name corresponding to this tag
   */
  private String getCasTypeName(String aTagName) {
    return StringUtils.replaceAll(StringUtils.replaceAll(aTagName, ":", "_colon_"), "-", "_dash_");
  }

  /**
   *
   * @param addr
   * @param attrs
   * @throws SAXParseException
   */
  private void readFS(FeatureStructureImpl fsImpl, Attributes attrs) throws SAXParseException {
    String attrName, attrValue;
    for (int i = 0; i < attrs.getLength(); i++) {
      attrName = attrs.getQName(i);
      attrValue = attrs.getValue(i);
      if (attrName.startsWith(reservedAttrPrefix)) {
        if (attrName.equals(XCASSerializer.ID_ATTR_NAME)) {
          fsImpl.setId(attrValue);
        } else if (attrName.equals(XCASSerializer.CONTENT_ATTR_NAME)) {
          this.currentContentFeat = attrValue;
        } else if (attrName.equals(XCASSerializer.INDEXED_ATTR_NAME)) {
          if (attrValue.equals(XCASSerializer.TRUE_VALUE)) {
            fsImpl.setIndexed(new int[] { 1 }); // Backwards compatible CAS, has one default text
            // Sofa
          } else if (!attrValue.equals("false")) {
            fsImpl.setIndexed(parseIntArray(attrValue));
          }
        } else {
          handleFeature(fsImpl, attrName, attrValue);
        }
      } else {
        handleFeature(fsImpl, attrName, attrValue);
      }
    }

    // Set the state; we're either expecting features, or _content.
    // APL - 6/28/04 - even if _content attr is not specified, we can still have content, which
    // would
    // be assigned to the "value" feature, as per XCAS spec. FEAT_STATE did not really seem to be
    // working, anyway.
    this.state = CONTENT_STATE;
    // if (this.state != CONTENT_STATE)
    // {
    // this.state = FEAT_STATE;
    // }
  }

  /**
   * Parse a space-separated string into an integer array.
   */
  private int[] parseIntArray(String val) {
    String[] strVals;
    val = val.trim();
    if ("".equals(val)) {
      strVals = new String[0];
    } else {
      strVals = val.split("\\s+");
    }
    int[] intVals = new int[strVals.length];
    for (int i = 0; i < strVals.length; i++) {
      intVals[i] = Integer.parseInt(strVals[i]);
    }
    return intVals;
  }

  // Create a new array FS.
  private void readArray(String type, Attributes attrs) throws SAXParseException {
    String attrName, attrVal;
    int[] indexed = new int[0];
    int size = 0;
    String id = null;
    for (int i = 0; i < attrs.getLength(); i++) {
      attrName = attrs.getQName(i);
      attrVal = attrs.getValue(i);
      if (attrName.equals(XCASSerializer.ID_ATTR_NAME)) {
        id = attrVal;
      } else if (attrName.equals(XCASSerializer.ARRAY_SIZE_ATTR)) {
        try {
          size = Integer.parseInt(attrVal);
          if (size < 0) {
            throw createException(XCASParsingException.ILLEGAL_ARRAY_SIZE, attrVal);
          }
        } catch (NumberFormatException e) {
          throw createException(XCASParsingException.INTEGER_EXPECTED, attrVal);
        }
      } else if (attrName.equals(XCASSerializer.INDEXED_ATTR_NAME)) {
        if (attrVal.equals(XCASSerializer.TRUE_VALUE)) {
          indexed = new int[] { 1 }; // Backwards compatible CAS, has one default text Sofa
        } else if (!attrVal.equals("false")) {
          indexed = parseIntArray(attrVal);
        }
      } else {
        throw createException(XCASParsingException.ILLEGAL_ARRAY_ATTR, attrName);
      }
    }
    // Hang on to those for setting array values.
    this.arrayPos = 0;
    if (CAS.TYPE_NAME_INTEGER_ARRAY.equals(type)) {
      this.currentFS = new PrimitiveArrayFSImpl(new int[size]);
      this.arrayType = INT_TYPE;
    } else if (CAS.TYPE_NAME_FLOAT_ARRAY.equals(type)) {
      this.currentFS = new PrimitiveArrayFSImpl(new float[size]);
      this.arrayType = FLOAT_TYPE;
    } else if (CAS.TYPE_NAME_STRING_ARRAY.equals(type)) {
      this.currentFS = new PrimitiveArrayFSImpl(new String[size]);
      this.arrayType = STRING_TYPE;
    } else {
      this.currentFS = new ReferenceArrayFSImpl(new String[size]);
      this.arrayType = FS_TYPE;
    }
    this.currentFS.setId(id);
    this.currentFS.setType(type);
    this.currentFS.setIndexed(indexed);
    this.currentFS.setFeatureValue(XCASSerializer.ARRAY_SIZE_ATTR, new PrimitiveValueImpl(size));
    this.state = ARRAY_ELE_STATE;
  }

  // The definition of a null value. Any other value must be in the expected
  // format.
  private final boolean emptyVal(String val) {
    return ((val == null) || (val.length() == 0));
  }

  // Create a feature value from a string representation.
  private void handleFeature(FeatureStructureImpl fsImpl, String featName, String featVal)
          throws SAXParseException {
    if (featName.startsWith(XCASSerializer.REF_PREFIX)) {
      String realFeatName = featName.substring(XCASSerializer.REF_PREFIX.length());
      fsImpl.setFeatureValue(realFeatName, new ReferenceValueImpl(featVal));
    } else {
      fsImpl.setFeatureValue(featName, new PrimitiveValueImpl(featVal));
    }
  }

  /*
   * (non-Javadoc)
   *
   * @see org.xml.sax.ContentHandler#characters(char[], int, int)
   */
  public void characters(char[] chars, int start, int length) throws SAXException {
    if ((this.state == CONTENT_STATE)
            || (this.state == ARRAY_ELE_CONTENT_STATE) || (this.state == FEAT_CONTENT_STATE)) {
      // When we're in a text expecting state, add the characters to the
      // text buffer. Else, do nothing.
      buffer.append(chars, start, length);
    }
  }

  /*
   * (non-Javadoc)
   *
   * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String,
   *      java.lang.String)
   */
  public void endElement(String nsURI, String localName, String qualifiedName) throws SAXException {
    switch (this.state) {
      case DOC_STATE: {
        // Do nothing.
        break;
      }
      case FS_STATE: {
        this.state = DOC_STATE;
        break;
      }
      case FEAT_STATE: {
        this.state = FS_STATE;
        break;
      }
      case CONTENT_STATE: {
        if (!isAllWhitespace(buffer)) {
          // Set the value of the content feature.
          handleFeature(this.currentFS, currentContentFeat, buffer.toString());
        }
        this.state = FS_STATE;
        break;
      }
      case FEAT_CONTENT_STATE: {
        // Create a feature value from an element.
        handleFeature(this.currentFS, qualifiedName, buffer.toString());
        this.state = FEAT_STATE;
        break;
      }
      case ARRAY_ELE_CONTENT_STATE: {
        // Create an array value.
        addArrayElement(buffer.toString());
        this.state = ARRAY_ELE_STATE;
        break;
      }
      case ARRAY_ELE_STATE: {
        this.state = FS_STATE;
        break;
      }
    }
  }

  boolean isAllWhitespace(StringBuffer b) {
    final int len = b.length();
    for (int i = 0; i < len; i++) {
      if (!Character.isWhitespace(b.charAt(i))) {
        return false;
      }
    }
    return true;
  }

  private void addArrayElement(String content) throws SAXParseException {
    switch (arrayType) {
      case INT_TYPE: {
        if (!emptyVal(content)) {
          try {
            ((PrimitiveArrayFS) this.currentFS).toIntArray()[arrayPos] = Integer.parseInt(content);
          } catch (NumberFormatException e) {
            throw createException(XCASParsingException.INTEGER_EXPECTED, content);
          }
        }
        break;
      }
      case FLOAT_TYPE: {
        if (!emptyVal(content)) {
          try {
            ((PrimitiveArrayFS) this.currentFS).toFloatArray()[arrayPos] = Float
                    .parseFloat(content);
          } catch (NumberFormatException e) {
            throw createException(XCASParsingException.FLOAT_EXPECTED, content);
          }
        }
        break;
      }
      case STRING_TYPE: {
        ((PrimitiveArrayFS) this.currentFS).toStringArray()[arrayPos] = content;
        break;
      }
      case FS_TYPE: {
        if (!emptyVal(content)) {
          ((ReferenceArrayFS) this.currentFS).getIdRefArray()[arrayPos] = content;
        }
        break;
      }
    }
    ++arrayPos;
  }

  /*
   * (non-Javadoc)
   *
   * @see org.xml.sax.ContentHandler#endDocument()
   */
  public void endDocument() throws SAXException {
    // nothing to do
  }

  private XCASParsingException createException(int code) {
    XCASParsingException e = new XCASParsingException(code);
    String source = unknownXMLSource;
    String line = unknownXMLSource;
    String col = unknownXMLSource;
    if (locator != null) {
      source = locator.getSystemId();
      if (source == null) {
        source = locator.getPublicId();
      }
      if (source == null) {
        source = unknownXMLSource;
      }
      line = Integer.toString(locator.getLineNumber());
      col = Integer.toString(locator.getColumnNumber());
    }
    e.addArgument(source);
    e.addArgument(line);
    e.addArgument(col);
    return e;
  }

  private XCASParsingException createException(int code, String arg) {
    XCASParsingException e = createException(code);
    e.addArgument(arg);
    return e;
  }

  /*
   * (non-Javadoc)
   *
   * @see org.xml.sax.ErrorHandler#error(org.xml.sax.SAXParseException)
   */
  public void error(SAXParseException e) throws SAXException {
    throw e;
  }

  /*
   * (non-Javadoc)
   *
   * @see org.xml.sax.ErrorHandler#fatalError(org.xml.sax.SAXParseException)
   */
  public void fatalError(SAXParseException e) throws SAXException {
    throw e;
  }

  /*
   * (non-Javadoc)
   *
   * @see org.xml.sax.ContentHandler#ignorableWhitespace(char[], int, int)
   */
  public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {
    // Since we're not validating, we don't need to do anything; this won't
    // be called.
  }

  /*
   * (non-Javadoc)
   *
   * @see org.xml.sax.ContentHandler#setDocumentLocator(org.xml.sax.Locator)
   */
  public void setDocumentLocator(Locator loc) {
    // System.out.println("Setting document locator.");
    this.locator = loc;
  }

  /*
   * (non-Javadoc)
   *
   * @see org.xml.sax.ErrorHandler#warning(org.xml.sax.SAXParseException)
   */
  public void warning(SAXParseException e) throws SAXException {
    throw e;
  }

  private static final int INT_TYPE = 0;

  private static final int FLOAT_TYPE = 1;

  private static final int STRING_TYPE = 2;

  private static final int FS_TYPE = 3;

  private boolean isArrayType(String typeName) {
    return CAS.TYPE_NAME_INTEGER_ARRAY.equals(typeName)
            || CAS.TYPE_NAME_FLOAT_ARRAY.equals(typeName)
            || CAS.TYPE_NAME_STRING_ARRAY.equals(typeName)
            || CAS.TYPE_NAME_FS_ARRAY.equals(typeName);
  }

}
TOP

Related Classes of org.apache.uima.cas_data.impl.XCasToCasDataSaxHandler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.