Source Code of org.apache.flink.core.io.StringRecord

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */




/**
 * This file is based on source code from the Hadoop Project (http://hadoop.apache.org/), licensed by the Apache
 * Software Foundation (ASF) under the Apache License, Version 2.0. See the NOTICE file distributed with this work for
 * additional information regarding copyright ownership. 
 */


package org.apache.flink.core.io;


import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.MalformedInputException;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.util.Arrays;


import org.apache.flink.core.memory.DataInputView;
import org.apache.flink.core.memory.DataOutputView;
import org.apache.flink.types.Value;


/**
 * This class stores text using standard UTF8 encoding. It provides methods to
 * serialize, deserialize, and compare texts at byte level. The type of length
 * is integer and is serialized using zero-compressed format.
 * <p>
 * In addition, it provides methods for string traversal without converting the byte array to a string.
 * <p>
 * Also includes utilities for serializing/deserialing a string, coding/decoding a string, checking if a byte array
 * contains valid UTF8 code, calculating the length of an encoded string.
 */
public class StringRecord implements Value {
  
  private static final long serialVersionUID = 1L;


  private static final ThreadLocal<CharsetEncoder> ENCODER_FACTORY = new ThreadLocal<CharsetEncoder>() {
    protected CharsetEncoder initialValue() {
      return Charset.forName("UTF-8").newEncoder().onMalformedInput(CodingErrorAction.REPORT)
        .onUnmappableCharacter(CodingErrorAction.REPORT);
    }
  };


  private static final ThreadLocal<CharsetDecoder> DECODER_FACTORY = new ThreadLocal<CharsetDecoder>() {
    protected CharsetDecoder initialValue() {
      return Charset.forName("UTF-8").newDecoder().onMalformedInput(CodingErrorAction.REPORT)
        .onUnmappableCharacter(CodingErrorAction.REPORT);
    }
  };


  /**
   * Cache the hash code for the encapsulated string.
   **/
  private int hash = 0;


  private static final byte[] EMPTY_BYTES = new byte[0];


  private byte[] bytes;


  private int length;


  public StringRecord() {
    this.bytes = EMPTY_BYTES;
    this.hash = 0;
  }


  /**
   * Construct from a string.
   */
  public StringRecord(final String string) {
    set(string);
  }


  /** Construct from another text. */
  public StringRecord(final StringRecord utf8) {
    set(utf8);
  }


  /**
   * Construct from a byte array.
   */
  public StringRecord(final byte[] utf8) {
    set(utf8);
  }


  /**
   * Returns the raw bytes; however, only data up to {@link #getLength()} is
   * valid.
   */
  public byte[] getBytes() {
    return bytes;
  }


  /**
   * Returns the Unicode Scalar Value (32-bit integer value) for the character
   * at <code>position</code>. Note that this method avoids using the
   * converter or doing String instantiation
   * 
   * @return the Unicode scalar value at position or -1 if the position is
   *         invalid or points to a trailing byte
   */
  public int charAt(final int position) {
    if (position > this.length) {
      return -1; // too long
    }
    if (position < 0) {
      return -1;
    }


    final ByteBuffer bb = (ByteBuffer) ByteBuffer.wrap(this.bytes).position(position);
    return bytesToCodePoint(bb.slice());
  }


  public int find(final String what) {
    return find(what, 0);
  }


  /**
   * Finds any occurrence of <code>what</code> in the backing buffer, starting
   * as position <code>start</code>. The starting position is measured in
   * bytes and the return value is in terms of byte position in the buffer.
   * The backing buffer is not converted to a string for this operation.
   * 
   * @return byte position of the first occurence of the search string in the
   *         UTF-8 buffer or -1 if not found
   */
  public int find(final String what, final int start) {
    try {
      final ByteBuffer src = ByteBuffer.wrap(this.bytes, 0, this.length);
      final ByteBuffer tgt = encode(what);
      final byte b = tgt.get();
      src.position(start);


      while (src.hasRemaining()) {
        if (b == src.get()) { // matching first byte
          src.mark(); // save position in loop
          tgt.mark(); // save position in target
          boolean found = true;
          final int pos = src.position() - 1;
          while (tgt.hasRemaining()) {
            if (!src.hasRemaining()) { // src expired first
              tgt.reset();
              src.reset();
              found = false;
              break;
            }
            if (!(tgt.get() == src.get())) {
              tgt.reset();
              src.reset();
              found = false;
              break; // no match
            }
          }
          if (found) {
            return pos;
          }
        }
      }
      return -1; // not found
    } catch (CharacterCodingException e) {
      // can't get here
      e.printStackTrace();
      return -1;
    }
  }


  /**
   * Set to contain the contents of a string.
   */
  public void set(final String string) {
    try {
      final ByteBuffer bb = encode(string, true);
      this.bytes = bb.array();
      this.length = bb.limit();
      this.hash = 0;
    } catch (CharacterCodingException e) {
      throw new RuntimeException("Should not have happened " + e.toString());
    }
  }


  /**
   * Set to a utf8 byte array
   */
  public void set(final byte[] utf8) {
    set(utf8, 0, utf8.length);
  }


  /** copy a text. */
  public void set(final StringRecord other) {
    set(other.getBytes(), 0, other.getLength());
  }


  /** Returns the number of bytes in the byte array */
  public int getLength() {
    return length;
  }


  /**
   * Set the Text to range of bytes
   * 
   * @param utf8
   *        the data to copy from
   * @param start
   *        the first position of the new string
   * @param len
   *        the number of bytes of the new string
   */
  public void set(final byte[] utf8, final int start, final int len) {
    setCapacity(len, false);
    System.arraycopy(utf8, start, bytes, 0, len);
    this.length = len;
    this.hash = 0;
  }


  /**
   * Append a range of bytes to the end of the given text
   * 
   * @param utf8
   *        the data to copy from
   * @param start
   *        the first position to append from utf8
   * @param len
   *        the number of bytes to append
   */
  public void append(final byte[] utf8, final int start, final int len) {
    setCapacity(length + len, true);
    System.arraycopy(utf8, start, bytes, length, len);
    this.length += len;
    this.hash = 0;
  }


  /**
   * Clear the string to empty.
   */
  public void clear() {
    this.length = 0;
    this.hash = 0;
  }


  /*
   * Sets the capacity of this Text object to <em>at least</em>
   * <code>len</code> bytes. If the current buffer is longer, then the
   * capacity and existing content of the buffer are unchanged. If
   * <code>len</code> is larger than the current capacity, the Text object's
   * capacity is increased to match.
   * @param len the number of bytes we need
   * @param keepData should the old data be kept
   */
  private void setCapacity(final int len, final boolean keepData) {
    if (this.bytes == null || this.bytes.length < len) {
      final byte[] newBytes = new byte[len];
      if (this.bytes != null && keepData) {
        System.arraycopy(this.bytes, 0, newBytes, 0, this.length);
      }
      this.bytes = newBytes;
    }
  }


  /**
   * Convert text back to string
   * 
   * @see java.lang.Object#toString()
   */
  public String toString() {
    try {
      return decode(bytes, 0, length);
    } catch (CharacterCodingException e) {
      throw new RuntimeException("Should not have happened " + e.toString());
    }
  }


  /**
   * deserialize
   * @param in
   */
  public void read(final DataInputView in) throws IOException {
    final int newLength = in.readInt();
    setCapacity(newLength, false);
    in.readFully(this.bytes, 0, newLength);
    this.length = newLength;
    this.hash = 0;
  }


  /** Skips over one Text in the input. */
  public static void skip(final DataInput in) throws IOException {
    final int length = in.readInt();
    skipFully(in, length);
  }


  public static void skipFully(final DataInput in, final int len) throws IOException {
    int total = 0;
    int cur = 0;


    while ((total < len) && ((cur = in.skipBytes(len - total)) > 0)) {
      total += cur;
    }


    if (total < len) {
      throw new IOException("Not able to skip " + len + " bytes, possibly " + "due to end of input.");
    }
  }


  @Override
  public void write(final DataOutputView out) throws IOException {
    out.writeInt(this.length);
    out.write(this.bytes, 0, this.length);
  }




  @Override
  public boolean equals(final Object obj) {


    if (!(obj instanceof StringRecord)) {
      return false;
    }


    final StringRecord sr = (StringRecord) obj;
    if (this.length != sr.getLength()) {
      return false;
    }


    if (this.bytes.length == this.length && sr.bytes.length == sr.length) {


      return Arrays.equals(this.bytes, sr.bytes);


    } else {


      for (int i = 0; i < this.length; ++i) {
        if (this.bytes[i] != sr.bytes[i]) {
          return false;
        }
      }
    }


    return true;
  }




  @Override
  public int hashCode() {


    int h = this.hash;
    if (h == 0 && this.length > 0) {
      int off = 0;
      byte[] val = this.bytes;
      int len = this.length;


      for (int i = 0; i < len; i++) {
        h = 31 * h + val[off++];
      }
      this.hash = h;
    }
    return h;


  }


  // / STATIC UTILITIES FROM HERE DOWN
  /**
   * Converts the provided byte array to a String using the UTF-8 encoding. If
   * the input is malformed, replace by a default value.
   */
  public static String decode(final byte[] utf8) throws CharacterCodingException {
    return decode(ByteBuffer.wrap(utf8), true);
  }


  public static String decode(final byte[] utf8, final int start, final int length) throws CharacterCodingException {
    return decode(ByteBuffer.wrap(utf8, start, length), true);
  }


  /**
   * Converts the provided byte array to a String using the UTF-8 encoding. If <code>replace</code> is true, then
   * malformed input is replaced with the
   * substitution character, which is U+FFFD. Otherwise the method throws a
   * MalformedInputException.
   */
  public static String decode(final byte[] utf8, final int start, final int length, final boolean replace)
      throws CharacterCodingException {
    return decode(ByteBuffer.wrap(utf8, start, length), replace);
  }


  private static String decode(final ByteBuffer utf8, final boolean replace) throws CharacterCodingException {
    final CharsetDecoder decoder = DECODER_FACTORY.get();
    if (replace) {
      decoder.onMalformedInput(java.nio.charset.CodingErrorAction.REPLACE);
      decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
    }
    final String str = decoder.decode(utf8).toString();
    // set decoder back to its default value: REPORT
    if (replace) {
      decoder.onMalformedInput(CodingErrorAction.REPORT);
      decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
    }
    return str;
  }


  /**
   * Converts the provided String to bytes using the UTF-8 encoding. If the
   * input is malformed, invalid chars are replaced by a default value.
   * 
   * @return ByteBuffer: bytes stores at ByteBuffer.array() and length is
   *         ByteBuffer.limit()
   */


  public static ByteBuffer encode(final String string) throws CharacterCodingException {
    return encode(string, true);
  }


  /**
   * Converts the provided String to bytes using the UTF-8 encoding. If <code>replace</code> is true, then malformed
   * input is replaced with the
   * substitution character, which is U+FFFD. Otherwise the method throws a
   * MalformedInputException.
   * 
   * @return ByteBuffer: bytes stores at ByteBuffer.array() and length is
   *         ByteBuffer.limit()
   */
  public static ByteBuffer encode(final String string, final boolean replace) throws CharacterCodingException {


    final CharsetEncoder encoder = ENCODER_FACTORY.get();
    if (replace) {
      encoder.onMalformedInput(CodingErrorAction.REPLACE);
      encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
    }
    ByteBuffer bytes = encoder.encode(CharBuffer.wrap(string.toCharArray()));
    if (replace) {
      encoder.onMalformedInput(CodingErrorAction.REPORT);
      encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
    }
    return bytes;
  }


  /**
   * Read a UTF8 encoded string from in
   */
  public static String readString(final DataInput in) throws IOException {


    if (in.readBoolean()) {
      final int length = in.readInt();
      if (length < 0) {
        throw new IOException("length of StringRecord is " + length);
      }


      final byte[] bytes = new byte[length];
      in.readFully(bytes, 0, length);
      return decode(bytes);
    }


    return null;
  }


  /**
   * Write a UTF8 encoded string to out
   */
  public static int writeString(final DataOutput out, final String s) throws IOException {


    int length = 0;


    if (s == null) {
      out.writeBoolean(false);
    } else {
      out.writeBoolean(true);
      final ByteBuffer bytes = encode(s);
      length = bytes.limit();
      out.writeInt(length);
      out.write(bytes.array(), 0, length);
    }
    return length;
  }


  // //// states for validateUTF8


  private static final int LEAD_BYTE = 0;


  private static final int TRAIL_BYTE_1 = 1;


  private static final int TRAIL_BYTE = 2;


  /**
   * Check if a byte array contains valid utf-8
   * 
   * @param utf8
   *        byte array
   * @throws MalformedInputException
   *         if the byte array contains invalid utf-8
   */
  public static void validateUTF8(final byte[] utf8) throws MalformedInputException {
    validateUTF8(utf8, 0, utf8.length);
  }


  /**
   * Check to see if a byte array is valid utf-8
   * 
   * @param utf8
   *        the array of bytes
   * @param start
   *        the offset of the first byte in the array
   * @param len
   *        the length of the byte sequence
   * @throws MalformedInputException
   *         if the byte array contains invalid bytes
   */
  public static void validateUTF8(final byte[] utf8, final int start, final int len) throws MalformedInputException {
    int count = start;
    int leadByte = 0;
    int length = 0;
    int state = LEAD_BYTE;
    while (count < start + len) {
      final int aByte = ((int) utf8[count] & 0xFF);


      switch (state) {
      case LEAD_BYTE:
        leadByte = aByte;
        length = bytesFromUTF8[aByte];


        switch (length) {
        case 0: // check for ASCII
          if (leadByte > 0x7F) {
            throw new MalformedInputException(count);
          }
          break;
        case 1:
          if (leadByte < 0xC2 || leadByte > 0xDF) {
            throw new MalformedInputException(count);
          }
          state = TRAIL_BYTE_1;
          break;
        case 2:
          if (leadByte < 0xE0 || leadByte > 0xEF) {
            throw new MalformedInputException(count);
          }
          state = TRAIL_BYTE_1;
          break;
        case 3:
          if (leadByte < 0xF0 || leadByte > 0xF4) {
            throw new MalformedInputException(count);
          }
          state = TRAIL_BYTE_1;
          break;
        default:
          // too long! Longest valid UTF-8 is 4 bytes (lead + three)
          // or if < 0 we got a trail byte in the lead byte position
          throw new MalformedInputException(count);
        } // switch (length)
        break;


      case TRAIL_BYTE_1:
        if (leadByte == 0xF0 && aByte < 0x90) {
          throw new MalformedInputException(count);
        }
        if (leadByte == 0xF4 && aByte > 0x8F) {
          throw new MalformedInputException(count);
        }
        if (leadByte == 0xE0 && aByte < 0xA0) {
          throw new MalformedInputException(count);
        }
        if (leadByte == 0xED && aByte > 0x9F) {
          throw new MalformedInputException(count);
        }
        // falls through to regular trail-byte test!!
      case TRAIL_BYTE:
        if (aByte < 0x80 || aByte > 0xBF) {
          throw new MalformedInputException(count);
        }
        if (--length == 0) {
          state = LEAD_BYTE;
        } else {
          state = TRAIL_BYTE;
        }
        break;
      default:
        break;
      } // switch (state)
      count++;
    }
  }


  /**
   * Magic numbers for UTF-8. These are the number of bytes that <em>follow</em> a given lead byte. Trailing bytes
   * have the value -1. The
   * values 4 and 5 are presented in this table, even though valid UTF-8
   * cannot include the five and six byte sequences.
   */
  static final int[] bytesFromUTF8 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0,
    0,
    0,
    // trail bytes
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5,
    5 };


  /**
   * Returns the next code point at the current position in the buffer. The
   * buffer's position will be incremented. Any mark set on this buffer will
   * be changed by this method!
   */
  public static int bytesToCodePoint(final ByteBuffer bytes) {
    bytes.mark();
    final byte b = bytes.get();
    bytes.reset();
    final int extraBytesToRead = bytesFromUTF8[(b & 0xFF)];
    if (extraBytesToRead < 0) {
      return -1; // trailing byte!
    }
    int ch = 0;


    switch (extraBytesToRead) {
    case 5:
      ch += (bytes.get() & 0xFF);
      ch <<= 6; /* remember, illegal UTF-8 */
    case 4:
      ch += (bytes.get() & 0xFF);
      ch <<= 6; /* remember, illegal UTF-8 */
    case 3:
      ch += (bytes.get() & 0xFF);
      ch <<= 6;
    case 2:
      ch += (bytes.get() & 0xFF);
      ch <<= 6;
    case 1:
      ch += (bytes.get() & 0xFF);
      ch <<= 6;
    case 0:
      ch += (bytes.get() & 0xFF);
    default:
      break;


    }
    ch -= offsetsFromUTF8[extraBytesToRead];


    return ch;
  }


  static final int[] offsetsFromUTF8 = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };


  /**
   * For the given string, returns the number of UTF-8 bytes required to
   * encode the string.
   * 
   * @param string
   *        text to encode
   * @return number of UTF-8 bytes required to encode
   */
  public static int utf8Length(final String string) {
    final CharacterIterator iter = new StringCharacterIterator(string);
    char ch = iter.first();
    int size = 0;
    while (ch != CharacterIterator.DONE) {
      if ((ch >= 0xD800) && (ch < 0xDC00)) {
        // surrogate pair?
        char trail = iter.next();
        if ((trail > 0xDBFF) && (trail < 0xE000)) {
          // valid pair
          size += 4;
        } else {
          // invalid pair
          size += 3;
          iter.previous(); // rewind one
        }
      } else if (ch < 0x80) {
        size++;
      } else if (ch < 0x800) {
        size += 2;
      } else {
        // ch < 0x10000, that is, the largest char value
        size += 3;
      }
      ch = iter.next();
    }
    return size;
  }
}
Source Code of org.apache.flink.core.io.StringRecord

Related Classes of org.apache.flink.core.io.StringRecord