/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jackrabbit.core.persistence.util;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.math.BigDecimal;
import java.util.Calendar;
import java.util.Collection;
import java.util.GregorianCalendar;
import javax.jcr.PropertyType;
import javax.jcr.RepositoryException;
import org.apache.commons.io.IOUtils;
import org.apache.jackrabbit.core.id.NodeId;
import org.apache.jackrabbit.core.value.InternalValue;
import org.apache.jackrabbit.core.persistence.util.NodePropBundle.ChildNodeEntry;
import org.apache.jackrabbit.core.persistence.util.NodePropBundle.PropertyEntry;
import org.apache.jackrabbit.spi.Name;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Bundle serializer.
*
* @see BundleReader
*/
class BundleWriter {
/** Logger instance */
private static Logger log = LoggerFactory.getLogger(BundleWriter.class);
private final BundleBinding binding;
private final DataOutputStream out;
/**
* The default namespace and the first six other namespaces used in this
* bundle. Used by the {@link #writeName(Name)} method to keep track of
* already seen namespaces.
*/
private final String[] namespaces =
// NOTE: The length of this array must be seven
{ Name.NS_DEFAULT_URI, null, null, null, null, null, null };
/**
* Creates a new bundle serializer.
*
* @param binding bundle binding
* @param stream stream to which the bundle will be written
* @throws IOException if an I/O error occurs.
*/
public BundleWriter(BundleBinding binding, OutputStream stream)
throws IOException {
assert namespaces.length == 7;
this.binding = binding;
this.out = new DataOutputStream(stream);
this.out.writeByte(BundleBinding.VERSION_CURRENT);
}
/**
* Serializes a <code>NodePropBundle</code> to a data output stream
*
* @param bundle the bundle to serialize
* @throws IOException if an I/O error occurs.
*/
public void writeBundle(NodePropBundle bundle)
throws IOException {
long size = out.size();
// primaryType
writeName(bundle.getNodeTypeName());
// parentUUID
NodeId parentId = bundle.getParentId();
if (parentId == null) {
parentId = BundleBinding.NULL_PARENT_ID;
}
writeNodeId(parentId);
// write mod count
writeVarInt(bundle.getModCount());
Collection<Name> mixins = bundle.getMixinTypeNames();
Collection<PropertyEntry> properties = bundle.getPropertyEntries();
Collection<ChildNodeEntry> nodes = bundle.getChildNodeEntries();
Collection<NodeId> shared = bundle.getSharedSet();
int mn = mixins.size();
int pn = properties.size();
int nn = nodes.size();
int sn = shared.size();
int referenceable = 0;
if (bundle.isReferenceable()) {
referenceable = 1;
}
out.writeByte(
Math.min(mn, 1) << 7
| Math.min(pn, 7) << 4
| Math.min(nn, 3) << 2
| Math.min(sn, 1) << 1
| referenceable);
// mixin types
writeVarInt(mn, 1);
for (Name name : mixins) {
writeName(name);
}
// properties
writeVarInt(pn, 7);
for (PropertyEntry property : properties) {
writeState(property);
}
// child nodes (list of name/uuid pairs)
writeVarInt(nn, 3);
for (ChildNodeEntry child : nodes) {
writeName(child.getName()); // name
writeNodeId(child.getId()); // uuid
}
// write shared set
writeVarInt(sn, 1);
for (NodeId nodeId: shared) {
writeNodeId(nodeId);
}
// set size of bundle
bundle.setSize(out.size() - size);
}
/**
* Serializes a property entry. The serialization begins with the
* property name followed by a single byte that encodes the type and
* multi-valuedness of the property:
* <pre>
* +-------------------------------+
* | mv count | type |
* +-------------------------------+
* </pre>
* <p>
* The lower four bits encode the property type (0-12 in JCR 2.0) and
* higher bits indicate whether this is a multi-valued property and how
* many property values there are. A value of 0 is reserved for
* single-valued properties (that are guaranteed to always have just a
* single value), and all non-zero values indicate a multi-valued property.
* <p>
* In multi-valued properties the exact value of the "mv count" field is
* the number of property values plus one and truncated at 15 (the highest
* four-bit value). If there are 14 or more (14 + 1 == 15) property values,
* then the number of additional values is serialized as a variable-length
* integer (see {@link #writeVarInt(int)}) right after this byte.
* <p>
* The modification count of the property state is written next as a
* variable-length integer, followed by the serializations of all the
* values of this property.
*
* @param state the property entry to store
* @throws IOException if an I/O error occurs.
*/
private void writeState(NodePropBundle.PropertyEntry state)
throws IOException {
writeName(state.getName());
InternalValue[] values = state.getValues();
int type = state.getType();
if (type < 0 || type > 0xf) {
throw new IOException("Illegal property type " + type);
}
if (state.isMultiValued()) {
int len = values.length + 1;
if (len < 0x0f) {
out.writeByte(len << 4 | type);
} else {
out.writeByte(0xf0 | type);
writeVarInt(len - 0x0f);
}
} else {
if (values.length != 1) {
throw new IOException(
"Single values property with " + values.length + " values: " +
state.getName());
}
out.writeByte(type);
}
writeVarInt(state.getModCount());
// values
for (int i = 0; i < values.length; i++) {
InternalValue val = values[i];
switch (type) {
case PropertyType.BINARY:
try {
long size = val.getLength();
if (val.isInDataStore()) {
out.writeInt(BundleBinding.BINARY_IN_DATA_STORE);
writeString(val.toString());
} else if (binding.dataStore != null) {
writeSmallBinary(val, state, i);
} else if (size < 0) {
log.warn("Blob has negative size. Potential loss of data. "
+ "id={} idx={}", state.getId(), String.valueOf(i));
out.writeInt(0);
values[i] = InternalValue.create(new byte[0]);
val.discard();
} else if (size > binding.getMinBlobSize()) {
// special handling required for binary value:
// spool binary value to file in blob store
out.writeInt(BundleBinding.BINARY_IN_BLOB_STORE);
String blobId = state.getBlobId(i);
if (blobId == null) {
BLOBStore blobStore = binding.getBlobStore();
try {
InputStream in = val.getStream();
try {
blobId = blobStore.createId(state.getId(), i);
blobStore.put(blobId, in, size);
state.setBlobId(blobId, i);
} finally {
IOUtils.closeQuietly(in);
}
} catch (Exception e) {
String msg = "Error while storing blob. id="
+ state.getId() + " idx=" + i + " size=" + size;
log.error(msg, e);
throw new IOException(msg);
}
try {
// replace value instance with value
// backed by resource in blob store and delete temp file
if (blobStore instanceof ResourceBasedBLOBStore) {
values[i] = InternalValue.create(((ResourceBasedBLOBStore) blobStore).getResource(blobId));
} else {
values[i] = InternalValue.create(blobStore.get(blobId));
}
} catch (Exception e) {
log.error("Error while reloading blob. truncating. id="
+ state.getId() + " idx=" + i + " size=" + size, e);
values[i] = InternalValue.create(new byte[0]);
}
val.discard();
}
// store id of blob as property value
writeString(blobId); // value
} else {
// delete evt. blob
byte[] data = writeSmallBinary(val, state, i);
// replace value instance with value
// backed by resource in blob store and delete temp file
values[i] = InternalValue.create(data);
val.discard();
}
} catch (RepositoryException e) {
String msg = "Error while storing blob. id="
+ state.getId() + " idx=" + i + " value=" + val;
log.error(msg, e);
throw new IOException(msg);
}
break;
case PropertyType.DOUBLE:
try {
out.writeDouble(val.getDouble());
} catch (RepositoryException e) {
// should never occur
throw new IOException("Unexpected error while writing DOUBLE value.");
}
break;
case PropertyType.DECIMAL:
try {
writeDecimal(val.getDecimal());
} catch (RepositoryException e) {
// should never occur
throw new IOException("Unexpected error while writing DECIMAL value.");
}
break;
case PropertyType.LONG:
try {
writeVarLong(val.getLong());
} catch (RepositoryException e) {
// should never occur
throw new IOException("Unexpected error while writing LONG value.");
}
break;
case PropertyType.BOOLEAN:
try {
out.writeBoolean(val.getBoolean());
} catch (RepositoryException e) {
// should never occur
throw new IOException("Unexpected error while writing BOOLEAN value.");
}
break;
case PropertyType.NAME:
try {
writeName(val.getName());
} catch (RepositoryException e) {
// should never occur
throw new IOException("Unexpected error while writing NAME value.");
}
break;
case PropertyType.WEAKREFERENCE:
case PropertyType.REFERENCE:
writeNodeId(val.getNodeId());
break;
case PropertyType.DATE:
try {
writeDate(val.getCalendar());
} catch (RepositoryException e) {
// should never occur
throw new IOException("Unexpected error while writing DATE value.");
}
break;
case PropertyType.STRING:
case PropertyType.PATH:
case PropertyType.URI:
writeString(val.toString());
break;
default:
throw new IOException("Inknown property type: " + type);
}
}
}
/**
* Write a small binary value and return the data.
*
* @param value the binary value
* @param state the property state (for error messages)
* @param i the index (for error messages)
* @return the data
* @throws IOException if the data could not be read
*/
private byte[] writeSmallBinary(
InternalValue value, NodePropBundle.PropertyEntry state, int i)
throws IOException {
try {
int size = (int) value.getLength();
out.writeInt(size);
byte[] data = new byte[size];
DataInputStream in =
new DataInputStream(value.getStream());
try {
in.readFully(data);
} finally {
IOUtils.closeQuietly(in);
}
out.write(data, 0, data.length);
return data;
} catch (Exception e) {
String msg = "Error while storing blob. id="
+ state.getId() + " idx=" + i + " value=" + value;
log.error(msg, e);
throw new IOException(msg);
}
}
/**
* Serializes a node identifier
*
* @param id the node id
* @throws IOException in an I/O error occurs.
*/
private void writeNodeId(NodeId id) throws IOException {
out.writeLong(id.getMostSignificantBits());
out.writeLong(id.getLeastSignificantBits());
}
/**
* Serializes a BigDecimal
*
* @param decimal the decimal number
* @throws IOException in an I/O error occurs.
*/
private void writeDecimal(BigDecimal decimal) throws IOException {
if (decimal == null) {
out.writeBoolean(false);
} else {
out.writeBoolean(true);
// TODO more efficient serialization format
writeString(decimal.toString());
}
}
/**
* Serializes a name. The name encoding works as follows:
* <p>
* First; if the name is known by the {@link BundleNames} class (this
* includes the <code>null</code> name), then the name is serialized
* as a single byte using the following format.
* <pre>
* +-------------------------------+
* | 0 | common name index |
* +-------------------------------+
* </pre>
* <p>
* Second; if the name is not known, it gets serialized as a
* variable-length field whose first byte looks like this:
* <pre>
* +-------------------------------+
* | 1 | ns index | name length |
* +-------------------------------+
* </pre>
* <p>
* The three-bit namespace index identifies the namespace of the name.
* The serializer keeps track of the default namespace (value 0) and at
* most six other other namespaces (values 1-6), in the order they appear
* in the bundle. When one of these six custom namespaces first appears
* in the bundle, then the namespace URI is written using
* {@link #writeString(String)} right after this byte.
* Later uses of such a namespace simply refers back to the already read
* namespace URI string. Any other namespaces are identified with value 7
* and always written to the bundle after this byte.
* <p>
* The four-bit name length field indicates the length (in UTF-8 bytes)
* of the local part of the name. Since zero-length local names are not
* allowed, the length is first decremented by one before storing in this
* field. The UTF-8 byte sequence is written out after this byte and the
* possible namespace URI string. If the length of the local name is
* larger than 15 (i.e. would be stored as 0x0f or more), then the value
* 0x0f is stored as the name length and the name string is written as
* UTF-8 using {@link #writeBytes(byte[], int)} with a base length of
* 0x10 (0x0f + 1).
*
* @param name the name
* @throws IOException in an I/O error occurs.
*/
private void writeName(Name name) throws IOException {
int index = BundleNames.nameToIndex(name);
if (index != -1) {
assert 0 <= index && index < 0x80;
out.writeByte(index);
} else {
String uri = name.getNamespaceURI();
int ns = 0;
while (ns < namespaces.length
&& namespaces[ns] != null
&& !namespaces[ns].equals(uri)) {
ns++;
}
String local = name.getLocalName();
if (local.length() == 0) {
throw new IOException("Attempt to write an empty local name: " + name);
}
byte[] bytes = local.getBytes("UTF-8");
int len = Math.min(bytes.length - 1, 0x0f);
out.writeByte(0x80 | ns << 4 | len);
if (ns == namespaces.length || namespaces[ns] == null) {
writeString(uri);
if (ns < namespaces.length) {
namespaces[ns] = uri;
}
}
if (len != 0x0f) {
out.write(bytes);
} else {
writeBytes(bytes, 0x0f + 1);
}
}
}
/**
* Serializes an integer using a variable-length encoding that favors
* small positive numbers. The serialization consists of one to five
* bytes of the following format:
* <pre>
* +-------------------------------+
* | c | 7 least significant bits |
* +-------------------------------+
* </pre>
* <p>
* If the given integer fits in seven bits (i.e. the value between
* 0 and 127, inclusive), then it is written as-is in a single byte.
* Otherwise the continuation flag <code>c</code> is set and the least
* significant seven bits are written together with the flag as a single
* byte. The integer is then shifed right seven bits and the process
* continues from the beginning.
* <p>
* This format uses a single byte for values 0-127, two bytes for
* 128-16343, three for 16343-2097151, four for 2097152-268435455
* and five bytes for all other 32-bit numbers (including negative ones).
*
* @param integer integer value
* @throws IOException if an I/O error occurs
*/
private void writeVarInt(int value) throws IOException {
while (true) {
int b = value & 0x7f;
if (b != value) {
out.writeByte(b | 0x80);
value >>>= 7; // unsigned shift
} else {
out.writeByte(b);
return;
}
}
}
private void writeVarInt(int value, int base) throws IOException {
if (value >= base) {
writeVarInt(value - base);
}
}
/**
* Serializes a long value using a variable length encoding like the
* one used by {@link #writeVarInt(int)} for integer values. Before
* writing out, the value is first normalized to an unsigned value
* by moving the sign bit to be the end negating the other bits of
* a negative value. This normalization step maximizes the number of
* zero high order bits for typical small values (positive or negative),
* and thus keeps the serialization short.
*
* @param value long value
* @throws IOException if an I/O error occurs
*/
private void writeVarLong(long value) throws IOException {
// Normalize to an unsigned value with the sign as the lowest bit
if (value < 0) {
value = ~value << 1 | 1;
} else {
value <<= 1;
}
while (true) {
long b = value & 0x7f;
if (b != value) {
out.writeByte((int) b | 0x80);
value >>>= 7; // unsigned shift
} else {
out.writeByte((int) b);
return;
}
}
}
/**
* Serializes a JCR date value using the {@link #writeVarLong(long)}
* serialization on a special 64-bit date encoding. This encoding maps
* the <code>sYYYY-MM-DDThh:mm:ss.sssTZD</code> date format used by
* JCR to an as small 64 bit integer (positive or negative) as possible,
* while preserving full accuracy (including time zone offsets) and
* favouring common levels of accuracy (per minute, hour and day) over
* full millisecond level detail.
* <p>
* Each date value is mapped to separate timestamp and timezone fields,
* both of whose lenghts are variable:
* <pre>
* +----- ... ------- ... --+
* | timestamp | timezone |
* +----- ... ------- ... --+
* </pre>
* <p>
* The type and length of the timezone field can be determined by looking
* at the two least significant bits of the value:
* <dl>
* <dt><code>?0</code></dt>
* <dd>
* UTC time. The length of the timezone field is just one bit,
* i.e. the second bit is already a part of the timestamp field.
* </dd>
* <dt><code>01</code></dt>
* <dd>
* The offset is counted as hours from UTC, and stored as the number
* of hours (positive or negative) in the next 5 bits (range from
* -16 to +15 hours), making the timezone field 7 bits long in total.
* </dd>
* <dt><code>11</code></dt>
* <dd>
* The offset is counted as hours and minutes from UTC, and stored
* as the total minute offset (positive or negative) in the next
* 11 bits (range from -17 to +17 hours), making the timezone field
* 13 bits long in total.
* </dd>
* </dl>
* <p>
* The remaining 51-63 bits of the encoded value make up the timestamp
* field that also uses the two least significant bits to indicate the
* type and length of the field:
* <dl>
* <dt><code>00</code></dt>
* <dd>
* <code>sYYYY-MM-DDT00:00:00.000</code>, i.e. midnight of the
* specified date. The next 9 bits encode the day within the year
* (starting from 1, maximum value 366) and the remaining bits are
* used for the year, stored as an offset from year 2010.
* </dd>
* <dt><code>01</code></dt>
* <dd>
* <code>sYYYY-MM-DDThh:00:00.000</code>, i.e. at the hour. The
* next 5 bits encode the hour within the day (starting from 0,
* maximum value 23) and the remaining bits are used as described
* above for the date.
* </dd>
* <dt><code>10</code></dt>
* <dd>
* <code>sYYYY-MM-DDThh:mm:00.000</code>, i.e. at the minute. The
* next 11 bits encode the minute within the day (starting from 0,
* maximum value 1439) and the remaining bits are used as described
* above for the date.
* </dd>
* <dt><code>11</code></dt>
* <dd>
* <code>sYYYY-MM-DDThh:mm:ss.sss</code>, i.e. full millisecond
* accuracy. The next 30 bits encode the millisecond within the
* day (starting from 0, maximum value 87839999) and the remaining
* bits are used as described above for the date.
* </dd>
* </dl>
* <p>
* With full timezone and millisecond accuracies, this encoding leaves
* 10 bits (64 - 9 - 30 - 2 - 11 - 2) for the date offset, which allows
* for representation of all timestamps between years 1498 and 2521.
* Timestamps outside this range and with a minute-level timezone offset
* are automatically truncated to minute-level accuracy to support the
* full range of years -9999 to 9999 specified in JCR.
* <p>
* Note that the year, day of year, and time of day values are stored
* as separate bit sequences to avoid problems with changing leap second
* or leap year definitions. Bit fields are used for better encoding and
* decoding performance than what would be possible with the slightly more
* space efficient mechanism of using multiplication and modulo divisions
* to separate the different timestamp fields.
*
* @param value date value
* @throws IOException if an I/O error occurs
*/
private void writeDate(Calendar value) throws IOException {
int y = value.get(Calendar.YEAR);
if (value.isSet(Calendar.ERA)
&& value.get(Calendar.ERA) == GregorianCalendar.BC) {
y = 1 - y; // convert to an astronomical year
}
y -= 2010; // use a recent offset NOTE: do not change this!
int d = value.get(Calendar.DAY_OF_YEAR);
int h = value.get(Calendar.HOUR_OF_DAY);
int m = value.get(Calendar.MINUTE);
int s = value.get(Calendar.SECOND);
int u = value.get(Calendar.MILLISECOND);
int z = value.getTimeZone().getOffset(value.getTimeInMillis()) / (60 * 1000);
int zh = z / 60;
int zm = z - zh * 60;
long ts = y << 9 | d & 0x01ff;
if ((u != 0 || s != 0) && ((-512 <= y && y < 512) || zm == 0)) {
ts <<= 30;
ts |= (((h * 60 + m) * 60 + s) * 1000 + u) & 0x3fffffff; // 30 bits
ts <<= 2;
ts |= 3;
} else if (m != 0) {
ts <<= 11;
ts |= (h * 60 + m) & 0x07ff; // 11 bits
ts <<= 2;
ts |= 2;
} else if (h != 0) {
ts <<= 5;
ts |= h & 0x1f; // 5 bits
ts <<= 2;
ts |= 1;
} else {
ts <<= 2;
}
if (zm != 0) {
ts <<= 11;
ts |= z & 0x07ff; // 11 bits
writeVarLong(ts << 2 | 3);
} else if (zh != 0) {
ts <<= 5;
ts |= zh & 0x1f; // 5 bits
writeVarLong(ts << 2 | 1);
} else {
writeVarLong(ts << 1);
}
}
/**
* Serializes a string in UTF-8. The length of the UTF-8 byte sequence
* is first written as a variable-length string (see
* {@link #writeVarInt(int)}), and then the sequence itself is written.
*
* @param value string value
* @throws IOException if an I/O error occurs
*/
private void writeString(String value) throws IOException {
writeBytes(value.getBytes("UTF-8"), 0);
}
/**
* Serializes the given array of bytes. The length of the byte array is
* first written as a {@link #writeVarInt(int) variable length integer},
* followed by the given bytes.
*
* @param bytes the bytes to be serialized
* @param base optional base length
* @throws IOException if an I/O error occurs
*/
private void writeBytes(byte[] bytes, int base) throws IOException {
assert bytes.length >= base;
writeVarInt(bytes.length - base);
out.write(bytes);
}
}