/**
* Copyright 2012 Akiban Technologies, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.persistit;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicLong;
import com.persistit.exception.DuplicateKeyException;
import com.persistit.exception.PersistitException;
import com.persistit.util.Util;
/**
* <p>
* A mechanism for optimizing the process of loading large sets of records with
* non-sequential keys. This class speeds up the process of inserting records
* into a set of Persistit <code>Tree</code>s by sorting them before inserting
* them. The sort process uses multiple "sort trees" in multiple temporary
* <code>Volume</code>s to hold copies of the data. These are then merged into
* the final "destination trees." Each sort tree is constrained to be small
* enough to fit in the {@link BufferPool}.
* </p>
* <h3>Background</h3>
* <p>
* In general, Persistit can store records very quickly, even when the keys of
* those records arrive in random order, as long as all the pages of the
* destination tree or trees are resident in the buffer pool. However, the
* situation changes dramatically as soon as the the destination tree or trees
* exceed the size of the buffer pool. Once that happens, insert performance
* degrades because the ratio of records inserted per disk I/O operation
* performed decreases. In a worst-case scenario, inserting each new key may
* require two or more disk I/O operations. These may occur because Persistit
* performs the following steps:
* <ul>
* <li>Look up the key requires reading the page containing that key from disk
* into the BufferPool.</li>
* <li>Reading the page requires a Buffer containing some other page to be
* evicted.</li>
* <li>The page being evicted is likely to be dirty and therefore Persistit must
* write its contents to disk before reusing the Buffer.</li>
* </ul>
* Further, these disk I/O operations are are usually at unrelated file
* positions and therefore may each require random seeks. As a result, inserting
* one key can take orders of magnitude longer once the tree no longer fits in
* the buffer pool.
* </p>
* <p>
* <code>TreeBuilder</code> mitigates that degradation by sorting the keys
* before inserting them into their final destination trees. To do so it builds
* a collection of bounded-size sort trees in temporary volumes. Then it
* performs a merge sort from those trees into the final destination tree or
* trees. This mechanism eliminates the problem that every key insertion
* requires two (or more) random disk I/O operations. However, it is still the
* case that every sort tree page must be written and read once, and every
* destination tree page must be written at least once. Therefore the I/O
* associated with TreeBuilder is reduced but not eliminated.
* </p>
* <p>
* TreeBuilder is effective if and only if (a) the keys arrive in random order,
* and (b) the data is significantly larger than available memory in the buffer
* pool. In general it is faster to insert the keys directly into the
* destination trees unless both of these conditions are true.
* </p>
* <h3>Using TreeBuilder</h3>
* <p>
* The following example demonstrates the fundamental operation of
* <code>TreeBuilder</code>: <code><pre>
* Exchange exchange = db.getExchange("myVolume", "myTree", true);
* TreeBuilder tb = new TreeBuilder(db);
* //
* // Insert the data into sort trees
* //
* while (<i>source has more data</i>) {
* exchange.to(<i>next key</i>).getValue().put(<i>next value</i>);
* tb.store(exchange);
* }
* //
* // Merge the data into myTree
* //
* tb.merge();
* </pre></code> Note that a TreeBuilder can pre-sort data for multiple
* destination trees. For example, it is possible to load and merge records for
* a table and its corresponding indexes in one pass using TreeBuilder. During
* the merge operation the final destination <code>Tree</code> are built in
* sequence. By default that sequence is by alphabetical order of tree name, but
* it is possible to customize TreeBuilder to change that order.
* </p>
* <p>
* Loading a large data set may take a long time under the best of
* circumstances. Therefore this class is designed to be extended by
* applications to support progress reporting, to control disk space allocation,
* to handle attempts to insert conflicting records with duplicate keys, etc.
* See the following methods which may be overridden to provide custom behavior:
* <ul>
* <li>{@link #reportSorted(long)} - report completion of N records inserts into
* sort trees</li>
* <li>{@link #reportMerged(long)} - report completion of N records merged</li>
* <li>{@link #duplicateKeyDetected(Tree, Key, Value, Value)} - handle detection
* of records inserted with duplicate keys</li>
* <li>{@link #beforeMergeKey(Exchange)} - allowing filtering or custom handling
* per record while merging</li>
* <li>{@link #afterMergeKey(Exchange)} - customizable behavior after merging
* one record</li>
* <li>{@link #beforeSortVolumeClosed(Volume, File)} - customizable behavior
* before closing a sort volume when full</li>
* <li>{@link #afterSortVolumeClose(Volume, File)} - customizable behavior after
* closing a sort volume when full</li>
* <li>{@link #getTreeComparator()} - return a custom Comparator to determine
* sequence in which trees are populated within the {@link #merge()} method
* </ul>
* </p>
*
* @author peter
*
*/
public class TreeBuilder {
private final static float DEFAULT_BUFFER_POOL_FRACTION = 0.5f;
private final static long REPORT_REPORT_MULTIPLE = 1000000;
private final static String SDF = "yyyyMMddHHmm";
private final static int STREAM_SIZE = 1024 * 1024;
private final String _name;
private final long _uniqueId;
private final Persistit _persistit;
private final List<File> _directories = new ArrayList<File>();
private final int _pageSize;
private final int _pageLimit;
private final AtomicLong _sortedKeyCount = new AtomicLong();
private final AtomicLong _mergedKeyCount = new AtomicLong();
private volatile long _reportKeyCountMultiple = REPORT_REPORT_MULTIPLE;
private Volume _sortVolume;
private File _sortFile;
private final List<Tree> _allTrees = new ArrayList<Tree>();
private final Map<String, Tree> _sortTreeMap = new HashMap<String, Tree>();
private int _sortFileIndex;
private final List<Node> _sortNodes = new ArrayList<Node>();
private final ThreadLocal<Map<Tree, Exchange>> _sortExchangeMapThreadLocal = new ThreadLocal<Map<Tree, Exchange>>() {
@Override
public Map<Tree, Exchange> initialValue() {
return new HashMap<Tree, Exchange>();
}
};
private final Comparator<Tree> _defaultTreeComparator = new Comparator<Tree>() {
/**
* Default implementation returns trees sorted in the order they were
* added to the _allTrees list - in other words, sorting should leave
* the list unchanged.
*
* @param a
* @param b
* @return
*/
@Override
public int compare(final Tree a, final Tree b) {
if (a == b) {
return 0;
}
return _allTrees.indexOf(a) - _allTrees.indexOf(b);
}
@Override
public boolean equals(final Object obj) {
return this == obj;
}
};
private class Node implements Comparable<Node> {
private Tree _tree;
private Key _key;
private Value _value;
private Node _duplicate;
private final int _precedence;
private final File _file;
private StreamLoader _loader;
private Handler _handler;
private boolean _next;
private class Handler extends StreamLoader.ImportHandler {
private Handler(final Persistit persistit) {
super(persistit);
}
@Override
protected void handleDataRecord(final Key key, final Value value) throws PersistitException {
Node.this._tree = super._tree;
_key = key;
_value = value;
_next = true;
}
}
private File getFile() {
return _file;
}
private Node(final File file, final int index) {
_file = file;
_precedence = index;
}
@Override
public int compareTo(final Node node) {
if (_tree == null) {
return node._tree == null ? 0 : 1;
}
if (node._tree == null) {
return -1;
}
if (_tree != node._tree) {
return _allTrees.indexOf(_tree) - _allTrees.indexOf(node._tree);
} else
return _key.compareTo(node._key);
}
@Override
public String toString() {
Node n = this;
final StringBuilder sb = new StringBuilder();
while (n != null) {
if (sb.length() > 0) {
sb.append(",");
}
if (n._tree == null) {
sb.append("<end>");
} else {
sb.append("<" + (n._tree.getName() + n._key + "=" + n._value) + ">");
}
n = n._duplicate;
}
return sb.toString();
}
private void createStreamLoader() throws Exception {
_loader = new StreamLoader(_persistit, new DataInputStream(new BufferedInputStream(new FileInputStream(
_file), STREAM_SIZE)));
_handler = new Handler(_persistit);
}
private boolean next() throws Exception {
_next = false;
while (_loader.next(_handler) && !_next)
;
if (!_next) {
_loader.close();
}
return _next;
}
}
private class SortStreamSaver extends StreamSaver {
Tree _sortTree = null;
SortStreamSaver(final Persistit persistit, final DataOutputStream stream) {
super(persistit, stream);
}
@Override
protected void writeData(final Exchange exchange) throws IOException {
if (exchange.getTree() != _sortTree) {
final Tree source = _sortTreeMap.get(exchange.getTree().getName());
if (_lastVolume != source.getVolume()) {
writeVolumeInfo(source.getVolume());
_lastVolume = source.getVolume();
}
if (_lastTree != source) {
writeTreeInfo(source);
_lastTree = source;
}
}
writeData(exchange.getKey(), exchange.getValue());
_recordCount++;
}
}
public TreeBuilder(final Persistit persistit) {
this(persistit, new SimpleDateFormat(SDF).format(new Date()), -1, DEFAULT_BUFFER_POOL_FRACTION);
}
public TreeBuilder(final Persistit persistit, final String name, final int pageSize, final float bufferPoolFraction) {
_name = name;
_uniqueId = persistit.unique();
_persistit = persistit;
_pageSize = pageSize == -1 ? computePageSize(persistit) : pageSize;
final int bufferCount = _persistit.getBufferPool(_pageSize).getBufferCount();
_pageLimit = (int) (bufferCount * bufferPoolFraction);
}
private int computePageSize(final Persistit persistit) {
int pageSize = persistit.getConfiguration().getTmpVolPageSize();
if (pageSize == 0) {
for (final int size : persistit.getBufferPoolHashMap().keySet()) {
if (size > pageSize) {
pageSize = size;
}
}
}
return pageSize;
}
/**
* @return Name provide when TreeBuilder was constructed. Default name is
* "TreeBuilder".
*/
public final String getName() {
return _name;
}
/**
* Set the count of keys inserted or merged per call to
* {@link #reportSorted(long)} or {@link #reportMerged(long)}.
*
* @param multiple
*/
public final void setReportKeyCountMultiple(final long multiple) {
_reportKeyCountMultiple = Util.rangeCheck(multiple, 1, Long.MAX_VALUE);
}
/**
*
* @return Count of keys inserted or merged per call to
* {@link #reportSorted(long)} or {@link #reportMerged(long)}
*/
public final long getReportKeyCountMultiple() {
return _reportKeyCountMultiple;
}
/**
* @return Count of sort trees that have been created while sorting keys
*/
public final synchronized int getSortFileCount() {
return _sortFileIndex;
}
/**
* @return Number of keys stored in sort trees
*/
public long getSortedKeyCount() {
return _sortedKeyCount.get();
}
/**
* @return Number of keys merged into destination trees
*/
public long getMergedKeyCount() {
return _mergedKeyCount.get();
}
/**
* @return List of destination
* <code>Tree<code> instances. This list is built as keys are stored.
*/
public synchronized final List<Tree> getTrees() {
final List<Tree> list = new ArrayList<Tree>(_allTrees);
return list;
}
/**
* <p>
* Define a list of directories in which sort volumes will be created. This
* method can be used to override the default value provided by
* {@link Configuration#getTmpVolDir()} to control more closely where sort
* trees will be stored. If the list is empty then the directory defined by
* the <code>Configuration</code> will be used. If multiple directories are
* declared then volumes will be allocated to them in round-robin fashion.
* This technique can distribute large load sets over multiple volumes and
* can allow for interleaved disk reads during the merge process.
* </p>
* <p>
* If a <code>File</code> supplied to this method does not exist, an attempt
* is made to create it as a directory. This method also attempts to create
* and delete a file in each supplied directory to ensure that if there is a
* file permission or other problem, it is detected immediately, rather than
* much later during the sort process.
* </p>
*
* @param directories
* List of <code>File</code> instances, each of which must be a
* directory
* @throws IllegalArgumentException
* if a supplied file exists and is not a directory or cannot be
* created as a new directory
* @throws IOException
* if an attempt to create a file in one of the supplied
* directories fails
*/
public final void setSortTreeDirectories(final List<File> directories) throws IOException {
if (directories == null || directories.isEmpty()) {
synchronized (this) {
_directories.clear();
}
} else {
/*
* Make sure all supplied items are directories
*/
for (final File file : directories) {
if (file.exists() && !file.isDirectory()) {
throw new IllegalArgumentException(file + " is not a directory");
}
}
/*
* Make sure all directories exist
*/
for (final File file : directories) {
if (!file.exists() && !file.mkdirs()) {
throw new IllegalArgumentException(file + " could not be created as a new directory");
}
}
/*
* Make sure all directories permit creation of a new file
*/
for (final File file : directories) {
final File temp = File.createTempFile(VolumeStorageT2.TEMP_FILE_PREFIX, null, file);
temp.delete();
}
synchronized (this) {
_directories.clear();
_directories.addAll(directories);
_sortFileIndex = 0;
}
}
}
/**
*
* @return List of directories set via the
* {@link #setSortTreeDirectories(List)} method.
*/
public final List<File> getSortFileDirectories() {
return Collections.unmodifiableList(_directories);
}
/**
* Store a key-value pair into a sort tree. The {@link Tree}, {@link Key}
* and {@link Value} are specified by the supplied {@link Exchange}.
*
* @param exchange
* The Exchange
* @throws Exception
*/
public final void store(final Exchange exchange) throws Exception {
store(exchange.getTree(), exchange.getKey(), exchange.getValue());
}
/**
* Store a key-value pair for a specified <code>Tree</code> into a sort
* tree.
*
* @param tree
* the Tree
* @param key
* the Key
* @param value
* the Value
* @throws Exception
*/
public final void store(final Tree tree, final Key key, final Value value) throws Exception {
final Map<Tree, Exchange> map = _sortExchangeMapThreadLocal.get();
Exchange ex = map.get(tree);
if (ex == null || ex.getTree().getVolume().getNextAvailablePage() > _pageLimit) {
final Volume newSortVolume = getSortVolume();
final String tempTreeName = "_" + _persistit.getJournalManager().handleForTree(tree);
ex = _persistit.getExchange(newSortVolume, tempTreeName, true);
map.put(tree, ex);
synchronized (this) {
if (!_allTrees.contains(tree)) {
_allTrees.add(tree);
_sortTreeMap.put(tempTreeName, tree);
}
}
}
key.copyTo(ex.getKey());
value.copyTo(ex.getValue());
ex.fetchAndStore();
boolean stored = true;
if (ex.getValue().isDefined()) {
if (!duplicateKeyDetected(ex.getTree(), ex.getKey(), ex.getValue(), value)) {
stored = false;
ex.store();
}
}
if (stored) {
final long count = _sortedKeyCount.incrementAndGet();
if ((count % _reportKeyCountMultiple) == 0) {
reportSorted(count);
}
}
}
private void insertNode(final Map<Node, Node> sorted, final Node node) throws Exception {
final Node other = sorted.put(node, node);
if (other != null) {
final boolean reverse;
if (node._precedence < other._precedence) {
reverse = duplicateKeyDetected(node._tree, node._key, node._value, other._value);
} else {
reverse = !duplicateKeyDetected(node._tree, node._key, other._value, node._value);
}
if (reverse) {
sorted.put(node, other);
final Node p = other._duplicate;
other._duplicate = node;
node._duplicate = p;
} else {
node._duplicate = other;
}
}
}
/**
* Merge the record previously stored in sort volumes into their destination
* <code>Tree</code>s.
*
* @throws Exception
*/
public synchronized void merge() throws Exception {
finishSortVolume();
if ((_mergedKeyCount.get() % _reportKeyCountMultiple) != 0) {
reportSorted(_mergedKeyCount.get());
}
Tree currentTree = null;
Exchange ex = null;
final SortedMap<Node, Node> sorted = new TreeMap<Node, Node>();
for (final Node node : _sortNodes) {
node.createStreamLoader();
if (node.next()) {
insertNode(sorted, node);
}
}
for (;;) {
if (sorted.isEmpty()) {
break;
}
Node node = sorted.firstKey();
node = sorted.remove(node);
if (node._tree != currentTree) {
ex = new Exchange(node._tree);
currentTree = node._tree;
}
node._key.copyTo(ex.getKey());
node._value.copyTo(ex.getValue());
if (beforeMergeKey(ex)) {
ex.fetchAndStore();
boolean stored = true;
if (ex.getValue().isDefined()) {
if (!duplicateKeyDetected(ex.getTree(), ex.getKey(), ex.getValue(), node._value)) {
ex.store();
stored = false;
}
}
if (stored) {
afterMergeKey(ex);
if ((_mergedKeyCount.incrementAndGet() % _reportKeyCountMultiple) == 0) {
reportMerged(_mergedKeyCount.get());
}
}
}
while (node != null) {
final Node next = node._duplicate;
node._duplicate = null;
if (node.next()) {
insertNode(sorted, node);
}
node = next;
}
}
if ((_mergedKeyCount.get() % _reportKeyCountMultiple) != 0) {
reportMerged(_mergedKeyCount.get());
}
reset();
}
private synchronized void reset() throws Exception {
Exception exception = null;
try {
if (_sortVolume != null) {
_sortVolume.close();
}
} catch (final PersistitException e) {
if (exception == null) {
exception = e;
}
}
for (final Node node : _sortNodes) {
try {
if (node.getFile() != null) {
node.getFile().delete();
}
} catch (final Exception e) {
if (exception == null) {
exception = e;
}
}
}
_allTrees.clear();
_sortNodes.clear();
_sortVolume = null;
_sortFileIndex = 0;
_sortExchangeMapThreadLocal.get().clear();
if (exception != null) {
throw exception;
}
}
public void clear() throws Exception {
_sortedKeyCount.set(0);
_mergedKeyCount.set(0);
reset();
}
private synchronized Volume getSortVolume() throws Exception {
if (_sortVolume != null && _sortVolume.getNextAvailablePage() > _pageLimit) {
finishSortVolume();
}
if (_sortVolume == null) {
final File directory;
if (_directories.isEmpty()) {
String directoryName = _persistit.getConfiguration().getTmpVolDir();
if (directoryName == null) {
directoryName = System.getProperty("java.io.tmpdir");
}
directory = new File(directoryName);
if (!directory.exists()) {
directory.mkdirs();
}
_directories.add(directory);
} else {
directory = _directories.get(_sortFileIndex % _directories.size());
}
_sortVolume = Volume.createTemporaryVolume(_persistit, _pageSize, directory);
_sortFile = new File(directory, String.format("%s_%d.%06d", _name, _uniqueId, _sortFileIndex));
final Node node = new Node(_sortFile, _sortFileIndex);
_sortNodes.add(node);
_sortFileIndex++;
}
return _sortVolume;
}
private void finishSortVolume() throws Exception {
if (_sortVolume != null) {
beforeSortVolumeClosed(_sortVolume, _sortFile);
saveSortVolume(_sortVolume, _sortFile);
afterSortVolumeClose(_sortVolume, _sortFile);
_sortVolume.close();
_sortVolume = null;
}
}
private void saveSortVolume(final Volume volume, final File file) throws Exception {
final DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file),
STREAM_SIZE));
final List<Tree> sorted = new ArrayList<Tree>(_allTrees);
Collections.sort(sorted, getTreeComparator());
final StreamSaver saver = new SortStreamSaver(_persistit, dos);
for (final Tree tree : sorted) {
final String sortTreeName = "_" + tree.getHandle();
final Tree sortTree = volume.getTree(sortTreeName, false);
if (sortTree != null) {
final Exchange exchange = new Exchange(sortTree);
saver.save(exchange, null);
}
}
file.deleteOnExit();
dos.close();
}
/**
* This method may be extended to provide application-specific behavior when
* a sort volume has been filled to capacity. Subsequent to this call, the
* sort volume is streamed to a sort file and then its pages in the
* <code>BufferPool</code> are invalidated to allow their immediate reuse.
*
* @param volume
* The temporary <code>Volume</code> that has been filled
* @param file
* the file to which the sorted key-value pairs will be written
* @throws Exception
*/
protected void beforeSortVolumeClosed(final Volume volume, final File file) throws Exception {
}
/**
* This method may be extended to provide application-specific reporting
* functionality after a sort volume has been filled to capacity and has
* been evicted. An application may also modify the temporary directory set
* via {@link #setSortTreeDirectories(List)} within this method if necessary
* to adjust disk space utilization, for example. The default behavior of
* this method is to do nothing.
*
* @param volume
* The temporary <code>Volume</code> that has been filled
* @param file
* the file to which the sorted key-value pairs have been written
* @throws Exception
*/
protected void afterSortVolumeClose(final Volume volume, final File file) throws Exception {
}
/**
* <p>
* This method may be extended to provide application-specific behavior when
* an attempt is made to merge records with duplicate keys. The two
* <code>Value</code>s v1 and v2 are provided in the order they were
* inserted into the <code>TreeBuilder</code>. behavior is to write a
* warning to the log and retain the first value..
* </p>
*
* @param tree
* the <code>Tree</code> to which a key is being merged
* @param key
* the <code>Key</code>
* @param v1
* the <code>Value</code> previously inserted
* @param v2
* the conflicting <code>Value</code>
* @return <code>true</code> to replace the value previously stored,
* <code>false</code> to leave the value first inserted and ignore
* the new value.
* @throws DuplicateKeyException
* if a key being inserted or merged matches a key already
* exists
* @throws Exception
*/
protected boolean duplicateKeyDetected(final Tree tree, final Key key, final Value v1, final Value v2)
throws Exception {
throw new DuplicateKeyException(String.format("Tree=%s Key=%s", tree, key));
}
/**
* This method may be extended to provide alternative functionality. The
* default implementation merely returns <code>true</code> which signifies
* that the key-value pair represented in the <code>Exchange</code> should
* be merged into the destination <code>Tree</code>. A custom implementation
* could be used to filter out unwanted records or to emit records to a
* different destination.
*
* @param exchange
* represents the key-value pair proposed for merging
* @return <code>true</code> to allow the record to be merged
* @throws Exception
*/
protected boolean beforeMergeKey(final Exchange exchange) throws Exception {
return true;
}
/**
* This method may be extended to provide custom behavior after merging one
* record. The default implementation does nothing. This method is called
* only if the corresponding call to {@link #beforeMergeKey(Exchange)}
* returned <code>true</code>.
*
* @param exchange
* represents the key-value pair that was merged.
* @throws Exception
*/
protected void afterMergeKey(final Exchange exchange) throws Exception {
}
/**
* This method may be extended to provide application-specific progress
* reports. By default it does nothing. This method is called after
* inserting a number of records into sort trees. The method
* {@link #setReportKeyCountMultiple(long)} determines the frequency at
* which this method is called.
*
* @param count
* The total number of recirds that has been merged so far.
*/
protected void reportSorted(final long count) {
}
/**
* This method may be extended to provide application-specific progress
* reports. By default it does nothing. This method is called after merging
* a number of records into destination trees. The method
* {@link #setReportKeyCountMultiple(long)} determines the frequency at
* which this method is called.
*
* @param count
* The total number of recirds that has been merged so far.
*/
protected void reportMerged(final long count) {
}
/**
* This method may be extended to provide an application-specific ordering
* on <code>Tree</code>s. This ordering determines the sequence in which
* destination trees are built from the sort data. By default trees are
* build in alphabetical order by volume and tree name. However, an
* application may choose a different order to ensure invariants for
* concurrent use.
*
* @return a <code>java.util.Comparator</code> on <code>Tree</code>
*/
protected Comparator<Tree> getTreeComparator() {
return _defaultTreeComparator;
}
void unitTestNextSortFile() throws Exception {
finishSortVolume();
_sortExchangeMapThreadLocal.get().clear();
_sortVolume = null;
}
}