/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.db;
import java.io.File;
import java.io.IOError;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import com.google.common.base.Function;
import com.google.common.collect.Iterables;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.ConfigurationException;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.config.KSMetaData;
import org.apache.cassandra.db.commitlog.CommitLog;
import org.apache.cassandra.db.filter.QueryFilter;
import org.apache.cassandra.db.filter.QueryPath;
import org.apache.cassandra.dht.LocalToken;
import org.apache.cassandra.io.CompactionInfo;
import org.apache.cassandra.io.CompactionType;
import org.apache.cassandra.io.sstable.ReducingKeyIterator;
import org.apache.cassandra.io.sstable.SSTableDeletingReference;
import org.apache.cassandra.io.sstable.SSTableReader;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.locator.AbstractReplicationStrategy;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.NodeId;
import org.cliffc.high_scale_lib.NonBlockingHashMap;
public class Table
{
public static final String SYSTEM_TABLE = "system";
private static final Logger logger = LoggerFactory.getLogger(Table.class);
private static final String SNAPSHOT_SUBDIR_NAME = "snapshots";
/**
* accesses to CFS.memtable should acquire this for thread safety.
* Table.maybeSwitchMemtable should aquire the writeLock; see that method for the full explanation.
*
* (Enabling fairness in the RRWL is observed to decrease throughput, so we leave it off.)
*/
static final ReentrantReadWriteLock switchLock = new ReentrantReadWriteLock();
// It is possible to call Table.open without a running daemon, so it makes sense to ensure
// proper directories here as well as in CassandraDaemon.
static
{
if (!StorageService.instance.isClientMode())
{
try
{
DatabaseDescriptor.createAllDirectories();
}
catch (IOException ex)
{
throw new IOError(ex);
}
}
}
/** Table objects, one per keyspace. only one instance should ever exist for any given keyspace. */
private static final Map<String, Table> instances = new NonBlockingHashMap<String, Table>();
/* Table name. */
public final String name;
/* ColumnFamilyStore per column family */
private final Map<Integer, ColumnFamilyStore> columnFamilyStores = new ConcurrentHashMap<Integer, ColumnFamilyStore>();
private final Object[] indexLocks;
private ScheduledFuture<?> flushTask;
private volatile AbstractReplicationStrategy replicationStrategy;
public static Table open(String table)
{
Table tableInstance = instances.get(table);
if (tableInstance == null)
{
// instantiate the Table. we could use putIfAbsent but it's important to making sure it is only done once
// per keyspace, so we synchronize and re-check before doing it.
synchronized (Table.class)
{
tableInstance = instances.get(table);
if (tableInstance == null)
{
// open and store the table
tableInstance = new Table(table);
instances.put(table, tableInstance);
//table has to be constructed and in the cache before cacheRow can be called
for (ColumnFamilyStore cfs : tableInstance.getColumnFamilyStores())
cfs.initCaches();
}
}
}
return tableInstance;
}
public static Table clear(String table) throws IOException
{
synchronized (Table.class)
{
Table t = instances.remove(table);
if (t != null)
{
t.flushTask.cancel(false);
for (ColumnFamilyStore cfs : t.getColumnFamilyStores())
t.unloadCf(cfs);
}
return t;
}
}
public Collection<ColumnFamilyStore> getColumnFamilyStores()
{
return Collections.unmodifiableCollection(columnFamilyStores.values());
}
public ColumnFamilyStore getColumnFamilyStore(String cfName)
{
Integer id = CFMetaData.getId(name, cfName);
if (id == null)
throw new IllegalArgumentException(String.format("Unknown table/cf pair (%s.%s)", name, cfName));
return getColumnFamilyStore(id);
}
public ColumnFamilyStore getColumnFamilyStore(Integer id)
{
ColumnFamilyStore cfs = columnFamilyStores.get(id);
if (cfs == null)
throw new IllegalArgumentException("Unknown CF " + id);
return cfs;
}
/**
* Do a cleanup of keys that do not belong locally.
*/
public void forceCleanup(NodeId.OneShotRenewer renewer) throws IOException, ExecutionException, InterruptedException
{
if (name.equals(SYSTEM_TABLE))
throw new UnsupportedOperationException("Cleanup of the system table is neither necessary nor wise");
// Sort the column families in order of SSTable size, so cleanup of smaller CFs
// can free up space for larger ones
List<ColumnFamilyStore> sortedColumnFamilies = new ArrayList<ColumnFamilyStore>(columnFamilyStores.values());
Collections.sort(sortedColumnFamilies, new Comparator<ColumnFamilyStore>()
{
// Compare first on size and, if equal, sort by name (arbitrary & deterministic).
public int compare(ColumnFamilyStore cf1, ColumnFamilyStore cf2)
{
long diff = (cf1.getTotalDiskSpaceUsed() - cf2.getTotalDiskSpaceUsed());
if (diff > 0)
return 1;
if (diff < 0)
return -1;
return cf1.columnFamily.compareTo(cf2.columnFamily);
}
});
// Cleanup in sorted order to free up space for the larger ones
for (ColumnFamilyStore cfs : sortedColumnFamilies)
cfs.forceCleanup(renewer);
}
/**
* Take a snapshot of the entire set of column families with a given timestamp.
*
* @param clientSuppliedName the tag associated with the name of the snapshot. This
* value can be null.
*/
public void snapshot(String snapshotName)
{
for (ColumnFamilyStore cfStore : columnFamilyStores.values())
{
cfStore.snapshot(snapshotName);
}
}
/**
* @param clientSuppliedName; may be null.
* @return
*/
public static String getTimestampedSnapshotName(String clientSuppliedName)
{
String snapshotName = Long.toString(System.currentTimeMillis());
if (clientSuppliedName != null && !clientSuppliedName.equals(""))
{
snapshotName = snapshotName + "-" + clientSuppliedName;
}
return snapshotName;
}
/**
* Clear snapshots for this table. If no tag is given we will clear all
* snapshots
*
* @param snapshotName the user supplied snapshot name
* @return true if the snapshot exists
*/
public boolean snapshotExists(String snapshotName)
{
for (String dataDirPath : DatabaseDescriptor.getAllDataFileLocations())
{
String snapshotPath = dataDirPath + File.separator + name + File.separator + SNAPSHOT_SUBDIR_NAME + File.separator + snapshotName;
File snapshot = new File(snapshotPath);
if (snapshot.exists())
{
return true;
}
}
return false;
}
/**
* Clear all the snapshots for a given table.
*/
public void clearSnapshot(String tag) throws IOException
{
for (String dataDirPath : DatabaseDescriptor.getAllDataFileLocations())
{
// If tag is empty we will delete the entire snapshot directory
String snapshotPath = dataDirPath + File.separator + name + File.separator + SNAPSHOT_SUBDIR_NAME + File.separator + tag;
File snapshotDir = new File(snapshotPath);
if (snapshotDir.exists())
{
if (logger.isDebugEnabled())
logger.debug("Removing snapshot directory " + snapshotPath);
FileUtils.deleteRecursive(snapshotDir);
}
}
}
/**
* @return A list of open SSTableReaders (TODO: ensure that the caller doesn't modify these).
*/
public List<SSTableReader> getAllSSTables()
{
List<SSTableReader> list = new ArrayList<SSTableReader>();
for (ColumnFamilyStore cfStore : columnFamilyStores.values())
list.addAll(cfStore.getSSTables());
return list;
}
private Table(String table)
{
name = table;
KSMetaData ksm = DatabaseDescriptor.getKSMetaData(table);
assert ksm != null : "Unknown keyspace " + table;
try
{
createReplicationStrategy(ksm);
}
catch (ConfigurationException e)
{
throw new RuntimeException(e);
}
indexLocks = new Object[DatabaseDescriptor.getConcurrentWriters() * 128];
for (int i = 0; i < indexLocks.length; i++)
indexLocks[i] = new Object();
// create data directories.
for (String dataDir : DatabaseDescriptor.getAllDataFileLocations())
{
try
{
String keyspaceDir = dataDir + File.separator + table;
if (!StorageService.instance.isClientMode())
FileUtils.createDirectory(keyspaceDir);
// remove the deprecated streaming directory.
File streamingDir = new File(keyspaceDir, "stream");
if (streamingDir.exists())
FileUtils.deleteRecursive(streamingDir);
}
catch (IOException ex)
{
throw new IOError(ex);
}
}
for (CFMetaData cfm : new ArrayList<CFMetaData>(DatabaseDescriptor.getTableDefinition(table).cfMetaData().values()))
{
logger.debug("Initializing {}.{}", name, cfm.cfName);
initCf(cfm.cfId, cfm.cfName);
}
Runnable runnable = new Runnable()
{
public void run()
{
for (ColumnFamilyStore cfs : columnFamilyStores.values())
{
cfs.forceFlushIfExpired();
}
}
};
flushTask = StorageService.scheduledTasks.scheduleWithFixedDelay(runnable, 10, 10, TimeUnit.SECONDS);
}
public void createReplicationStrategy(KSMetaData ksm) throws ConfigurationException
{
if (replicationStrategy != null)
StorageService.instance.getTokenMetadata().unregister(replicationStrategy);
replicationStrategy = AbstractReplicationStrategy.createReplicationStrategy(ksm.name,
ksm.strategyClass,
StorageService.instance.getTokenMetadata(),
DatabaseDescriptor.getEndpointSnitch(),
ksm.strategyOptions);
}
// best invoked on the compaction mananger.
public void dropCf(Integer cfId) throws IOException
{
assert columnFamilyStores.containsKey(cfId);
ColumnFamilyStore cfs = columnFamilyStores.remove(cfId);
if (cfs == null)
return;
unloadCf(cfs);
cfs.removeAllSSTables();
}
// disassociate a cfs from this table instance.
private void unloadCf(ColumnFamilyStore cfs) throws IOException
{
try
{
cfs.forceBlockingFlush();
}
catch (ExecutionException e)
{
throw new IOException(e);
}
catch (InterruptedException e)
{
throw new IOException(e);
}
cfs.unregisterMBean();
}
/** adds a cf to internal structures, ends up creating disk files). */
public void initCf(Integer cfId, String cfName)
{
assert !columnFamilyStores.containsKey(cfId) : String.format("tried to init %s as %s, but already used by %s",
cfName, cfId, columnFamilyStores.get(cfId));
columnFamilyStores.put(cfId, ColumnFamilyStore.createColumnFamilyStore(this, cfName));
}
/** basically a combined drop and add */
public void renameCf(Integer cfId, String newName) throws IOException
{
assert columnFamilyStores.containsKey(cfId);
ColumnFamilyStore cfs = columnFamilyStores.remove(cfId);
unloadCf(cfs);
cfs.renameSSTables(newName);
initCf(cfId, newName);
}
public Row getRow(QueryFilter filter) throws IOException
{
ColumnFamilyStore cfStore = getColumnFamilyStore(filter.getColumnFamilyName());
ColumnFamily columnFamily = cfStore.getColumnFamily(filter);
return new Row(filter.key, columnFamily);
}
/**
* This method adds the row to the Commit Log associated with this table.
* Once this happens the data associated with the individual column families
* is also written to the column family store's memtable.
*/
public void apply(RowMutation mutation, boolean writeCommitLog) throws IOException
{
List<Memtable> memtablesToFlush = Collections.emptyList();
if (logger.isDebugEnabled())
logger.debug("applying mutation of row {}", ByteBufferUtil.bytesToHex(mutation.key()));
// write the mutation to the commitlog and memtables
switchLock.readLock().lock();
try
{
if (writeCommitLog)
CommitLog.instance.add(mutation);
DecoratedKey<?> key = StorageService.getPartitioner().decorateKey(mutation.key());
for (ColumnFamily cf : mutation.getColumnFamilies())
{
ColumnFamilyStore cfs = columnFamilyStores.get(cf.id());
if (cfs == null)
{
logger.error("Attempting to mutate non-existant column family " + cf.id());
continue;
}
SortedSet<ByteBuffer> mutatedIndexedColumns = null;
for (ByteBuffer column : cfs.getIndexedColumns())
{
if (cf.getColumnNames().contains(column) || cf.isMarkedForDelete())
{
if (mutatedIndexedColumns == null)
mutatedIndexedColumns = new TreeSet<ByteBuffer>();
mutatedIndexedColumns.add(column);
if (logger.isDebugEnabled())
{
// can't actually use validator to print value here, because we overload value
// for deletion timestamp as well (which may not be a well-formed value for the column type)
ByteBuffer value = cf.getColumn(column) == null ? null : cf.getColumn(column).value(); // may be null on row-level deletion
logger.debug(String.format("mutating indexed column %s value %s",
cf.getComparator().getString(column),
value == null ? "null" : ByteBufferUtil.bytesToHex(value)));
}
}
}
synchronized (indexLockFor(mutation.key()))
{
ColumnFamily oldIndexedColumns = null;
if (mutatedIndexedColumns != null)
{
// with the raw data CF, we can just apply every update in any order and let
// read-time resolution throw out obsolete versions, thus avoiding read-before-write.
// but for indexed data we need to make sure that we're not creating index entries
// for obsolete writes.
oldIndexedColumns = readCurrentIndexedColumns(key, cfs, mutatedIndexedColumns);
logger.debug("Pre-mutation index row is {}", oldIndexedColumns);
ignoreObsoleteMutations(cf, mutatedIndexedColumns, oldIndexedColumns);
}
Memtable fullMemtable = cfs.apply(key, cf);
if (fullMemtable != null)
memtablesToFlush = addFullMemtable(memtablesToFlush, fullMemtable);
if (mutatedIndexedColumns != null)
{
// ignore full index memtables -- we flush those when the "master" one is full
applyIndexUpdates(mutation.key(), cf, cfs, mutatedIndexedColumns, oldIndexedColumns);
}
}
}
}
finally
{
switchLock.readLock().unlock();
}
// flush memtables that got filled up outside the readlock (maybeSwitchMemtable acquires writeLock).
// usually mTF will be empty and this will be a no-op.
for (Memtable memtable : memtablesToFlush)
memtable.cfs.maybeSwitchMemtable(memtable, writeCommitLog);
}
private static List<Memtable> addFullMemtable(List<Memtable> memtablesToFlush, Memtable fullMemtable)
{
if (memtablesToFlush.isEmpty())
memtablesToFlush = new ArrayList<Memtable>(2);
memtablesToFlush.add(fullMemtable);
return memtablesToFlush;
}
private static void ignoreObsoleteMutations(ColumnFamily cf, SortedSet<ByteBuffer> mutatedIndexedColumns, ColumnFamily oldIndexedColumns)
{
if (oldIndexedColumns == null)
return;
ColumnFamily cf2 = cf.cloneMe();
for (IColumn oldColumn : oldIndexedColumns)
{
cf2.addColumn(oldColumn);
}
ColumnFamily resolved = ColumnFamilyStore.removeDeleted(cf2, Integer.MAX_VALUE);
for (IColumn oldColumn : oldIndexedColumns)
{
IColumn resolvedColumn = resolved == null ? null : resolved.getColumn(oldColumn.name());
if (resolvedColumn != null && resolvedColumn.equals(oldColumn))
{
if (logger.isDebugEnabled())
logger.debug("ignoring obsolete mutation of " + cf.getComparator().getString(oldColumn.name()));
cf.remove(oldColumn.name());
mutatedIndexedColumns.remove(oldColumn.name());
oldIndexedColumns.remove(oldColumn.name());
}
}
}
private static ColumnFamily readCurrentIndexedColumns(DecoratedKey<?> key, ColumnFamilyStore cfs, SortedSet<ByteBuffer> mutatedIndexedColumns)
{
QueryFilter filter = QueryFilter.getNamesFilter(key, new QueryPath(cfs.getColumnFamilyName()), mutatedIndexedColumns);
return cfs.getColumnFamily(filter);
}
/**
* removes obsolete index entries and creates new ones for the given row key and mutated columns.
* @return list of full (index CF) memtables
*/
private static List<Memtable> applyIndexUpdates(ByteBuffer key,
ColumnFamily cf,
ColumnFamilyStore cfs,
SortedSet<ByteBuffer> mutatedIndexedColumns,
ColumnFamily oldIndexedColumns)
{
List<Memtable> fullMemtables = Collections.emptyList();
// add new index entries
for (ByteBuffer columnName : mutatedIndexedColumns)
{
IColumn column = cf.getColumn(columnName);
if (column == null || column.isMarkedForDelete())
continue; // null column == row deletion
DecoratedKey<LocalToken> valueKey = cfs.getIndexKeyFor(columnName, column.value());
ColumnFamily cfi = cfs.newIndexedColumnFamily(columnName);
if (column instanceof ExpiringColumn)
{
ExpiringColumn ec = (ExpiringColumn)column;
cfi.addColumn(new ExpiringColumn(key, ByteBufferUtil.EMPTY_BYTE_BUFFER, ec.timestamp, ec.getTimeToLive(), ec.getLocalDeletionTime()));
}
else
{
cfi.addColumn(new Column(key, ByteBufferUtil.EMPTY_BYTE_BUFFER, column.timestamp()));
}
if (logger.isDebugEnabled())
logger.debug("applying index row {}:{}", valueKey, cfi);
Memtable fullMemtable = cfs.getIndexedColumnFamilyStore(columnName).apply(valueKey, cfi);
if (fullMemtable != null)
fullMemtables = addFullMemtable(fullMemtables, fullMemtable);
}
// remove the old index entries
if (oldIndexedColumns != null)
{
int localDeletionTime = (int) (System.currentTimeMillis() / 1000);
for (Map.Entry<ByteBuffer, IColumn> entry : oldIndexedColumns.getColumnsMap().entrySet())
{
ByteBuffer columnName = entry.getKey();
IColumn column = entry.getValue();
if (column.isMarkedForDelete())
continue;
DecoratedKey<LocalToken> valueKey = cfs.getIndexKeyFor(columnName, column.value());
ColumnFamily cfi = cfs.newIndexedColumnFamily(columnName);
cfi.addTombstone(key, localDeletionTime, column.timestamp());
Memtable fullMemtable = cfs.getIndexedColumnFamilyStore(columnName).apply(valueKey, cfi);
if (logger.isDebugEnabled())
logger.debug("applying index tombstones {}:{}", valueKey, cfi);
if (fullMemtable != null)
fullMemtables = addFullMemtable(fullMemtables, fullMemtable);
}
}
return fullMemtables;
}
public static void cleanupIndexEntry(ColumnFamilyStore cfs, ByteBuffer key, IColumn column)
{
if (column.isMarkedForDelete())
return;
int localDeletionTime = (int) (System.currentTimeMillis() / 1000);
DecoratedKey<LocalToken> valueKey = cfs.getIndexKeyFor(column.name(), column.value());
ColumnFamily cfi = cfs.newIndexedColumnFamily(column.name());
cfi.addTombstone(key, localDeletionTime, column.timestamp());
Memtable fullMemtable = cfs.getIndexedColumnFamilyStore(column.name()).apply(valueKey, cfi);
if (logger.isDebugEnabled())
logger.debug("removed index entry for cleaned-up value {}:{}", valueKey, cfi);
if (fullMemtable != null)
fullMemtable.cfs.maybeSwitchMemtable(fullMemtable, false);
}
public IndexBuilder createIndexBuilder(ColumnFamilyStore cfs, SortedSet<ByteBuffer> columns, ReducingKeyIterator iter)
{
return new IndexBuilder(cfs, columns, iter);
}
public AbstractReplicationStrategy getReplicationStrategy()
{
return replicationStrategy;
}
public class IndexBuilder implements CompactionInfo.Holder
{
private final ColumnFamilyStore cfs;
private final SortedSet<ByteBuffer> columns;
private final ReducingKeyIterator iter;
public IndexBuilder(ColumnFamilyStore cfs, SortedSet<ByteBuffer> columns, ReducingKeyIterator iter)
{
this.cfs = cfs;
this.columns = columns;
this.iter = iter;
}
public CompactionInfo getCompactionInfo()
{
return new CompactionInfo(cfs.table.name,
cfs.columnFamily,
CompactionType.INDEX_BUILD,
iter.getTotalBytes(),
iter.getBytesRead());
}
public void build()
{
while (iter.hasNext())
{
DecoratedKey<?> key = iter.next();
logger.debug("Indexing row {} ", key);
List<Memtable> memtablesToFlush = Collections.emptyList();
switchLock.readLock().lock();
try
{
synchronized (indexLockFor(key.key))
{
ColumnFamily cf = readCurrentIndexedColumns(key, cfs, columns);
if (cf != null)
memtablesToFlush = applyIndexUpdates(key.key, cf, cfs, cf.getColumnNames(), null);
}
}
finally
{
switchLock.readLock().unlock();
}
// during index build, we do flush index memtables separately from master; otherwise we could OOM
for (Memtable memtable : memtablesToFlush)
memtable.cfs.maybeSwitchMemtable(memtable, false);
}
try
{
iter.close();
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
}
private Object indexLockFor(ByteBuffer key)
{
return indexLocks[Math.abs(key.hashCode() % indexLocks.length)];
}
public List<Future<?>> flush() throws IOException
{
List<Future<?>> futures = new ArrayList<Future<?>>();
for (Integer cfId : columnFamilyStores.keySet())
{
Future<?> future = columnFamilyStores.get(cfId).forceFlush();
if (future != null)
futures.add(future);
}
return futures;
}
// for binary load path. skips commitlog.
void load(RowMutation rowMutation) throws IOException
{
DecoratedKey<?> key = StorageService.getPartitioner().decorateKey(rowMutation.key());
for (ColumnFamily columnFamily : rowMutation.getColumnFamilies())
{
Collection<IColumn> columns = columnFamily.getSortedColumns();
for (IColumn column : columns)
{
ColumnFamilyStore cfStore = columnFamilyStores.get(ByteBufferUtil.toInt(column.name()));
cfStore.applyBinary(key, column.value());
}
}
}
public String getDataFileLocation(long expectedCompactedFileSize)
{
String path = DatabaseDescriptor.getDataFileLocationForTable(name, expectedCompactedFileSize);
if (path == null)
{
// retry after GCing to force unmap of compacted SSTables so they can be deleted
StorageService.instance.requestGC();
try
{
Thread.sleep(SSTableDeletingReference.RETRY_DELAY * 2);
}
catch (InterruptedException e)
{
throw new AssertionError(e);
}
path = DatabaseDescriptor.getDataFileLocationForTable(name, expectedCompactedFileSize);
}
return path;
}
public static String getSnapshotPath(String dataDirPath, String tableName, String snapshotName)
{
return dataDirPath + File.separator + tableName + File.separator + SNAPSHOT_SUBDIR_NAME + File.separator + snapshotName;
}
public static Iterable<Table> all()
{
Function<String, Table> transformer = new Function<String, Table>()
{
public Table apply(String tableName)
{
return Table.open(tableName);
}
};
return Iterables.transform(DatabaseDescriptor.getTables(), transformer);
}
/**
* Performs a synchronous truncate operation, effectively deleting all data
* from the column family cfname
* @param cfname
* @throws IOException
* @throws ExecutionException
* @throws InterruptedException
*/
public void truncate(String cfname) throws InterruptedException, ExecutionException, IOException
{
logger.debug("Truncating...");
ColumnFamilyStore cfs = getColumnFamilyStore(cfname);
// truncate, blocking
cfs.truncate().get();
logger.debug("Truncation done.");
}
@Override
public String toString() {
return getClass().getSimpleName() + "(name='" + name + "')";
}
}