Source Code of org.apache.cassandra.db.compaction.CompactionManager

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.cassandra.db.compaction;


import java.io.File;
import java.io.IOError;
import java.io.IOException;
import java.lang.management.ManagementFactory;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantReadWriteLock;


import javax.management.MBeanServer;
import javax.management.ObjectName;


import org.apache.cassandra.cache.AutoSavingCache;
import org.apache.cassandra.concurrent.DebuggableThreadPoolExecutor;
import org.apache.cassandra.concurrent.NamedThreadFactory;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.config.Schema;
import org.apache.cassandra.db.*;
import org.apache.cassandra.db.index.SecondaryIndexBuilder;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.io.sstable.*;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.io.util.RandomAccessReader;
import org.apache.cassandra.service.AntiEntropyService;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.utils.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import com.google.common.base.Predicates;
import com.google.common.collect.Iterators;


/**
 * A singleton which manages a private executor of ongoing compactions. A readwrite lock
 * controls whether compactions can proceed: an external consumer can completely stop
 * compactions by acquiring the write half of the lock via getCompactionLock().
 *
 * Scheduling for compaction is accomplished by swapping sstables to be compacted into
 * a set via DataTracker. New scheduling attempts will ignore currently compacting
 * sstables.
 */
public class CompactionManager implements CompactionManagerMBean
{
    public static final String MBEAN_OBJECT_NAME = "org.apache.cassandra.db:type=CompactionManager";
    private static final Logger logger = LoggerFactory.getLogger(CompactionManager.class);
    public static final CompactionManager instance;


    /**
     * compactionLock has two purposes:
     * - Compaction acquires its readLock so that multiple compactions can happen simultaneously,
     *   but the KS/CF migtations acquire its writeLock, so they can be sure no new SSTables will
     *   be created for a dropped CF posthumously.  (Thus, compaction checks CFS.isValid while the
     *   lock is acquired.)
     * - "Special" compactions will acquire writelock instead of readlock to make sure that all
     *   other compaction activity is quiesced and they can grab ALL the sstables to do something.
     *   TODO this is too big a hammer -- we should only care about quiescing all for the given CFS.
     */
    private final ReentrantReadWriteLock compactionLock = new ReentrantReadWriteLock();


    static
    {
        instance = new CompactionManager();
        MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
        try
        {
            mbs.registerMBean(instance, new ObjectName(MBEAN_OBJECT_NAME));
        }
        catch (Exception e)
        {
            throw new RuntimeException(e);
        }
    }


    private CompactionExecutor executor = new CompactionExecutor();
    private CompactionExecutor validationExecutor = new ValidationExecutor();


    /**
     * @return A lock, for which acquisition means no compactions can run.
     */
    public Lock getCompactionLock()
    {
        return compactionLock.writeLock();
    }


    /**
     * Call this whenever a compaction might be needed on the given columnfamily.
     * It's okay to over-call (within reason) since the compactions are single-threaded,
     * and if a call is unnecessary, it will just be no-oped in the bucketing phase.
     */
    public Future<Integer> submitBackground(final ColumnFamilyStore cfs)
    {
        Callable<Integer> callable = new Callable<Integer>()
        {
            public Integer call() throws IOException
            {
                compactionLock.readLock().lock();
                try
                {
                    if (!cfs.isValid())
                        return 0;


                    boolean taskExecuted = false;
                    AbstractCompactionStrategy strategy = cfs.getCompactionStrategy();
                    List<AbstractCompactionTask> tasks = strategy.getBackgroundTasks(getDefaultGcBefore(cfs));
                    for (AbstractCompactionTask task : tasks)
                    {
                        if (!task.markSSTablesForCompaction())
                            continue;


                        taskExecuted = true;
                        try
                        {
                            task.execute(executor);
                        }
                        finally
                        {
                            task.unmarkSSTables();
                        }
                    }


                    // newly created sstables might have made other compactions eligible
                    if (taskExecuted)
                        submitBackground(cfs);
                }
                finally 
                {
                    compactionLock.readLock().unlock();
                }
                return 0;
            }
        };
        return executor.submit(callable);
    }


    public void performCleanup(final ColumnFamilyStore cfStore, final NodeId.OneShotRenewer renewer) throws InterruptedException, ExecutionException
    {
        Callable<Object> runnable = new Callable<Object>()
        {
            public Object call() throws IOException
            {
                compactionLock.writeLock().lock();
                try 
                {
                    if (!cfStore.isValid())
                        return this;
                    Collection<SSTableReader> tocleanup = cfStore.getDataTracker().markCompacting(cfStore.getSSTables(), 1, Integer.MAX_VALUE);
                    if (tocleanup == null || tocleanup.isEmpty())
                        return this;
                    try
                    {
                        // downgrade the lock acquisition
                        compactionLock.readLock().lock();
                        compactionLock.writeLock().unlock();
                        try
                        {
                            doCleanupCompaction(cfStore, tocleanup, renewer);
                        }
                        finally
                        {
                            compactionLock.readLock().unlock();
                        }
                    }
                    finally
                    {
                        cfStore.getDataTracker().unmarkCompacting(tocleanup);
                    }
                    return this;
                }
                finally 
                {
                    // we probably already downgraded
                    if (compactionLock.writeLock().isHeldByCurrentThread())
                        compactionLock.writeLock().unlock();
                }
            }
        };
        executor.submit(runnable).get();
    }


    public void performScrub(final ColumnFamilyStore cfStore) throws InterruptedException, ExecutionException
    {
        Callable<Object> runnable = new Callable<Object>()
        {
            public Object call() throws IOException
            {
                // acquire the write lock to schedule all sstables
                compactionLock.writeLock().lock();
                try
                {
                    if (!cfStore.isValid())
                        return this;


                    Collection<SSTableReader> toscrub = cfStore.getDataTracker().markCompacting(cfStore.getSSTables(), 1, Integer.MAX_VALUE);
                    if (toscrub == null || toscrub.isEmpty())
                        return this;
                    try
                    {
                        // downgrade the lock acquisition
                        compactionLock.readLock().lock();
                        compactionLock.writeLock().unlock();
                        try
                        {
                            doScrub(cfStore, toscrub);
                        }
                        finally
                        {
                            compactionLock.readLock().unlock();
                        }
                    }
                    finally
                    {
                        cfStore.getDataTracker().unmarkCompacting(toscrub);
                    }
                    return this;
                }
                finally
                {
                    // we probably already downgraded
                    if (compactionLock.writeLock().isHeldByCurrentThread())
                        compactionLock.writeLock().unlock();
                }
            }
        };
        executor.submit(runnable).get();
    }


    public void performMaximal(final ColumnFamilyStore cfStore) throws InterruptedException, ExecutionException
    {
        submitMaximal(cfStore, getDefaultGcBefore(cfStore)).get();
    }


    public Future<Object> submitMaximal(final ColumnFamilyStore cfStore, final int gcBefore)
    {
        Callable<Object> callable = new Callable<Object>()
        {
            public Object call() throws IOException
            {
                // acquire the write lock long enough to schedule all sstables
                compactionLock.writeLock().lock();
                try
                {
                    if (!cfStore.isValid())
                        return this;
                    AbstractCompactionStrategy strategy = cfStore.getCompactionStrategy();
                    for (AbstractCompactionTask task : strategy.getMaximalTasks(gcBefore))
                    {
                        if (!task.markSSTablesForCompaction(0, Integer.MAX_VALUE))
                            return this;
                        try
                        {
                            // downgrade the lock acquisition
                            compactionLock.readLock().lock();
                            compactionLock.writeLock().unlock();
                            try
                            {
                                return task.execute(executor);
                            }
                            finally
                            {
                                compactionLock.readLock().unlock();
                            }
                        }
                        finally
                        {
                            task.unmarkSSTables();
                        }
                    }
                }
                finally
                {
                    // we probably already downgraded
                    if (compactionLock.writeLock().isHeldByCurrentThread())
                        compactionLock.writeLock().unlock();
                }
                return this;
            }
        };
        return executor.submit(callable);
    }


    public void forceUserDefinedCompaction(String ksname, String dataFiles)
    {
        if (!Schema.instance.getTables().contains(ksname))
            throw new IllegalArgumentException("Unknown keyspace " + ksname);


        File directory = new File(ksname);
        String[] filenames = dataFiles.split(",");
        Collection<Descriptor> descriptors = new ArrayList<Descriptor>(filenames.length);


        String cfname = null;
        for (String filename : filenames)
        {
            Pair<Descriptor, String> p = Descriptor.fromFilename(directory, filename.trim());
            if (!p.right.equals(Component.DATA.name()))
            {
                throw new IllegalArgumentException(filename + " does not appear to be a data file");
            }
            if (cfname == null)
            {
                cfname = p.left.cfname;
            }
            else if (!cfname.equals(p.left.cfname))
            {
                throw new IllegalArgumentException("All provided sstables should be for the same column family");
            }


            descriptors.add(p.left);
        }


        ColumnFamilyStore cfs = Table.open(ksname).getColumnFamilyStore(cfname);
        submitUserDefined(cfs, descriptors, getDefaultGcBefore(cfs));
    }


    public Future<Object> submitUserDefined(final ColumnFamilyStore cfs, final Collection<Descriptor> dataFiles, final int gcBefore)
    {
        Callable<Object> callable = new Callable<Object>()
        {
            public Object call() throws IOException
            {
                compactionLock.readLock().lock();
                try
                {
                    if (!cfs.isValid())
                        return this;


                    // look up the sstables now that we're on the compaction executor, so we don't try to re-compact
                    // something that was already being compacted earlier.
                    Collection<SSTableReader> sstables = new ArrayList<SSTableReader>();
                    for (Descriptor desc : dataFiles)
                    {
                        // inefficient but not in a performance sensitive path
                        SSTableReader sstable = lookupSSTable(cfs, desc);
                        if (sstable == null)
                        {
                            logger.info("Will not compact {}: it is not an active sstable", desc);
                        }
                        else
                        {
                            sstables.add(sstable);
                        }
                    }


                    Collection<SSTableReader> toCompact;
                    try
                    {
                        if (sstables.isEmpty())
                        {
                            logger.error("No file to compact for user defined compaction");
                        }
                        // attempt to schedule the set
                        else if ((toCompact = cfs.getDataTracker().markCompacting(sstables, 1, Integer.MAX_VALUE)) != null)
                        {
                            // success: perform the compaction
                            try
                            {
                                AbstractCompactionStrategy strategy = cfs.getCompactionStrategy();
                                AbstractCompactionTask task = strategy.getUserDefinedTask(toCompact, gcBefore);
                                task.execute(executor);
                            }
                            finally
                            {
                                cfs.getDataTracker().unmarkCompacting(toCompact);
                            }
                        }
                        else
                        {
                            logger.error("SSTables for user defined compaction are already being compacted.");
                        }
                    }
                    finally
                    {
                        SSTableReader.releaseReferences(sstables);
                    }


                    return this;
                }
                finally
                {
                    compactionLock.readLock().unlock();
                }
            }
        };
        return executor.submit(callable);
    }


    // This acquire a reference on the sstable
    // This is not efficent, do not use in any critical path
    private SSTableReader lookupSSTable(final ColumnFamilyStore cfs, Descriptor descriptor)
    {
        SSTableReader found = null;
        for (SSTableReader sstable : cfs.markCurrentSSTablesReferenced())
        {
            // .equals() with no other changes won't work because in sstable.descriptor, the directory is an absolute path.
            // We could construct descriptor with an absolute path too but I haven't found any satisfying way to do that
            // (DB.getDataFileLocationForTable() may not return the right path if you have multiple volumes). Hence the
            // endsWith.
            if (sstable.descriptor.toString().endsWith(descriptor.toString()))
                found = sstable;
            else
                sstable.releaseReference();
        }
        return found;
    }


    /**
     * Does not mutate data, so is not scheduled.
     */
    public Future<Object> submitValidation(final ColumnFamilyStore cfStore, final AntiEntropyService.Validator validator)
    {
        Callable<Object> callable = new Callable<Object>()
        {
            public Object call() throws IOException
            {
                compactionLock.readLock().lock();
                try
                {
                    if (cfStore.isValid())
                        doValidationCompaction(cfStore, validator);
                    return this;
                }
                finally
                {
                    compactionLock.readLock().unlock();
                }
            }
        };
        return validationExecutor.submit(callable);
    }


    /* Used in tests. */
    public void disableAutoCompaction()
    {
        for (String ksname : Schema.instance.getNonSystemTables())
        {
            for (ColumnFamilyStore cfs : Table.open(ksname).getColumnFamilyStores())
                cfs.disableAutoCompaction();
        }
    }


    /**
     * Deserialize everything in the CFS and re-serialize w/ the newest version.  Also attempts to recover
     * from bogus row keys / sizes using data from the index, and skips rows with garbage columns that resulted
     * from early ByteBuffer bugs.
     *
     * @throws IOException
     */
    private void doScrub(ColumnFamilyStore cfs, Collection<SSTableReader> sstables) throws IOException
    {
        assert !cfs.isIndex();
        for (final SSTableReader sstable : sstables)
            scrubOne(cfs, sstable);
    }


    private void scrubOne(ColumnFamilyStore cfs, SSTableReader sstable) throws IOException
    {
        logger.info("Scrubbing " + sstable);
        CompactionController controller = new CompactionController(cfs, Collections.singletonList(sstable), getDefaultGcBefore(cfs), true);
        boolean isCommutative = cfs.metadata.getDefaultValidator().isCommutative();


        // Calculate the expected compacted filesize
        String compactionFileLocation = cfs.table.getDataFileLocation(sstable.onDiskLength());
        if (compactionFileLocation == null)
            throw new IOException("disk full");
        int expectedBloomFilterSize = Math.max(DatabaseDescriptor.getIndexInterval(),
                                               (int)(SSTableReader.getApproximateKeyCount(Arrays.asList(sstable))));


        // loop through each row, deserializing to check for damage.
        // we'll also loop through the index at the same time, using the position from the index to recover if the
        // row header (key or data size) is corrupt. (This means our position in the index file will be one row
        // "ahead" of the data file.)
        final RandomAccessReader dataFile = sstable.openDataReader(true);
        RandomAccessReader indexFile = RandomAccessReader.open(new File(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX)), true);
        ScrubInfo scrubInfo = new ScrubInfo(dataFile, sstable);
        executor.beginCompaction(scrubInfo);


        SSTableWriter writer = null;
        SSTableReader newSstable = null;
        int goodRows = 0, badRows = 0, emptyRows = 0;


        try
        {
            ByteBuffer nextIndexKey = ByteBufferUtil.readWithShortLength(indexFile);
            {
                // throw away variable so we don't have a side effect in the assert
                long firstRowPositionFromIndex = indexFile.readLong();
                assert firstRowPositionFromIndex == 0 : firstRowPositionFromIndex;
            }


            // TODO errors when creating the writer may leave empty temp files.
            writer = maybeCreateWriter(cfs, compactionFileLocation, expectedBloomFilterSize, null, Collections.singletonList(sstable));


            while (!dataFile.isEOF())
            {
                long rowStart = dataFile.getFilePointer();
                if (logger.isDebugEnabled())
                    logger.debug("Reading row at " + rowStart);


                DecoratedKey key = null;
                long dataSize = -1;
                try
                {
                    key = SSTableReader.decodeKey(sstable.partitioner, sstable.descriptor, ByteBufferUtil.readWithShortLength(dataFile));
                    dataSize = sstable.descriptor.hasIntRowSize ? dataFile.readInt() : dataFile.readLong();
                    if (logger.isDebugEnabled())
                        logger.debug(String.format("row %s is %s bytes", ByteBufferUtil.bytesToHex(key.key), dataSize));
                }
                catch (Throwable th)
                {
                    throwIfFatal(th);
                    // check for null key below
                }


                ByteBuffer currentIndexKey = nextIndexKey;
                long nextRowPositionFromIndex;
                try
                {
                    nextIndexKey = indexFile.isEOF() ? null : ByteBufferUtil.readWithShortLength(indexFile);
                    nextRowPositionFromIndex = indexFile.isEOF() ? dataFile.length() : indexFile.readLong();
                }
                catch (Throwable th)
                {
                    logger.warn("Error reading index file", th);
                    nextIndexKey = null;
                    nextRowPositionFromIndex = dataFile.length();
                }


                long dataStart = dataFile.getFilePointer();
                long dataStartFromIndex = currentIndexKey == null
                                        ? -1
                                        : rowStart + 2 + currentIndexKey.remaining() + (sstable.descriptor.hasIntRowSize ? 4 : 8);
                long dataSizeFromIndex = nextRowPositionFromIndex - dataStartFromIndex;
                assert currentIndexKey != null || indexFile.isEOF();
                if (logger.isDebugEnabled() && currentIndexKey != null)
                    logger.debug(String.format("Index doublecheck: row %s is %s bytes", ByteBufferUtil.bytesToHex(currentIndexKey),  dataSizeFromIndex));


                writer.mark();
                try
                {
                    if (key == null)
                        throw new IOError(new IOException("Unable to read row key from data file"));
                    if (dataSize > dataFile.length())
                        throw new IOError(new IOException("Impossible row size " + dataSize));
                    SSTableIdentityIterator row = new SSTableIdentityIterator(sstable, dataFile, key, dataStart, dataSize, true);
                    AbstractCompactedRow compactedRow = controller.getCompactedRow(row);
                    if (compactedRow.isEmpty())
                    {
                        emptyRows++;
                    }
                    else
                    {
                        writer.append(compactedRow);
                        goodRows++;
                    }
                    if (!key.key.equals(currentIndexKey) || dataStart != dataStartFromIndex)
                        logger.warn("Index file contained a different key or row size; using key from data file");
                }
                catch (Throwable th)
                {
                    throwIfFatal(th);
                    logger.warn("Non-fatal error reading row (stacktrace follows)", th);
                    writer.resetAndTruncate();


                    if (currentIndexKey != null
                        && (key == null || !key.key.equals(currentIndexKey) || dataStart != dataStartFromIndex || dataSize != dataSizeFromIndex))
                    {
                        logger.info(String.format("Retrying from row index; data is %s bytes starting at %s",
                                                  dataSizeFromIndex, dataStartFromIndex));
                        key = SSTableReader.decodeKey(sstable.partitioner, sstable.descriptor, currentIndexKey);
                        try
                        {
                            SSTableIdentityIterator row = new SSTableIdentityIterator(sstable, dataFile, key, dataStartFromIndex, dataSizeFromIndex, true);
                            AbstractCompactedRow compactedRow = controller.getCompactedRow(row);
                            if (compactedRow.isEmpty())
                            {
                                emptyRows++;
                            }
                            else
                            {
                                writer.append(compactedRow);
                                goodRows++;
                            }
                        }
                        catch (Throwable th2)
                        {
                            throwIfFatal(th2);
                            // Skipping rows is dangerous for counters (see CASSANDRA-2759)
                            if (isCommutative)
                                throw new IOError(th2);


                            logger.warn("Retry failed too.  Skipping to next row (retry's stacktrace follows)", th2);
                            writer.resetAndTruncate();
                            dataFile.seek(nextRowPositionFromIndex);
                            badRows++;
                        }
                    }
                    else
                    {
                        // Skipping rows is dangerous for counters (see CASSANDRA-2759)
                        if (isCommutative)
                            throw new IOError(th);


                        logger.warn("Row at " + dataStart + " is unreadable; skipping to next");
                        if (currentIndexKey != null)
                            dataFile.seek(nextRowPositionFromIndex);
                        badRows++;
                    }
                }
            }


            if (writer.getFilePointer() > 0)
                newSstable = writer.closeAndOpenReader(sstable.maxDataAge);
        }
        finally
        {
            if (writer != null)
                writer.cleanupIfNecessary();
            FileUtils.closeQuietly(dataFile);
            FileUtils.closeQuietly(indexFile);


            executor.finishCompaction(scrubInfo);
        }


        if (newSstable == null)
        {
            cfs.markCompacted(Arrays.asList(sstable));
            if (badRows > 0)
                logger.warn("No valid rows found while scrubbing " + sstable + "; it is marked for deletion now. If you want to attempt manual recovery, you can find a copy in the pre-scrub snapshot");
            else
                logger.info("Scrub of " + sstable + " complete; looks like all " + emptyRows + " rows were tombstoned");
        }
        else
        {
            cfs.replaceCompactedSSTables(Arrays.asList(sstable), Arrays.asList(newSstable));
            logger.info("Scrub of " + sstable + " complete: " + goodRows + " rows in new sstable and " + emptyRows + " empty (tombstoned) rows dropped");
            if (badRows > 0)
                logger.warn("Unable to recover " + badRows + " rows that were skipped.  You can attempt manual recovery from the pre-scrub snapshot.  You can also run nodetool repair to transfer the data from a healthy replica, if any");
        }
    }


    private void throwIfFatal(Throwable th)
    {
        if (th instanceof Error && !(th instanceof AssertionError || th instanceof IOError))
            throw (Error) th;
    }


    /**
     * This function goes over each file and removes the keys that the node is not responsible for
     * and only keeps keys that this node is responsible for.
     *
     * @throws IOException
     */
    private void doCleanupCompaction(ColumnFamilyStore cfs, Collection<SSTableReader> sstables, NodeId.OneShotRenewer renewer) throws IOException
    {
        assert !cfs.isIndex();
        Table table = cfs.table;
        Collection<Range> ranges = StorageService.instance.getLocalRanges(table.name);
        boolean isCommutative = cfs.metadata.getDefaultValidator().isCommutative();
        if (ranges.isEmpty())
        {
            logger.info("Cleanup cannot run before a node has joined the ring");
            return;
        }


        for (SSTableReader sstable : sstables)
        {
            CompactionController controller = new CompactionController(cfs, Collections.singletonList(sstable), getDefaultGcBefore(cfs), false);
            long startTime = System.currentTimeMillis();


            long totalkeysWritten = 0;


            int expectedBloomFilterSize = Math.max(DatabaseDescriptor.getIndexInterval(),
                                                   (int)(SSTableReader.getApproximateKeyCount(Arrays.asList(sstable))));
            if (logger.isDebugEnabled())
              logger.debug("Expected bloom filter size : " + expectedBloomFilterSize);


            SSTableWriter writer = null;
            SSTableReader newSstable = null;


            logger.info("Cleaning up " + sstable);
            // Calculate the expected compacted filesize
            long expectedRangeFileSize = cfs.getExpectedCompactedFileSize(Arrays.asList(sstable)) / 2;
            String compactionFileLocation = table.getDataFileLocation(expectedRangeFileSize);
            if (compactionFileLocation == null)
                throw new IOException("disk full");


            SSTableScanner scanner = sstable.getDirectScanner();
            Collection<ByteBuffer> indexedColumns = cfs.indexManager.getIndexedColumns();
            List<IColumn> indexedColumnsInRow = null;


            CleanupInfo ci = new CleanupInfo(sstable, scanner);
            executor.beginCompaction(ci);
            try
            {
                while (scanner.hasNext())
                {
                    SSTableIdentityIterator row = (SSTableIdentityIterator) scanner.next();
                    if (Range.isTokenInRanges(row.getKey().token, ranges))
                    {
                        AbstractCompactedRow compactedRow = controller.getCompactedRow(row);
                        if (compactedRow.isEmpty())
                            continue;
                        writer = maybeCreateWriter(cfs, compactionFileLocation, expectedBloomFilterSize, writer, Collections.singletonList(sstable));
                        writer.append(compactedRow);
                        totalkeysWritten++;
                    }
                    else
                    {
                                              
                        cfs.invalidateCachedRow(row.getKey());
                                                
                        if (!indexedColumns.isEmpty() || isCommutative)
                        {
                            if (indexedColumnsInRow != null)
                                indexedColumnsInRow.clear();
                            
                            while (row.hasNext())
                            {
                                IColumn column = row.next();
                                if (column instanceof CounterColumn)
                                    renewer.maybeRenew((CounterColumn) column);
                                if (indexedColumns.contains(column.name()))
                                {
                                    if (indexedColumnsInRow == null)
                                        indexedColumnsInRow = new ArrayList<IColumn>();
                                    
                                    indexedColumnsInRow.add(column);
                                }
                            }
                            
                            if (indexedColumnsInRow != null && !indexedColumnsInRow.isEmpty())
                                cfs.indexManager.deleteFromIndexes(row.getKey(), indexedColumnsInRow);
                        }
                    }
                }
                if (writer != null)
                    newSstable = writer.closeAndOpenReader(sstable.maxDataAge);
            }
            finally
            {
                scanner.close();
                executor.finishCompaction(ci);
                if (writer != null)
                    writer.cleanupIfNecessary();
                executor.finishCompaction(ci);
            }


            List<SSTableReader> results = new ArrayList<SSTableReader>();
            if (newSstable != null)
            {
                results.add(newSstable);


                String format = "Cleaned up to %s.  %,d to %,d (~%d%% of original) bytes for %,d keys.  Time: %,dms.";
                long dTime = System.currentTimeMillis() - startTime;
                long startsize = sstable.onDiskLength();
                long endsize = newSstable.onDiskLength();
                double ratio = (double)endsize / (double)startsize;
                logger.info(String.format(format, writer.getFilename(), startsize, endsize, (int)(ratio*100), totalkeysWritten, dTime));
            }


            // flush to ensure we don't lose the tombstones on a restart, since they are not commitlog'd         
            cfs.indexManager.flushIndexesBlocking();
           


            cfs.replaceCompactedSSTables(Arrays.asList(sstable), results);
        }
    }


    private SSTableWriter maybeCreateWriter(ColumnFamilyStore cfs, String compactionFileLocation, int expectedBloomFilterSize, SSTableWriter writer, Collection<SSTableReader> sstables)
            throws IOException
    {
        if (writer == null)
        {
            FileUtils.createDirectory(compactionFileLocation);
            writer = cfs.createCompactionWriter(expectedBloomFilterSize, compactionFileLocation, sstables);
        }
        return writer;
    }


    /**
     * Performs a readonly "compaction" of all sstables in order to validate complete rows,
     * but without writing the merge result
     */
    private void doValidationCompaction(ColumnFamilyStore cfs, AntiEntropyService.Validator validator) throws IOException
    {
        // flush first so everyone is validating data that is as similar as possible
        try
        {
            StorageService.instance.forceTableFlush(cfs.table.name, cfs.getColumnFamilyName());
        }
        catch (ExecutionException e)
        {
            throw new IOException(e);
        }
        catch (InterruptedException e)
        {
            throw new AssertionError(e);
        }


        // we don't mark validating sstables as compacting in DataTracker, so we have to mark them referenced
        // instead so they won't be cleaned up if they do get compacted during the validation
        Collection<SSTableReader> sstables = cfs.markCurrentSSTablesReferenced();
        CompactionIterable ci = new ValidationCompactionIterable(cfs, sstables, validator.request.range);
        CloseableIterator<AbstractCompactedRow> iter = ci.iterator();
        validationExecutor.beginCompaction(ci);
        try
        {
            Iterator<AbstractCompactedRow> nni = Iterators.filter(iter, Predicates.notNull());


            // validate the CF as we iterate over it
            validator.prepare(cfs);
            while (nni.hasNext())
            {
                AbstractCompactedRow row = nni.next();
                validator.add(row);
            }
            validator.complete();
        }
        finally
        {
            SSTableReader.releaseReferences(sstables);
            iter.close();
            validationExecutor.finishCompaction(ci);
        }
    }


    /**
     * Is not scheduled, because it is performing disjoint work from sstable compaction.
     */
    public Future<?> submitIndexBuild(final SecondaryIndexBuilder builder)
    {
        Runnable runnable = new Runnable()
        {
            public void run()
            {
                compactionLock.readLock().lock();
                try
                {
                    executor.beginCompaction(builder);
                    try
                    {
                        builder.build();
                    }
                    finally
                    {
                        executor.finishCompaction(builder);
                    }
                }
                finally
                {
                    compactionLock.readLock().unlock();
                }
            }
        };
        
        // don't submit to the executor if the compaction lock is held by the current thread. Instead return a simple
        // future that will be immediately immediately get()ed and executed. Happens during a migration, which locks
        // the compaction thread and then reinitializes a ColumnFamilyStore. Under normal circumstances, CFS spawns
        // index jobs to the compaction manager (this) and blocks on them.
        if (compactionLock.isWriteLockedByCurrentThread())
            return new SimpleFuture(runnable);
        else
            return executor.submit(runnable);
    }


    public Future<?> submitCacheWrite(final AutoSavingCache.Writer writer)
    {
        Runnable runnable = new WrappedRunnable()
        {
            public void runMayThrow() throws IOException
            {
                if (!AutoSavingCache.flushInProgress.compareAndSet(false, true))
                {
                    logger.debug("Cache flushing was already in progress: skipping {}", writer.getCompactionInfo());
                    return;
                }
                try
                {
                    executor.beginCompaction(writer);
                    try
                    {
                        writer.saveCache();
                    }
                    finally
                    {
                        executor.finishCompaction(writer);
                    }
                }
                finally
                {
                    AutoSavingCache.flushInProgress.set(false);
                }
            }
        };
        return executor.submit(runnable);
    }


    public Future<?> submitTruncate(final ColumnFamilyStore main, final long truncatedAt)
    {
        Runnable runnable = new WrappedRunnable()
        {
            public void runMayThrow() throws InterruptedException, IOException
            {
                compactionLock.writeLock().lock();
                try
                {
                    for (ColumnFamilyStore cfs : main.concatWithIndexes())
                    {
                        List<SSTableReader> truncatedSSTables = new ArrayList<SSTableReader>();
                        for (SSTableReader sstable : cfs.getSSTables())
                        {
                            if (!sstable.newSince(truncatedAt))
                                truncatedSSTables.add(sstable);
                        }
                        cfs.markCompacted(truncatedSSTables);
                    }
                }
                finally
                {
                    compactionLock.writeLock().unlock();
                }


                main.invalidateRowCache();
            }
        };


        return executor.submit(runnable);
    }


    static int getDefaultGcBefore(ColumnFamilyStore cfs)
    {
        return cfs.isIndex()
               ? Integer.MAX_VALUE
               : (int) (System.currentTimeMillis() / 1000) - cfs.metadata.getGcGraceSeconds();
    }


    private static class ValidationCompactionIterable extends CompactionIterable
    {
        public ValidationCompactionIterable(ColumnFamilyStore cfs, Collection<SSTableReader> sstables, Range range) throws IOException
        {
            super(OperationType.VALIDATION,
                  getScanners(sstables, range),
                  new CompactionController(cfs, sstables, getDefaultGcBefore(cfs), true));
        }


        protected static List<SSTableScanner> getScanners(Iterable<SSTableReader> sstables, Range range) throws IOException
        {
            ArrayList<SSTableScanner> scanners = new ArrayList<SSTableScanner>();
            for (SSTableReader sstable : sstables)
                scanners.add(sstable.getDirectScanner(range));
            return scanners;
        }
    }


    public int getActiveCompactions()
    {
        return CompactionExecutor.compactions.size();
    }


    private static class CompactionExecutor extends DebuggableThreadPoolExecutor implements CompactionExecutorStatsCollector
    {
        // a synchronized identity set of running tasks to their compaction info
        private static final Set<CompactionInfo.Holder> compactions = Collections.synchronizedSet(Collections.newSetFromMap(new IdentityHashMap<CompactionInfo.Holder, Boolean>()));


        protected CompactionExecutor(int minThreads, int maxThreads, String name, BlockingQueue<Runnable> queue)
        {
            super(minThreads, maxThreads, 60, TimeUnit.SECONDS, queue, new NamedThreadFactory(name, Thread.MIN_PRIORITY));
        }


        private CompactionExecutor(int threadCount, String name)
        {
            this(threadCount, threadCount, name, new LinkedBlockingQueue<Runnable>());
        }


        public CompactionExecutor()
        {
            this(Math.max(1, DatabaseDescriptor.getConcurrentCompactors()), "CompactionExecutor");
        }


        public void beginCompaction(CompactionInfo.Holder ci)
        {
            compactions.add(ci);
        }


        public void finishCompaction(CompactionInfo.Holder ci)
        {
            compactions.remove(ci);
        }


        public static List<CompactionInfo.Holder> getCompactions()
        {
            return new ArrayList<CompactionInfo.Holder>(compactions);
        }
    }


    private static class ValidationExecutor extends CompactionExecutor
    {
        public ValidationExecutor()
        {
            super(1, Integer.MAX_VALUE, "ValidationExecutor", new SynchronousQueue<Runnable>());
        }
    }


    public interface CompactionExecutorStatsCollector
    {
        void beginCompaction(CompactionInfo.Holder ci);
        void finishCompaction(CompactionInfo.Holder ci);
    }


    public List<CompactionInfo> getCompactions()
    {
        List<CompactionInfo> out = new ArrayList<CompactionInfo>();
        for (CompactionInfo.Holder ci : CompactionExecutor.getCompactions())
            out.add(ci.getCompactionInfo());
        return out;
    }


    public List<String> getCompactionSummary()
    {
        List<String> out = new ArrayList<String>();
        for (CompactionInfo.Holder ci : CompactionExecutor.getCompactions())
            out.add(ci.getCompactionInfo().toString());
        return out;
    }


    public int getPendingTasks()
    {
        int n = 0;
        for (String tableName : Schema.instance.getTables())
        {
            for (ColumnFamilyStore cfs : Table.open(tableName).getColumnFamilyStores())
            {
                n += cfs.getCompactionStrategy().getEstimatedRemainingTasks();
            }
        }
        return (int) (executor.getTaskCount() + validationExecutor.getTaskCount() - executor.getCompletedTaskCount() - validationExecutor.getCompletedTaskCount()) + n;
    }


    public long getCompletedTasks()
    {
        return executor.getCompletedTaskCount() + validationExecutor.getCompletedTaskCount();
    }
    
    private static class SimpleFuture implements Future
    {
        private Runnable runnable;
        
        private SimpleFuture(Runnable r) 
        {
            runnable = r;
        }
        
        public boolean cancel(boolean mayInterruptIfRunning)
        {
            throw new IllegalStateException("May not call SimpleFuture.cancel()");
        }


        public boolean isCancelled()
        {
            return false;
        }


        public boolean isDone()
        {
            return runnable == null;
        }


        public Object get() throws InterruptedException, ExecutionException
        {
            runnable.run();
            runnable = null;
            return runnable;
        }


        public Object get(long timeout, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException
        {
            throw new IllegalStateException("May not call SimpleFuture.get(long, TimeUnit)");
        }
    }


    private static class CleanupInfo implements CompactionInfo.Holder
    {
        private final SSTableReader sstable;
        private final SSTableScanner scanner;
        public CleanupInfo(SSTableReader sstable, SSTableScanner scanner)
        {
            this.sstable = sstable;
            this.scanner = scanner;
        }


        public CompactionInfo getCompactionInfo()
        {
            try
            {
                return new CompactionInfo(this.hashCode(),
                                          sstable.descriptor.ksname,
                                          sstable.descriptor.cfname,
                                          OperationType.CLEANUP,
                                          scanner.getFilePointer(),
                                          scanner.getFileLength());
            }
            catch (Exception e)
            {
                throw new RuntimeException();
            }
        }
    }


    private static class ScrubInfo implements CompactionInfo.Holder
    {
        private final RandomAccessReader dataFile;
        private final SSTableReader sstable;
        public ScrubInfo(RandomAccessReader dataFile, SSTableReader sstable)
        {
            this.dataFile = dataFile;
            this.sstable = sstable;
        }


        public CompactionInfo getCompactionInfo()
        {
            try
            {
                return new CompactionInfo(this.hashCode(),
                                          sstable.descriptor.ksname,
                                          sstable.descriptor.cfname,
                                          OperationType.SCRUB,
                                          dataFile.getFilePointer(),
                                          dataFile.length());
            }
            catch (Exception e)
            {
                throw new RuntimeException();
            }
        }
    }
}
Source Code of org.apache.cassandra.db.compaction.CompactionManager

Related Classes of org.apache.cassandra.db.compaction.CompactionManager