Source Code of org.apache.hadoop.hbase.regionserver.HStore

/**
 * Copyright 2007 The Apache Software Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.regionserver;


import java.io.EOFException;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HStoreKey;
import org.apache.hadoop.hbase.RemoteExceptionHandler;
import org.apache.hadoop.hbase.filter.RowFilterInterface;
import org.apache.hadoop.hbase.io.Cell;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.StringUtils;


/**
 * HStore maintains a bunch of data files.  It is responsible for maintaining 
 * the memory/file hierarchy and for periodic flushes to disk and compacting 
 * edits to the file.
 *
 * Locking and transactions are handled at a higher level.  This API should not 
 * be called directly by any writer, but rather by an HRegion manager.
 */
public class HStore implements HConstants {
  static final Log LOG = LogFactory.getLog(HStore.class);


  /*
   * Regex that will work for straight filenames and for reference names.
   * If reference, then the regex has more than just one group.  Group 1 is
   * this files id.  Group 2 the referenced region name, etc.
   */
  private static final Pattern REF_NAME_PARSER =
    Pattern.compile("^(\\d+)(?:\\.(.+))?$");
  
  protected final Memcache memcache;
  private final Path basedir;
  private final HRegionInfo info;
  private final HColumnDescriptor family;
  private final SequenceFile.CompressionType compression;
  final FileSystem fs;
  private final HBaseConfiguration conf;
  protected long ttl;


  private final long desiredMaxFileSize;
  private volatile long storeSize;


  private final Integer flushLock = new Integer(0);


  final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();


  final byte [] storeName;
  private final String storeNameStr;


  /*
   * Sorted Map of readers keyed by sequence id (Most recent should be last in
   * in list).
   */
  private final SortedMap<Long, HStoreFile> storefiles =
    Collections.synchronizedSortedMap(new TreeMap<Long, HStoreFile>());
  
  /*
   * Sorted Map of readers keyed by sequence id (Most recent is last in list).
   */
  private final SortedMap<Long, MapFile.Reader> readers =
    new TreeMap<Long, MapFile.Reader>();


  // The most-recent log-seq-ID that's present.  The most-recent such ID means
  // we can ignore all log messages up to and including that ID (because they're
  // already reflected in the TreeMaps).
  private volatile long maxSeqId;
  
  private final Path compactionDir;
  private final Integer compactLock = new Integer(0);
  private final int compactionThreshold;
  private final Set<ChangedReadersObserver> changedReaderObservers =
    Collections.synchronizedSet(new HashSet<ChangedReadersObserver>());


  /**
   * An HStore is a set of zero or more MapFiles, which stretch backwards over 
   * time.  A given HStore is responsible for a certain set of columns for a
   * row in the HRegion.
   *
   * <p>The HRegion starts writing to its set of HStores when the HRegion's 
   * memcache is flushed.  This results in a round of new MapFiles, one for
   * each HStore.
   *
   * <p>There's no reason to consider append-logging at this level; all logging 
   * and locking is handled at the HRegion level.  HStore just provides
   * services to manage sets of MapFiles.  One of the most important of those
   * services is MapFile-compaction services.
   *
   * <p>The only thing having to do with logs that HStore needs to deal with is
   * the reconstructionLog.  This is a segment of an HRegion's log that might
   * NOT be present upon startup.  If the param is NULL, there's nothing to do.
   * If the param is non-NULL, we need to process the log to reconstruct
   * a TreeMap that might not have been written to disk before the process
   * died.
   *
   * <p>It's assumed that after this constructor returns, the reconstructionLog
   * file will be deleted (by whoever has instantiated the HStore).
   *
   * @param basedir qualified path under which the region directory lives
   * @param info HRegionInfo for this region
   * @param family HColumnDescriptor for this column
   * @param fs file system object
   * @param reconstructionLog existing log file to apply if any
   * @param conf configuration object
   * @param reporter Call on a period so hosting server can report we're
   * making progress to master -- otherwise master might think region deploy
   * failed.  Can be null.
   * @throws IOException
   */
  protected HStore(Path basedir, HRegionInfo info, HColumnDescriptor family,
      FileSystem fs, Path reconstructionLog, HBaseConfiguration conf,
      final Progressable reporter)
  throws IOException {  
    this.basedir = basedir;
    this.info = info;
    this.family = family;
    this.fs = fs;
    this.conf = conf;
    this.ttl = family.getTimeToLive();
    if (ttl != HConstants.FOREVER)
      this.ttl *= 1000;
    this.memcache = new Memcache(this.ttl);
    this.compactionDir = HRegion.getCompactionDir(basedir);
    this.storeName = Bytes.toBytes(this.info.getEncodedName() + "/" +
      Bytes.toString(this.family.getName()));
    this.storeNameStr = Bytes.toString(this.storeName);


    // By default, we compact if an HStore has more than
    // MIN_COMMITS_FOR_COMPACTION map files
    this.compactionThreshold =
      conf.getInt("hbase.hstore.compactionThreshold", 3);
    
    // By default we split region if a file > DEFAULT_MAX_FILE_SIZE.
    long maxFileSize = info.getTableDesc().getMaxFileSize();
    if (maxFileSize == HConstants.DEFAULT_MAX_FILE_SIZE) {
      maxFileSize = conf.getLong("hbase.hregion.max.filesize",
        HConstants.DEFAULT_MAX_FILE_SIZE);
    }
    this.desiredMaxFileSize = maxFileSize;


    this.storeSize = 0L;


    if (family.getCompression() == HColumnDescriptor.CompressionType.BLOCK) {
      this.compression = SequenceFile.CompressionType.BLOCK;
    } else if (family.getCompression() ==
      HColumnDescriptor.CompressionType.RECORD) {
      this.compression = SequenceFile.CompressionType.RECORD;
    } else {
      this.compression = SequenceFile.CompressionType.NONE;
    }
    
    Path mapdir = HStoreFile.getMapDir(basedir, info.getEncodedName(),
        family.getName());
    if (!fs.exists(mapdir)) {
      fs.mkdirs(mapdir);
    }
    Path infodir = HStoreFile.getInfoDir(basedir, info.getEncodedName(),
        family.getName());
    if (!fs.exists(infodir)) {
      fs.mkdirs(infodir);
    }
    
    // Go through the 'mapdir' and 'infodir' together, make sure that all 
    // MapFiles are in a reliable state.  Every entry in 'mapdir' must have a 
    // corresponding one in 'loginfodir'. Without a corresponding log info
    // file, the entry in 'mapdir' must be deleted.
    // loadHStoreFiles also computes the max sequence id internally.
    this.maxSeqId = -1L;
    this.storefiles.putAll(loadHStoreFiles(infodir, mapdir));
    if (LOG.isDebugEnabled() && this.storefiles.size() > 0) {
      LOG.debug("Loaded " + this.storefiles.size() + " file(s) in hstore " +
        Bytes.toString(this.storeName) + ", max sequence id " + this.maxSeqId);
    }
    
    try {
      doReconstructionLog(reconstructionLog, maxSeqId, reporter);
    } catch (EOFException e) {
      // Presume we got here because of lack of HADOOP-1700; for now keep going
      // but this is probably not what we want long term.  If we got here there
      // has been data-loss
      LOG.warn("Exception processing reconstruction log " + reconstructionLog +
        " opening " + this.storeName +
        " -- continuing.  Probably lack-of-HADOOP-1700 causing DATA LOSS!", e);
    } catch (IOException e) {
      // Presume we got here because of some HDFS issue. Don't just keep going.
      // Fail to open the HStore.  Probably means we'll fail over and over
      // again until human intervention but alternative has us skipping logs
      // and losing edits: HBASE-642.
      LOG.warn("Exception processing reconstruction log " + reconstructionLog +
        " opening " + this.storeName, e);
      throw e;
    }


    // Finally, start up all the map readers! (There could be more than one
    // since we haven't compacted yet.)
    boolean first = true;
    for(Map.Entry<Long, HStoreFile> e: this.storefiles.entrySet()) {
      MapFile.Reader r = null;
      if (first) {
        // Use a block cache (if configured) for the first reader only
        // so as to control memory usage.
        r = e.getValue().getReader(this.fs, this.family.isBloomfilter(),
          family.isBlockCacheEnabled());
        first = false;
      } else {
        r = e.getValue().getReader(this.fs, this.family.isBloomfilter(),
            false);
      }
      this.readers.put(e.getKey(), r);
    }
  }


  HColumnDescriptor getFamily() {
    return this.family;
  }
  
  long getMaxSequenceId() {
    return this.maxSeqId;
  }
  
  /*
   * Read the reconstructionLog to see whether we need to build a brand-new 
   * MapFile out of non-flushed log entries.  
   *
   * We can ignore any log message that has a sequence ID that's equal to or 
   * lower than maxSeqID.  (Because we know such log messages are already 
   * reflected in the MapFiles.)
   */
  private void doReconstructionLog(final Path reconstructionLog,
    final long maxSeqID, final Progressable reporter)
  throws UnsupportedEncodingException, IOException {
    if (reconstructionLog == null || !fs.exists(reconstructionLog)) {
      // Nothing to do.
      return;
    }
    // Check its not empty.
    FileStatus[] stats = fs.listStatus(reconstructionLog);
    if (stats == null || stats.length == 0) {
      LOG.warn("Passed reconstruction log " + reconstructionLog + " is zero-length");
      return;
    }
    long maxSeqIdInLog = -1;
    TreeMap<HStoreKey, byte []> reconstructedCache =
      new TreeMap<HStoreKey, byte []>();
      
    SequenceFile.Reader logReader = new SequenceFile.Reader(this.fs,
        reconstructionLog, this.conf);
    
    try {
      HLogKey key = new HLogKey();
      HLogEdit val = new HLogEdit();
      long skippedEdits = 0;
      long editsCount = 0;
      // How many edits to apply before we send a progress report.
      int reportInterval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000);
      while (logReader.next(key, val)) {
        maxSeqIdInLog = Math.max(maxSeqIdInLog, key.getLogSeqNum());
        if (key.getLogSeqNum() <= maxSeqID) {
          skippedEdits++;
          continue;
        }
        // Check this edit is for me. Also, guard against writing
        // METACOLUMN info such as HBASE::CACHEFLUSH entries
        byte [] column = val.getColumn();
        if (Bytes.equals(column, HLog.METACOLUMN)
            || !Bytes.equals(key.getRegionName(), info.getRegionName())
            || !HStoreKey.matchingFamily(family.getName(), column)) {
          continue;
        }
        HStoreKey k = new HStoreKey(key.getRow(), column, val.getTimestamp());
        reconstructedCache.put(k, val.getVal());
        editsCount++;
        // Every 2k edits, tell the reporter we're making progress.
        // Have seen 60k edits taking 3minutes to complete.
        if (reporter != null && (editsCount % reportInterval) == 0) {
          reporter.progress();
        }
      }
      if (LOG.isDebugEnabled()) {
        LOG.debug("Applied " + editsCount + ", skipped " + skippedEdits +
          " because sequence id <= " + maxSeqID);
      }
    } finally {
      logReader.close();
    }
    
    if (reconstructedCache.size() > 0) {
      // We create a "virtual flush" at maxSeqIdInLog+1.
      if (LOG.isDebugEnabled()) {
        LOG.debug("flushing reconstructionCache");
      }
      internalFlushCache(reconstructedCache, maxSeqIdInLog + 1);
    }
  }
  
  /*
   * Creates a series of HStoreFiles loaded from the given directory.
   * There must be a matching 'mapdir' and 'loginfo' pair of files.
   * If only one exists, we'll delete it.  Does other consistency tests
   * checking files are not zero, etc.
   *
   * @param infodir qualified path for info file directory
   * @param mapdir qualified path for map file directory
   * @throws IOException
   */
  private SortedMap<Long, HStoreFile> loadHStoreFiles(Path infodir, Path mapdir)
  throws IOException {
    // Look first at info files.  If a reference, these contain info we need
    // to create the HStoreFile.
    FileStatus infofiles[] = fs.listStatus(infodir);
    SortedMap<Long, HStoreFile> results = new TreeMap<Long, HStoreFile>();
    ArrayList<Path> mapfiles = new ArrayList<Path>(infofiles.length);
    for (int i = 0; i < infofiles.length; i++) {
      Path p = infofiles[i].getPath();
      // Check for empty info file.  Should never be the case but can happen
      // after data loss in hdfs for whatever reason (upgrade, etc.): HBASE-646
      if (this.fs.getFileStatus(p).getLen() <= 0) {
        LOG.warn("Skipping " + p + " because its empty.  DATA LOSS?  Can " +
          "this scenario be repaired?  HBASE-646");
        continue;
      }


      Matcher m = REF_NAME_PARSER.matcher(p.getName());
      /*
       *  *  *  *  *  N O T E  *  *  *  *  *
       *  
       *  We call isReference(Path, Matcher) here because it calls
       *  Matcher.matches() which must be called before Matcher.group(int)
       *  and we don't want to call Matcher.matches() twice.
       *  
       *  *  *  *  *  N O T E  *  *  *  *  *
       */
      boolean isReference = isReference(p, m);
      long fid = Long.parseLong(m.group(1));
      
      HStoreFile curfile = null;
      HStoreFile.Reference reference = null;
      if (isReference) {
        reference = HStoreFile.readSplitInfo(p, fs);
      }
      curfile = new HStoreFile(conf, fs, basedir, info.getEncodedName(),
        family.getName(), fid, reference);
      long storeSeqId = -1;
      try {
        storeSeqId = curfile.loadInfo(fs);
        if (storeSeqId > this.maxSeqId) {
          this.maxSeqId = storeSeqId;
        }
      } catch (IOException e) {
        // If the HSTORE_LOGINFOFILE doesn't contain a number, just ignore it.
        // That means it was built prior to the previous run of HStore, and so
        // it cannot contain any updates also contained in the log.
        LOG.info("HSTORE_LOGINFOFILE " + curfile +
          " does not contain a sequence number - ignoring");
      }
      Path mapfile = curfile.getMapFilePath();
      if (!fs.exists(mapfile)) {
        fs.delete(curfile.getInfoFilePath(), false);
        LOG.warn("Mapfile " + mapfile.toString() + " does not exist. " +
          "Cleaned up info file.  Continuing...Probable DATA LOSS!!!");
        continue;
      }
      if (isEmptyDataFile(mapfile)) {
        curfile.delete();
        // We can have empty data file if data loss in hdfs.
        LOG.warn("Mapfile " + mapfile.toString() + " has empty data. " +
          "Deleting.  Continuing...Probable DATA LOSS!!!  See HBASE-646.");
        continue;
      }
      if (isEmptyIndexFile(mapfile)) {
        try {
          // Try fixing this file.. if we can.  Use the hbase version of fix.
          // Need to remove the old index file first else fix won't go ahead.
          this.fs.delete(new Path(mapfile, MapFile.INDEX_FILE_NAME), false);
          long count = MapFile.fix(this.fs, mapfile, HStoreFile.HbaseMapFile.KEY_CLASS,
            HStoreFile.HbaseMapFile.VALUE_CLASS, false, this.conf);
          if (LOG.isDebugEnabled()) {
            LOG.debug("Fixed index on " + mapfile.toString() + "; had " +
              count + " entries");
          }
        } catch (Exception e) {
          LOG.warn("Failed fix of " + mapfile.toString() +
            "...continuing; Probable DATA LOSS!!!", e);
          continue;
        }
      }
      storeSize += curfile.length();
      
      // TODO: Confirm referent exists.
      
      // Found map and sympathetic info file.  Add this hstorefile to result.
      if (LOG.isDebugEnabled()) {
        LOG.debug("loaded " + FSUtils.getPath(p) + ", isReference=" +
          isReference + ", sequence id=" + storeSeqId);
      }
      results.put(Long.valueOf(storeSeqId), curfile);
      // Keep list of sympathetic data mapfiles for cleaning info dir in next
      // section.  Make sure path is fully qualified for compare.
      mapfiles.add(mapfile);
    }
    
    // List paths by experience returns fully qualified names -- at least when
    // running on a mini hdfs cluster.
    FileStatus datfiles[] = fs.listStatus(mapdir);
    for (int i = 0; i < datfiles.length; i++) {
      Path p = datfiles[i].getPath();
      // If does not have sympathetic info file, delete.
      if (!mapfiles.contains(fs.makeQualified(p))) {
        fs.delete(p, true);
      }
    }
    return results;
  }


  /* 
   * @param mapfile
   * @return True if the passed mapfile has a zero-length data component (its
   * broken).
   * @throws IOException
   */
  private boolean isEmptyDataFile(final Path mapfile)
  throws IOException {
    // Mapfiles are made of 'data' and 'index' files.  Confirm 'data' is
    // non-null if it exists (may not have been written to yet).
    return isEmptyFile(new Path(mapfile, MapFile.DATA_FILE_NAME));
  }


  /* 
   * @param mapfile
   * @return True if the passed mapfile has a zero-length index component (its
   * broken).
   * @throws IOException
   */
  private boolean isEmptyIndexFile(final Path mapfile)
  throws IOException {
    // Mapfiles are made of 'data' and 'index' files.  Confirm 'data' is
    // non-null if it exists (may not have been written to yet).
    return isEmptyFile(new Path(mapfile, MapFile.INDEX_FILE_NAME));
  }


  /* 
   * @param mapfile
   * @return True if the passed mapfile has a zero-length index component (its
   * broken).
   * @throws IOException
   */
  private boolean isEmptyFile(final Path f)
  throws IOException {
    return this.fs.exists(f) &&
      this.fs.getFileStatus(f).getLen() == 0;
  }


  /**
   * Adds a value to the memcache
   * 
   * @param key
   * @param value
   * @return memcache size delta
   */
  protected long add(HStoreKey key, byte[] value) {
    lock.readLock().lock();
    try {
      return this.memcache.add(key, value);
    } finally {
      lock.readLock().unlock();
    }
  }
  
  /**
   * Close all the MapFile readers
   * 
   * We don't need to worry about subsequent requests because the HRegion holds
   * a write lock that will prevent any more reads or writes.
   * 
   * @throws IOException
   */
  List<HStoreFile> close() throws IOException {
    ArrayList<HStoreFile> result = null;
    this.lock.writeLock().lock();
    try {
      for (MapFile.Reader reader: this.readers.values()) {
        reader.close();
      }
      synchronized (this.storefiles) {
        result = new ArrayList<HStoreFile>(storefiles.values());
      }
      LOG.debug("closed " + this.storeNameStr);
      return result;
    } finally {
      this.lock.writeLock().unlock();
    }
  }
  
  //////////////////////////////////////////////////////////////////////////////
  // Flush changes to disk
  //////////////////////////////////////////////////////////////////////////////


  /**
   * Snapshot this stores memcache.  Call before running
   * {@link #flushCache(long)} so it has some work to do.
   */
  void snapshot() {
    this.memcache.snapshot();
  }
  
  /**
   * Write out current snapshot.  Presumes {@link #snapshot()} has been called
   * previously.
   * @param logCacheFlushId flush sequence number
   * @return true if a compaction is needed
   * @throws IOException
   */
  boolean flushCache(final long logCacheFlushId) throws IOException {
    // Get the snapshot to flush.  Presumes that a call to
    // this.memcache.snapshot() has happened earlier up in the chain.
    SortedMap<HStoreKey, byte []> cache = this.memcache.getSnapshot();
    boolean compactionNeeded = internalFlushCache(cache, logCacheFlushId);
    // If an exception happens flushing, we let it out without clearing
    // the memcache snapshot.  The old snapshot will be returned when we say
    // 'snapshot', the next time flush comes around.
    this.memcache.clearSnapshot(cache);
    return compactionNeeded;
  }
  
  private boolean internalFlushCache(SortedMap<HStoreKey, byte []> cache,
      long logCacheFlushId) throws IOException {
    long flushed = 0;
    // Don't flush if there are no entries.
    if (cache.size() == 0) {
      return false;
    }
    
    // TODO:  We can fail in the below block before we complete adding this
    // flush to list of store files.  Add cleanup of anything put on filesystem
    // if we fail.
    synchronized(flushLock) {
      long now = System.currentTimeMillis();
      // A. Write the Maps out to the disk
      HStoreFile flushedFile = new HStoreFile(conf, fs, basedir,
        info.getEncodedName(),  family.getName(), -1L, null);
      MapFile.Writer out = flushedFile.getWriter(this.fs, this.compression,
        this.family.isBloomfilter(), cache.size());
       out.setIndexInterval(family.getMapFileIndexInterval());
      
      // Here we tried picking up an existing HStoreFile from disk and
      // interlacing the memcache flush compacting as we go.  The notion was
      // that interlacing would take as long as a pure flush with the added
      // benefit of having one less file in the store.  Experiments showed that
      // it takes two to three times the amount of time flushing -- more column
      // families makes it so the two timings come closer together -- but it
      // also complicates the flush. The code was removed.  Needed work picking
      // which file to interlace (favor references first, etc.)
      //
      // Related, looks like 'merging compactions' in BigTable paper interlaces
      // a memcache flush.  We don't.
      int entries = 0;
      try {
        for (Map.Entry<HStoreKey, byte []> es: cache.entrySet()) {
          HStoreKey curkey = es.getKey();
          byte[] bytes = es.getValue();
          if (HStoreKey.matchingFamily(this.family.getName(), curkey.getColumn())) {
            if (!isExpired(curkey, ttl, now)) {
              entries++;
              out.append(curkey, new ImmutableBytesWritable(bytes));
              flushed += curkey.getSize() + (bytes == null ? 0 : bytes.length);
            }
          }
        }
      } finally {
        out.close();
      }
      long newStoreSize = flushedFile.length();
      storeSize += newStoreSize;


      // B. Write out the log sequence number that corresponds to this output
      // MapFile.  The MapFile is current up to and including the log seq num.
      flushedFile.writeInfo(fs, logCacheFlushId);
      
      // C. Finally, make the new MapFile available.
      updateReaders(logCacheFlushId, flushedFile);
      if(LOG.isDebugEnabled()) {
        LOG.debug("Added " + FSUtils.getPath(flushedFile.getMapFilePath()) +
          " with " + entries +
          " entries, sequence id " + logCacheFlushId + ", data size " +
          StringUtils.humanReadableInt(flushed) + ", file size " +
          StringUtils.humanReadableInt(newStoreSize));
      }
    }
    return storefiles.size() >= compactionThreshold;
  }
  
  /*
   * Change readers adding into place the Reader produced by this new flush.
   * @param logCacheFlushId
   * @param flushedFile
   * @throws IOException
   */
  private void updateReaders(final long logCacheFlushId,
      final HStoreFile flushedFile)
  throws IOException {
    this.lock.writeLock().lock();
    try {
      Long flushid = Long.valueOf(logCacheFlushId);
      // Open the map file reader.
      this.readers.put(flushid,
        flushedFile.getReader(this.fs, this.family.isBloomfilter(),
        this.family.isBlockCacheEnabled()));
      this.storefiles.put(flushid, flushedFile);
      // Tell listeners of the change in readers.
      notifyChangedReadersObservers();
    } finally {
      this.lock.writeLock().unlock();
    }
  }
  
  /*
   * Notify all observers that set of Readers has changed.
   * @throws IOException
   */
  private void notifyChangedReadersObservers() throws IOException {
    synchronized (this.changedReaderObservers) {
      for (ChangedReadersObserver o: this.changedReaderObservers) {
        o.updateReaders();
      }
    }
  }
  
  /*
   * @param o Observer who wants to know about changes in set of Readers
   */
  void addChangedReaderObserver(ChangedReadersObserver o) {
    this.changedReaderObservers.add(o);
  }
  
  /*
   * @param o Observer no longer interested in changes in set of Readers.
   */
  void deleteChangedReaderObserver(ChangedReadersObserver o) {
    if (!this.changedReaderObservers.remove(o)) {
      LOG.warn("Not in set" + o);
    }
  }


  //////////////////////////////////////////////////////////////////////////////
  // Compaction
  //////////////////////////////////////////////////////////////////////////////


  /*
   * @param files
   * @return True if any of the files in <code>files</code> are References.
   */
  private boolean hasReferences(Collection<HStoreFile> files) {
    if (files != null && files.size() > 0) {
      for (HStoreFile hsf: files) {
        if (hsf.isReference()) {
          return true;
        }
      }
    }
    return false;
  }
  
  /**
   * Compact the back-HStores.  This method may take some time, so the calling 
   * thread must be able to block for long periods.
   * 
   * <p>During this time, the HStore can work as usual, getting values from
   * MapFiles and writing new MapFiles from the Memcache.
   * 
   * Existing MapFiles are not destroyed until the new compacted TreeMap is 
   * completely written-out to disk.
   *
   * The compactLock prevents multiple simultaneous compactions.
   * The structureLock prevents us from interfering with other write operations.
   * 
   * We don't want to hold the structureLock for the whole time, as a compact() 
   * can be lengthy and we want to allow cache-flushes during this period.
   * 
   * @param force True to force a compaction regardless of thresholds (Needed
   * by merge).
   * @return mid key if a split is needed, null otherwise
   * @throws IOException
   */
  StoreSize compact(final boolean force) throws IOException {
    synchronized (compactLock) {
      long maxId = -1;
      int nrows = -1;
      List<HStoreFile> filesToCompact = null;
      synchronized (storefiles) {
        if (this.storefiles.size() <= 0) {
          return null;
        }
        filesToCompact = new ArrayList<HStoreFile>(this.storefiles.values());


        // The max-sequenceID in any of the to-be-compacted TreeMaps is the 
        // last key of storefiles.
        maxId = this.storefiles.lastKey().longValue();
      }
      if (!force && !hasReferences(filesToCompact) &&
          filesToCompact.size() < compactionThreshold) {
        return checkSplit();
      }
      if (!fs.exists(compactionDir) && !fs.mkdirs(compactionDir)) {
        LOG.warn("Mkdir on " + compactionDir.toString() + " failed");
        return checkSplit();
      }


      // HBASE-745, preparing all store file size for incremental compacting
      // selection.
      int countOfFiles = filesToCompact.size();
      long totalSize = 0;
      long[] fileSizes = new long[countOfFiles];
      long skipped = 0;
      int point = 0;
      for (int i = 0; i < countOfFiles; i++) {
        HStoreFile file = filesToCompact.get(i);
        Path path = file.getMapFilePath();
        int len = 0;
        for (FileStatus fstatus:fs.listStatus(path)) {
          len += fstatus.getLen();
        }
        fileSizes[i] = len;
        totalSize += len;
      }
      if (!force && !hasReferences(filesToCompact)) {
        // Here we select files for incremental compaction.  
        // The rule is: if the largest(oldest) one is more than twice the 
        // size of the second, skip the largest, and continue to next...,
        // until we meet the compactionThreshold limit.
        for (point = 0; point < compactionThreshold - 1; point++) {
          if (fileSizes[point] < fileSizes[point + 1] * 2) {
            break;
          }
          skipped += fileSizes[point];
        }
        filesToCompact = new ArrayList<HStoreFile>(filesToCompact.subList(point,
          countOfFiles));
        if (LOG.isDebugEnabled()) {
          LOG.debug("Compaction size of " + this.storeNameStr + ": " +
            StringUtils.humanReadableInt(totalSize) + ", skipped " + point +
            ", " + skipped);
        }
      }


      /*
       * We create a new list of MapFile.Reader objects so we don't screw up
       * the caching associated with the currently-loaded ones. Our iteration-
       * based access pattern is practically designed to ruin the cache.
       */
      List<MapFile.Reader> readers = new ArrayList<MapFile.Reader>();
      for (HStoreFile file: filesToCompact) {
        try {
          HStoreFile.BloomFilterMapFile.Reader reader =
            file.getReader(fs, false, false);
          readers.add(reader);
          
          // Compute the size of the new bloomfilter if needed
          if (this.family.isBloomfilter()) {
            nrows += reader.getBloomFilterSize();
          }
        } catch (IOException e) {
          // Add info about which file threw exception. It may not be in the
          // exception message so output a message here where we know the
          // culprit.
          LOG.warn("Failed with " + e.toString() + ": " + file.toString());
          closeCompactionReaders(readers);
          throw e;
        }
      }
      
      // Storefiles are keyed by sequence id. The oldest file comes first.
      // We need to return out of here a List that has the newest file first.
      Collections.reverse(readers);


      // Step through them, writing to the brand-new MapFile
      HStoreFile compactedOutputFile = new HStoreFile(conf, fs, 
          this.compactionDir, info.getEncodedName(), family.getName(),
          -1L, null);
      if (LOG.isDebugEnabled()) {
        LOG.debug("started compaction of " + readers.size() + " files into " +
          FSUtils.getPath(compactedOutputFile.getMapFilePath()));
      }
      MapFile.Writer writer = compactedOutputFile.getWriter(this.fs,
        this.compression, this.family.isBloomfilter(), nrows);
      writer.setIndexInterval(family.getMapFileIndexInterval());
      try {
        compactHStoreFiles(writer, readers);
      } finally {
        writer.close();
      }


      // Now, write out an HSTORE_LOGINFOFILE for the brand-new TreeMap.
      compactedOutputFile.writeInfo(fs, maxId);


      // Move the compaction into place.
      completeCompaction(filesToCompact, compactedOutputFile);
      if (LOG.isDebugEnabled()) {
        LOG.debug("Completed compaction of " + this.storeNameStr +
          " store size is " + StringUtils.humanReadableInt(storeSize));
      }
    }
    return checkSplit();
  }


  /*
   * Compact a list of MapFile.Readers into MapFile.Writer.
   * 
   * We work by iterating through the readers in parallel. We always increment
   * the lowest-ranked one.
   * Updates to a single row/column will appear ranked by timestamp. This allows
   * us to throw out deleted values or obsolete versions.
   */
  private void compactHStoreFiles(final MapFile.Writer compactedOut,
      final List<MapFile.Reader> readers)
  throws IOException {
    MapFile.Reader[] rdrs = readers.toArray(new MapFile.Reader[readers.size()]);
    try {
      HStoreKey[] keys = new HStoreKey[rdrs.length];
      ImmutableBytesWritable[] vals = new ImmutableBytesWritable[rdrs.length];
      boolean[] done = new boolean[rdrs.length];
      for(int i = 0; i < rdrs.length; i++) {
        keys[i] = new HStoreKey();
        vals[i] = new ImmutableBytesWritable();
        done[i] = false;
      }


      // Now, advance through the readers in order.  This will have the
      // effect of a run-time sort of the entire dataset.
      int numDone = 0;
      for(int i = 0; i < rdrs.length; i++) {
        rdrs[i].reset();
        done[i] = ! rdrs[i].next(keys[i], vals[i]);
        if(done[i]) {
          numDone++;
        }
      }


      long now = System.currentTimeMillis();
      int timesSeen = 0;
      byte [] lastRow = null;
      byte [] lastColumn = null;


      while (numDone < done.length) {
        // Find the reader with the smallest key.  If two files have same key
        // but different values -- i.e. one is delete and other is non-delete
        // value -- we will find the first, the one that was written later and
        // therefore the one whose value should make it out to the compacted
        // store file.
        int smallestKey = -1;
        for (int i = 0; i < rdrs.length; i++) {
          if(done[i]) {
            continue;
          }
          if(smallestKey < 0) {
            smallestKey = i;
          } else {
            if(keys[i].compareTo(keys[smallestKey]) < 0) {
              smallestKey = i;
            }
          }
        }


        // Reflect the current key/val in the output
        HStoreKey sk = keys[smallestKey];
        if (Bytes.equals(lastRow, sk.getRow())
            && Bytes.equals(lastColumn, sk.getColumn())) {
          timesSeen++;
        } else {
          timesSeen = 0;
        }


        if (timesSeen <= family.getMaxVersions()) {
          // Keep old versions until we have maxVersions worth.
          // Then just skip them.
          if (sk.getRow().length != 0 && sk.getColumn().length != 0) {
            // Only write out objects which have a non-zero length key and
            // value
            if (!isExpired(sk, ttl, now)) {
              compactedOut.append(sk, vals[smallestKey]);
            }
          }
        }


        // Update last-seen items
        lastRow = sk.getRow();
        lastColumn = sk.getColumn();


        // Advance the smallest key.  If that reader's all finished, then 
        // mark it as done.
        if (!rdrs[smallestKey].next(keys[smallestKey], vals[smallestKey])) {
          done[smallestKey] = true;
          rdrs[smallestKey].close();
          rdrs[smallestKey] = null;
          numDone++;
        }
      }
    } finally {
      closeCompactionReaders(Arrays.asList(rdrs));
    }
  }
  
  private void closeCompactionReaders(final List<MapFile.Reader> rdrs) {
    for (MapFile.Reader r: rdrs) {
      try {
        if (r != null) {
          r.close();
        }
      } catch (IOException e) {
        LOG.warn("Exception closing reader for " + this.storeNameStr, e);
      }
    }
  }


  /*
   * Check if this is cell is deleted.
   * If a memcache and a deletes, check key does not have an entry filled.
   * Otherwise, check value is not the <code>HGlobals.deleteBytes</code> value.
   * If passed value IS deleteBytes, then it is added to the passed
   * deletes map.
   * @param hsk
   * @param value
   * @param checkMemcache true if the memcache should be consulted
   * @param deletes Map keyed by column with a value of timestamp. Can be null.
   * If non-null and passed value is HGlobals.deleteBytes, then we add to this
   * map.
   * @return True if this is a deleted cell.  Adds the passed deletes map if
   * passed value is HGlobals.deleteBytes.
  */
  private boolean isDeleted(final HStoreKey hsk, final byte [] value,
      final boolean checkMemcache, final Map<byte [], List<Long>> deletes) {
    if (checkMemcache && memcache.isDeleted(hsk)) {
      return true;
    }
    List<Long> timestamps =
      (deletes == null) ? null: deletes.get(hsk.getColumn());
    if (timestamps != null &&
        timestamps.contains(Long.valueOf(hsk.getTimestamp()))) {
      return true;
    }
    if (value == null) {
      // If a null value, shouldn't be in here.  Mark it as deleted cell.
      return true;
    }
    if (!HLogEdit.isDeleted(value)) {
      return false;
    }
    // Cell has delete value.  Save it into deletes.
    if (deletes != null) {
      if (timestamps == null) {
        timestamps = new ArrayList<Long>();
        deletes.put(hsk.getColumn(), timestamps);
      }
      // We know its not already in the deletes array else we'd have returned
      // earlier so no need to test if timestamps already has this value.
      timestamps.add(Long.valueOf(hsk.getTimestamp()));
    }
    return true;
  }
  
  /*
   * It's assumed that the compactLock  will be acquired prior to calling this 
   * method!  Otherwise, it is not thread-safe!
   *
   * It works by processing a compaction that's been written to disk.
   * 
   * <p>It is usually invoked at the end of a compaction, but might also be
   * invoked at HStore startup, if the prior execution died midway through.
   * 
   * <p>Moving the compacted TreeMap into place means:
   * <pre>
   * 1) Moving the new compacted MapFile into place
   * 2) Unload all replaced MapFiles, close and collect list to delete.
   * 3) Loading the new TreeMap.
   * 4) Compute new store size
   * </pre>
   * 
   * @param compactedFiles list of files that were compacted
   * @param compactedFile HStoreFile that is the result of the compaction
   * @throws IOException
   */
  private void completeCompaction(final List<HStoreFile> compactedFiles,
    final HStoreFile compactedFile)
  throws IOException {
    this.lock.writeLock().lock();
    try {
      // 1. Moving the new MapFile into place.
      HStoreFile finalCompactedFile = new HStoreFile(conf, fs, basedir,
        info.getEncodedName(), family.getName(), -1, null);
      if (LOG.isDebugEnabled()) {
        LOG.debug("moving " + FSUtils.getPath(compactedFile.getMapFilePath()) +
          " to " + FSUtils.getPath(finalCompactedFile.getMapFilePath()));
      }
      if (!compactedFile.rename(this.fs, finalCompactedFile)) {
        LOG.error("Failed move of compacted file " +
          finalCompactedFile.getMapFilePath().toString());
        return;
      }


      // 2. Unload all replaced MapFiles, close and collect list to delete.
      synchronized (storefiles) {
        Map<Long, HStoreFile> toDelete = new HashMap<Long, HStoreFile>();
        for (Map.Entry<Long, HStoreFile> e : this.storefiles.entrySet()) {
          if (!compactedFiles.contains(e.getValue())) {
            continue;
          }
          Long key = e.getKey();
          MapFile.Reader reader = this.readers.remove(key);
          if (reader != null) {
            reader.close();
          }
          toDelete.put(key, e.getValue());
        }


        try {
          // 3. Loading the new TreeMap.
          // Change this.storefiles so it reflects new state but do not
          // delete old store files until we have sent out notification of
          // change in case old files are still being accessed by outstanding
          // scanners.
          for (Long key : toDelete.keySet()) {
            this.storefiles.remove(key);
          }
          // Add new compacted Reader and store file.
          Long orderVal = Long.valueOf(finalCompactedFile.loadInfo(fs));
          this.readers.put(orderVal,
              // Use a block cache (if configured) for this reader since
              // it is the only one.
              finalCompactedFile.getReader(this.fs,
                  this.family.isBloomfilter(),
                  this.family.isBlockCacheEnabled()));
          this.storefiles.put(orderVal, finalCompactedFile);
          // Tell observers that list of Readers has changed.
          notifyChangedReadersObservers();
          // Finally, delete old store files.
          for (HStoreFile hsf : toDelete.values()) {
            hsf.delete();
          }
        } catch (IOException e) {
          e = RemoteExceptionHandler.checkIOException(e);
          LOG.error("Failed replacing compacted files for " +
            this.storeNameStr +
            ". Compacted file is " + finalCompactedFile.toString() +
            ".  Files replaced are " + compactedFiles.toString() +
            " some of which may have been already removed", e);
        }
        // 4. Compute new store size
        storeSize = 0L;
        for (HStoreFile hsf : storefiles.values()) {
          storeSize += hsf.length();
        }
      }
    } finally {
      this.lock.writeLock().unlock();
    }
  }


  // ////////////////////////////////////////////////////////////////////////////
  // Accessors.
  // (This is the only section that is directly useful!)
  //////////////////////////////////////////////////////////////////////////////
  
  /**
   * Return all the available columns for the given key.  The key indicates a 
   * row and timestamp, but not a column name.
   *
   * The returned object should map column names to Cells.
   */
  void getFull(HStoreKey key, final Set<byte []> columns,
      Map<byte [], Cell> results)
  throws IOException {
    Map<byte [], Long> deletes =
      new TreeMap<byte [], Long>(Bytes.BYTES_COMPARATOR);


    // if the key is null, we're not even looking for anything. return.
    if (key == null) {
      return;
    }
    
    this.lock.readLock().lock();
    
    // get from the memcache first.
    memcache.getFull(key, columns, deletes, results);
    
    try {
      MapFile.Reader[] maparray = getReaders();
      
      // examine each mapfile
      for (int i = maparray.length - 1; i >= 0; i--) {
        MapFile.Reader map = maparray[i];
        
        // synchronize on the map so that no one else iterates it at the same 
        // time
        getFullFromMapFile(map, key, columns, deletes, results);
      }
      
    } finally {
      this.lock.readLock().unlock();
    }
  }
  
  private void getFullFromMapFile(MapFile.Reader map, HStoreKey key, 
    Set<byte []> columns, Map<byte [], Long> deletes, Map<byte [], Cell> results) 
  throws IOException {
    synchronized(map) {
      long now = System.currentTimeMillis();


      // seek back to the beginning
      map.reset();
      
      // seek to the closest key that should match the row we're looking for
      ImmutableBytesWritable readval = new ImmutableBytesWritable();
      HStoreKey readkey = (HStoreKey)map.getClosest(key, readval);
      if (readkey == null) {
        return;
      }
      do {
        byte [] readcol = readkey.getColumn();
        
        // if we're looking for this column (or all of them), and there isn't 
        // already a value for this column in the results map, and the key we 
        // just read matches, then we'll consider it
        if ((columns == null || columns.contains(readcol)) 
          && !results.containsKey(readcol)
          && key.matchesWithoutColumn(readkey)) {
          // if the value of the cell we're looking at right now is a delete, 
          // we need to treat it differently
          if(HLogEdit.isDeleted(readval.get())) {
            // if it's not already recorded as a delete or recorded with a more
            // recent delete timestamp, record it for later
            if (!deletes.containsKey(readcol) 
              || deletes.get(readcol).longValue() < readkey.getTimestamp()) {
              deletes.put(readcol, readkey.getTimestamp());              
            }
          } else if (!(deletes.containsKey(readcol) 
            && deletes.get(readcol).longValue() >= readkey.getTimestamp()) ) {
            // So the cell itself isn't a delete, but there may be a delete 
            // pending from earlier in our search. Only record this result if
            // there aren't any pending deletes.
            if (!(deletes.containsKey(readcol) &&
                deletes.get(readcol).longValue() >= readkey.getTimestamp())) {
              if (!isExpired(readkey, ttl, now)) {
                results.put(readcol, 
                  new Cell(readval.get(), readkey.getTimestamp()));
                // need to reinstantiate the readval so we can reuse it, 
                // otherwise next iteration will destroy our result
                readval = new ImmutableBytesWritable();
              }
            }
          }
        } else if (Bytes.compareTo(key.getRow(), readkey.getRow()) < 0) {
          // if we've crossed into the next row, then we can just stop 
          // iterating
          break;
        }
        
      } while(map.next(readkey, readval));
    }
  }


  /**
   * @return Array of readers ordered oldest to newest.
   */
  MapFile.Reader [] getReaders() {
    return this.readers.values().
      toArray(new MapFile.Reader[this.readers.size()]);
  }


  /**
   * Get the value for the indicated HStoreKey.  Grab the target value and the 
   * previous <code>numVersions - 1</code> values, as well.
   *
   * Use {@link HConstants.ALL_VERSIONS} to retrieve all versions.
   * @param key
   * @param numVersions Number of versions to fetch.  Must be > 0.
   * @return values for the specified versions
   * @throws IOException
   */
  Cell[] get(HStoreKey key, int numVersions) throws IOException {
    if (numVersions <= 0) {
      throw new IllegalArgumentException("Number of versions must be > 0");
    }
    
    this.lock.readLock().lock();
    long now = System.currentTimeMillis();
    try {
      // Check the memcache
      List<Cell> results = this.memcache.get(key, numVersions);
      // If we got sufficient versions from memcache, return. 
      if (results.size() == numVersions) {
        return results.toArray(new Cell[results.size()]);
      }


      // Keep a list of deleted cell keys.  We need this because as we go through
      // the store files, the cell with the delete marker may be in one file and
      // the old non-delete cell value in a later store file. If we don't keep
      // around the fact that the cell was deleted in a newer record, we end up
      // returning the old value if user is asking for more than one version.
      // This List of deletes should not be large since we are only keeping rows
      // and columns that match those set on the scanner and which have delete
      // values.  If memory usage becomes an issue, could redo as bloom filter.
      Map<byte [], List<Long>> deletes =
        new TreeMap<byte [], List<Long>>(Bytes.BYTES_COMPARATOR);
      // This code below is very close to the body of the getKeys method.
      MapFile.Reader[] maparray = getReaders();
      for(int i = maparray.length - 1; i >= 0; i--) {
        MapFile.Reader map = maparray[i];
        synchronized(map) {
          map.reset();
          ImmutableBytesWritable readval = new ImmutableBytesWritable();
          HStoreKey readkey = (HStoreKey)map.getClosest(key, readval);
          if (readkey == null) {
            // map.getClosest returns null if the passed key is > than the
            // last key in the map file.  getClosest is a bit of a misnomer
            // since it returns exact match or the next closest key AFTER not
            // BEFORE.
            continue;
          }
          if (!readkey.matchesRowCol(key)) {
            continue;
          }
          if (!isDeleted(readkey, readval.get(), true, deletes)) {
            if (!isExpired(readkey, ttl, now)) {
              results.add(new Cell(readval.get(), readkey.getTimestamp()));
            }
            // Perhaps only one version is wanted.  I could let this
            // test happen later in the for loop test but it would cost
            // the allocation of an ImmutableBytesWritable.
            if (hasEnoughVersions(numVersions, results)) {
              break;
            }
          }
          for (readval = new ImmutableBytesWritable();
              map.next(readkey, readval) &&
              readkey.matchesRowCol(key) &&
              !hasEnoughVersions(numVersions, results);
              readval = new ImmutableBytesWritable()) {
            if (!isDeleted(readkey, readval.get(), true, deletes)) {
              if (!isExpired(readkey, ttl, now)) {
                results.add(new Cell(readval.get(), readkey.getTimestamp()));
              }
            }
          }
        }
        if (hasEnoughVersions(numVersions, results)) {
          break;
        }
      }
      return results.size() == 0 ?
        null : results.toArray(new Cell[results.size()]);
    } finally {
      this.lock.readLock().unlock();
    }
  }
  
  /**
   * Small method to check if we are over the max number of versions
   * or we acheived this family max versions. 
   * The later happens when we have the situation described in HBASE-621.
   * @param numVersions
   * @param results
   * @return 
   */
  private boolean hasEnoughVersions(final int numVersions,
      final List<Cell> results) {
    return (results.size() >= numVersions || results.size() >= family
            .getMaxVersions());
  }


  /**
   * Get <code>versions</code> keys matching the origin key's
   * row/column/timestamp and those of an older vintage
   * Default access so can be accessed out of {@link HRegionServer}.
   * @param origin Where to start searching.
   * @param versions How many versions to return. Pass
   * {@link HConstants.ALL_VERSIONS} to retrieve all. Versions will include
   * size of passed <code>allKeys</code> in its count.
   * @param allKeys List of keys prepopulated by keys we found in memcache.
   * This method returns this passed list with all matching keys found in
   * stores appended.
   * @return The passed <code>allKeys</code> with <code>versions</code> of
   * matching keys found in store files appended.
   * @throws IOException
   */
  List<HStoreKey> getKeys(final HStoreKey origin, final int versions)
  throws IOException {
      
    List<HStoreKey> keys = this.memcache.getKeys(origin, versions);
    if (keys.size() >= versions) {
      return keys;
    }
    
    // This code below is very close to the body of the get method.
    this.lock.readLock().lock();
    long now = System.currentTimeMillis();
    try {
      MapFile.Reader[] maparray = getReaders();
      for(int i = maparray.length - 1; i >= 0; i--) {
        MapFile.Reader map = maparray[i];
        synchronized(map) {
          map.reset();
          
          // do the priming read
          ImmutableBytesWritable readval = new ImmutableBytesWritable();
          HStoreKey readkey = (HStoreKey)map.getClosest(origin, readval);
          if (readkey == null) {
            // map.getClosest returns null if the passed key is > than the
            // last key in the map file.  getClosest is a bit of a misnomer
            // since it returns exact match or the next closest key AFTER not
            // BEFORE.
            continue;
          }
          
          do{
            // if the row matches, we might want this one.
            if (rowMatches(origin, readkey)) {
              // if the cell matches, then we definitely want this key.
              if (cellMatches(origin, readkey)) {
                // store the key if it isn't deleted or superceeded by what's
                // in the memcache
                if (!isDeleted(readkey, readval.get(), false, null) &&
                    !keys.contains(readkey)) {
                  if (!isExpired(readkey, ttl, now)) {
                    keys.add(new HStoreKey(readkey));
                  }


                  // if we've collected enough versions, then exit the loop.
                  if (keys.size() >= versions) {
                    break;
                  }
                }
              } else {
                // the cell doesn't match, but there might be more with different
                // timestamps, so move to the next key
                continue;
              }
            } else {
              // the row doesn't match, so we've gone too far.
              break;
            }
          } while (map.next(readkey, readval)); // advance to the next key
        }
      }
      
      return keys;
    } finally {
      this.lock.readLock().unlock();
    }
  }
  
  /**
   * Find the key that matches <i>row</i> exactly, or the one that immediately
   * preceeds it. WARNING: Only use this method on a table where writes occur 
   * with stricly increasing timestamps. This method assumes this pattern of 
   * writes in order to make it reasonably performant.
   * @param row
   * @return Found row
   * @throws IOException
   */
  byte [] getRowKeyAtOrBefore(final byte [] row)
  throws IOException{
    // Map of HStoreKeys that are candidates for holding the row key that
    // most closely matches what we're looking for. We'll have to update it as
    // deletes are found all over the place as we go along before finally
    // reading the best key out of it at the end.
    SortedMap<HStoreKey, Long> candidateKeys = new TreeMap<HStoreKey, Long>();
    
    // Keep a list of deleted cell keys.  We need this because as we go through
    // the store files, the cell with the delete marker may be in one file and
    // the old non-delete cell value in a later store file. If we don't keep
    // around the fact that the cell was deleted in a newer record, we end up
    // returning the old value if user is asking for more than one version.
    // This List of deletes should not be large since we are only keeping rows
    // and columns that match those set on the scanner and which have delete
    // values.  If memory usage becomes an issue, could redo as bloom filter.
    Set<HStoreKey> deletes = new HashSet<HStoreKey>();
    
    
    this.lock.readLock().lock();
    try {
      // First go to the memcache.  Pick up deletes and candidates.
      this.memcache.getRowKeyAtOrBefore(row, candidateKeys, deletes);
      
      // Process each store file.  Run through from newest to oldest.
      // This code below is very close to the body of the getKeys method.
      MapFile.Reader[] maparray = getReaders();
      for(int i = maparray.length - 1; i >= 0; i--) {
        // Update the candidate keys from the current map file
        rowAtOrBeforeFromMapFile(maparray[i], row, candidateKeys, deletes);
      }
      // Return the best key from candidateKeys
      return candidateKeys.isEmpty()? null: candidateKeys.lastKey().getRow();
    } finally {
      this.lock.readLock().unlock();
    }
  }
  
  /*
   * Check an individual MapFile for the row at or before a given key 
   * and timestamp
   * @param map
   * @param row
   * @param candidateKeys
   * @throws IOException
   */
  private void rowAtOrBeforeFromMapFile(final MapFile.Reader map,
    final byte [] row, final SortedMap<HStoreKey, Long> candidateKeys,
    final Set<HStoreKey> deletes)
  throws IOException {
    HStoreKey startKey = new HStoreKey();
    ImmutableBytesWritable startValue = new ImmutableBytesWritable();
    synchronized(map) {
      // Don't bother with the rest of this if the file is empty
      map.reset();
      if (!map.next(startKey, startValue)) {
        return;
      }
      // If start row for this file is beyond passed in row, return; nothing
      // in here is of use to us.
      if (Bytes.compareTo(startKey.getRow(), row) > 0) {
        return;
      }
      long now = System.currentTimeMillis();
      // if there aren't any candidate keys yet, we'll do some things different 
      if (candidateKeys.isEmpty()) {
        rowAtOrBeforeCandidate(startKey, map, row, candidateKeys, deletes, now);
      } else {
        rowAtOrBeforeWithCandidates(startKey, map, row, candidateKeys, deletes,
          now);
      }
    }
  }
  
  /* Find a candidate for row that is at or before passed row in passed
   * mapfile.
   * @param startKey First key in the mapfile.
   * @param map
   * @param row
   * @param candidateKeys
   * @param now
   * @throws IOException
   */
  private void rowAtOrBeforeCandidate(final HStoreKey startKey,
    final MapFile.Reader map, final byte[] row,
    final SortedMap<HStoreKey, Long> candidateKeys,
    final Set<HStoreKey> deletes, final long now) 
  throws IOException {
    // if the row we're looking for is past the end of this mapfile, set the
    // search key to be the last key.  If its a deleted key, then we'll back
    // up to the row before and return that.
    HStoreKey finalKey = getFinalKey(map);
    HStoreKey searchKey = null;
    if (Bytes.compareTo(finalKey.getRow(), row) < 0) {
      searchKey = finalKey;
    } else {
      searchKey = new HStoreKey(row);
      if (searchKey.compareTo(startKey) < 0) {
        searchKey = startKey;
      }
    }
    rowAtOrBeforeCandidate(map, searchKey, candidateKeys, deletes, now);
  }


  /* 
   * @param ttlSetting
   * @param hsk
   * @param now
   * @param deletes
   * @return True if key has not expired and is not in passed set of deletes.
   */
  static boolean notExpiredAndNotInDeletes(final long ttl,
      final HStoreKey hsk, final long now, final Set<HStoreKey> deletes) {
    return !isExpired(hsk, ttl, now) && !deletes.contains(hsk);
  }
  
  private static boolean isExpired(final HStoreKey hsk, final long ttl,
      final long now) {
    boolean result = ttl != HConstants.FOREVER && now > hsk.getTimestamp() + ttl;
    if (result && LOG.isDebugEnabled()) {
      LOG.debug("rowAtOrBeforeCandidate 1:" + hsk +
        ": expired, skipped");
    }
    return result;
  }


  /* Find a candidate for row that is at or before passed key, sk, in mapfile.
   * @param map
   * @param sk Key to go search the mapfile with.
   * @param candidateKeys
   * @param now
   * @throws IOException
   * @see {@link #rowAtOrBeforeCandidate(HStoreKey, org.apache.hadoop.io.MapFile.Reader, byte[], SortedMap, long)}
   */
  private void rowAtOrBeforeCandidate(final MapFile.Reader map,
    final HStoreKey sk, final SortedMap<HStoreKey, Long> candidateKeys,
    final Set<HStoreKey> deletes, final long now)
  throws IOException {
    HStoreKey searchKey = sk;
    HStoreKey readkey = new HStoreKey();
    ImmutableBytesWritable readval = new ImmutableBytesWritable();
    HStoreKey knownNoGoodKey = null;
    for (boolean foundCandidate = false; !foundCandidate;) {
      // Seek to the exact row, or the one that would be immediately before it
      readkey = (HStoreKey)map.getClosest(searchKey, readval, true);
      if (readkey == null) {
        // If null, we are at the start or end of the file.
        break;
      }
      HStoreKey deletedOrExpiredRow = null;
      do {
        // If we have an exact match on row, and it's not a delete, save this
        // as a candidate key
        if (Bytes.equals(readkey.getRow(), searchKey.getRow())) {
          if (!HLogEdit.isDeleted(readval.get())) {
            if (notExpiredAndNotInDeletes(this.ttl, readkey, now, deletes)) {
              candidateKeys.put(stripTimestamp(readkey), 
                  new Long(readkey.getTimestamp()));
              foundCandidate = true;
              // NOTE! Continue.
              continue;
            }
          }
          // Deleted value.
          deletes.add(readkey);
          if (deletedOrExpiredRow == null) {
            deletedOrExpiredRow = new HStoreKey(readkey);
          }
        } else if (Bytes.compareTo(readkey.getRow(), searchKey.getRow()) > 0) {
          // if the row key we just read is beyond the key we're searching for,
          // then we're done.
          break;
        } else {
          // So, the row key doesn't match, but we haven't gone past the row
          // we're seeking yet, so this row is a candidate for closest
          // (assuming that it isn't a delete).
          if (!HLogEdit.isDeleted(readval.get())) {
            if (notExpiredAndNotInDeletes(this.ttl, readkey, now, deletes)) {
              candidateKeys.put(stripTimestamp(readkey), 
                  new Long(readkey.getTimestamp()));
              foundCandidate = true;
              continue;
            }
          }
          deletes.add(readkey);
          if (deletedOrExpiredRow == null) {
            deletedOrExpiredRow = new HStoreKey(readkey);
          }
        }        
      } while(map.next(readkey, readval) && (knownNoGoodKey == null ||
          readkey.compareTo(knownNoGoodKey) < 0));


      // If we get here and have no candidates but we did find a deleted or
      // expired candidate, we need to look at the key before that
      if (!foundCandidate && deletedOrExpiredRow != null) {
        knownNoGoodKey = deletedOrExpiredRow;
        searchKey = new BeforeThisStoreKey(deletedOrExpiredRow);
      } else {
        // No candidates and no deleted or expired candidates. Give up.
        break;
      }
    }
    
    // Arriving here just means that we consumed the whole rest of the map
    // without going "past" the key we're searching for. we can just fall
    // through here.
  }
  
  private void rowAtOrBeforeWithCandidates(final HStoreKey startKey,
    final MapFile.Reader map, final byte[] row,
    final SortedMap<HStoreKey, Long> candidateKeys,
    final Set<HStoreKey> deletes, final long now) 
  throws IOException {
    HStoreKey readkey = new HStoreKey();
    ImmutableBytesWritable readval = new ImmutableBytesWritable();


    // if there are already candidate keys, we need to start our search 
    // at the earliest possible key so that we can discover any possible
    // deletes for keys between the start and the search key.  Back up to start
    // of the row in case there are deletes for this candidate in this mapfile
    // BUT do not backup before the first key in the mapfile else getClosest
    // will return null
    HStoreKey searchKey = new HStoreKey(candidateKeys.firstKey().getRow());
    if (searchKey.compareTo(startKey) < 0) {
      searchKey = startKey;
    }


    // Seek to the exact row, or the one that would be immediately before it
    readkey = (HStoreKey)map.getClosest(searchKey, readval, true);
    if (readkey == null) {
      // If null, we are at the start or end of the file.
      // Didn't find anything that would match, so return
      return;
    }


    do {
      HStoreKey strippedKey = null;
      // if we have an exact match on row, and it's not a delete, save this
      // as a candidate key
      if (Bytes.equals(readkey.getRow(), row)) {
        strippedKey = stripTimestamp(readkey);
        if (!HLogEdit.isDeleted(readval.get())) {
          if (notExpiredAndNotInDeletes(this.ttl, readkey, now, deletes)) {
            candidateKeys.put(strippedKey,
                new Long(readkey.getTimestamp()));
          }
        } else {
          // If the candidate keys contain any that might match by timestamp,
          // then check for a match and remove it if it's too young to 
          // survive the delete 
          if (candidateKeys.containsKey(strippedKey)) {
            long bestCandidateTs =
              candidateKeys.get(strippedKey).longValue();
            if (bestCandidateTs <= readkey.getTimestamp()) {
              candidateKeys.remove(strippedKey);
            } 
          }
        }
      } else if (Bytes.compareTo(readkey.getRow(), row) > 0 ) {
        // if the row key we just read is beyond the key we're searching for,
        // then we're done.
        break;
      } else {
        strippedKey = stripTimestamp(readkey);
        // So, the row key doesn't match, but we haven't gone past the row
        // we're seeking yet, so this row is a candidate for closest 
        // (assuming that it isn't a delete).
        if (!HLogEdit.isDeleted(readval.get())) {
          if (notExpiredAndNotInDeletes(this.ttl, readkey, now, deletes)) {
            candidateKeys.put(strippedKey, Long.valueOf(readkey.getTimestamp()));
          }
        } else {
          // If the candidate keys contain any that might match by timestamp,
          // then check for a match and remove it if it's too young to 
          // survive the delete 
          if (candidateKeys.containsKey(strippedKey)) {
            long bestCandidateTs = 
              candidateKeys.get(strippedKey).longValue();
            if (bestCandidateTs <= readkey.getTimestamp()) {
              candidateKeys.remove(strippedKey);
            } 
          }
        }      
      }
    } while(map.next(readkey, readval));    
  }
  
  /*
   * @param mf MapFile to dig in.
   * @return Final key from passed <code>mf</code>
   * @throws IOException
   */
  private HStoreKey getFinalKey(final MapFile.Reader mf) throws IOException {
    HStoreKey finalKey = new HStoreKey(); 
    mf.finalKey(finalKey);
    return finalKey;
  }
  
  static HStoreKey stripTimestamp(HStoreKey key) {
    return new HStoreKey(key.getRow(), key.getColumn());
  }
    
  /**
   * Test that the <i>target</i> matches the <i>origin</i>. If the 
   * <i>origin</i> has an empty column, then it's assumed to mean any column 
   * matches and only match on row and timestamp. Otherwise, it compares the
   * keys with HStoreKey.matchesRowCol().
   * @param origin The key we're testing against
   * @param target The key we're testing
   */
  private boolean cellMatches(HStoreKey origin, HStoreKey target){
    // if the origin's column is empty, then we're matching any column
    if (Bytes.equals(origin.getColumn(), HConstants.EMPTY_BYTE_ARRAY)){
      // if the row matches, then...
      if (Bytes.equals(target.getRow(), origin.getRow())) {
        // check the timestamp
        return target.getTimestamp() <= origin.getTimestamp();
      }
      return false;
    }
    // otherwise, we want to match on row and column
    return target.matchesRowCol(origin);
  }
    
  /**
   * Test that the <i>target</i> matches the <i>origin</i>. If the <i>origin</i>
   * has an empty column, then it just tests row equivalence. Otherwise, it uses
   * HStoreKey.matchesRowCol().
   * @param origin Key we're testing against
   * @param target Key we're testing
   */
  private boolean rowMatches(HStoreKey origin, HStoreKey target){
    // if the origin's column is empty, then we're matching any column
    if (Bytes.equals(origin.getColumn(), HConstants.EMPTY_BYTE_ARRAY)) {
      // if the row matches, then...
      return Bytes.equals(target.getRow(), origin.getRow());
    }
    // otherwise, we want to match on row and column
    return target.matchesRowCol(origin);
  }
  
  /**
   * Determines if HStore can be split
   * 
   * @return a StoreSize if store can be split, null otherwise
   */
  StoreSize checkSplit() {
    if (this.storefiles.size() <= 0) {
      return null;
    }
    if (storeSize < this.desiredMaxFileSize) {
      return null;
    }
    this.lock.readLock().lock();
    try {
      // Not splitable if we find a reference store file present in the store.
      boolean splitable = true;
      long maxSize = 0L;
      Long mapIndex = Long.valueOf(0L);
      // Iterate through all the MapFiles
      synchronized (storefiles) {
        for (Map.Entry<Long, HStoreFile> e: storefiles.entrySet()) {
          HStoreFile curHSF = e.getValue();
          long size = curHSF.length();
          if (size > maxSize) {
            // This is the largest one so far
            maxSize = size;
            mapIndex = e.getKey();
          }
          if (splitable) {
            splitable = !curHSF.isReference();
          }
        }
      }
      if (!splitable) {
        return null;
      }
      MapFile.Reader r = this.readers.get(mapIndex);


      // seek back to the beginning of mapfile
      r.reset();


      // get the first and last keys
      HStoreKey firstKey = new HStoreKey();
      HStoreKey lastKey = new HStoreKey();
      Writable value = new ImmutableBytesWritable();
      r.next(firstKey, value);
      r.finalKey(lastKey);


      // get the midkey
      HStoreKey mk = (HStoreKey)r.midKey();
      if (mk != null) {
        // if the midkey is the same as the first and last keys, then we cannot
        // (ever) split this region. 
        if (Bytes.equals(mk.getRow(), firstKey.getRow()) && 
            Bytes.equals(mk.getRow(), lastKey.getRow())) {
          return null;
        }
        return new StoreSize(maxSize, mk.getRow());
      }
    } catch(IOException e) {
      LOG.warn("Failed getting store size for " + this.storeNameStr, e);
    } finally {
      this.lock.readLock().unlock();
    }
    return null;
  }
  
  /** @return aggregate size of HStore */
  public long getSize() {
    return storeSize;
  }
  
  //////////////////////////////////////////////////////////////////////////////
  // File administration
  //////////////////////////////////////////////////////////////////////////////


  /**
   * Return a scanner for both the memcache and the HStore files
   */
  protected InternalScanner getScanner(long timestamp, byte [][] targetCols,
      byte [] firstRow, RowFilterInterface filter)
  throws IOException {
    lock.readLock().lock();
    try {
      return new HStoreScanner(this, targetCols, firstRow, timestamp, filter);
    } finally {
      lock.readLock().unlock();
    }
  }


  /** {@inheritDoc} */
  @Override
  public String toString() {
    return this.storeNameStr;
  }


  /**
   * @param p Path to check.
   * @return True if the path has format of a HStoreFile reference.
   */
  public static boolean isReference(final Path p) {
    return isReference(p, REF_NAME_PARSER.matcher(p.getName()));
  }
 
  private static boolean isReference(final Path p, final Matcher m) {
    if (m == null || !m.matches()) {
      LOG.warn("Failed match of store file name " + p.toString());
      throw new RuntimeException("Failed match of store file name " +
          p.toString());
    }
    return m.groupCount() > 1 && m.group(2) != null;
  }


  /**
   * @return Current list of store files.
   */
  SortedMap<Long, HStoreFile> getStorefiles() {
    synchronized (this.storefiles) {
      SortedMap<Long, HStoreFile> copy =
        new TreeMap<Long, HStoreFile>(this.storefiles);
      return copy;
    }
  }
  
  class StoreSize {
    private final long size;
    private final byte[] key;
    StoreSize(long size, byte[] key) {
      this.size = size;
      this.key = new byte[key.length];
      System.arraycopy(key, 0, this.key, 0, key.length);
    }
    /* @return the size */
    long getSize() {
      return size;
    }
    /* @return the key */
    byte[] getKey() {
      return key;
    }
  }
}
Source Code of org.apache.hadoop.hbase.regionserver.HStore

Related Classes of org.apache.hadoop.hbase.regionserver.HStore