Package water.fvec

Source Code of water.fvec.RebalanceDataSet$RebalanceTask

package water.fvec;

import java.util.Arrays;
import java.util.Iterator;
import jsr166y.CountedCompleter;
import water.Futures;
import water.H2O;
import water.Key;
import water.MRTask;

/**
* Created by tomasnykodym on 3/28/14.
*
* Utility to rebalance dataset so that it has requested number of chunks and each chunk has the same number of rows +/-1.
*
* It *does not* guarantee even chunk-node placement.
* (This can not currently be done in H2O, since the placement of chunks is governed only by key-hash /vector group/ for Vecs)
*/
public class RebalanceDataSet extends H2O.H2OCountedCompleter {
  final Frame _in;
  final int _nchunks;
  Key _okey;
  Frame _out;
  final Key _jobKey;

  public RebalanceDataSet(Frame srcFrame, Key dstKey, int nchunks) { this(srcFrame, dstKey,nchunks,null,null);}
  public RebalanceDataSet(Frame srcFrame, Key dstKey, int nchunks, H2O.H2OCountedCompleter cmp, Key jobKey) {
    super(cmp);
    _in = srcFrame;
    _nchunks = nchunks;
    _jobKey = jobKey;
    _okey = dstKey;
  }

  public Frame getResult(){join(); return _out;}

  @Override public void compute2() {
    _in.read_lock(_jobKey);
    // Simply create a bogus new vector (don't even put it into KV) with
    // appropriate number of lines per chunk and then use it as a source to do
    // multiple makeZero calls to create empty vecs and than call RebalanceTask
    // on each one of them.  RebalanceTask will fetch the appropriate src
    // chunks and fetch the data from them.
    int rpc = (int)(_in.numRows() / _nchunks);
    int rem = (int)(_in.numRows() % _nchunks);
    long[] espc = new long[_nchunks+1];
    Arrays.fill(espc,rpc);
    for( int i = 0; i < rem; ++i ) ++espc[i];
    long sum = 0;
    for( int i = 0; i < espc.length; ++i ) {
      long s = espc[i];
      espc[i] = sum;
      sum += s;
    }
    assert espc[espc.length-1] == _in.numRows():"unexpected number of rows, expected " + _in.numRows() + ", got " + espc[espc.length-1];
    final Vec[] srcVecs = _in.vecs();
    _out = new Frame(_okey,_in.names(), new Vec(Vec.newKey(),espc).makeZeros(srcVecs.length,_in.domains(),_in.uuids(), _in.strings(), _in.times()));
    _out.delete_and_lock(_jobKey);
    new RebalanceTask(this,srcVecs).asyncExec(_out);
  }

  @Override public void onCompletion(CountedCompleter caller) {
    assert _out.numRows() == _in.numRows();
    Vec vec = _out.anyVec();
    assert vec.nChunks() == _nchunks;
    _in.unlock(_jobKey);
    _out.update(_jobKey);
    _out.unlock(_jobKey);
  }
  @Override public boolean onExceptionalCompletion(Throwable t, CountedCompleter caller){
    _in.unlock(_jobKey);
    if(_out != null)_out.delete(_jobKey,new Futures()).blockForPending();
    return true;
  }

  public static class RebalanceTask extends MRTask<RebalanceTask> {
    final Vec [] _srcVecs;
    public RebalanceTask(H2O.H2OCountedCompleter cmp, Vec... srcVecs){super(cmp);_srcVecs = srcVecs;}

    @Override public boolean logVerbose() { return false; }

    private void rebalanceChunk(Vec srcVec, Chunk chk){
      NewChunk dst = new NewChunk(chk);
      dst.set_len(0);
      dst.set_sparseLen(dst.len());
      int rem = chk.len();
      while(rem > 0 && dst.len() < chk.len()){
        Chunk srcRaw = srcVec.chunkForRow(chk._start+ dst.len());
        NewChunk src = new NewChunk((srcRaw));
        src = srcRaw.inflate_impl(src);
        assert src.len() == srcRaw.len();
        int srcFrom = (int)(chk._start+ dst.len() - src._start);
        // check if the result is sparse (not exact since we only take subset of src in general)
        if((src.sparse() && dst.sparse()) || (src.sparseLen() + dst.sparseLen() < NewChunk.MIN_SPARSE_RATIO*(src.len() + dst.len()))){
          src.set_sparse(src.sparseLen());
          dst.set_sparse(dst.sparseLen());
        }
        final int srcTo = srcFrom + rem;
        int off = srcFrom-1;
        Iterator<NewChunk.Value> it = src.values(Math.max(0,srcFrom),srcTo);
        while(it.hasNext()){
          NewChunk.Value v = it.next();
          final int rid = v.rowId0();
          assert  rid < srcTo;
          int add = rid - off;
          off = rid;
          dst.addZeros(add-1);
          v.add2Chunk(dst);
          rem -= add;
          assert rem >= 0;
        }
        int trailingZeros = Math.min(rem, src.len() - off -1);
        dst.addZeros(trailingZeros);
        rem -= trailingZeros;
      }
      assert rem == 0:"rem = " + rem;
      assert dst.len() == chk.len() :"len = " + dst.len() + ", _len = " + chk.len();
      dst.close(dst.cidx(),_fs);
    }
    @Override public void map(Chunk [] chks){
      for(int i = 0; i < chks.length; ++i)
        rebalanceChunk(_srcVecs[i],chks[i]);
    }
  }
}
TOP

Related Classes of water.fvec.RebalanceDataSet$RebalanceTask

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.