Package com.google.uzaygezen.core.hbase

Source Code of com.google.uzaygezen.core.hbase.HBaseQueryTest

/*
* Copyright (C) 2012 Daniel Aioanei.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.google.uzaygezen.core.hbase;

import java.io.IOException;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.junit.Assert;
import org.junit.Test;

import com.google.common.base.Charsets;
import com.google.common.base.Functions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterators;
import com.google.common.primitives.Ints;
import com.google.uzaygezen.core.BacktrackingQueryBuilder;
import com.google.uzaygezen.core.BigIntegerContent;
import com.google.uzaygezen.core.BitVector;
import com.google.uzaygezen.core.BitVectorFactories;
import com.google.uzaygezen.core.BitVectorMath;
import com.google.uzaygezen.core.BoundedRollup;
import com.google.uzaygezen.core.CompactHilbertCurve;
import com.google.uzaygezen.core.FilteredIndexRange;
import com.google.uzaygezen.core.HilbertIndexMasks;
import com.google.uzaygezen.core.MapNode;
import com.google.uzaygezen.core.MapRegionInspector;
import com.google.uzaygezen.core.MultiDimensionalSpec;
import com.google.uzaygezen.core.NodeValue;
import com.google.uzaygezen.core.PlainFilterCombiner;
import com.google.uzaygezen.core.Pow2LengthBitSetRange;
import com.google.uzaygezen.core.Pow2LengthBitSetRangeFactory;
import com.google.uzaygezen.core.Query;
import com.google.uzaygezen.core.QueryBuilder;
import com.google.uzaygezen.core.RegionInspector;
import com.google.uzaygezen.core.SimpleRegionInspector;
import com.google.uzaygezen.core.SpaceFillingCurve;
import com.google.uzaygezen.core.StreamingRollup;
import com.google.uzaygezen.core.TestUtils;
import com.google.uzaygezen.core.ZoomingSpaceVisitorAdapter;
import com.google.uzaygezen.core.ranges.BigIntegerRange;
import com.google.uzaygezen.core.ranges.BigIntegerRangeHome;
import com.google.uzaygezen.core.ranges.RangeUtil;

/**
* Test case that also serves as an example of how to use the query
* functionality. While this class relies on BigInteger, BigIntegerContent,
* BigIntegerRange and BigIntegerRangeHome, it is recommended to use the
* parallel Long, LongContent, LongRange and LogRangeHome classes when the total
* precision of the Hilbert space is less than 63 bits.
*
* @author Daniel Aioanei
*/
public class HBaseQueryTest {

  private static final Logger logger = Logger.getLogger(HBaseQueryTest.class.getSimpleName());

  /**
   * With more than 62 bits (using {@link BigInteger} rather than plain
   * {@link Long}) and without any caching rollup version of the data
   * {@link BoundedRollup}, this way of building the queries is likely to be
   * quite slow, but it shows off the capability of perform queries of
   * non-cached arbitrary-precision data.
   */
  @Test
  public void queryHBase() throws IOException, InterruptedException {
    MockHTable table = MockHTable.create();
    final byte[] family = "FAMILY".getBytes(Charsets.ISO_8859_1);
    /*
     * We choose not to store the coordinates themselves, since storing the
     * Hilbert index is sufficient to recover the coordinate values. So let's
     * use a dummy column.
     */
    final byte[][] qualifiers = {"NICE".getBytes(Charsets.ISO_8859_1),};
    MultiDimensionalSpec spec = new MultiDimensionalSpec(Ints.asList(30, 10, 25));
    // Add some data.
    Random rnd = new Random(TestUtils.SEED);
    int[][] data = generateData(spec, 1 << 16, rnd);
    SpaceFillingCurve sfc = new CompactHilbertCurve(spec);
    logger.log(Level.INFO, "Populating table with up to {0} rows.", data.length);
    populateTable(family, qualifiers, spec, data, sfc, table);
    int cacheSize = 1 << 8;
    logger.log(Level.INFO, "Building cache of size {0}.", cacheSize);
    // The cache is optional.
    Map<Pow2LengthBitSetRange, NodeValue<BigIntegerContent>> rolledupMap = createRolledupCache(
      table, spec, sfc, cacheSize);
    logger.log(Level.INFO, "Constructed cache of actual size {0}.", rolledupMap.size());
    for (int trial = 0; trial < 1; ++trial) {
      logger.log(Level.INFO, "trial={0}", trial);
      int[] maxLengthPerDimension = new int[spec.getBitsPerDimension().size()];
      for (boolean useCache : new boolean[] {false, true}) {
        int m = useCache ? 256 : 32;
        /*
         * For testing purposes limit the range size to m values for each
         * dimension to speed up query computation. In practice, query volume
         * should be enforced to be small, and when a certain query volume is
         * exceeded, a full table scan will probably be faster anyway.
         */
        Arrays.fill(maxLengthPerDimension, m);
        int[][] ranges = generateRanges(spec, maxLengthPerDimension, rnd);
        logger.log(Level.INFO, "ranges={0}", Arrays.deepToString(ranges));
        // Limit the maximum number of ranges.
        int maxRanges = 1 + rnd.nextInt(32);
        List<int[]> actual = queryAndFilter(
          table, spec, sfc, ranges, maxRanges, useCache ? rolledupMap : null);
        List<int[]> expected = uniq(fullScanQuery(data, sfc, ranges));
        logger.log(Level.INFO, "expected.size()={0}", expected.size());
        Assert.assertEquals(expected.size(), actual.size());
        for (int i = 0; i < expected.size(); ++i) {
          Assert.assertArrayEquals(expected.get(i), actual.get(i));
        }
      }
    }
  }

  public Map<Pow2LengthBitSetRange, NodeValue<BigIntegerContent>> createRolledupCache(
    MockHTable table, MultiDimensionalSpec spec, SpaceFillingCurve sfc, int cacheSize)
    throws IOException {
    int[] elementLengths = Ints.toArray(new HilbertIndexMasks(sfc.getSpec()).cardinalities());
    BitVector[] path = new BitVector[elementLengths.length];
    for (int i = 0; i < path.length; ++i) {
      path[i] = BitVectorFactories.OPTIMAL.apply(elementLengths[path.length - i - 1]);
    }
    StreamingRollup<BitVector, BigIntegerContent> rollup = BoundedRollup.create(
      new BigIntegerContent(BigInteger.ZERO), cacheSize);
    Scan fullScan = new Scan();
    ResultScanner scanner = table.getScanner(fullScan);
    BitVector hilbertIndex = BitVectorFactories.OPTIMAL.apply(spec.sumBitsPerDimension());
    for (Result row : scanner) {
      hilbertIndex.copyFromBigEndian(row.getRow());
      for (int i = 0; i < path.length; ++i) {
        path[i] = path[i].clone();
      }
      BitVectorMath.split(hilbertIndex, path);
      // We should say the exact number of times. Saying one is correct, but
      // suboptimal.
      BigIntegerContent v = new BigIntegerContent(BigInteger.ONE);
      rollup.feedRow(Iterators.<BitVector>forArray(path), v);
    }
    MapNode<BitVector, BigIntegerContent> rolledupTree = rollup.finish();
    Pow2LengthBitSetRangeFactory<BigIntegerContent> factory = Pow2LengthBitSetRangeFactory.create(Ints.asList(elementLengths));
    Map<Pow2LengthBitSetRange, NodeValue<BigIntegerContent>> rolledupMap = factory.apply(rolledupTree);
    return rolledupMap;
  }

  public List<int[]> fullScanQuery(int[][] data, SpaceFillingCurve sfc, int[][] ranges) {
    MultiDimensionalSpec spec = sfc.getSpec();
    List<Integer> filtered = filter(data, ranges);
    List<Pair<BitVector, Integer>> pairs = new ArrayList<>(filtered.size());
    BitVector[] point = new BitVector[spec.getBitsPerDimension().size()];
    for (int j = 0; j < spec.getBitsPerDimension().size(); ++j) {
      point[j] = BitVectorFactories.OPTIMAL.apply(spec.getBitsPerDimension().get(j));
    }
    for (int i : filtered) {
      BitVector index = BitVectorFactories.OPTIMAL.apply(spec.sumBitsPerDimension());
      // int has 32 bits, which fits in each dimensions.
      for (int j = 0; j < spec.getBitsPerDimension().size(); ++j) {
        point[j].copyFrom(data[i][j]);
      }
      sfc.index(point, 0, index);
      pairs.add(Pair.of(index.clone(), i));
    }
    // Sort by Hilbert index.
    Collections.sort(pairs);
    List<int[]> expected = new ArrayList<>(pairs.size());
    for (Pair<BitVector, Integer> pair : pairs) {
      expected.add(data[pair.getRight()]);
    }
    return expected;
  }

  private static List<Integer> filter(int[][] data, int[][] ranges) {
    List<Integer> result = new ArrayList<>();
    for (int i = 0; i < data.length; ++i) {
      if (RangeUtil.contains(ranges, data[i])) {
        result.add(i);
      }
    }
    return result;
  }

  public List<int[]> queryAndFilter(
    MockHTable table, MultiDimensionalSpec spec, SpaceFillingCurve sfc, int[][] ranges,
    int maxRanges, Map<Pow2LengthBitSetRange, NodeValue<BigIntegerContent>> rolledupMap)
    throws IOException {
    List<BigIntegerRange> region = rangesToQueryRegion(ranges);
    List<FilteredIndexRange<Object, BigIntegerRange>> indexRanges = query(
      table, region, sfc, maxRanges, rolledupMap);
    Assert.assertTrue(indexRanges.size() <= maxRanges);
    logger.log(Level.INFO, "indexRanges={0}", indexRanges);
    // The ranges are in strictly increasing hilbert index order.
    for (int i = 0; i < indexRanges.size() - 1; ++i) {
      FilteredIndexRange<Object, BigIntegerRange> a = indexRanges.get(i);
      FilteredIndexRange<Object, BigIntegerRange> b = indexRanges.get(i + 1);
      Assert.assertTrue(a.getIndexRange().getEnd().compareTo(b.getIndexRange().getStart()) < 0);
    }
    BitVector start = BitVectorFactories.OPTIMAL.apply(spec.sumBitsPerDimension());
    BitVector end = BitVectorFactories.OPTIMAL.apply(spec.sumBitsPerDimension());
    Scan[] scans = new Scan[indexRanges.size()];
    for (int i = 0; i < indexRanges.size(); ++i) {
      FilteredIndexRange<Object, BigIntegerRange> indexRange = indexRanges.get(i);
      BigInteger startBigInteger = indexRange.getIndexRange().getStart();
      start.copyFrom(startBigInteger);
      BigInteger endBigInteger = indexRange.getIndexRange().getEnd();
      final Scan scan;
      if (endBigInteger.testBit(spec.sumBitsPerDimension())) {
        scan = new Scan(start.toBigEndianByteArray());
      } else {
        end.copyFrom(endBigInteger);
        scan = new Scan(start.toBigEndianByteArray(), end.toBigEndianByteArray());
      }
      scans[i] = scan;
    }
    BitVector[] point = new BitVector[spec.getBitsPerDimension().size()];
    BitVector index = BitVectorFactories.OPTIMAL.apply(spec.sumBitsPerDimension());
    for (int j = 0; j < spec.getBitsPerDimension().size(); ++j) {
      point[j] = BitVectorFactories.OPTIMAL.apply(spec.getBitsPerDimension().get(j));
    }
    List<int[]> actual = new ArrayList<>();
    for (int i = 0; i < indexRanges.size(); ++i) {
      ResultScanner scanner = table.getScanner(scans[i]);
      FilteredIndexRange<Object, BigIntegerRange> indexRange = indexRanges.get(i);
      logger.log(Level.FINE, "indexRange={0}", indexRange);
      for (Result result : scanner) {
        byte[] row = result.getRow();
        index.copyFromBigEndian(row);
        sfc.indexInverse(index, point);
        boolean isContained = RangeUtil.containsBigInteger(
          region, Arrays.asList(bitVectorPointToBigIntegerPoint(point)));
        if (!indexRange.isPotentialOverSelectivity()) {
          Assert.assertTrue(isContained);
        }
        if (isContained) {
          int[] e = new int[point.length];
          for (int j = 0; j < e.length; ++j) {
            e[j] = (int) point[j].toExactLong();
          }
          actual.add(e);
        }
      }
    }
    return actual;
  }

  private BigInteger[] bitVectorPointToBigIntegerPoint(BitVector[] point) {
    BigInteger[] a = new BigInteger[point.length];
    for (int i = 0; i < a.length; ++i) {
      a[i] = point[i].toBigInteger();
    }
    return a;
  }

  private List<FilteredIndexRange<Object, BigIntegerRange>> query(
    MockHTable table, List<BigIntegerRange> region, SpaceFillingCurve sfc, int maxRanges,
    Map<Pow2LengthBitSetRange, NodeValue<BigIntegerContent>> rolledupMap) {
    List<? extends List<BigIntegerRange>> x = ImmutableList.of(region);
    BigIntegerContent zero = new BigIntegerContent(BigInteger.ZERO);
    Object filter = "";
    BigIntegerContent one = new BigIntegerContent(BigInteger.ONE);
    RegionInspector<Object, BigIntegerContent> simpleRegionInspector = SimpleRegionInspector.create(
      x, one, Functions.constant(filter), BigIntegerRangeHome.INSTANCE, zero);
    final RegionInspector<Object, BigIntegerContent> regionInspector;
    if (rolledupMap == null) {
      regionInspector = simpleRegionInspector;
    } else {
      regionInspector = MapRegionInspector.create(
        rolledupMap, simpleRegionInspector, false, zero, one);
    }
    // Not using using sub-ranges here.
    PlainFilterCombiner<Object, BigInteger, BigIntegerContent, BigIntegerRange> combiner = new PlainFilterCombiner<>(
      filter);
    QueryBuilder<Object, BigIntegerRange> queryBuilder = BacktrackingQueryBuilder.create(
      regionInspector, combiner, maxRanges, true, BigIntegerRangeHome.INSTANCE, zero);
    sfc.accept(new ZoomingSpaceVisitorAdapter(sfc, queryBuilder));
    Query<Object, BigIntegerRange> query = queryBuilder.get();
    return query.getFilteredIndexRanges();
  }

  private static List<BigIntegerRange> rangesToQueryRegion(int[][] ranges) {
    List<BigIntegerRange> region = new ArrayList<>();
    for (int j = 0; j < ranges.length; ++j) {
      region.add(BigIntegerRange.of(ranges[j][0], ranges[j][1]));
    }
    return region;
  }

  private static int[][] generateRanges(
    MultiDimensionalSpec spec, int[] maxLengthPerDimension, Random rnd) {
    int[][] ranges = new int[spec.getBitsPerDimension().size()][2];
    for (int j = 0; j < spec.getBitsPerDimension().size(); ++j) {
      int bound = 1 << spec.getBitsPerDimension().get(j);
      int start = bound / 2 - rnd.nextInt(Math.min(bound, maxLengthPerDimension[j])) / 2;
      assert start >= 0;
      int end = (bound + 1) / 2 + rnd.nextInt(Math.min(bound, maxLengthPerDimension[j])) / 2;
      assert end <= bound;
      ranges[j][0] = start;
      ranges[j][1] = end;
    }
    return ranges;
  }

  private static void populateTable(
    final byte[] family, final byte[][] qualifiers, MultiDimensionalSpec spec, int[][] data,
    SpaceFillingCurve sfc, MockHTable table) throws IOException, InterruptedException {
    BitVector[] point = new BitVector[spec.getBitsPerDimension().size()];
    BitVector index = BitVectorFactories.OPTIMAL.apply(spec.sumBitsPerDimension());
    for (int j = 0; j < spec.getBitsPerDimension().size(); ++j) {
      point[j] = BitVectorFactories.OPTIMAL.apply(spec.getBitsPerDimension().get(j));
    }
    Put[] puts = new Put[data.length];
    for (int i = 0; i < data.length; ++i) {
      // int has 32 bits, which fits in each dimensions.
      for (int j = 0; j < spec.getBitsPerDimension().size(); ++j) {
        point[j].copyFrom(data[i][j]);
      }
      sfc.index(point, 0, index);
      byte[] row = index.toBigEndianByteArray();
      Put put = new Put(row);
      KeyValue[] keyValues = new KeyValue[qualifiers.length];
      for (int k = 0; k < qualifiers.length; ++k) {
        // Put a nice string representation of the data point in the dummy
        // column.
        keyValues[k] = new KeyValue(row, family, qualifiers[k], Arrays.toString(data[i]).getBytes(
          Charsets.ISO_8859_1));
      }
      put.setFamilyMap(ImmutableMap.of(family, Arrays.asList(keyValues)));
      puts[i] = put;
    }
    table.batch(Arrays.asList(puts));
  }

  /**
   * It may generate duplicates.
   */
  private static int[][] generateData(MultiDimensionalSpec spec, int n, Random rnd) {
    int[][] data = new int[n][spec.getBitsPerDimension().size()];
    for (int i = 0; i < n; ++i) {
      // int has 32 bits, which fits in each dimensions.
      for (int j = 0; j < spec.getBitsPerDimension().size(); ++j) {
        int bound = 1 << spec.getBitsPerDimension().get(j);
        double gauss = rnd.nextGaussian();
        // Std of 1024.
        int d = bound / 2 + (int) (gauss * (1 << (spec.getBitsPerDimension().get(j) / 2)) / 1024);
        if (d < 0) {
          d = 0;
        }
        if (d >= bound) {
          d = bound - 1;
        }
        data[i][j] = d;
      }
    }
    return data;
  }

  public static List<int[]> uniq(List<int[]> data) {
    List<int[]> u = new ArrayList<>();
    for (int i = 0; i < data.size(); ++i) {
      if (i == data.size() - 1 || !Arrays.equals(data.get(i), data.get(i + 1))) {
        u.add(data.get(i));
      }
    }
    return u;
  }
}
TOP

Related Classes of com.google.uzaygezen.core.hbase.HBaseQueryTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.