Package eu.stratosphere.pact.runtime.hash

Source Code of eu.stratosphere.pact.runtime.hash.ReOpenableHashTableITCase

/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/

package eu.stratosphere.pact.runtime.hash;

import static org.junit.Assert.fail;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;

import eu.stratosphere.api.common.typeutils.TypeComparator;
import eu.stratosphere.api.common.typeutils.TypePairComparator;
import eu.stratosphere.api.common.typeutils.TypeSerializer;
import eu.stratosphere.api.java.record.functions.JoinFunction;
import eu.stratosphere.core.memory.MemorySegment;
import eu.stratosphere.nephele.services.iomanager.IOManager;
import eu.stratosphere.nephele.services.memorymanager.MemoryAllocationException;
import eu.stratosphere.nephele.services.memorymanager.MemoryManager;
import eu.stratosphere.nephele.services.memorymanager.spi.DefaultMemoryManager;
import eu.stratosphere.nephele.template.AbstractInvokable;
import eu.stratosphere.nephele.template.AbstractTask;
import eu.stratosphere.pact.runtime.hash.HashMatchIteratorITCase.RecordMatch;
import eu.stratosphere.pact.runtime.hash.HashMatchIteratorITCase.RecordMatchRemovingJoin;
import eu.stratosphere.pact.runtime.hash.HashTableITCase.ConstantsKeyValuePairsIterator;
import eu.stratosphere.pact.runtime.hash.MutableHashTable.HashBucketIterator;
import eu.stratosphere.api.java.typeutils.runtime.record.RecordComparator;
import eu.stratosphere.api.java.typeutils.runtime.record.RecordPairComparator;
import eu.stratosphere.api.java.typeutils.runtime.record.RecordSerializer;
import eu.stratosphere.pact.runtime.test.util.DiscardingOutputCollector;
import eu.stratosphere.pact.runtime.test.util.DummyInvokable;
import eu.stratosphere.pact.runtime.test.util.TestData;
import eu.stratosphere.pact.runtime.test.util.TestData.Generator;
import eu.stratosphere.pact.runtime.test.util.TestData.Generator.KeyMode;
import eu.stratosphere.pact.runtime.test.util.TestData.Generator.ValueMode;
import eu.stratosphere.pact.runtime.test.util.TestData.Key;
import eu.stratosphere.pact.runtime.test.util.UniformRecordGenerator;
import eu.stratosphere.pact.runtime.test.util.UnionIterator;
import eu.stratosphere.types.IntValue;
import eu.stratosphere.types.Record;
import eu.stratosphere.util.Collector;
import eu.stratosphere.util.MutableObjectIterator;

/**
* Test specialized hash join that keeps the build side data (in memory and on hard disk)
* This is used for iterative tasks.
*/

public class ReOpenableHashTableITCase {
 
  private static final int PAGE_SIZE = 8 * 1024;
  private static final long MEMORY_SIZE = PAGE_SIZE * 1000; // 100 Pages.

  private static final long SEED1 = 561349061987311L;
  private static final long SEED2 = 231434613412342L;
 
  private static final int NUM_PROBES = 3; // number of reopenings of hash join
 
  private final AbstractTask parentTask = new DummyInvokable();

  private IOManager ioManager;
  private MemoryManager memoryManager;
 
  private TypeSerializer<Record> recordSerializer;
  private TypeComparator<Record> record1Comparator;
  private TypeComparator<Record> record2Comparator;
  private TypePairComparator<Record, Record> recordPairComparator;
 
 
 
 
  private static final AbstractInvokable MEM_OWNER = new DummyInvokable();
  private TypeSerializer<Record> recordBuildSideAccesssor;
  private TypeSerializer<Record> recordProbeSideAccesssor;
  private TypeComparator<Record> recordBuildSideComparator;
  private TypeComparator<Record> recordProbeSideComparator;
  private TypePairComparator<Record, Record> pactRecordComparator;
 

  @SuppressWarnings("unchecked")
  @Before
  public void beforeTest()
  {
    this.recordSerializer = RecordSerializer.get();
   
    this.record1Comparator = new RecordComparator(new int[] {0}, new Class[] {TestData.Key.class});
    this.record2Comparator = new RecordComparator(new int[] {0}, new Class[] {TestData.Key.class});
    this.recordPairComparator = new RecordPairComparator(new int[] {0}, new int[] {0}, new Class[] {TestData.Key.class});
   
   
    final int[] keyPos = new int[] {0};
    final Class<? extends Key>[] keyType = (Class<? extends Key>[]) new Class[] { IntValue.class };
   
    this.recordBuildSideAccesssor = RecordSerializer.get();
    this.recordProbeSideAccesssor = RecordSerializer.get();
    this.recordBuildSideComparator = new RecordComparator(keyPos, keyType);
    this.recordProbeSideComparator = new RecordComparator(keyPos, keyType);
    this.pactRecordComparator = new HashTableITCase.RecordPairComparatorFirstInt();
   
    this.memoryManager = new DefaultMemoryManager(MEMORY_SIZE, PAGE_SIZE);
    this.ioManager = new IOManager();
  }

  @After
  public void afterTest()
  {
    if (this.ioManager != null) {
      this.ioManager.shutdown();
      if (!this.ioManager.isProperlyShutDown()) {
        Assert.fail("I/O manager failed to properly shut down.");
      }
      this.ioManager = null;
    }
   
    if (this.memoryManager != null) {
      Assert.assertTrue("Memory Leak: Not all memory has been returned to the memory manager.",
        this.memoryManager.verifyEmpty());
      this.memoryManager.shutdown();
      this.memoryManager = null;
    }
  }
 
 
  /**
   * Test behavior with overflow buckets (Overflow buckets must be initialized correctly
   * if the input is reopened again)
   */
  @Test
  public void testOverflow() {
   
    int buildSize = 1000;
    int probeSize = 1000;
    try {
      Generator bgen = new Generator(SEED1, 200, 1024, KeyMode.RANDOM, ValueMode.FIX_LENGTH);
      Generator pgen = new Generator(SEED2, 0, 1024, KeyMode.SORTED, ValueMode.FIX_LENGTH);
     
      final TestData.GeneratorIterator buildInput = new TestData.GeneratorIterator(bgen, buildSize);
      final TestData.GeneratorIterator probeInput = new TestData.GeneratorIterator(pgen, probeSize);
      doTest(buildInput,probeInput, bgen, pgen);
    }
    catch (Exception e) {
      e.printStackTrace();
      Assert.fail("An exception occurred during the test: " + e.getMessage());
    }
  }
 
  /**
   * Verify proper operation if the build side is spilled to disk.
   */
  @Test
  public void testDoubleProbeSpilling() {
   
    int buildSize = 1000;
    int probeSize = 1000;
    try {
      Generator bgen = new Generator(SEED1, 0, 1024, KeyMode.SORTED, ValueMode.FIX_LENGTH);
      Generator pgen = new Generator(SEED2, 0, 1024, KeyMode.SORTED, ValueMode.FIX_LENGTH);
     
      final TestData.GeneratorIterator buildInput = new TestData.GeneratorIterator(bgen, buildSize);
      final TestData.GeneratorIterator probeInput = new TestData.GeneratorIterator(pgen, probeSize);
      doTest(buildInput,probeInput, bgen, pgen);
    }
    catch (Exception e) {
      e.printStackTrace();
      Assert.fail("An exception occurred during the test: " + e.getMessage());
    }
  }
 
  /**
   * This test case verifies that hybrid hash join is able to handle multiple probe phases
   * when the build side fits completely into memory.
   */
  @Test
  public void testDoubleProbeInMemory() {
   
    int buildSize = 1000;
    int probeSize = 1000;
    try {
      Generator bgen = new Generator(SEED1, 0, 28, KeyMode.SORTED, ValueMode.FIX_LENGTH);
      Generator pgen = new Generator(SEED2, 0, 28, KeyMode.SORTED, ValueMode.FIX_LENGTH);
     
      final TestData.GeneratorIterator buildInput = new TestData.GeneratorIterator(bgen, buildSize);
      final TestData.GeneratorIterator probeInput = new TestData.GeneratorIterator(pgen, probeSize);
     
      doTest(buildInput,probeInput, bgen, pgen);
    }
    catch (Exception e) {
      e.printStackTrace();
      Assert.fail("An exception occurred during the test: " + e.getMessage());
    }
  }
 
  private void doTest(TestData.GeneratorIterator buildInput, TestData.GeneratorIterator probeInput, Generator bgen, Generator pgen) throws Exception {
    // collect expected data
    final Map<TestData.Key, Collection<RecordMatch>> expectedFirstMatchesMap = HashMatchIteratorITCase.matchRecordValues(
      HashMatchIteratorITCase.collectRecordData(buildInput),
      HashMatchIteratorITCase.collectRecordData(probeInput));
   
    final List<Map<TestData.Key, Collection<RecordMatch>>> expectedNMatchesMapList = new ArrayList<Map<Key,Collection<RecordMatch>>>(NUM_PROBES);
    final JoinFunction[] nMatcher = new RecordMatchRemovingJoin[NUM_PROBES];
    for(int i = 0; i < NUM_PROBES; i++) {
      Map<TestData.Key, Collection<RecordMatch>> tmp;
      expectedNMatchesMapList.add(tmp = deepCopy(expectedFirstMatchesMap));
      nMatcher[i] = new RecordMatchRemovingJoin(tmp);
    }
   
    final JoinFunction firstMatcher = new RecordMatchRemovingJoin(expectedFirstMatchesMap);
   
    final Collector<Record> collector = new DiscardingOutputCollector<Record>();

    // reset the generators
    bgen.reset();
    pgen.reset();
    buildInput.reset();
    probeInput.reset();

    // compare with iterator values
    BuildFirstReOpenableHashMatchIterator<Record, Record, Record> iterator =
        new BuildFirstReOpenableHashMatchIterator<Record, Record, Record>(
            buildInput, probeInput, this.recordSerializer, this.record1Comparator,
          this.recordSerializer, this.record2Comparator, this.recordPairComparator,
          this.memoryManager, ioManager, this.parentTask, MEMORY_SIZE);
   
    iterator.open();
    // do first join with both inputs
    while (iterator.callWithNextKey(firstMatcher, collector));

    // assert that each expected match was seen for the first input
    for (Entry<TestData.Key, Collection<RecordMatch>> entry : expectedFirstMatchesMap.entrySet()) {
      if (!entry.getValue().isEmpty()) {
        Assert.fail("Collection for key " + entry.getKey() + " is not empty");
      }
    }
   
    for(int i = 0; i < NUM_PROBES; i++) {
      pgen.reset();
      probeInput.reset();
      // prepare ..
      iterator.reopenProbe(probeInput);
      // .. and do second join
      while (iterator.callWithNextKey(nMatcher[i], collector));
     
      // assert that each expected match was seen for the second input
      for (Entry<TestData.Key, Collection<RecordMatch>> entry : expectedNMatchesMapList.get(i).entrySet()) {
        if (!entry.getValue().isEmpty()) {
          Assert.fail("Collection for key " + entry.getKey() + " is not empty");
        }
      }
    }
   
    iterator.close();
  }
 
  //
  //
  //  Tests taken from HahTableITCase!
  //
  //
 
  private final MutableObjectIterator<Record> getProbeInput(final int numKeys,
      final int probeValsPerKey, final int repeatedValue1, final int repeatedValue2) {
    MutableObjectIterator<Record> probe1 = new UniformRecordGenerator(numKeys, probeValsPerKey, true);
    MutableObjectIterator<Record> probe2 = new ConstantsKeyValuePairsIterator(repeatedValue1, 17, 5);
    MutableObjectIterator<Record> probe3 = new ConstantsKeyValuePairsIterator(repeatedValue2, 23, 5);
    List<MutableObjectIterator<Record>> probes = new ArrayList<MutableObjectIterator<Record>>();
    probes.add(probe1);
    probes.add(probe2);
    probes.add(probe3);
    return new UnionIterator<Record>(probes);
  }
 
  @Test
  public void testSpillingHashJoinWithMassiveCollisions() throws IOException
  {
    // the following two values are known to have a hash-code collision on the initial level.
    // we use them to make sure one partition grows over-proportionally large
    final int REPEATED_VALUE_1 = 40559;
    final int REPEATED_VALUE_2 = 92882;
    final int REPEATED_VALUE_COUNT_BUILD = 200000;
    final int REPEATED_VALUE_COUNT_PROBE = 5;
   
    final int NUM_KEYS = 1000000;
    final int BUILD_VALS_PER_KEY = 3;
    final int PROBE_VALS_PER_KEY = 10;
   
    // create a build input that gives 3 million pairs with 3 values sharing the same key, plus 400k pairs with two colliding keys
    MutableObjectIterator<Record> build1 = new UniformRecordGenerator(NUM_KEYS, BUILD_VALS_PER_KEY, false);
    MutableObjectIterator<Record> build2 = new ConstantsKeyValuePairsIterator(REPEATED_VALUE_1, 17, REPEATED_VALUE_COUNT_BUILD);
    MutableObjectIterator<Record> build3 = new ConstantsKeyValuePairsIterator(REPEATED_VALUE_2, 23, REPEATED_VALUE_COUNT_BUILD);
    List<MutableObjectIterator<Record>> builds = new ArrayList<MutableObjectIterator<Record>>();
    builds.add(build1);
    builds.add(build2);
    builds.add(build3);
    MutableObjectIterator<Record> buildInput = new UnionIterator<Record>(builds);
 
   
   

    // allocate the memory for the HashTable
    List<MemorySegment> memSegments;
    try {
      memSegments = this.memoryManager.allocatePages(MEM_OWNER, 896);
    }
    catch (MemoryAllocationException maex) {
      fail("Memory for the Join could not be provided.");
      return;
    }
   
    // create the map for validating the results
    HashMap<Integer, Long> map = new HashMap<Integer, Long>(NUM_KEYS);
   
    // ----------------------------------------------------------------------------------------
   
    final ReOpenableMutableHashTable<Record, Record> join = new ReOpenableMutableHashTable<Record, Record>(
        this.recordBuildSideAccesssor, this.recordProbeSideAccesssor,
        this.recordBuildSideComparator, this.recordProbeSideComparator, this.pactRecordComparator,
        memSegments, ioManager);
   
    for(int probe = 0; probe < NUM_PROBES; probe++) {
      // create a probe input that gives 10 million pairs with 10 values sharing a key
      MutableObjectIterator<Record> probeInput = getProbeInput(NUM_KEYS, PROBE_VALS_PER_KEY, REPEATED_VALUE_1, REPEATED_VALUE_2);
      if(probe == 0) {
        join.open(buildInput, probeInput);
      } else {
        join.reopenProbe(probeInput);
      }
   
      Record record;
      final Record recordReuse = new Record();

      while (join.nextRecord())
      {
        int numBuildValues = 0;
   
        final Record probeRec = join.getCurrentProbeRecord();
        int key = probeRec.getField(0, IntValue.class).getValue();
       
        HashBucketIterator<Record, Record> buildSide = join.getBuildSideIterator();
        if ((record = buildSide.next(recordReuse)) != null) {
          numBuildValues = 1;
          Assert.assertEquals("Probe-side key was different than build-side key.", key, record.getField(0, IntValue.class).getValue());
        }
        else {
          fail("No build side values found for a probe key.");
        }
        while ((record = buildSide.next(record)) != null) {
          numBuildValues++;
          Assert.assertEquals("Probe-side key was different than build-side key.", key, record.getField(0, IntValue.class).getValue());
        }
       
        Long contained = map.get(key);
        if (contained == null) {
          contained = new Long(numBuildValues);
        }
        else {
          contained = new Long(contained.longValue() + numBuildValues);
        }
       
        map.put(key, contained);
      }
    }
   
    join.close();
   
    Assert.assertEquals("Wrong number of keys", NUM_KEYS, map.size());
    for (Map.Entry<Integer, Long> entry : map.entrySet()) {
      long val = entry.getValue();
      int key = entry.getKey();
 
      if( key == REPEATED_VALUE_1 || key == REPEATED_VALUE_2) {
        Assert.assertEquals("Wrong number of values in per-key cross product for key " + key,
              (PROBE_VALS_PER_KEY + REPEATED_VALUE_COUNT_PROBE) * (BUILD_VALS_PER_KEY + REPEATED_VALUE_COUNT_BUILD) * NUM_PROBES, val);
      } else {
        Assert.assertEquals("Wrong number of values in per-key cross product for key " + key,
              PROBE_VALS_PER_KEY * BUILD_VALS_PER_KEY * NUM_PROBES, val);
      }
    }
   
   
    // ----------------------------------------------------------------------------------------
   
    this.memoryManager.release(join.getFreedMemory());
  }
 
  /*
   * This test is basically identical to the "testSpillingHashJoinWithMassiveCollisions" test, only that the number
   * of repeated values (causing bucket collisions) are large enough to make sure that their target partition no longer
   * fits into memory by itself and needs to be repartitioned in the recursion again.
   */
  @Test
  public void testSpillingHashJoinWithTwoRecursions() throws IOException
  {
    // the following two values are known to have a hash-code collision on the first recursion level.
    // we use them to make sure one partition grows over-proportionally large
    final int REPEATED_VALUE_1 = 40559;
    final int REPEATED_VALUE_2 = 92882;
    final int REPEATED_VALUE_COUNT_BUILD = 200000;
    final int REPEATED_VALUE_COUNT_PROBE = 5;
   
    final int NUM_KEYS = 1000000;
    final int BUILD_VALS_PER_KEY = 3;
    final int PROBE_VALS_PER_KEY = 10;
   
    // create a build input that gives 3 million pairs with 3 values sharing the same key, plus 400k pairs with two colliding keys
    MutableObjectIterator<Record> build1 = new UniformRecordGenerator(NUM_KEYS, BUILD_VALS_PER_KEY, false);
    MutableObjectIterator<Record> build2 = new ConstantsKeyValuePairsIterator(REPEATED_VALUE_1, 17, REPEATED_VALUE_COUNT_BUILD);
    MutableObjectIterator<Record> build3 = new ConstantsKeyValuePairsIterator(REPEATED_VALUE_2, 23, REPEATED_VALUE_COUNT_BUILD);
    List<MutableObjectIterator<Record>> builds = new ArrayList<MutableObjectIterator<Record>>();
    builds.add(build1);
    builds.add(build2);
    builds.add(build3);
    MutableObjectIterator<Record> buildInput = new UnionIterator<Record>(builds);
 

    // allocate the memory for the HashTable
    List<MemorySegment> memSegments;
    try {
      memSegments = this.memoryManager.allocatePages(MEM_OWNER, 896);
    }
    catch (MemoryAllocationException maex) {
      fail("Memory for the Join could not be provided.");
      return;
    }
   
    // create the map for validating the results
    HashMap<Integer, Long> map = new HashMap<Integer, Long>(NUM_KEYS);
   
    // ----------------------------------------------------------------------------------------
   
    final ReOpenableMutableHashTable<Record, Record> join = new ReOpenableMutableHashTable<Record, Record>(
        this.recordBuildSideAccesssor, this.recordProbeSideAccesssor,
        this.recordBuildSideComparator, this.recordProbeSideComparator, this.pactRecordComparator,
        memSegments, ioManager);
    for(int probe = 0; probe < NUM_PROBES; probe++) {
      // create a probe input that gives 10 million pairs with 10 values sharing a key
      MutableObjectIterator<Record> probeInput = getProbeInput(NUM_KEYS, PROBE_VALS_PER_KEY, REPEATED_VALUE_1, REPEATED_VALUE_2);
      if(probe == 0) {
        join.open(buildInput, probeInput);
      } else {
        join.reopenProbe(probeInput);
      }
      Record record;
      final Record recordReuse = new Record();

      while (join.nextRecord())
      { 
        int numBuildValues = 0;
       
        final Record probeRec = join.getCurrentProbeRecord();
        int key = probeRec.getField(0, IntValue.class).getValue();
       
        HashBucketIterator<Record, Record> buildSide = join.getBuildSideIterator();
        if ((record = buildSide.next(recordReuse)) != null) {
          numBuildValues = 1;
          Assert.assertEquals("Probe-side key was different than build-side key.", key, record.getField(0, IntValue.class).getValue());
        }
        else {
          fail("No build side values found for a probe key.");
        }
        while ((record = buildSide.next(recordReuse)) != null) {
          numBuildValues++;
          Assert.assertEquals("Probe-side key was different than build-side key.", key, record.getField(0, IntValue.class).getValue());
        }
       
        Long contained = map.get(key);
        if (contained == null) {
          contained = new Long(numBuildValues);
        }
        else {
          contained = new Long(contained.longValue() + numBuildValues);
        }
       
        map.put(key, contained);
      }
    }
   
    join.close();
    Assert.assertEquals("Wrong number of keys", NUM_KEYS, map.size());
    for (Map.Entry<Integer, Long> entry : map.entrySet()) {
      long val = entry.getValue();
      int key = entry.getKey();
 
      if( key == REPEATED_VALUE_1 || key == REPEATED_VALUE_2) {
        Assert.assertEquals("Wrong number of values in per-key cross product for key " + key,
              (PROBE_VALS_PER_KEY + REPEATED_VALUE_COUNT_PROBE) * (BUILD_VALS_PER_KEY + REPEATED_VALUE_COUNT_BUILD) * NUM_PROBES, val);
      } else {
        Assert.assertEquals("Wrong number of values in per-key cross product for key " + key,
              PROBE_VALS_PER_KEY * BUILD_VALS_PER_KEY * NUM_PROBES, val);
      }
    }
   
   
    // ----------------------------------------------------------------------------------------
   
    this.memoryManager.release(join.getFreedMemory());
  }
 
 
  static Map<Key, Collection<RecordMatch>> deepCopy(Map<Key, Collection<RecordMatch>> expectedSecondMatchesMap) {
    Map<Key, Collection<RecordMatch>> copy = new HashMap<Key, Collection<RecordMatch>>(expectedSecondMatchesMap.size());
    for(Map.Entry<Key, Collection<RecordMatch>> entry : expectedSecondMatchesMap.entrySet()) {
      List<RecordMatch> matches = new ArrayList<RecordMatch>(entry.getValue().size());
      for(RecordMatch m : entry.getValue()) {
        matches.add(m);
      }
      copy.put(entry.getKey(), matches);
    }
    return copy;
  }
 
}
TOP

Related Classes of eu.stratosphere.pact.runtime.hash.ReOpenableHashTableITCase

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.