Package org.apache.tez.mapreduce.examples.terasort

Source Code of org.apache.tez.mapreduce.examples.terasort.GenSort

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tez.mapreduce.examples.terasort;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.math.BigInteger;
import java.util.zip.Checksum;

import org.apache.hadoop.util.PureJavaCrc32;

/**
* A single process data generator for the terasort data. Based on gensort.c
* version 1.1 (3 Mar 2009) from Chris Nyberg <chris.nyberg@ordinal.com>.
*/
public class GenSort {

  /**
   * Generate a "binary" record suitable for all sort benchmarks *except*
   * PennySort.
   */
  static void generateRecord(byte[] recBuf, Unsigned16 rand,
                                     Unsigned16 recordNumber) {
    /* generate the 10-byte key using the high 10 bytes of the 128-bit
     * random number
     */
    for(int i=0; i < 10; ++i) {
      recBuf[i] = rand.getByte(i);
    }

    /* add 2 bytes of "break" */
    recBuf[10] = 0x00;
    recBuf[11] = 0x11;

    /* convert the 128-bit record number to 32 bits of ascii hexadecimal
     * as the next 32 bytes of the record.
     */
    for (int i = 0; i < 32; i++) {
      recBuf[12 + i] = (byte) recordNumber.getHexDigit(i);
    }

    /* add 4 bytes of "break" data */
    recBuf[44] = (byte) 0x88;
    recBuf[45] = (byte) 0x99;
    recBuf[46] = (byte) 0xAA;
    recBuf[47] = (byte) 0xBB;

    /* add 48 bytes of filler based on low 48 bits of random number */
    for(int i=0; i < 12; ++i) {
      recBuf[48+i*4] = recBuf[49+i*4] = recBuf[50+i*4] = recBuf[51+i*4] =
        (byte) rand.getHexDigit(20 + i);
    }

    /* add 4 bytes of "break" data */
    recBuf[96] = (byte) 0xCC;
    recBuf[97] = (byte) 0xDD;
    recBuf[98] = (byte) 0xEE;
    recBuf[99] = (byte) 0xFF;
  }


  private static BigInteger makeBigInteger(long x) {
    byte[] data = new byte[8];
    for(int i=0; i < 8; ++i) {
      data[i] = (byte) (x >>> (56 - 8*i));
    }
    return new BigInteger(1, data);
  }

  private static final BigInteger NINETY_FIVE = new BigInteger("95");

  /**
   * Generate an ascii record suitable for all sort benchmarks including
   * PennySort.
   */
  static void generateAsciiRecord(byte[] recBuf, Unsigned16 rand,
                                  Unsigned16 recordNumber) {

    /* generate the 10-byte ascii key using mostly the high 64 bits.
     */
    long temp = rand.getHigh8();
    if (temp < 0) {
      // use biginteger to avoid the negative sign problem
      BigInteger bigTemp = makeBigInteger(temp);
      recBuf[0] = (byte) (' ' + (bigTemp.mod(NINETY_FIVE).longValue()));
      temp = bigTemp.divide(NINETY_FIVE).longValue();
    } else {
      recBuf[0] = (byte) (' ' + (temp % 95));
      temp /= 95;     
    }
    for(int i=1; i < 8; ++i) {
      recBuf[i] = (byte) (' ' + (temp % 95));
      temp /= 95;     
    }
    temp = rand.getLow8();
    if (temp < 0) {
      BigInteger bigTemp = makeBigInteger(temp);
      recBuf[8] = (byte) (' ' + (bigTemp.mod(NINETY_FIVE).longValue()));
      temp = bigTemp.divide(NINETY_FIVE).longValue();     
    } else {
      recBuf[8] = (byte) (' ' + (temp % 95));
      temp /= 95;
    }
    recBuf[9] = (byte)(' ' + (temp % 95));

    /* add 2 bytes of "break" */
    recBuf[10] = ' ';
    recBuf[11] = ' ';

    /* convert the 128-bit record number to 32 bits of ascii hexadecimal
     * as the next 32 bytes of the record.
     */
    for (int i = 0; i < 32; i++) {
      recBuf[12 + i] = (byte) recordNumber.getHexDigit(i);
    }

    /* add 2 bytes of "break" data */
    recBuf[44] = ' ';
    recBuf[45] = ' ';

    /* add 52 bytes of filler based on low 48 bits of random number */
    for(int i=0; i < 13; ++i) {
      recBuf[46+i*4] = recBuf[47+i*4] = recBuf[48+i*4] = recBuf[49+i*4] =
        (byte) rand.getHexDigit(19 + i);
    }

    /* add 2 bytes of "break" data */
    recBuf[98] = '\r'/* nice for Windows */
    recBuf[99] = '\n';
}


  private static void usage() {
    PrintStream out = System.out;
    out.println("usage: gensort [-a] [-c] [-bSTARTING_REC_NUM] NUM_RECS FILE_NAME");
    out.println("-a        Generate ascii records required for PennySort or JouleSort.");
    out.println("          These records are also an alternative input for the other");
    out.println("          sort benchmarks.  Without this flag, binary records will be");
    out.println("          generated that contain the highest density of randomness in");
    out.println("          the 10-byte key.");
    out.println( "-c        Calculate the sum of the crc32 checksums of each of the");
    out.println("          generated records and send it to standard error.");
    out.println("-bN       Set the beginning record generated to N. By default the");
    out.println("          first record generated is record 0.");
    out.println("NUM_RECS  The number of sequential records to generate.");
    out.println("FILE_NAME The name of the file to write the records to.\n");
    out.println("Example 1 - to generate 1000000 ascii records starting at record 0 to");
    out.println("the file named \"pennyinput\":");
    out.println("    gensort -a 1000000 pennyinput\n");
    out.println("Example 2 - to generate 1000 binary records beginning with record 2000");
    out.println("to the file named \"partition2\":");
    out.println("    gensort -b2000 1000 partition2");
    System.exit(1);
  }


  public static void outputRecords(OutputStream out,
                                   boolean useAscii,
                                   Unsigned16 firstRecordNumber,
                                   Unsigned16 recordsToGenerate,
                                   Unsigned16 checksum
                                   ) throws IOException {
    byte[] row = new byte[100];
    Unsigned16 recordNumber = new Unsigned16(firstRecordNumber);
    Unsigned16 lastRecordNumber = new Unsigned16(firstRecordNumber);
    Checksum crc = new PureJavaCrc32();
    Unsigned16 tmp = new Unsigned16();
    lastRecordNumber.add(recordsToGenerate);
    Unsigned16 ONE = new Unsigned16(1);
    Unsigned16 rand = Random16.skipAhead(firstRecordNumber);
    while (!recordNumber.equals(lastRecordNumber)) {
      Random16.nextRand(rand);
      if (useAscii) {
        generateAsciiRecord(row, rand, recordNumber);
      } else {
        generateRecord(row, rand, recordNumber);
      }
      if (checksum != null) {
        crc.reset();
        crc.update(row, 0, row.length);
        tmp.set(crc.getValue());
        checksum.add(tmp);
      }
      recordNumber.add(ONE);
      out.write(row);
    }
  }
                                  
  public static void main(String[] args) throws Exception {
    Unsigned16 startingRecord = new Unsigned16();
    Unsigned16 numberOfRecords;
    OutputStream out;
    boolean useAscii = false;
    Unsigned16 checksum = null;

    int i;
    for(i=0; i < args.length; ++i) {
      String arg = args[i];
      int argLength = arg.length();
      if (argLength >= 1 && arg.charAt(0) == '-') {
        if (argLength < 2) {
          usage();
        }
        switch (arg.charAt(1)) {
        case 'a':
          useAscii = true;
          break;
        case 'b':
          startingRecord = Unsigned16.fromDecimal(arg.substring(2));
          break;
        case 'c':
          checksum = new Unsigned16();
          break;
        default:
          usage();
        }
      } else {
        break;
      }
    }
    if (args.length - i != 2) {
      usage();
    }
    numberOfRecords = Unsigned16.fromDecimal(args[i]);
    out = new FileOutputStream(args[i+1]);

    outputRecords(out, useAscii, startingRecord, numberOfRecords, checksum);
    out.close();
    if (checksum != null) {
      System.out.println(checksum);
    }
  }

}
TOP

Related Classes of org.apache.tez.mapreduce.examples.terasort.GenSort

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.