Package com.cloudera.recordbreaker.learnstructure.test

Source Code of com.cloudera.recordbreaker.learnstructure.test.GenerateTestAvro$Purchase

/*
* Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
*
* Cloudera, Inc. licenses this file to you under the Apache License,
* Version 2.0 (the "License"). You may not use this file except in
* compliance with the License. You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for
* the specific language governing permissions and limitations under the
* License.
*/
package com.cloudera.recordbreaker.learnstructure.test;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.OutputStreamWriter;
import java.io.FileOutputStream;

import java.util.Random;

import org.apache.avro.Schema;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.io.Encoder;
import org.apache.avro.io.JsonEncoder;
import org.apache.avro.reflect.ReflectDatumWriter;
import org.apache.avro.reflect.ReflectData;
import org.codehaus.jackson.JsonGenerator;

import com.cloudera.recordbreaker.schemadict.TestRecord;
import com.cloudera.recordbreaker.schemadict.SchemaSuggest;

/*********************************************************************
* This class generates a number of test data files for the schema inference program.
* It makes data for five different genres:
* 1) A Web crawl
* 2) An access log
* 3) A file listing
* 4) Sensor data
* 5) Purchase transactions
*
* We also attempt to generate statistically-plausible data for each
*
* @author mjc
***********************************************************************/
public class GenerateTestAvro {
  static long DAY_IN_MILLIS = 1000 * 60 * 60 * 24;
  static long WEEK_IN_MILLIS = 7 * DAY_IN_MILLIS;
  static int NUM_ALPHA = 32;
  static int CAPITAL_A = 65;
  static int LOWER_A = 97;
  static int RESULT_CODES[] = {200, 401, 501, 301, 403};
  static double RESULT_CODE_DIST[] = {.8, .1, .02, .02, .06};

  static Random r = new Random(3333);
  public GenerateTestAvro() {
  }

 
  /**
   * Main method for building all the test data files.
   */
  public void generateData(File outDir, int numRecords) throws IOException, InstantiationException {
    // Create the target dir
    outDir = outDir.getCanonicalFile();
    if (outDir.exists()) {
      throw new IOException("Directory already exists: " + outDir);
    }
    outDir.mkdirs();

    //
    // Emit WebPage data.  Note the weird "Instantiator" business that appears as if it could be done
    // via Class.newInstance().  We can't do that here because newInstance() is incompatible with inner
    // classes.
    //
    Schema webCrawlSchema = ReflectData.get().getSchema(WebPage.class);
    emitSchema(new File(outDir, "webcrawl.schema"), webCrawlSchema);
    emitData(new File(outDir, "webcrawl.dat"), webCrawlSchema, numRecords, new Instantiator<WebPage>() {
        public WebPage create() {
          return new WebPage();
        }
      });

    //
    // Access log
    //
    Schema accessLogSchema = ReflectData.get().getSchema(AccessLog.class);
    emitSchema(new File(outDir, "accesslog.schema"), accessLogSchema);
    emitData(new File(outDir, "accesslog.dat"), accessLogSchema, numRecords, new Instantiator<AccessLog>() {
        public AccessLog create() {
          return new AccessLog();
        }
      });

    //
    // File listing
    //
    Schema fileListingSchema = ReflectData.get().getSchema(FileListing.class);
    emitSchema(new File(outDir, "filelisting.schema"), fileListingSchema);
    emitData(new File(outDir, "filelisting.dat"), fileListingSchema, numRecords, new Instantiator<FileListing>() {
        public FileListing create() {
          return new FileListing();
        }
      });

    //
    // Sensor data
    //
    Schema sensorDataSchema = ReflectData.get().getSchema(SensorData.class);
    emitSchema(new File(outDir, "sensordata.schema"), sensorDataSchema);
    emitData(new File(outDir, "sensordata.dat"), sensorDataSchema, numRecords, new Instantiator<SensorData>() {
        public SensorData create() {
          return new SensorData();
        }
      });

    //
    // Purchases
    //
    Schema purchaseSchema = ReflectData.get().getSchema(Purchase.class);
    emitSchema(new File(outDir, "purchase.schema"), purchaseSchema);
    emitData(new File(outDir, "purchase.dat"), purchaseSchema, numRecords, new Instantiator<Purchase>() {
        public Purchase create() {
          return new Purchase();
        }
      });
  }

  ///////////////////////////
  // The sample classes
  ///////////////////////////
  /**
   * Data type #1: WebPage
   */
  public class WebPage  {
    String url;
    long dateCrawled;
    double rank;
    int lastResultCode;
    int failedAttempts;
    long nextCrawl;
    String content;
    public WebPage() {
      this.url = "http://" + generateRandomString(10, 100);
      this.dateCrawled = System.currentTimeMillis() - (Math.abs(r.nextLong()) % WEEK_IN_MILLIS);
      this.rank = r.nextDouble();
      this.lastResultCode = RESULT_CODES[chooseIndex(RESULT_CODE_DIST)];
      this.failedAttempts = r.nextInt(3);
      this.nextCrawl = dateCrawled + WEEK_IN_MILLIS;
      this.content = generateRandomString(1024, 10 * 1024);
    }
  }

  /**
   * Data type #2: access log.  (Taken from Pavlo, et al, SIGMOD 2009)
   */
  public class AccessLog  {
    String srcIP;
    String destURL;
    long visitDate;
    float adRevenue;
    String userAgent;
    String countryCode;
    String languageCode;
    String searchWord;
    int duration;
    public AccessLog() {
      this.srcIP = generateRandomString(12, 12);
      this.destURL = "http://" + generateRandomString(10, 100);
      this.visitDate = System.currentTimeMillis() - (Math.abs(r.nextLong()) % DAY_IN_MILLIS);
      this.adRevenue = Math.abs(r.nextFloat()) * 100;
      this.userAgent = generateRandomString(4, 10);
      this.countryCode = generateRandomString(2, 2);
      this.languageCode = generateRandomString(4, 4);
      this.searchWord = generateRandomString(4, 20);
      this.duration = r.nextInt(10000);
    }
  }

  /**
   * Data type #3: file listing
   */
  public class FileListing  {
    String permissions;
    String user;
    String group;
    int size;
    String month;
    int day;
    String time;
    String filename;

    public FileListing() {
      this.permissions = generateRandomString(10, 10);
      this.user = generateRandomString(2, 8);
      this.group = generateRandomString(5, 8);     
      this.size = r.nextInt(9086);
      this.month = generateRandomString(3, 3);
      this.day = r.nextInt(31);
      this.time = generateRandomString(5, 5);
      this.filename = generateRandomString(3, 20);
    }
  }

  /**
   * Data type #4: sensor data
   */
  public class SensorData  {
    double temp;
    double lumens;
    double pressure;
    long timestamp;
    int xpos;
    int ypos;
    int zpos;

    public SensorData() {
      this.temp = r.nextDouble() * 120;
      this.lumens = r.nextDouble() * 15000;
      this.pressure = r.nextDouble();
      this.timestamp = System.currentTimeMillis() + (Math.abs(r.nextLong()) % DAY_IN_MILLIS);
      this.xpos = r.nextInt(1000);
      this.ypos = r.nextInt(1000);
      this.zpos = r.nextInt(1000);
    }
  }

  /**
   * Data type #5: purchases
   */
  public class Purchase  {
    long productCode;
    String productDesc;
    double price;
    long timestamp;
    int quantity;

    public Purchase() {
      this.productCode = Math.abs(r.nextLong());
      this.productDesc = generateRandomString(15, 25);
      this.price = r.nextDouble() * 10000;
      this.timestamp = System.currentTimeMillis() + (Math.abs(r.nextLong()) % DAY_IN_MILLIS);
      this.quantity = r.nextInt(10);
    }
  }


  /////////////////////////////
  // Utility file-handling
  //////////////////////////////
  /**
   */
  void emitSchema(File outSchema, Schema schema) throws IOException {
    OutputStreamWriter out = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(outSchema)));
    try {
      out.write(schema.toString(true));
    } finally {
      out.close();
    }        
  }

  /**
   */
  void emitData(File outData, Schema schema, int numRecords, Instantiator inster) throws IOException, InstantiationException {
    DatumWriter dout = new ReflectDatumWriter(schema);

    DataFileWriter out = new DataFileWriter(dout);
    out = out.create(schema, outData);
    try {
      for (int i = 0; i < numRecords; i++) {
        out.append(inster.create());
      }
      //encoder.flush();
    } finally {
      out.close();
    }
  }

  /////////////////////////////
  // Utility class-handling
  //////////////////////////////
  interface Instantiator<T> {
    public T create();
  }

  /////////////////////////////
  // Utility random-gen
  //////////////////////////////
  String generateRandomString(int minLen, int maxLen) {
    int target = minLen;
    if (maxLen - minLen > 0) {
      target += r.nextInt(maxLen-minLen);
    }

    StringBuffer buf = new StringBuffer();
    for (int i = 0; i < target; i++) {
      int rval = r.nextInt(NUM_ALPHA * 2);
      if (rval < NUM_ALPHA) {
        buf.append((char) (CAPITAL_A + rval));
      } else {
        buf.append((char) (LOWER_A + rval));
      }
    }
    return buf.toString();
  }

  /**
   * We require, but do not test, that the contents of distribution sums to 1.0
   */
  int chooseIndex(double[] distribution) {
    double target = r.nextDouble();
    for (int i = 0; i < distribution.length; i++) {
      target -= distribution[i];
      if (target <= 0) {
        return i;
      }
    }
    return distribution.length-1;
  }

  /**
   */
  public static void main(String argv[]) throws IOException, InstantiationException {
    CommandLine cmd = null;
    Options options = new Options();
    options.addOption("?", false, "Help for command-line");
    options.addOption("n", true, "# tuples to emit per file");

    try {
      CommandLineParser parser = new PosixParser();
      cmd = parser.parse(options, argv);
    } catch (ParseException pe) {
      HelpFormatter fmt = new HelpFormatter();
      fmt.printHelp("GenerateTestAvro", options, true);
      System.exit(-1);
    }

    if (cmd.hasOption("?")) {
      HelpFormatter fmt = new HelpFormatter();
      fmt.printHelp("GenerateTestAvro", options, true);
      System.exit(0);
    }

    int numToEmit = 100;
    if (cmd.hasOption("n")) {
      try {
        numToEmit = Integer.parseInt(cmd.getOptionValue("n"));
      } catch (NumberFormatException nfe) {
        nfe.printStackTrace();
      }
    }

    String[] argArray = cmd.getArgs();
    if (argArray.length == 0) {
      HelpFormatter fmt = new HelpFormatter();
      fmt.printHelp("GenerateTestAvro", options, true);
      System.exit(0);
    }
    File outputDir = new File(argArray[0]).getCanonicalFile();

    GenerateTestAvro gta = new GenerateTestAvro();
    gta.generateData(outputDir, numToEmit);
  }
}
TOP

Related Classes of com.cloudera.recordbreaker.learnstructure.test.GenerateTestAvro$Purchase

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.