/*
* Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
*
* Cloudera, Inc. licenses this file to you under the Apache License,
* Version 2.0 (the "License"). You may not use this file except in
* compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for
* the specific language governing permissions and limitations under the
* License.
*/
package com.cloudera.recordbreaker.learnstructure.test;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.nio.ByteBuffer;
import java.util.Map;
import java.util.List;
import java.util.HashMap;
import java.util.Random;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Type;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.io.Encoder;
import org.apache.avro.io.JsonEncoder;
import org.apache.avro.io.EncoderFactory;
import org.apache.avro.reflect.ReflectDatumWriter;
import org.apache.avro.reflect.ReflectData;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.util.Utf8;
import org.codehaus.jackson.JsonGenerator;
import com.cloudera.recordbreaker.schemadict.TestRecord;
import com.cloudera.recordbreaker.schemadict.SchemaSuggest;
/**
* @author mjc
*/
public class GenerateRandomData {
Schema schema;
Random r = new Random();
public GenerateRandomData() {
this.schema = schema;
}
Object generateData(Schema s) {
Schema.Type stype = s.getType();
if (stype == Schema.Type.ARRAY) {
Schema arrayS = s.getElementType();
int numElts = 1 + r.nextInt(100);
GenericData.Array result = new GenericData.Array(numElts, arrayS);
for (int i = 0; i < numElts; i++) {
result.add(generateData(arrayS));
}
return arrayS;
} else if (stype == Schema.Type.BOOLEAN) {
return r.nextInt(2) == 0 ? new Boolean(true) : new Boolean(false);
} else if (stype == Schema.Type.BYTES) {
return ByteBuffer.wrap(new byte[16]);
} else if (stype == Schema.Type.DOUBLE) {
return new Double(r.nextDouble());
} else if (stype == Schema.Type.ENUM) {
List<String> symbols = s.getEnumSymbols();
return symbols.get(r.nextInt(symbols.size()));
} else if (stype == Schema.Type.FIXED) {
return new GenericData.Fixed(s, new byte[16]);
} else if (stype == Schema.Type.FLOAT) {
return new Float(r.nextFloat());
} else if (stype == Schema.Type.INT) {
return new Integer(r.nextInt());
} else if (stype == Schema.Type.LONG) {
return new Long(r.nextLong());
} else if (stype == Schema.Type.MAP) {
HashMap<Utf8, Object> result = new HashMap<Utf8, Object>();
Schema valType = s.getValueType();
int maxElts = 1 + r.nextInt(100);
for (int i = 0; i < maxElts; i++) {
result.put(new Utf8("label-" + i), generateData(valType));
}
return result;
} else if (stype == Schema.Type.NULL) {
return null;
} else if (stype == Schema.Type.RECORD) {
GenericData.Record result = new GenericData.Record(s);
for (Schema.Field f: s.getFields()) {
result.put(f.name(), generateData(f.schema()));
}
return result;
} else if (stype == Schema.Type.STRING) {
return new Utf8("Rand-" + r.nextInt());
} else if (stype == Schema.Type.UNION) {
List<Schema> types = s.getTypes();
return generateData(types.get(r.nextInt(types.size())));
}
return null;
}
/**
*/
public void generateData(boolean encodeJson, File outfile, int numRecords) throws IOException {
Schema schema = ReflectData.get().getSchema(TestRecord.class);
DatumWriter dout = new ReflectDatumWriter(schema);
if (encodeJson) {
BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(outfile));
try {
Encoder encoder = EncoderFactory.get().jsonEncoder(schema, (OutputStream) out);
for (int i = 0; i < numRecords; i++) {
TestRecord tr = new TestRecord();
dout.write(tr, encoder);
}
encoder.flush();
} finally {
out.close();
}
} else {
DataFileWriter out = new DataFileWriter(dout);
try {
out.create(schema, outfile);
for (int i = 0; i < numRecords; i++) {
TestRecord tr = new TestRecord();
out.append(tr);
}
} finally {
out.close();
}
}
}
/**
*/
public static void main(String argv[]) throws IOException {
CommandLine cmd = null;
Options options = new Options();
options.addOption("?", false, "Help for command-line");
options.addOption("n", true, "Number elts to emit");
try {
CommandLineParser parser = new PosixParser();
cmd = parser.parse(options, argv);
} catch (ParseException pe) {
HelpFormatter fmt = new HelpFormatter();
fmt.printHelp("GenerateRandomData", options, true);
System.exit(-1);
}
if (cmd.hasOption("?")) {
HelpFormatter fmt = new HelpFormatter();
fmt.printHelp("GenerateRandomData", options, true);
System.exit(0);
}
int numToEmit = 100;
if (cmd.hasOption("n")) {
try {
numToEmit = Integer.parseInt(cmd.getOptionValue("n"));
} catch (NumberFormatException nfe) {
nfe.printStackTrace();
}
}
String[] argArray = cmd.getArgs();
if (argArray.length == 0) {
HelpFormatter fmt = new HelpFormatter();
fmt.printHelp("GenerateRandomData", options, true);
System.exit(0);
}
File inputSchemaFile = new File(argArray[0]).getCanonicalFile();
File outputDataFile = new File(argArray[1]).getCanonicalFile();
if (outputDataFile.exists()) {
System.err.println("Output file already exists: " + outputDataFile.getCanonicalPath());
System.exit(0);
}
GenerateRandomData grd = new GenerateRandomData();
Schema schema = Schema.parse(inputSchemaFile);
GenericDatumWriter datum = new GenericDatumWriter(schema);
DataFileWriter out = new DataFileWriter(datum);
out.create(schema, outputDataFile);
try {
for (int i = 0; i < numToEmit; i++) {
out.append((GenericData.Record) grd.generateData(schema));
}
} finally {
out.close();
}
}
}