/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.howl.rcfile;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import junit.framework.Assert;
import junit.framework.TestCase;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.ql.io.RCFile;
import org.apache.hadoop.hive.ql.io.RCFileOutputFormat;
import org.apache.hadoop.hive.serde.Constants;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.howl.common.HowlException;
import org.apache.howl.common.HowlUtil;
import org.apache.howl.data.DefaultHowlRecord;
import org.apache.howl.data.HowlRecord;
import org.apache.howl.data.schema.HowlSchema;
public class TestRCFileInputStorageDriver extends TestCase{
private static Configuration conf = new Configuration();
private static Path file;
private static FileSystem fs;
static {
try {
fs = FileSystem.getLocal(conf);
Path dir = new Path(System.getProperty("test.data.dir", ".") + "/mapred");
file = new Path(dir, "test_rcfile");
fs.delete(dir, true);
} catch (Exception e) {
}
}
public void testConvertValueToTuple() throws IOException,InterruptedException{
fs.delete(file, true);
byte[][] record_1 = {"123".getBytes("UTF-8"), "456".getBytes("UTF-8"),
"789".getBytes("UTF-8"), "1000".getBytes("UTF-8"),
"5.3".getBytes("UTF-8"), "howl and hadoop".getBytes("UTF-8"),
new byte[0], "\\N".getBytes("UTF-8")};
byte[][] record_2 = {"100".getBytes("UTF-8"), "200".getBytes("UTF-8"),
"123".getBytes("UTF-8"), "1000".getBytes("UTF-8"),
"5.3".getBytes("UTF-8"), "howl and hadoop".getBytes("UTF-8"),
new byte[0], "\\N".getBytes("UTF-8")};
RCFileOutputFormat.setColumnNumber(conf, 8);
RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null,
new DefaultCodec());
BytesRefArrayWritable bytes = new BytesRefArrayWritable(record_1.length);
for (int i = 0; i < record_1.length; i++) {
BytesRefWritable cu = new BytesRefWritable(record_1[i], 0,
record_1[i].length);
bytes.set(i, cu);
}
writer.append(bytes);
BytesRefArrayWritable bytes2 = new BytesRefArrayWritable(record_2.length);
for (int i = 0; i < record_2.length; i++) {
BytesRefWritable cu = new BytesRefWritable(record_2[i], 0,
record_2[i].length);
bytes2.set(i, cu);
}
writer.append(bytes2);
writer.close();
BytesRefArrayWritable[] bytesArr = new BytesRefArrayWritable[]{bytes,bytes2};
HowlSchema schema = buildHiveSchema();
RCFileInputDriver sd = new RCFileInputDriver();
JobContext jc = new JobContext(conf, new JobID());
sd.setInputPath(jc, file.toString());
InputFormat<?,?> iF = sd.getInputFormat(null);
InputSplit split = iF.getSplits(jc).get(0);
sd.setOriginalSchema(jc, schema);
sd.setOutputSchema(jc, schema);
sd.initialize(jc, getProps());
TaskAttemptContext tac = new TaskAttemptContext(conf, new TaskAttemptID());
RecordReader<?,?> rr = iF.createRecordReader(split,tac);
rr.initialize(split, tac);
HowlRecord[] tuples = getExpectedRecords();
for(int j=0; j < 2; j++){
Assert.assertTrue(rr.nextKeyValue());
BytesRefArrayWritable w = (BytesRefArrayWritable)rr.getCurrentValue();
Assert.assertEquals(bytesArr[j], w);
HowlRecord t = sd.convertToHowlRecord(null,w);
Assert.assertEquals(8, t.size());
Assert.assertEquals(t,tuples[j]);
}
}
public void testPruning() throws IOException,InterruptedException{
fs.delete(file, true);
byte[][] record_1 = {"123".getBytes("UTF-8"), "456".getBytes("UTF-8"),
"789".getBytes("UTF-8"), "1000".getBytes("UTF-8"),
"5.3".getBytes("UTF-8"), "howl and hadoop".getBytes("UTF-8"),
new byte[0], "\\N".getBytes("UTF-8")};
byte[][] record_2 = {"100".getBytes("UTF-8"), "200".getBytes("UTF-8"),
"123".getBytes("UTF-8"), "1000".getBytes("UTF-8"),
"5.3".getBytes("UTF-8"), "howl and hadoop".getBytes("UTF-8"),
new byte[0], "\\N".getBytes("UTF-8")};
RCFileOutputFormat.setColumnNumber(conf, 8);
RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null,
new DefaultCodec());
BytesRefArrayWritable bytes = new BytesRefArrayWritable(record_1.length);
for (int i = 0; i < record_1.length; i++) {
BytesRefWritable cu = new BytesRefWritable(record_1[i], 0,
record_1[i].length);
bytes.set(i, cu);
}
writer.append(bytes);
BytesRefArrayWritable bytes2 = new BytesRefArrayWritable(record_2.length);
for (int i = 0; i < record_2.length; i++) {
BytesRefWritable cu = new BytesRefWritable(record_2[i], 0,
record_2[i].length);
bytes2.set(i, cu);
}
writer.append(bytes2);
writer.close();
BytesRefArrayWritable[] bytesArr = new BytesRefArrayWritable[]{bytes,bytes2};
RCFileInputDriver sd = new RCFileInputDriver();
JobContext jc = new JobContext(conf, new JobID());
sd.setInputPath(jc, file.toString());
InputFormat<?,?> iF = sd.getInputFormat(null);
InputSplit split = iF.getSplits(jc).get(0);
sd.setOriginalSchema(jc, buildHiveSchema());
sd.setOutputSchema(jc, buildPrunedSchema());
sd.initialize(jc, getProps());
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR,jc.getConfiguration().get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
TaskAttemptContext tac = new TaskAttemptContext(conf, new TaskAttemptID());
RecordReader<?,?> rr = iF.createRecordReader(split,tac);
rr.initialize(split, tac);
HowlRecord[] tuples = getPrunedRecords();
for(int j=0; j < 2; j++){
Assert.assertTrue(rr.nextKeyValue());
BytesRefArrayWritable w = (BytesRefArrayWritable)rr.getCurrentValue();
Assert.assertFalse(bytesArr[j].equals(w));
Assert.assertEquals(w.size(), 8);
HowlRecord t = sd.convertToHowlRecord(null,w);
Assert.assertEquals(5, t.size());
Assert.assertEquals(t,tuples[j]);
}
assertFalse(rr.nextKeyValue());
}
public void testReorderdCols() throws IOException,InterruptedException{
fs.delete(file, true);
byte[][] record_1 = {"123".getBytes("UTF-8"), "456".getBytes("UTF-8"),
"789".getBytes("UTF-8"), "1000".getBytes("UTF-8"),
"5.3".getBytes("UTF-8"), "howl and hadoop".getBytes("UTF-8"),
new byte[0], "\\N".getBytes("UTF-8")};
byte[][] record_2 = {"100".getBytes("UTF-8"), "200".getBytes("UTF-8"),
"123".getBytes("UTF-8"), "1000".getBytes("UTF-8"),
"5.3".getBytes("UTF-8"), "howl and hadoop".getBytes("UTF-8"),
new byte[0], "\\N".getBytes("UTF-8")};
RCFileOutputFormat.setColumnNumber(conf, 8);
RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null,
new DefaultCodec());
BytesRefArrayWritable bytes = new BytesRefArrayWritable(record_1.length);
for (int i = 0; i < record_1.length; i++) {
BytesRefWritable cu = new BytesRefWritable(record_1[i], 0,
record_1[i].length);
bytes.set(i, cu);
}
writer.append(bytes);
BytesRefArrayWritable bytes2 = new BytesRefArrayWritable(record_2.length);
for (int i = 0; i < record_2.length; i++) {
BytesRefWritable cu = new BytesRefWritable(record_2[i], 0,
record_2[i].length);
bytes2.set(i, cu);
}
writer.append(bytes2);
writer.close();
BytesRefArrayWritable[] bytesArr = new BytesRefArrayWritable[]{bytes,bytes2};
RCFileInputDriver sd = new RCFileInputDriver();
JobContext jc = new JobContext(conf, new JobID());
sd.setInputPath(jc, file.toString());
InputFormat<?,?> iF = sd.getInputFormat(null);
InputSplit split = iF.getSplits(jc).get(0);
sd.setOriginalSchema(jc, buildHiveSchema());
sd.setOutputSchema(jc, buildReorderedSchema());
sd.initialize(jc, getProps());
Map<String,String> map = new HashMap<String,String>(1);
map.put("part1", "first-part");
sd.setPartitionValues(jc, map);
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR,jc.getConfiguration().get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
TaskAttemptContext tac = new TaskAttemptContext(conf, new TaskAttemptID());
RecordReader<?,?> rr = iF.createRecordReader(split,tac);
rr.initialize(split, tac);
HowlRecord[] tuples = getReorderedCols();
for(int j=0; j < 2; j++){
Assert.assertTrue(rr.nextKeyValue());
BytesRefArrayWritable w = (BytesRefArrayWritable)rr.getCurrentValue();
Assert.assertFalse(bytesArr[j].equals(w));
Assert.assertEquals(w.size(), 8);
HowlRecord t = sd.convertToHowlRecord(null,w);
Assert.assertEquals(7, t.size());
Assert.assertEquals(t,tuples[j]);
}
assertFalse(rr.nextKeyValue());
}
private HowlRecord[] getExpectedRecords(){
List<Object> rec_1 = new ArrayList<Object>(8);
rec_1.add(new Byte("123"));
rec_1.add(new Short("456"));
rec_1.add( new Integer(789));
rec_1.add( new Long(1000L));
rec_1.add( new Double(5.3D));
rec_1.add( new String("howl and hadoop"));
rec_1.add( null);
rec_1.add( null);
HowlRecord tup_1 = new DefaultHowlRecord(rec_1);
List<Object> rec_2 = new ArrayList<Object>(8);
rec_2.add( new Byte("100"));
rec_2.add( new Short("200"));
rec_2.add( new Integer(123));
rec_2.add( new Long(1000L));
rec_2.add( new Double(5.3D));
rec_2.add( new String("howl and hadoop"));
rec_2.add( null);
rec_2.add( null);
HowlRecord tup_2 = new DefaultHowlRecord(rec_2);
return new HowlRecord[]{tup_1,tup_2};
}
private HowlRecord[] getPrunedRecords(){
List<Object> rec_1 = new ArrayList<Object>(8);
rec_1.add(new Byte("123"));
rec_1.add( new Integer(789));
rec_1.add( new Double(5.3D));
rec_1.add( new String("howl and hadoop"));
rec_1.add( null);
HowlRecord tup_1 = new DefaultHowlRecord(rec_1);
List<Object> rec_2 = new ArrayList<Object>(8);
rec_2.add( new Byte("100"));
rec_2.add( new Integer(123));
rec_2.add( new Double(5.3D));
rec_2.add( new String("howl and hadoop"));
rec_2.add( null);
HowlRecord tup_2 = new DefaultHowlRecord(rec_2);
return new HowlRecord[]{tup_1,tup_2};
}
private HowlSchema buildHiveSchema() throws HowlException{
List<FieldSchema> fields = new ArrayList<FieldSchema>(8);
fields.add(new FieldSchema("atinyint", "tinyint", ""));
fields.add(new FieldSchema("asmallint", "smallint", ""));
fields.add(new FieldSchema("aint", "int", ""));
fields.add(new FieldSchema("along", "bigint", ""));
fields.add(new FieldSchema("adouble", "double", ""));
fields.add(new FieldSchema("astring", "string", ""));
fields.add(new FieldSchema("anullint", "int", ""));
fields.add(new FieldSchema("anullstring", "string", ""));
return new HowlSchema(HowlUtil.getHowlFieldSchemaList(fields));
}
private HowlSchema buildPrunedSchema() throws HowlException{
List<FieldSchema> fields = new ArrayList<FieldSchema>(5);
fields.add(new FieldSchema("atinyint", "tinyint", ""));
fields.add(new FieldSchema("aint", "int", ""));
fields.add(new FieldSchema("adouble", "double", ""));
fields.add(new FieldSchema("astring", "string", ""));
fields.add(new FieldSchema("anullint", "int", ""));
return new HowlSchema(HowlUtil.getHowlFieldSchemaList(fields));
}
private HowlSchema buildReorderedSchema() throws HowlException{
List<FieldSchema> fields = new ArrayList<FieldSchema>(7);
fields.add(new FieldSchema("aint", "int", ""));
fields.add(new FieldSchema("part1", "string", ""));
fields.add(new FieldSchema("adouble", "double", ""));
fields.add(new FieldSchema("newCol", "tinyint", ""));
fields.add(new FieldSchema("astring", "string", ""));
fields.add(new FieldSchema("atinyint", "tinyint", ""));
fields.add(new FieldSchema("anullint", "int", ""));
return new HowlSchema(HowlUtil.getHowlFieldSchemaList(fields));
}
private HowlRecord[] getReorderedCols(){
List<Object> rec_1 = new ArrayList<Object>(7);
rec_1.add( new Integer(789));
rec_1.add( new String("first-part"));
rec_1.add( new Double(5.3D));
rec_1.add( null); // new column
rec_1.add( new String("howl and hadoop"));
rec_1.add( new Byte("123"));
rec_1.add( null);
HowlRecord tup_1 = new DefaultHowlRecord(rec_1);
List<Object> rec_2 = new ArrayList<Object>(7);
rec_2.add( new Integer(123));
rec_2.add( new String("first-part"));
rec_2.add( new Double(5.3D));
rec_2.add(null);
rec_2.add( new String("howl and hadoop"));
rec_2.add( new Byte("100"));
rec_2.add( null);
HowlRecord tup_2 = new DefaultHowlRecord(rec_2);
return new HowlRecord[]{tup_1,tup_2};
}
private Properties getProps(){
Properties props = new Properties();
props.setProperty(Constants.SERIALIZATION_NULL_FORMAT, "\\N");
props.setProperty(Constants.SERIALIZATION_FORMAT, "9");
return props;
}
}