/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.apache.avro.hadoop.file;
import static org.junit.Assert.*;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.avro.AvroRuntimeException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.reflect.ReflectData;
import org.apache.avro.specific.SpecificData;
import org.apache.avro.hadoop.io.AvroKeyValue;
import org.apache.avro.mapred.FsInput;
import org.apache.avro.io.DatumReader;
import org.apache.avro.file.CodecFactory;
import org.apache.avro.file.FileReader;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class TestSortedKeyValueFile {
private static final Logger LOG = LoggerFactory.getLogger(TestSortedKeyValueFile.class);
@Rule
public TemporaryFolder mTempDir = new TemporaryFolder();
@Test(expected=IllegalArgumentException.class)
public void testWriteOutOfSortedOrder() throws IOException {
LOG.debug("Writing some records to a SortedKeyValueFile...");
Configuration conf = new Configuration();
SortedKeyValueFile.Writer.Options options = new SortedKeyValueFile.Writer.Options()
.withKeySchema(Schema.create(Schema.Type.STRING))
.withValueSchema(Schema.create(Schema.Type.STRING))
.withConfiguration(conf)
.withPath(new Path(mTempDir.getRoot().getPath(), "myfile"))
.withIndexInterval(2); // Index every other record.
SortedKeyValueFile.Writer<CharSequence, CharSequence> writer
= new SortedKeyValueFile.Writer<CharSequence, CharSequence>(options);
Utf8 key = new Utf8(); // re-use key, to test copied
try {
writer.append(key.set("banana"), "Banana");
writer.append(key.set("apple"), "Apple"); // Ruh, roh!
} finally {
writer.close();
}
}
@Test
public void testNamedCodecs() throws IOException {
Configuration conf = new Configuration();
Path myfile = new Path(mTempDir.getRoot().getPath(), "myfile");
Schema key = Schema.create(Schema.Type.STRING);
Schema value = Schema.create(Schema.Type.STRING);
Schema recordSchema = AvroKeyValue.getSchema(key, value);
DatumReader<GenericRecord> datumReader = SpecificData.get().createDatumReader(recordSchema);
DataFileReader<GenericRecord> reader;
SortedKeyValueFile.Writer.Options options = new SortedKeyValueFile.Writer.Options()
.withKeySchema(key)
.withValueSchema(value)
.withConfiguration(conf)
.withPath(myfile);
SortedKeyValueFile.Writer<CharSequence, CharSequence> writer;
for(String codec : new String[]{"null", "deflate", "snappy", "bzip2"}) {
LOG.debug("Using " + codec + "codec for a SortedKeyValueFile...");
options.withCodec(codec);
writer = new SortedKeyValueFile.Writer<CharSequence, CharSequence>(options);
writer.close();
reader = new DataFileReader<GenericRecord>(
new FsInput(new Path(myfile,SortedKeyValueFile.DATA_FILENAME), conf),
datumReader);
assertEquals(codec, reader.getMetaString("avro.codec"));
reader.close();
}
}
@Test
public void testDeflateClassCodec() throws IOException {
Configuration conf = new Configuration();
Path myfile = new Path(mTempDir.getRoot().getPath(), "myfile");
Schema key = Schema.create(Schema.Type.STRING);
Schema value = Schema.create(Schema.Type.STRING);
Schema recordSchema = AvroKeyValue.getSchema(key, value);
DatumReader<GenericRecord> datumReader = SpecificData.get().createDatumReader(recordSchema);
DataFileReader<GenericRecord> reader;
LOG.debug("Using CodecFactory.deflateCodec() for a SortedKeyValueFile...");
SortedKeyValueFile.Writer.Options options = new SortedKeyValueFile.Writer.Options()
.withKeySchema(key)
.withValueSchema(value)
.withConfiguration(conf)
.withPath(myfile)
.withCodec(CodecFactory.deflateCodec(9));
SortedKeyValueFile.Writer<CharSequence, CharSequence> writer =
new SortedKeyValueFile.Writer<CharSequence, CharSequence>(options);
writer.close();
reader = new DataFileReader<GenericRecord>(
new FsInput(new Path(myfile,SortedKeyValueFile.DATA_FILENAME), conf),
datumReader);
assertEquals("deflate", reader.getMetaString("avro.codec"));
reader.close();
}
@Test
public void testBadCodec() throws IOException {
LOG.debug("Using a bad codec for a SortedKeyValueFile...");
try {
SortedKeyValueFile.Writer.Options options =
new SortedKeyValueFile.Writer.Options().withCodec("foobar");
} catch (AvroRuntimeException e) {
assertEquals("Unrecognized codec: foobar", e.getMessage());
}
}
@Test
public void testWriter() throws IOException {
LOG.debug("Writing some records to a SortedKeyValueFile...");
Configuration conf = new Configuration();
SortedKeyValueFile.Writer.Options options = new SortedKeyValueFile.Writer.Options()
.withKeySchema(Schema.create(Schema.Type.STRING))
.withValueSchema(Schema.create(Schema.Type.STRING))
.withConfiguration(conf)
.withPath(new Path(mTempDir.getRoot().getPath(), "myfile"))
.withIndexInterval(2); // Index every other record.
SortedKeyValueFile.Writer<CharSequence, CharSequence> writer
= new SortedKeyValueFile.Writer<CharSequence, CharSequence>(options);
try {
writer.append("apple", "Apple"); // Will be indexed.
writer.append("banana", "Banana");
writer.append("carrot", "Carrot"); // Will be indexed.
writer.append("durian", "Durian");
} finally {
writer.close();
}
LOG.debug("Checking the generated directory...");
File directory = new File(mTempDir.getRoot().getPath(), "myfile");
assertTrue(directory.exists());
LOG.debug("Checking the generated index file...");
File indexFile = new File(directory, SortedKeyValueFile.INDEX_FILENAME);
DatumReader<GenericRecord> indexReader = new GenericDatumReader<GenericRecord>(
AvroKeyValue.getSchema(options.getKeySchema(), Schema.create(Schema.Type.LONG)));
FileReader<GenericRecord> indexFileReader = DataFileReader.openReader(indexFile, indexReader);
List<AvroKeyValue<CharSequence, Long>> indexRecords
= new ArrayList<AvroKeyValue<CharSequence, Long>>();
try {
for (GenericRecord indexRecord : indexFileReader) {
indexRecords.add(new AvroKeyValue<CharSequence, Long>(indexRecord));
}
} finally {
indexFileReader.close();
}
assertEquals(2, indexRecords.size());
assertEquals("apple", indexRecords.get(0).getKey().toString());
LOG.debug("apple's position in the file: " + indexRecords.get(0).getValue());
assertEquals("carrot", indexRecords.get(1).getKey().toString());
LOG.debug("carrot's position in the file: " + indexRecords.get(1).getValue());
LOG.debug("Checking the generated data file...");
File dataFile = new File(directory, SortedKeyValueFile.DATA_FILENAME);
DatumReader<GenericRecord> dataReader = new GenericDatumReader<GenericRecord>(
AvroKeyValue.getSchema(options.getKeySchema(), options.getValueSchema()));
DataFileReader<GenericRecord> dataFileReader
= new DataFileReader<GenericRecord>(dataFile, dataReader);
try {
dataFileReader.seek(indexRecords.get(0).getValue());
assertTrue(dataFileReader.hasNext());
AvroKeyValue<CharSequence, CharSequence> appleRecord
= new AvroKeyValue<CharSequence, CharSequence>(dataFileReader.next());
assertEquals("apple", appleRecord.getKey().toString());
assertEquals("Apple", appleRecord.getValue().toString());
dataFileReader.seek(indexRecords.get(1).getValue());
assertTrue(dataFileReader.hasNext());
AvroKeyValue<CharSequence, CharSequence> carrotRecord
= new AvroKeyValue<CharSequence, CharSequence>(dataFileReader.next());
assertEquals("carrot", carrotRecord.getKey().toString());
assertEquals("Carrot", carrotRecord.getValue().toString());
assertTrue(dataFileReader.hasNext());
AvroKeyValue<CharSequence, CharSequence> durianRecord
= new AvroKeyValue<CharSequence, CharSequence>(dataFileReader.next());
assertEquals("durian", durianRecord.getKey().toString());
assertEquals("Durian", durianRecord.getValue().toString());
} finally {
dataFileReader.close();
}
}
@Test
public void testReader() throws IOException {
Configuration conf = new Configuration();
SortedKeyValueFile.Writer.Options writerOptions = new SortedKeyValueFile.Writer.Options()
.withKeySchema(Schema.create(Schema.Type.STRING))
.withValueSchema(Schema.create(Schema.Type.STRING))
.withConfiguration(conf)
.withPath(new Path(mTempDir.getRoot().getPath(), "myfile"))
.withIndexInterval(2); // Index every other record.
SortedKeyValueFile.Writer<CharSequence, CharSequence> writer
= new SortedKeyValueFile.Writer<CharSequence, CharSequence>(writerOptions);
try {
writer.append("apple", "Apple"); // Will be indexed.
writer.append("banana", "Banana");
writer.append("carrot", "Carrot"); // Will be indexed.
writer.append("durian", "Durian");
} finally {
writer.close();
}
LOG.debug("Reading the file back using a reader...");
SortedKeyValueFile.Reader.Options readerOptions = new SortedKeyValueFile.Reader.Options()
.withKeySchema(Schema.create(Schema.Type.STRING))
.withValueSchema(Schema.create(Schema.Type.STRING))
.withConfiguration(conf)
.withPath(new Path(mTempDir.getRoot().getPath(), "myfile"));
SortedKeyValueFile.Reader<CharSequence, CharSequence> reader
= new SortedKeyValueFile.Reader<CharSequence, CharSequence>(readerOptions);
try {
assertEquals("Carrot", reader.get("carrot").toString());
assertEquals("Banana", reader.get("banana").toString());
assertNull(reader.get("a-vegetable"));
assertNull(reader.get("beet"));
assertNull(reader.get("zzz"));
} finally {
reader.close();
}
}
public static class Stringy implements Comparable<Stringy> {
private String s;
public Stringy() {};
public Stringy(String s) { this.s = s; }
@Override public String toString() { return s; }
@Override public int hashCode() { return s.hashCode(); }
@Override public boolean equals(Object that) {
return this.s.equals(that.toString());
}
@Override public int compareTo(Stringy that) {
return this.s.compareTo(that.s);
}
}
@Test public void testAlternateModel() throws Exception {
LOG.debug("Writing some reflect records...");
ReflectData model = ReflectData.get();
Configuration conf = new Configuration();
SortedKeyValueFile.Writer.Options options
= new SortedKeyValueFile.Writer.Options()
.withKeySchema(model.getSchema(Stringy.class))
.withValueSchema(model.getSchema(Stringy.class))
.withConfiguration(conf)
.withPath(new Path(mTempDir.getRoot().getPath(), "reflect"))
.withDataModel(model)
.withIndexInterval(2);
SortedKeyValueFile.Writer<Stringy,Stringy> writer
= new SortedKeyValueFile.Writer<Stringy,Stringy>(options);
try {
writer.append(new Stringy("apple"), new Stringy("Apple"));
writer.append(new Stringy("banana"), new Stringy("Banana"));
writer.append(new Stringy("carrot"), new Stringy("Carrot"));
writer.append(new Stringy("durian"), new Stringy("Durian"));
} finally {
writer.close();
}
LOG.debug("Reading the file back using a reader...");
SortedKeyValueFile.Reader.Options readerOptions =
new SortedKeyValueFile.Reader.Options()
.withKeySchema(model.getSchema(Stringy.class))
.withValueSchema(model.getSchema(Stringy.class))
.withConfiguration(conf)
.withPath(new Path(mTempDir.getRoot().getPath(), "reflect"))
.withDataModel(model);
SortedKeyValueFile.Reader<Stringy,Stringy> reader
= new SortedKeyValueFile.Reader<Stringy,Stringy>(readerOptions);
try {
assertEquals(new Stringy("Carrot"), reader.get(new Stringy("carrot")));
assertEquals(new Stringy("Banana"), reader.get(new Stringy("banana")));
assertNull(reader.get(new Stringy("a-vegetable")));
assertNull(reader.get(new Stringy("beet")));
assertNull(reader.get(new Stringy("zzz")));
} finally {
reader.close();
}
}
}