Package com.asakusafw.directio.hive.parquet

Source Code of com.asakusafw.directio.hive.parquet.ParquetFileFormatTest

/**
* Copyright 2011-2014 Asakusa Framework Team.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.asakusafw.directio.hive.parquet;

import static org.hamcrest.Matchers.*;
import static org.junit.Assert.*;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.junit.Assume;
import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;

import parquet.column.ParquetProperties.WriterVersion;

import com.asakusafw.directio.hive.parquet.mock.MockSimpleWithLong;
import com.asakusafw.directio.hive.parquet.mock.WithDateTime;
import com.asakusafw.directio.hive.parquet.mock.WithFour;
import com.asakusafw.directio.hive.parquet.mock.WithStringSupports;
import com.asakusafw.directio.hive.serde.DataModelDescriptorEditor;
import com.asakusafw.directio.hive.serde.DataModelMapping.ExceptionHandlingStrategy;
import com.asakusafw.directio.hive.serde.DataModelMapping.FieldMappingStrategy;
import com.asakusafw.directio.hive.serde.FieldPropertyDescriptor;
import com.asakusafw.directio.hive.serde.StringValueSerdeFactory;
import com.asakusafw.directio.hive.serde.ValueSerde;
import com.asakusafw.directio.hive.serde.mock.MockSimple;
import com.asakusafw.directio.hive.serde.mock.MockTypes;
import com.asakusafw.runtime.directio.Counter;
import com.asakusafw.runtime.directio.DirectInputFragment;
import com.asakusafw.runtime.directio.hadoop.StripedDataFormat;
import com.asakusafw.runtime.io.ModelInput;
import com.asakusafw.runtime.io.ModelOutput;
import com.asakusafw.runtime.value.Date;
import com.asakusafw.runtime.value.DateTime;
import com.asakusafw.runtime.value.IntOption;
import com.asakusafw.runtime.value.LongOption;
import com.asakusafw.runtime.value.StringOption;

/**
* Test for {@link ParquetFileFormat}.
*/
public class ParquetFileFormatTest {

    /**
     * A temporary folder for testing.
     */
    @Rule
    public final TemporaryFolder folder = new TemporaryFolder();

    private <T> ParquetFileFormat<T> format(Class<T> type, String... removes) {
        return format(type, Collections.<String, ValueSerde>emptyMap(), removes);
    }

    private <T> ParquetFileFormat<T> format(
            Class<T> type,
            Map<String, ? extends ValueSerde> edits,
            String... removes) {
        ParquetFileFormat<T> format = new ParquetFileFormat<T>(
                "testing",
                new ParquetFormatConfiguration(),
                new DataModelDescriptorEditor(FieldPropertyDescriptor.extract(type))
                    .editAll(edits)
                    .removeAll(Arrays.asList(removes))
                    .build());
        format.setConf(new org.apache.hadoop.conf.Configuration());
        return format;
    }

    /**
     * Test method for {@link AbstractParquetFileFormat#getFormatName()}.
     */
    @Test
    public void format_name() {
        assertThat(format(MockSimple.class).getFormatName(), equalTo("PARQUET"));
    }

    /**
     * Test method for {@link AbstractParquetFileFormat#getSupportedType()}.
     */
    @Test
    public void supported_type() {
        assertThat(format(MockSimple.class).getSupportedType(), equalTo((Object) MockSimple.class));
    }

    /**
     * {@code tblproperties} for default settings.
     */
    @Test
    public void table_properties_default() {
        Map<String, String> props = format(MockSimple.class).getTableProperties();
        assertThat(props.size(), is(0));
    }

    /**
     * simple I/O.
     * @throws Exception if failed
     */
    @Test
    public void io_simple() throws Exception {
        ParquetFileFormat<MockSimple> format = format(MockSimple.class);
        MockSimple in = new MockSimple(100, "Hello, world!");
        MockSimple out = restore(format, in);
        assertThat(out.number, is(new IntOption(100)));
        assertThat(out.string, is(new StringOption("Hello, world!")));
    }

    /**
     * I/O with all supported types.
     * @throws Exception if failed
     */
    @SuppressWarnings("deprecation")
    @Test
    public void io_types() throws Exception {
        Map<String, ValueSerde> edits = new HashMap<String, ValueSerde>();
        edits.put("decimalOption", StringValueSerdeFactory.DECIMAL);
        edits.put("dateOption", StringValueSerdeFactory.DATE);
        edits.put("dateTimeOption", StringValueSerdeFactory.DATETIME);

        ParquetFileFormat<MockTypes> format = format(MockTypes.class, edits);
        MockTypes in = new MockTypes();
        in.booleanOption.modify(true);
        in.byteOption.modify((byte) 1);
        in.shortOption.modify((short) 2);
        in.intOption.modify(3);
        in.longOption.modify(4L);
        in.floatOption.modify(5f);
        in.doubleOption.modify(6d);
        in.dateOption.modify(new Date(2014, 6, 1));
        in.dateTimeOption.modify(new DateTime(2014, 6, 1, 2, 3, 4));
        in.stringOption.modify("Hello, world!");
        in.decimalOption.modify(new BigDecimal("7.8"));

        MockTypes out = restore(format, in);
        assertThat(out.booleanOption, equalTo(in.booleanOption));
        assertThat(out.byteOption, equalTo(in.byteOption));
        assertThat(out.shortOption, equalTo(in.shortOption));
        assertThat(out.intOption, equalTo(in.intOption));
        assertThat(out.longOption, equalTo(in.longOption));
        assertThat(out.floatOption, equalTo(in.floatOption));
        assertThat(out.doubleOption, equalTo(in.doubleOption));
        assertThat(out.dateOption, equalTo(in.dateOption));
        assertThat(out.dateTimeOption, equalTo(in.dateTimeOption));
        assertThat(out.stringOption, equalTo(in.stringOption));
        assertThat(out.decimalOption, equalTo(in.decimalOption));
    }

    /**
     * I/O with all supported types.
     * @throws Exception if failed
     */
    @SuppressWarnings("deprecation")
    @Test
    public void io_types_large() throws Exception {
        Map<String, ValueSerde> edits = new HashMap<String, ValueSerde>();
        edits.put("decimalOption", StringValueSerdeFactory.DECIMAL);
        edits.put("dateOption", StringValueSerdeFactory.DATE);
        edits.put("dateTimeOption", StringValueSerdeFactory.DATETIME);

        int count = 1000;
        ParquetFileFormat<MockTypes> format = format(MockTypes.class, edits);
        List<MockTypes> inputs = new ArrayList<MockTypes>();
        for (int i = 0; i < count; i++) {
            MockTypes in = new MockTypes();
            in.booleanOption.modify(true);
            in.byteOption.modify((byte) 1);
            in.shortOption.modify((short) 2);
            in.intOption.modify(3);
            in.longOption.modify(4L);
            in.floatOption.modify(5f);
            in.doubleOption.modify(6d);
            in.dateOption.modify(new Date(2014, 6, 1));
            in.dateTimeOption.modify(new DateTime(2014, 6, 1, 2, 3, 4));
            in.stringOption.modify("Hello, world!");
            in.decimalOption.modify(new BigDecimal("7.8"));
            inputs.add(in);
        }

        List<MockTypes> outputs = restore(format, inputs);
        MockTypes sample = inputs.get(0);
        for (MockTypes out : outputs) {
            assertThat(out.booleanOption, equalTo(sample.booleanOption));
            assertThat(out.byteOption, equalTo(sample.byteOption));
            assertThat(out.shortOption, equalTo(sample.shortOption));
            assertThat(out.intOption, equalTo(sample.intOption));
            assertThat(out.longOption, equalTo(sample.longOption));
            assertThat(out.floatOption, equalTo(sample.floatOption));
            assertThat(out.doubleOption, equalTo(sample.doubleOption));
            assertThat(out.dateOption, equalTo(sample.dateOption));
            assertThat(out.dateTimeOption, equalTo(sample.dateTimeOption));
            assertThat(out.stringOption, equalTo(sample.stringOption));
            assertThat(out.decimalOption, equalTo(sample.decimalOption));
        }
    }

    /**
     * I/O with all supported types with {@code null}s.
     * @throws Exception if failed
     */
    @Test
    public void io_nulls() throws Exception {
        Map<String, ValueSerde> edits = new HashMap<String, ValueSerde>();
        edits.put("decimalOption", StringValueSerdeFactory.DECIMAL);
        edits.put("dateOption", StringValueSerdeFactory.DATE);
        edits.put("dateTimeOption", StringValueSerdeFactory.DATETIME);

        ParquetFileFormat<MockTypes> format = format(MockTypes.class, edits);
        MockTypes in = new MockTypes();
        MockTypes out = restore(format, in);
        assertThat(out.booleanOption, equalTo(in.booleanOption));
        assertThat(out.byteOption, equalTo(in.byteOption));
        assertThat(out.shortOption, equalTo(in.shortOption));
        assertThat(out.intOption, equalTo(in.intOption));
        assertThat(out.longOption, equalTo(in.longOption));
        assertThat(out.floatOption, equalTo(in.floatOption));
        assertThat(out.doubleOption, equalTo(in.doubleOption));
        assertThat(out.dateOption, equalTo(in.dateOption));
        assertThat(out.dateTimeOption, equalTo(in.dateTimeOption));
        assertThat(out.stringOption, equalTo(in.stringOption));
        assertThat(out.decimalOption, equalTo(in.decimalOption));
    }

    /**
     * I/O with fragment.
     * @throws Exception if failed
     */
    @Test
    public void io_fragment() throws Exception {
        File file = folder.newFile();
        Assume.assumeThat(file.delete() || file.exists() == false, is(true));

        ParquetFileFormat<MockSimple> format = format(MockSimple.class);
        LocalFileSystem fs = FileSystem.getLocal(format.getConf());
        ModelOutput<MockSimple> output = format.createOutput(
                MockSimple.class,
                fs, new Path(file.toURI()),
                new Counter());
        try {
            output.write(new MockSimple(100, "Hello, world!"));
        } finally {
            output.close();
        }
        assertThat(file.exists(), is(true));

        FileStatus stat = fs.getFileStatus(new Path(file.toURI()));
        List<DirectInputFragment> fragments = format.computeInputFragments(new StripedDataFormat.InputContext(
                MockSimple.class,
                Arrays.asList(stat), fs,
                -1L, -1L,
                false, false));

        assertThat(fragments, hasSize(1));
        DirectInputFragment first = fragments.get(0);

        ModelInput<MockSimple> input = format.createInput(
                MockSimple.class,
                fs, new Path(first.getPath()),
                first.getOffset(), first.getSize(),
                new Counter());
        try {
            MockSimple buf = new MockSimple();
            assertThat(input.readTo(buf), is(true));
            assertThat(buf.number, is(new IntOption(100)));
            assertThat(buf.string, is(new StringOption("Hello, world!")));

            assertThat(input.readTo(buf), is(false));
        } finally {
            input.close();
        }
    }

    /**
     * I/O with {@code v2}.
     * @throws Exception if failed
     */
    @Test
    public void io_v_2() throws Exception {
        ParquetFileFormat<MockSimple> format = format(MockSimple.class);
        format.getFormatConfiguration().withWriterVersion(WriterVersion.PARQUET_2_0);
        MockSimple in = new MockSimple(100, "Hello, world!");
        MockSimple out = restore(format, in);
        assertThat(out.number, is(new IntOption(100)));
        assertThat(out.string, is(new StringOption("Hello, world!")));
    }

    /**
     * Field mapping by its name.
     * @throws Exception if failed
     */
    @SuppressWarnings("deprecation")
    @Test
    public void mapping_by_position() throws Exception {
        ParquetFileFormat<WithFour> f1 = format(WithFour.class, "col1", "col3");
        ParquetFileFormat<WithFour> f2 = format(WithFour.class, "col2", "col3");
        f2.getFormatConfiguration().withFieldMappingStrategy(FieldMappingStrategy.POSITION);

        WithFour in = new WithFour();
        in.col0.modify(0);
        in.col1.modify(1);
        in.col2.modify(2);
        in.col3.modify(3);

        File file = save(f1, Arrays.asList(in));
        List<WithFour> results = load(f2, file);
        assertThat(results, hasSize(1));

        WithFour out = results.get(0);
        assertThat(out.col0, is(new IntOption(0)));
        assertThat(out.col1, is(new IntOption(2)));
        assertThat(out.col2, is(new IntOption()));
        assertThat(out.col3, is(new IntOption()));
    }

    /**
     * Field mapping by its name.
     * @throws Exception if failed
     */
    @SuppressWarnings("deprecation")
    @Test
    public void mapping_by_name() throws Exception {
        ParquetFileFormat<WithFour> f1 = format(WithFour.class, "col1", "col3");
        ParquetFileFormat<WithFour> f2 = format(WithFour.class, "col2", "col3");
        f2.getFormatConfiguration().withFieldMappingStrategy(FieldMappingStrategy.NAME);

        WithFour in = new WithFour();
        in.col0.modify(0);
        in.col1.modify(1);
        in.col2.modify(2);
        in.col3.modify(3);

        File file = save(f1, Arrays.asList(in));
        List<WithFour> results = load(f2, file);
        assertThat(results, hasSize(1));

        WithFour out = results.get(0);
        assertThat(out.col0, is(new IntOption(0)));
        assertThat(out.col1, is(new IntOption()));
        assertThat(out.col2, is(new IntOption()));
        assertThat(out.col3, is(new IntOption()));
    }

    /**
     * fail on missing source.
     * @throws Exception if failed
     */
    @Test
    public void fail_on_missing_source() throws Exception {
        ParquetFileFormat<WithFour> f1 = format(WithFour.class, "col3");
        ParquetFileFormat<WithFour> f2 = format(WithFour.class);
        f2.getFormatConfiguration()
            .withFieldMappingStrategy(FieldMappingStrategy.NAME)
            .withOnMissingSource(ExceptionHandlingStrategy.FAIL);

        WithFour in = new WithFour();
        File file = save(f1, Arrays.asList(in));
        try {
            load(f2, file);
            fail();
        } catch (IllegalArgumentException e) {
            // ok.
        }
    }

    /**
     * fail on missing target.
     * @throws Exception if failed
     */
    @Test
    public void fail_on_missing_target() throws Exception {
        ParquetFileFormat<WithFour> f1 = format(WithFour.class);
        ParquetFileFormat<WithFour> f2 = format(WithFour.class, "col3");
        f2.getFormatConfiguration()
            .withFieldMappingStrategy(FieldMappingStrategy.NAME)
            .withOnMissingTarget(ExceptionHandlingStrategy.FAIL);

        WithFour in = new WithFour();
        File file = save(f1, Arrays.asList(in));
        try {
            load(f2, file);
            fail();
        } catch (IllegalArgumentException e) {
            // ok.
        }
    }

    /**
     * ignore on incompatible type.
     * @throws Exception if failed
     */
    @Test
    public void ignore_on_incompatible_type() throws Exception {
        ParquetFileFormat<MockSimple> f1 = format(MockSimple.class);
        ParquetFileFormat<MockSimpleWithLong> f2 = format(MockSimpleWithLong.class);
        f2.getFormatConfiguration()
            .withFieldMappingStrategy(FieldMappingStrategy.NAME)
            .withOnIncompatibleType(ExceptionHandlingStrategy.IGNORE);

        MockSimple in = new MockSimple(100, "Hello, world!");
        File file = save(f1, Arrays.asList(in));
        List<MockSimpleWithLong> results = load(f2, file);
        assertThat(results, hasSize(1));
        MockSimpleWithLong out = results.get(0);
        assertThat(out.number, is(new LongOption()));
        assertThat(out.string, is(in.string));
    }

    /**
     * fail on incompatible type.
     * @throws Exception if failed
     */
    @Test
    public void fail_on_incompatible_type() throws Exception {
        ParquetFileFormat<MockSimple> f1 = format(MockSimple.class);
        ParquetFileFormat<MockSimpleWithLong> f2 = format(MockSimpleWithLong.class);
        f2.getFormatConfiguration()
            .withFieldMappingStrategy(FieldMappingStrategy.NAME)
            .withOnIncompatibleType(ExceptionHandlingStrategy.FAIL);

        MockSimple in = new MockSimple(100, "Hello, world!");
        File file = save(f1, Arrays.asList(in));
        try {
            load(f2, file);
            fail();
        } catch (IllegalArgumentException e) {
            // ok.
        }
    }

    /**
     * loading timestamp type which generated by impala.
     * @throws Exception if failed
     */
    @Ignore
    @Test
    public void format_timestamp() throws Exception {
        ModelInput<WithDateTime> input = load(WithDateTime.class, "impala-timestamp.bin");
        try {
            WithDateTime buf = new WithDateTime();
            assertThat(input.readTo(buf), is(true));
            // TODO check

            assertThat(input.readTo(buf), is(false));
        } finally {
            input.close();
        }
    }

    /**
     * using strings.
     * @throws Exception if failed
     */
    @SuppressWarnings("deprecation")
    @Test
    public void io_string() throws Exception {
        Map<String, ValueSerde> edits = new HashMap<String, ValueSerde>();
        edits.put("decimal", StringValueSerdeFactory.DECIMAL);
        edits.put("date", StringValueSerdeFactory.DATE);
        edits.put("datetime", StringValueSerdeFactory.DATETIME);
        ParquetFileFormat<WithStringSupports> format = format(WithStringSupports.class, edits);

        WithStringSupports in = new WithStringSupports();
        in.decimal.modify(new BigDecimal("123.45"));
        in.date.modify(new Date(2014, 7, 1));
        in.datetime.modify(new DateTime(2014, 7, 1, 12, 34, 56));

        WithStringSupports out = restore(format, in);
        assertThat(out.decimal, is(in.decimal));
        assertThat(out.date, is(in.date));
        assertThat(out.datetime, is(in.datetime));
    }

    /**
     * using strings with dictionary.
     * @throws Exception if failed
     */
    @SuppressWarnings("deprecation")
    @Test
    public void io_string_dict() throws Exception {
        Map<String, ValueSerde> edits = new HashMap<String, ValueSerde>();
        edits.put("decimal", StringValueSerdeFactory.DECIMAL);
        edits.put("date", StringValueSerdeFactory.DATE);
        edits.put("datetime", StringValueSerdeFactory.DATETIME);
        ParquetFileFormat<WithStringSupports> format = format(WithStringSupports.class, edits);

        int count = 1000;
        List<WithStringSupports> inputs = new ArrayList<WithStringSupports>();
        for (int i = 0; i < count; i++) {
            WithStringSupports object = new WithStringSupports();
            object.decimal.modify(new BigDecimal("123.45"));
            object.date.modify(new Date(2014, 7, 1));
            object.datetime.modify(new DateTime(2014, 7, 1, 12, 34, 56));
            inputs.add(object);
        }
        WithStringSupports sample = inputs.get(0);
        List<WithStringSupports> outputs = restore(format, inputs);
        for (WithStringSupports out : outputs) {
            assertThat(out.decimal, is(sample.decimal));
            assertThat(out.date, is(sample.date));
            assertThat(out.datetime, is(sample.datetime));
        }
    }

    private <T> ModelInput<T> load(Class<T> modelType, String name) throws IOException, InterruptedException {
        File target = folder.newFile();
        InputStream in = getClass().getResourceAsStream(name);
        assertThat(in, is(notNullValue()));
        try {
            IOUtils.copyBytes(in, new FileOutputStream(target), 1024, true);
        } finally {
            in.close();
        }
        ParquetFileFormat<T> format = format(modelType);
        FileSystem fs = FileSystem.getLocal(format.getConf());
        return format.createInput(
                modelType,
                fs, new Path(target.toURI()),
                0, -1,
                new Counter());
    }

    private <T> T restore(ParquetFileFormat<T> format, T value) throws IOException, InterruptedException {
        List<T> in = new ArrayList<T>();
        in.add(value);
        return restore(format, in).get(0);
    }

    private <T> List<T> restore(ParquetFileFormat<T> format, List<T> values) throws IOException, InterruptedException {
        File file = save(format, values);
        List<T> results = load(format, file);
        assertThat(values, hasSize(results.size()));
        return results;
    }

    private <T> File save(ParquetFileFormat<T> format, List<T> values) throws IOException, InterruptedException {
        File file = folder.newFile();
        Assume.assumeThat(file.delete() || file.exists() == false, is(true));
        LocalFileSystem fs = FileSystem.getLocal(format.getConf());
        ModelOutput<T> output = format.createOutput(
                format.getSupportedType(),
                fs, new Path(file.toURI()),
                new Counter());
        try {
            for (T value : values) {
                output.write(value);
            }
        } finally {
            output.close();
        }
        assertThat(file.exists(), is(true));
        return file;
    }

    private <T> List<T> load(ParquetFileFormat<T> format, File file) throws IOException, InterruptedException {
        LocalFileSystem fs = FileSystem.getLocal(format.getConf());
        ModelInput<T> input = format.createInput(
                format.getSupportedType(),
                fs, new Path(file.toURI()),
                0, file.length(),
                new Counter());
        try {
            List<T> results = new ArrayList<T>();
            while (true) {
                @SuppressWarnings("unchecked")
                T value = (T) format.getDataModelDescriptor().createDataModelObject();
                if (input.readTo(value) == false) {
                    break;
                }
                results.add(value);
            }
            return results;
        } finally {
            input.close();
        }
    }
}
TOP

Related Classes of com.asakusafw.directio.hive.parquet.ParquetFileFormatTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.