Source Code of com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleTextInputOutputFormat

/**
 * Copyright [2012] [Datasalt Systems S.L.]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datasalt.pangool.tuplemr.mapred.lib.output;


import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;


import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;


import junit.framework.Assert;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.junit.Test;


import com.datasalt.pangool.BaseTest;
import com.datasalt.pangool.io.Fields;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.Schema.Field;
import com.datasalt.pangool.io.Schema.Field.Type;
import com.datasalt.pangool.tuplemr.IdentityTupleMapper;
import com.datasalt.pangool.tuplemr.IdentityTupleReducer;
import com.datasalt.pangool.tuplemr.MapOnlyJobBuilder;
import com.datasalt.pangool.tuplemr.MultipleOutputsCollector;
import com.datasalt.pangool.tuplemr.TupleMRBuilder;
import com.datasalt.pangool.tuplemr.TupleMRException;
import com.datasalt.pangool.tuplemr.mapred.MapOnlyMapper;
import com.datasalt.pangool.tuplemr.mapred.lib.input.TupleTextInputFormat;
import com.datasalt.pangool.tuplemr.mapred.lib.input.TupleTextInputFormat.FieldSelector;
import com.datasalt.pangool.utils.CommonUtils;
import com.datasalt.pangool.utils.HadoopUtils;
import com.google.common.io.Files;


@SuppressWarnings({ "rawtypes", "serial" })
public class TestTupleTextInputOutputFormat extends BaseTest implements Serializable {


  public static String OUT = TestTupleTextInputOutputFormat.class.getName() + "-out";
  public static String IN = TestTupleTextInputOutputFormat.class.getName() + "-in";


  public static enum TestEnum {
    MICKEY, MOUSE, MINIE;
  }


  /*
   * A test for finding race conditions in initializing InputSplits
   */
  @Test
  public void testSplits() throws Exception {


    BufferedWriter writer = new BufferedWriter(new FileWriter(IN));
    for(int i = 0; i < 10000; i++) {
      writer.write("str1" + " " + "str2" + " " + "30" + " " + "4000" + "\n");
    }
    writer.close();


    Schema schema = new Schema("schema", Fields.parse("a:string, b:string, c:int, d:long"));
    InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ' ',
        TupleTextInputFormat.NO_QUOTE_CHARACTER, TupleTextInputFormat.NO_ESCAPE_CHARACTER,
        FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING);


    Configuration conf = getConf();
    conf.setLong("mapred.min.split.size", 10 * 1024);
    conf.setLong("dfs.block.size", 10 * 1024);
    conf.setLong("mapred.max.split.size", 10 * 1024);


    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);


    MapOnlyJobBuilder mapOnly = new MapOnlyJobBuilder(conf);
    mapOnly.addInput(new Path(IN), inputFormat,
        new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() {


          protected void map(ITuple key, NullWritable value, Context context) throws IOException,
              InterruptedException {
            Assert.assertEquals("str1", key.get("a").toString());
            Assert.assertEquals("str2", key.get("b").toString());
            Assert.assertEquals((Integer) 30, (Integer) key.get("c"));
            Assert.assertEquals((Long) 4000l, (Long) key.get("d"));
            context.getCounter("stats", "nlines").increment(1);
          };
        });


    HadoopUtils.deleteIfExists(fS, outPath);
    mapOnly.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class,
        NullWritable.class);
    Job job = mapOnly.createJob();
    try {
      assertTrue(job.waitForCompletion(true));
    } finally {
      mapOnly.cleanUpInstanceFiles();
    }


    HadoopUtils.deleteIfExists(fS, new Path(IN));


    assertEquals(10000, job.getCounters().getGroup("stats").findCounter("nlines").getValue());
  }


  @Test
  public void testInputCompression() throws Exception {
    Schema schema = new Schema("schema", Fields.parse("a:string, b:string, c:int, d:long"));
    InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ' ',
        TupleTextInputFormat.NO_QUOTE_CHARACTER, TupleTextInputFormat.NO_ESCAPE_CHARACTER,
        FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING);


    Configuration conf = getConf();
    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);


    MapOnlyJobBuilder mapOnly = new MapOnlyJobBuilder(conf);
    mapOnly.addInput(new Path("src/test/resources/*.gz"), inputFormat,
        new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() {


          protected void map(ITuple key, NullWritable value, Context context) throws IOException,
              InterruptedException {
            Assert.assertNotNull(key.get("a").toString());
            Assert.assertNotNull(key.get("b").toString());
            Assert.assertTrue((Integer) key.get("c") > 0);
            Assert.assertTrue((Long) key.get("d") > 0);
            context.getCounter("stats", "nlines").increment(1);
          };
        });


    HadoopUtils.deleteIfExists(fS, outPath);
    mapOnly.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class,
        NullWritable.class);
    Job job = mapOnly.createJob();
    try {
      assertTrue(job.waitForCompletion(true));
    } finally {
      mapOnly.cleanUpInstanceFiles();
    }


    HadoopUtils.deleteIfExists(fS, new Path(IN));


    assertEquals(100, job.getCounters().getGroup("stats").findCounter("nlines").getValue());
  }


  @Test
  public void test() throws TupleMRException, IOException, InterruptedException, ClassNotFoundException {


    String line1 = "foo1\t10.0\t ar \t1.0\t100\t1000000\ttrue\tMICKEY";
    String line2 = "foo2\t20.0\tbar2\t2.0\t200\t2000000\tfalse\tMOUSE";
    String line3 = "foo3\t30.0\tbar3\t3.0\t300\t3000000\ttrue\tMINIE";


    // The input is a simple space-separated file with no quotes
    CommonUtils.writeTXT(line1 + "\n" + line2 + "\n" + line3, new File(IN));
    Configuration conf = getConf();
    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);
    Path inPath = new Path(IN);
    HadoopUtils.deleteIfExists(fS, outPath);


    // Define the Schema according to the text file
    List<Field> fields = new ArrayList<Field>();
    fields.add(Field.create("strField1", Type.STRING));
    fields.add(Field.create("floatField", Type.FLOAT));
    fields.add(Field.create("strField2", Type.STRING));
    fields.add(Field.create("doubleField", Type.DOUBLE));
    fields.add(Field.create("intField", Type.INT));
    fields.add(Field.create("longField", Type.LONG));
    fields.add(Field.create("booleanField", Type.BOOLEAN));
    fields.add(Field.createEnum("enumField", TestEnum.class));


    Schema schema = new Schema("schema", fields);


    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("strField1"); // but we don't care, really
    /*
     * Define the Input Format and the Output Format!
     */
    InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, '\t',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER,
        FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING);
    OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, '\t',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER);


    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class);


    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }


    Assert.assertEquals(line1 + "\n" + line2 + "\n" + line3,
        Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());


    HadoopUtils.deleteIfExists(fS, inPath);
    HadoopUtils.deleteIfExists(fS, outPath);
  }


  @Test
  public void test2() throws TupleMRException, IOException, InterruptedException, ClassNotFoundException {


    String line1 = "1,\"Kabul\",\"AFG\",\"Kabol\",1780000";
    String line2 = "2,\"Qandahar\",\"AFG\",\"Qandahar\",237500";


    String line1out = "\"1\",\"Kabul\",\"AFG\",\"Kabol\",\"1780000\"";
    String line2out = "\"2\",\"Qandahar\",\"AFG\",\"Qandahar\",\"237500\"";


    // The input is a simple space-separated file with no quotes
    CommonUtils.writeTXT(line1 + "\n" + line2, new File(IN));
    Configuration conf = getConf();
    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);
    Path inPath = new Path(IN);
    HadoopUtils.deleteIfExists(fS, outPath);


    // Define the Schema according to the text file
    Schema schema = new Schema("schema",
        Fields.parse("id:int,name:string,country_code:string,district:string,population:int"));


    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("id"); // but we don't care, really
    /*
     * Define the Input Format and the Output Format!
     */
    InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ',', '"', '\\',
        FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING);
    OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, ',', '"', '\\');


    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class);
    try {
      Job job = builder.createJob();
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }


    Assert.assertEquals(line1out + "\n" + line2out,
        Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());


    HadoopUtils.deleteIfExists(fS, inPath);
    HadoopUtils.deleteIfExists(fS, outPath);
  }


  @Test
  public void testHeader() throws TupleMRException, IOException, InterruptedException,
      ClassNotFoundException {


    String line1 = "foo1 10.0 bar1 1.0 100 1000000 true MICKEY";
    String line2 = "foo2 20.0 bar2 2.0 200 2000000 false MOUSE";
    String line3 = "foo3 30.0 bar3 3.0 300 3000000 true MINIE";


    String outHeader = "strField1 floatField strField2 doubleField intField longField booleanField enumField";


    // The input is a simple space-separated file with no quotes
    CommonUtils.writeTXT(outHeader + "\n" + line1 + "\n" + line2 + "\n" + line3, new File(IN));
    Configuration conf = getConf();
    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);
    Path inPath = new Path(IN);
    HadoopUtils.deleteIfExists(fS, outPath);


    // Define the Schema according to the text file
    List<Field> fields = new ArrayList<Field>();
    fields.add(Field.create("strField1", Type.STRING));
    fields.add(Field.create("floatField", Type.FLOAT));
    fields.add(Field.create("strField2", Type.STRING));
    fields.add(Field.create("doubleField", Type.DOUBLE));
    fields.add(Field.create("intField", Type.INT));
    fields.add(Field.create("longField", Type.LONG));
    fields.add(Field.create("booleanField", Type.BOOLEAN));
    fields.add(Field.createEnum("enumField", TestEnum.class));


    Schema schema = new Schema("schema", fields);


    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("strField1"); // but we don't care, really
    /*
     * Define the Input Format and the Output Format!
     */
    InputFormat inputFormat = new TupleTextInputFormat(schema, true, false, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER,
        FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING);
    OutputFormat outputFormat = new TupleTextOutputFormat(schema, true, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER);


    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }
    Assert.assertEquals(outHeader + "\n" + line1 + "\n" + line2 + "\n" + line3,
        Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());


    HadoopUtils.deleteIfExists(fS, inPath);
    HadoopUtils.deleteIfExists(fS, outPath);
  }


  @Test
  public void testNulls() throws IOException, InterruptedException, ClassNotFoundException,
      TupleMRException, URISyntaxException {


    String line1 = "\"Joe\",\\N,,\"\\\"Joan\\\"\",\"\"";
    System.out.println(line1);


    CommonUtils.writeTXT(line1, new File(IN));
    Configuration conf = getConf();
    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);
    Path inPath = new Path(IN);
    HadoopUtils.deleteIfExists(fS, outPath);


    Schema schema = new Schema("schema",
        Fields.parse("name:string,name2:string,age:int,name3:string,emptystring:string"));


    MapOnlyJobBuilder mO = new MapOnlyJobBuilder(conf);
    mO.addInput(inPath, new TupleTextInputFormat(schema, false, true, ',', '"', '\\',
        FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING),
        new MapOnlyMapper<ITuple, NullWritable, ITuple, NullWritable>() {


          protected void map(ITuple key, NullWritable value, Context context,
              MultipleOutputsCollector collector) throws IOException, InterruptedException {


            try {
              Assert.assertNull(key.get("name2"));
              Assert.assertNull(key.get("age"));
              Assert.assertEquals("Joe", key.get("name"));
              Assert.assertEquals("\"Joan\"", key.get("name3"));
              Assert.assertEquals("", key.get("emptystring"));
              context.write(key, value);
            } catch(Throwable t) {
              t.printStackTrace();
              throw new RuntimeException(t);
            }
          }
        });


    mO.setOutput(outPath, new TupleTextOutputFormat(schema, false, ',', '"', '\\', "\\N"), NullWritable.class,
        NullWritable.class);
    Job job = mO.createJob();
    try {
      assertTrue(job.waitForCompletion(true));
      String str = Files.toString(new File(outPath.toString() + "/part-m-00000"), Charset.defaultCharset());
      assertEquals("\"Joe\",\\N,\\N,\"\\\"Joan\\\"\",\"\"", str.trim());
    } finally {
      mO.cleanUpInstanceFiles();
    }


    HadoopUtils.deleteIfExists(fS, inPath);
    HadoopUtils.deleteIfExists(fS, outPath);
  }


  @Test
  public void testNumberNulls() throws IOException, InterruptedException, ClassNotFoundException,
      TupleMRException, URISyntaxException {


    String line1 = ",-, ,.";


    CommonUtils.writeTXT(line1, new File(IN));
    Configuration conf = getConf();
    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);
    Path inPath = new Path(IN);
    HadoopUtils.deleteIfExists(fS, outPath);


    Schema schema = new Schema("schema", Fields.parse("n1:int,n2:long,n3:float,n4:double"));


    MapOnlyJobBuilder mO = new MapOnlyJobBuilder(conf);
    mO.addInput(inPath, new TupleTextInputFormat(schema, false, true, ',', '"', '\\',
        FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING),
        new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() {


          protected void map(ITuple key, NullWritable value, Context context,
              MultipleOutputsCollector collector) throws IOException, InterruptedException {


            try {
              Assert.assertNull(key.get("n1"));
              Assert.assertNull(key.get("n2"));
              Assert.assertNull(key.get("n3"));
              Assert.assertNull(key.get("n4"));
            } catch(Throwable t) {
              t.printStackTrace();
              throw new RuntimeException(t);
            }
          }
        });


    mO.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class,
        NullWritable.class);
    Job job = mO.createJob();
    try {
      assertTrue(job.waitForCompletion(true));
    } finally {
      mO.cleanUpInstanceFiles();
    }


    HadoopUtils.deleteIfExists(fS, inPath);
    HadoopUtils.deleteIfExists(fS, outPath);
  }


  @Test
  public void testQuotes() throws IOException, InterruptedException, ClassNotFoundException,
      TupleMRException, URISyntaxException {


    String line1 = "\"MYS\",\"Malaysia\",\"Asia\",\"Southeast Asia\",329758.00,1957,22244000,70.8,69213.00,97884.00,\"Malaysia\",\"Constitutional Monarchy, Federation\",\"Salahuddin Abdul Aziz Shah Alhaj\",2464,\"MY\"";


    CommonUtils.writeTXT(line1, new File(IN));
    Configuration conf = getConf();
    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);
    Path inPath = new Path(IN);
    HadoopUtils.deleteIfExists(fS, outPath);


    Schema schema = new Schema("schema", Fields.parse("code:string," + "name:string,"
        + "continent:string," + "region:string," + "surface_area:double," + "indep_year:int,"
        + "population:int," + "life_expectancy:double," + "gnp:double," + "gnp_old:double,"
        + "local_name:string," + "government_form:string," + "head_of_state:string," + "capital:int,"
        + "code2:string"));


    MapOnlyJobBuilder mO = new MapOnlyJobBuilder(conf);
    mO.addInput(inPath, new TupleTextInputFormat(schema, false, false, ',', '"', '\\',
        FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING),
        new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() {


          protected void map(ITuple key, NullWritable value, Context context,
              MultipleOutputsCollector collector) throws IOException, InterruptedException {


            try {
              Assert.assertEquals("Constitutional Monarchy, Federation", key.get("government_form")
                  .toString());
              Assert.assertEquals("Salahuddin Abdul Aziz Shah Alhaj", key.get("head_of_state")
                  .toString());
              Assert.assertEquals(2464, key.get("capital"));
            } catch(Throwable t) {
              t.printStackTrace();
              throw new RuntimeException(t);
            }
          }
        });
    mO.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class,
        NullWritable.class);
    Job job = mO.createJob();
    try {
      assertTrue(job.waitForCompletion(true));
    } finally {
      mO.cleanUpInstanceFiles();
    }


    HadoopUtils.deleteIfExists(fS, inPath);
    HadoopUtils.deleteIfExists(fS, outPath);
  }


  @Test
  public void testFieldSelection() throws IOException, TupleMRException, InterruptedException,
      ClassNotFoundException {
    String line1 = "foo1 10.0 bar1 1.0 100 1000000 true MICKEY";
    String line2 = "foo2 20.0 bar2 2.0 200 2000000 false MOUSE";
    String line3 = "foo3 30.0 bar3 3.0 300 3000000 true MINIE";


    // The input is a simple space-separated file with no quotes
    CommonUtils.writeTXT(line1 + "\n" + line2 + "\n" + line3, new File(IN));
    Configuration conf = getConf();
    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);
    Path inPath = new Path(IN);
    HadoopUtils.deleteIfExists(fS, outPath);


    // Define the Schema according to the text file
    // We will only select a subset of the file columns
    List<Field> fields = new ArrayList<Field>();
    fields.add(Field.create("floatField", Type.FLOAT));
    fields.add(Field.create("intField", Type.INT));
    fields.add(Field.create("booleanField", Type.BOOLEAN));


    Schema schema = new Schema("schema", fields);


    // Define a FieldSelector to select only columns 1, 4, 6
    // 0 is the first column
    FieldSelector selector = new FieldSelector(1, 4, 6);


    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("floatField"); // but we don't care, really
    // Define the Input Format and the Output Format!
    // Add the selector to the input format
    InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER, selector,
        TupleTextInputFormat.NO_NULL_STRING);
    OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER);


    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }
    // This is what we expect as output after field selection
    line1 = "10.0 100 true";
    line2 = "20.0 200 false";
    line3 = "30.0 300 true";


    Assert.assertEquals(line1 + "\n" + line2 + "\n" + line3,
        Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());


    HadoopUtils.deleteIfExists(fS, inPath);
    HadoopUtils.deleteIfExists(fS, outPath);
  }
  
  @Test
  public void testBadEncoding() throws TupleMRException, IOException, InterruptedException,
      ClassNotFoundException {


    Configuration conf = getConf();
    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);
    Path inPath = new Path("src/test/resources/broken-encoding.txt");
    HadoopUtils.deleteIfExists(fS, outPath);
    
    Schema schema = new Schema("schema", Fields.parse("plugin:string?, count:int?"));


    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("plugin"); // but we don't care, really
    /*
     * Define the Input Format and the Output Format!
     */


    InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ',', '"', '\\', null, null);
    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setTupleOutput(outPath, schema);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }


    HadoopUtils.deleteIfExists(fS, outPath);
  }


  @Test
  public void testInputFixedWidth() throws TupleMRException, IOException, InterruptedException,
      ClassNotFoundException {


    String line1 = "foo1 +10.0  ar  1.0 +10  +10000  true MICKEY";
    String line2 = "foo2 20.0  bar2 2.0 -20 +20000  false MOUSE ";
    String line3 = "foo3  30.0 bar3 3.0 30  3000000 true   MINIE";
    // "01234567890123456789012345678901234567890123"
    int fieldsPos[] = new int[] { 0, 3, 5, 9, 11, 14, 16, 18, 20, 22, 24, 30, 32, 36, 38, 43 };


    String line1out = "foo1 10.0 ar 1.0 10 10000 true MICKEY";
    String line2out = "foo2 20.0 bar2 2.0 -20 20000 false MOUSE";
    String line3out = "foo3 30.0 bar3 3.0 30 3000000 true MINIE";


    // The input is a simple space-separated file with no quotes
    CommonUtils.writeTXT(line1 + "\n" + line2 + "\n" + line3, new File(IN));
    Configuration conf = getConf();
    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);
    Path inPath = new Path(IN);
    HadoopUtils.deleteIfExists(fS, outPath);


    // Define the Schema according to the text file
    List<Field> fields = new ArrayList<Field>();
    fields.add(Field.create("strField1", Type.STRING));
    fields.add(Field.create("floatField", Type.FLOAT));
    fields.add(Field.create("strField2", Type.STRING));
    fields.add(Field.create("doubleField", Type.DOUBLE));
    fields.add(Field.create("intField", Type.INT));
    fields.add(Field.create("longField", Type.LONG));
    fields.add(Field.create("booleanField", Type.BOOLEAN));
    fields.add(Field.createEnum("enumField", TestEnum.class));


    Schema schema = new Schema("schema", fields);


    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("strField1"); // but we don't care, really
    /*
     * Define the Input Format and the Output Format!
     */


    InputFormat inputFormat = new TupleTextInputFormat(schema, fieldsPos, false, null);
    OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER);


    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }
    Assert.assertEquals(line1out + "\n" + line2out + "\n" + line3out,
        Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());


    HadoopUtils.deleteIfExists(fS, inPath);
    HadoopUtils.deleteIfExists(fS, outPath);
  }


  @Test
  public void testInputFixedWidthNull() throws TupleMRException, IOException, InterruptedException,
      ClassNotFoundException {


    String line1 = "foo1 +10.0 bar1 1.0 100 1000000  true MICKEY";
    String line2 = "foo2 20.0  bar2 2.0 200 2000000 false MOUSE ";
    String line3 = "foo3  30.0 bar3 3.0 300 3000000 true   MINIE";
    // "01234567890123456789012345678901234567890123"
    int fieldsPos[] = new int[] { 0, 3, 5, 9, 11, 14, 16, 18, 20, 22, 24, 30, 32, 36, 38, 43 };


    String line1out = "foo1 10.0 bar1 1.0 100 1000000 true MICKEY";
    String line2out = "foo2 20.0 bar2 2.0 200 2000000 false MOUSE";
    String line3out = "foo3 30.0 bar3 3.0 300 3000000 true MINIE";


    // The input is a simple space-separated file with no quotes
    CommonUtils.writeTXT(line1 + "\n" + line2 + "\n" + line3, new File(IN));
    Configuration conf = getConf();
    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);
    Path inPath = new Path(IN);
    HadoopUtils.deleteIfExists(fS, outPath);


    // Define the Schema according to the text file
    List<Field> fields = new ArrayList<Field>();
    fields.add(Field.create("strField1", Type.STRING));
    fields.add(Field.create("floatField", Type.FLOAT));
    fields.add(Field.create("strField2", Type.STRING));
    fields.add(Field.create("doubleField", Type.DOUBLE));
    fields.add(Field.create("intField", Type.INT));
    fields.add(Field.create("longField", Type.LONG));
    fields.add(Field.create("booleanField", Type.BOOLEAN));
    fields.add(Field.createEnum("enumField", TestEnum.class));


    Schema schema = new Schema("schema", fields);


    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("strField1"); // but we don't care, really
    /*
     * Define the Input Format and the Output Format!
     */


    InputFormat inputFormat = new TupleTextInputFormat(schema, fieldsPos, false, null);
    OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER);


    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }
    Assert.assertEquals(line1out + "\n" + line2out + "\n" + line3out,
        Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());


    HadoopUtils.deleteIfExists(fS, inPath);
    HadoopUtils.deleteIfExists(fS, outPath);
  }


  @Test
  public void testFixedWidthNulls() throws IOException, InterruptedException, ClassNotFoundException,
      TupleMRException, URISyntaxException {


    String line1 = "1000  - ";
    int fieldsPos[] = new int[] { 0, 3, 5, 7 };


    CommonUtils.writeTXT(line1, new File(IN));
    Configuration conf = getConf();
    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);
    Path inPath = new Path(IN);
    HadoopUtils.deleteIfExists(fS, outPath);


    Schema schema = new Schema("schema", Fields.parse("name:string,name2:string"));


    MapOnlyJobBuilder mO = new MapOnlyJobBuilder(conf);
    mO.addInput(inPath, new TupleTextInputFormat(schema, fieldsPos, false, "-"),
        new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() {


          protected void map(ITuple key, NullWritable value, Context context,
              MultipleOutputsCollector collector) throws IOException, InterruptedException {


            try {
              Assert.assertNull(key.get("name2"));
              Assert.assertEquals("1000", key.get("name"));
            } catch(Throwable t) {
              t.printStackTrace();
              throw new RuntimeException(t);
            }
          }
        });


    mO.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class,
        NullWritable.class);
    Job job = mO.createJob();
    try {
      assertTrue(job.waitForCompletion(true));
    } finally {
      mO.cleanUpInstanceFiles();
    }


    HadoopUtils.deleteIfExists(fS, inPath);
    HadoopUtils.deleteIfExists(fS, outPath);
  }


}
Source Code of com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleTextInputOutputFormat

Related Classes of com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleTextInputOutputFormat