Package org.apache.pig.piggybank.storage

Source Code of org.apache.pig.piggybank.storage.CSVExcelStorage

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.pig.piggybank.storage;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.log4j.Logger;

import org.apache.pig.LoadPushDown;
import org.apache.pig.PigException;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.StoreFuncInterface;

import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigTextInputFormat;

import org.apache.pig.builtin.PigStorage;
import org.apache.pig.bzip2r.Bzip2TextInputFormat;

import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;

import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.impl.util.StorageUtil;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.impl.util.Utils;

import org.apache.pig.parser.ParserException;

/**
* CSV loading and storing with support for multi-line fields,
* and escaping of delimiters and double quotes within fields;
* uses CSV conventions of Excel 2007.
*
* Arguments allow for control over:
*
* Which field delimiter to use (default = ',')
* Whether line breaks are allowed inside of fields (YES_MULTILINE = yes, NO_MULTILINE = no, default = no)
* How line breaks are to be written when storing (UNIX = LF, WINDOWS = CRLF, NOCHANGE = system default, default = system default)
* What to do with header rows (first line of each file):
*     On load: READ_INPUT_HEADER = read header rows, SKIP_INPUT_HEADER = do not read header rows, default = read header rows
*     On store: WRITE_OUTPUT_HEADER = write a header row, SKIP_OUTPUT_HEADER = do not write a header row, default = do not write a header row
*
* Usage:
*
* STORE x INTO '<destFileName>'
*         USING org.apache.pig.piggybank.storage.CSVExcelStorage(
*              [DELIMITER[,
*                  {YES_MULTILINE | NO_MULTILINE}[,
*                      {UNIX | WINDOWS | NOCHANGE}[,
*                          {READ_INPUT_HEADER, SKIP_INPUT_HEADER, WRITE_OUTPUT_HEADER, SKIP_OUTPUT_HEADER}]]]]
*         );
*
* Linebreak settings are only used during store; during load, no conversion is performed.
*
* WARNING: A danger with enabling multiline fields during load is that unbalanced
*          double quotes will cause slurping up of input until a balancing double
*          quote is found, or until something breaks. If you are not expecting
*          newlines within fields it is therefore more robust to use NO_MULTILINE,
*          which is the default for that reason.
*
* This is Adreas Paepcke's <paepcke@cs.stanford.edu> CSVExcelStorage with a few modifications.
*/

public class CSVExcelStorage extends PigStorage implements StoreFuncInterface, LoadPushDown {

    public static enum Linebreaks { UNIX, WINDOWS, NOCHANGE };
    public static enum Multiline { YES, NO };
    public static enum Headers { DEFAULT, READ_INPUT_HEADER, SKIP_INPUT_HEADER, WRITE_OUTPUT_HEADER, SKIP_OUTPUT_HEADER }

    protected final static byte LINEFEED = '\n';
    protected final static byte DOUBLE_QUOTE = '"';
    protected final static byte RECORD_DEL = LINEFEED;

    private static final String FIELD_DELIMITER_DEFAULT_STR = ",";
    private static final String MULTILINE_DEFAULT_STR = "NO_MULTILINE";
    private static final String EOL_DEFAULT_STR = "NOCHANGE";
    private static final String HEADER_DEFAULT_STR = "DEFAULT";
   
    long end = Long.MAX_VALUE;

    private byte fieldDelimiter = ',';
    private Multiline multilineTreatment = Multiline.NO;
    private Linebreaks eolTreatment = Linebreaks.NOCHANGE;
    private Headers headerTreatment = Headers.DEFAULT;

    private ArrayList<Object> mProtoTuple = null;
    private TupleFactory mTupleFactory = TupleFactory.getInstance();
    private String udfContextSignature;
    private String loadLocation;
    private boolean[] mRequiredColumns = null;
    private boolean mRequiredColumnsInitialized = false;
   
    final Logger logger = Logger.getLogger(getClass().getName());

    @SuppressWarnings("rawtypes")
    protected RecordReader in = null;   

    // For replacing LF with CRLF (Unix --> Windows end-of-line convention):
    Pattern loneLFDetectorPattern = Pattern.compile("([^\r])\n", Pattern.DOTALL | Pattern.MULTILINE);
    Matcher loneLFDetector = loneLFDetectorPattern.matcher("");

    // For removing CR (Windows --> Unix):
    Pattern CRLFDetectorPattern = Pattern.compile("\r\n", Pattern.DOTALL | Pattern.MULTILINE);
    Matcher CRLFDetector = CRLFDetectorPattern.matcher("");


    // Pig Storage with COMMA as delimiter:
    TupleFactory tupleMaker = TupleFactory.getInstance();
    private boolean getNextInQuotedField;
    private int getNextFieldID;
    private boolean nextTupleSkipChar;

    // For handling headers
    private boolean loadingFirstRecord = true;
    private boolean storingFirstRecord = true;
    private String header = null;
    private int splitIndex;

    private static final String SCHEMA_SIGNATURE = "pig.csvexcelstorage.schema";
    protected ResourceSchema schema = null;

    /*-----------------------------------------------------
    | Constructors
    ------------------------*/

    public CSVExcelStorage() {
        super(FIELD_DELIMITER_DEFAULT_STR);
        initializeInstance(FIELD_DELIMITER_DEFAULT_STR, MULTILINE_DEFAULT_STR, EOL_DEFAULT_STR, HEADER_DEFAULT_STR);
    }
   
    public CSVExcelStorage(String delimiter) {
        super(delimiter);
        initializeInstance(delimiter, MULTILINE_DEFAULT_STR, EOL_DEFAULT_STR, HEADER_DEFAULT_STR);
    }

    public CSVExcelStorage(String delimiter, String multilineTreatmentStr) {
        super(delimiter);
        initializeInstance(delimiter, multilineTreatmentStr, EOL_DEFAULT_STR, HEADER_DEFAULT_STR);
    }

    public CSVExcelStorage(String delimiter, String multilineTreatmentStr, String eolTreatmentStr) {
        super(delimiter);
        initializeInstance(delimiter, multilineTreatmentStr, eolTreatmentStr, HEADER_DEFAULT_STR);
    }

    public CSVExcelStorage(String delimiter, String multilineTreatmentStr, String eolTreatmentStr, String headerTreatmentStr) {
        super(delimiter);
        initializeInstance(delimiter, multilineTreatmentStr, eolTreatmentStr, headerTreatmentStr);
    }
   
    private void initializeInstance(String delimiter, String multilineTreatmentStr, String eolTreatmentStr, String headerTreatmentStr) {
        fieldDelimiter = StorageUtil.parseFieldDel(delimiter);

        multilineTreatment = canonicalizeMultilineTreatmentRequest(multilineTreatmentStr);
        eolTreatment = canonicalizeEOLTreatmentRequest(eolTreatmentStr);
        headerTreatment = canonicalizeHeaderTreatmentRequest(headerTreatmentStr);
    }
   
    private Multiline canonicalizeMultilineTreatmentRequest(String multilineTreatmentStr) {
        if (multilineTreatmentStr.equalsIgnoreCase("YES_MULTILINE"))
            return Multiline.YES;
        else if (multilineTreatmentStr.equalsIgnoreCase("NO_MULTILINE"))
            return Multiline.NO;

        throw new IllegalArgumentException(
                "Unrecognized multiline treatment argument " + multilineTreatmentStr + ". " +
                "Should be either 'YES_MULTILINE' or 'NO_MULTILINE'");
    }
   
    private Linebreaks canonicalizeEOLTreatmentRequest(String eolTreatmentStr) {
        if (eolTreatmentStr.equalsIgnoreCase("UNIX"))
            return Linebreaks.UNIX;
        else if (eolTreatmentStr.equalsIgnoreCase("WINDOWS"))
            return Linebreaks.WINDOWS;
        else if (eolTreatmentStr.equalsIgnoreCase("NOCHANGE"))
            return Linebreaks.NOCHANGE;

        throw new IllegalArgumentException(
                "Unrecognized end-of-line treatment argument " + eolTreatmentStr + ". " +
                "Should be one of 'UNIX', 'WINDOWS', or 'NOCHANGE'");
    }

    private Headers canonicalizeHeaderTreatmentRequest(String headerTreatmentStr) {
        if (headerTreatmentStr.equalsIgnoreCase("DEFAULT"))
            return Headers.DEFAULT;
        else if (headerTreatmentStr.equalsIgnoreCase("READ_INPUT_HEADER"))
            return Headers.READ_INPUT_HEADER;
        else if (headerTreatmentStr.equalsIgnoreCase("SKIP_INPUT_HEADER"))
            return Headers.SKIP_INPUT_HEADER;
        else if (headerTreatmentStr.equalsIgnoreCase("WRITE_OUTPUT_HEADER"))
            return Headers.WRITE_OUTPUT_HEADER;
        else if (headerTreatmentStr.equalsIgnoreCase("SKIP_OUTPUT_HEADER"))
            return Headers.SKIP_OUTPUT_HEADER;

        throw new IllegalArgumentException(
            "Unrecognized header treatment argument " + headerTreatmentStr + ". " +
            "Should be one of 'READ_INPUT_HEADER', 'SKIP_INPUT_HEADER', 'WRITE_OUTPUT_HEADER', 'SKIP_OUTPUT_HEADER'");
    }
   
    // ---------------------------------------- STORAGE -----------------------------
   
    public void checkSchema(ResourceSchema s) throws IOException {
        // Not actually checking schema
        // Actually, just storing it to use in the backend
       
        UDFContext udfc = UDFContext.getUDFContext();
        Properties p =
            udfc.getUDFProperties(this.getClass(), new String[]{ udfContextSignature });
        p.setProperty(SCHEMA_SIGNATURE, s.toString());
    }

    public void prepareToWrite(RecordWriter writer) {
        // Get the schema string from the UDFContext object.
        UDFContext udfc = UDFContext.getUDFContext();
        Properties p =
            udfc.getUDFProperties(this.getClass(), new String[]{ udfContextSignature });

        String strSchema = p.getProperty(SCHEMA_SIGNATURE);
        if (strSchema != null) {
            // Parse the schema from the string stored in the properties object.
            try {
                schema = new ResourceSchema(Utils.getSchemaFromString(strSchema));
            } catch (ParserException pex) {
                logger.warn("Could not parse schema for storing.");
            }
        }

        if (headerTreatment == Headers.DEFAULT) {
            headerTreatment = Headers.SKIP_OUTPUT_HEADER;
        }

        // PigStorage's prepareToWrite()
        super.prepareToWrite(writer);
    }

    /* (non-Javadoc)
     * @see org.apache.pig.builtin.PigStorage#putNext(org.apache.pig.data.Tuple)
     *
     * Given a tuple that corresponds to one record, write
     * it out as CSV, converting among Unix/Windows line
     * breaks as requested in the instantiation. Also take
     * care of escaping field delimiters, double quotes,
     * and linebreaks embedded within fields,
     *
     */
    @Override
    public void putNext(Tuple tupleToWrite) throws IOException {
        // If WRITE_OUTPUT_HEADER, store a header record with the names of each field
        if (storingFirstRecord && headerTreatment == Headers.WRITE_OUTPUT_HEADER && schema != null) {
            ArrayList<Object> headerProtoTuple = new ArrayList<Object>();
            ResourceFieldSchema[] fields = schema.getFields();
            for (ResourceFieldSchema field : fields) {
                headerProtoTuple.add(field.getName());
            }
            super.putNext(tupleMaker.newTuple(headerProtoTuple));
        }
        storingFirstRecord = false;

        ArrayList<Object> mProtoTuple = new ArrayList<Object>();
        int embeddedNewlineIndex = -1;
        String fieldStr = null;
        // For good debug messages:
        int fieldCounter = -1;
       
        // Do the escaping:
        for (Object field : tupleToWrite.getAll()) {
            fieldCounter++;

            // Substitute a null value with an empty string. See PIG-2470.
            if (field == null) {
                mProtoTuple.add("");
                continue;
            }
           
            fieldStr = field.toString();
           
            // Embedded double quotes are replaced by two double quotes:
            fieldStr = fieldStr.replaceAll("[\"]", "\"\"");
           
            // If any field delimiters are in the field, or if we did replace
            // any double quotes with a pair of double quotes above,
            // or if the string includes a newline character (LF:\n:0x0A)
            // and we are to allow newlines in fields,
            // then the entire field must be enclosed in double quotes:
            embeddedNewlineIndex =  fieldStr.indexOf(LINEFEED);
           
            if ((fieldStr.indexOf(fieldDelimiter) != -1) ||
                (fieldStr.indexOf(DOUBLE_QUOTE) != -1) ||
                (multilineTreatment == Multiline.YES) && (embeddedNewlineIndex != -1))  {
                fieldStr = "\"" + fieldStr + "\"";
            }
           
            // If requested: replace any Linefeed-only (^J), with LF-Newline (^M^J),
            // This is needed for Excel to recognize a field-internal
            // new line:

            if ((eolTreatment != Linebreaks.NOCHANGE) && (embeddedNewlineIndex != -1)) {
                if (eolTreatment == Linebreaks.WINDOWS) {
                    loneLFDetector.reset(fieldStr);
                    loneLFDetector.matches();
                    fieldStr = loneLFDetector.replaceAll("$1\r\n");
                } else if (eolTreatment == Linebreaks.UNIX) {
                    CRLFDetector.reset(fieldStr);
                    fieldStr = CRLFDetector.replaceAll("\n");
                }
            }

            mProtoTuple.add(fieldStr);         
        }
        // If Windows line breaks are requested, append
        // a newline (0x0D a.k.a. ^M) to the last field
        // so that the row termination will end up being
        // \r\n, once the superclass' putNext() method
        // is done below:

        if ((eolTreatment == Linebreaks.WINDOWS) && (fieldStr != null))
            mProtoTuple.set(mProtoTuple.size() - 1, fieldStr + "\r");

        Tuple resTuple = tupleMaker.newTuple(mProtoTuple);
        super.putNext(resTuple);
    }

    // ---------------------------------------- LOADING  ----------------------------- 

    /* (non-Javadoc)
     * @see org.apache.pig.builtin.PigStorage#getNext()
     */
    @Override
    public Tuple getNext() throws IOException {
        // If SKIP_INPUT_HEADER and this is the first input split, skip header record
        // We store its value as a string though, so we can compare
        // further records to it. If they are the same (this would
        // happen if multiple small files each with a header were combined
        // into one split), we know to skip the duplicate header record as well.
        if (loadingFirstRecord && headerTreatment == Headers.SKIP_INPUT_HEADER &&
                (splitIndex == 0 || splitIndex == -1)) {
            try {
                if (!in.nextKeyValue())
                    return null;
                header = ((Text) in.getCurrentValue()).toString();
            } catch (InterruptedException e) {
                int errCode = 6018;
                String errMsg = "Error while reading input";
                throw new ExecException(errMsg, errCode,
                        PigException.REMOTE_ENVIRONMENT, e);
            }
        }
        loadingFirstRecord = false;

        mProtoTuple = new ArrayList<Object>();

        getNextInQuotedField = false;
        boolean evenQuotesSeen = true;
        boolean sawEmbeddedRecordDelimiter = false;
        byte[] buf = null;
       
        if (!mRequiredColumnsInitialized) {
            if (udfContextSignature != null) {
                Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
                mRequiredColumns = (boolean[]) ObjectSerializer.deserialize(p.getProperty(udfContextSignature));
            }
            mRequiredColumnsInitialized = true;
        }
        // Note: we cannot factor out the check for nextKeyValue() being null,
        // because that call overwrites buf with the new line, which is
        // bad if we have a field with a newline.

        try {
            int recordLen = 0;
            getNextFieldID = 0;
           
            while (sawEmbeddedRecordDelimiter || getNextFieldID == 0) {
                Text value = null;
                if (sawEmbeddedRecordDelimiter) {
                   
                    // Deal with pulling more records from the input, because
                    // a double quoted embedded newline was encountered in a field.
                    // Save the length of the record so far, plus one byte for the
                    // record delimiter (usually newline) that's embedded in the field
                    // we were working on before falling into this branch:
                    int prevLineLen = recordLen + 1;
                   
                    // Save previous line (the one with the field that has the newline) in a new array.
                    // The last byte will be random; we'll fill in the embedded
                    // record delimiter (usually newline) below:
                    byte[] prevLineSaved = Arrays.copyOf(buf, prevLineLen);
                    prevLineSaved[prevLineLen - 1] = RECORD_DEL;
                   
                    // Read the continuation of the record, unless EOF:
                    if (!in.nextKeyValue()) {
                        return null;
                    }                                                                                          
                    value = (Text) in.getCurrentValue();
                    recordLen = value.getLength();
                    // Grab the continuation's bytes:
                    buf = value.getBytes();
                   
                    // Combine the previous line and the continuation into a new array.
                    // The following copyOf() does half the job: it allocates all the
                    // space, and also copies the previous line into that space:
                    byte[] prevLineAndContinuation = Arrays.copyOf(prevLineSaved, prevLineLen + recordLen);
                   
                    // Now append the continuation. Parms: fromBuf, fromStartPos, toBuf, toStartPos, lengthToCopy:
                    System.arraycopy(buf, 0, prevLineAndContinuation, prevLineLen, recordLen);
                   
                    // We'll work with the combination now:
                    buf = prevLineAndContinuation;
                   
                    // Do the whole record over from the start:
                    mProtoTuple.clear();
                    getNextInQuotedField = false;
                    evenQuotesSeen = true;
                    getNextFieldID = 0;
                    recordLen = prevLineAndContinuation.length;
                   
                } else {
                    // Previous record finished cleanly: start with the next record,
                    // unless EOF:
                    if (!in.nextKeyValue()) {
                        return null;
                    }                                                                                          
                    value = (Text) in.getCurrentValue();

                    // if the line is a duplicate header and 'SKIP_INPUT_HEADER' is set, ignore it
                    // (this might happen if multiple files each with a header are combined into a single split)
                    if (headerTreatment == Headers.SKIP_INPUT_HEADER && value.toString().equals(header)) {
                        if (!in.nextKeyValue())
                            return null;
                        value = (Text) in.getCurrentValue();
                    }

                    buf = value.getBytes();
                    getNextFieldID = 0;
                    recordLen = value.getLength();
                }
               
                nextTupleSkipChar = false;

                ByteBuffer fieldBuffer = ByteBuffer.allocate(recordLen);

                sawEmbeddedRecordDelimiter = processOneInRecord(evenQuotesSeen,
                        buf, recordLen, fieldBuffer);

                // The last field is never delimited by a FIELD_DEL, but by
                // the end of the record. So we need to add that last field.
                // The '!sawEmbeddedRecordDelimiter' handles the case of
                // embedded newlines; we are amidst a field, not at
                // the final record:
                if (!sawEmbeddedRecordDelimiter)
                    readField(fieldBuffer, getNextFieldID++);
            } // end while

        } catch (InterruptedException e) {
            int errCode = 6018;
            String errMsg = "Error while reading input";
            throw new ExecException(errMsg, errCode,
                    PigException.REMOTE_ENVIRONMENT, e);
        }

        Tuple t =  mTupleFactory.newTupleNoCopy(mProtoTuple);
        return t;
    }

    /*
     * Service method for getNext().
     * Looks at char after char in the input record,
     * that was previously pulled in by getNext(),
     * and fills the fieldBuffer with those chars.
     * <p>
     * If multilineTreatment is Multiline.YES, then
     * the return value indicates whether an embedded
     * newline was found in a field, and that newline
     * was in a field that opened with a double quote
     * that was not closed before the end of the
     * record was reached. If multilineTreatment
     * is Multine.NO, then the return value is always false.
     * <p>
     * A return value of true will cause the calling method
     * to continue pulling records from the input stream,
     * until a closing quote is found.
     * <p>
     * Note that the recordReader that delivers records
     * to out getNext() method above considers record
     * boundaries to be newlines. We therefore never see an actual
     * newline character embedded in a field. We just
     * run out of record. For Multiline.NO we just take
     * such an end of record at face value; the final
     * resulting tuple will contain information only up
     * to the first newline that was found.
     * <p>
     * For Multiline.YES, when we run out of record
     * in an open double quote, our return of true from
     * this method will cause the caller getNext() to
     * do its additional readings of records from the
     * stream, until the closing double quote is found.
     *  <p>
     * 
     * @param evenQuotesSeen
     * @param sawEmbeddedRecordDelimiter
     * @param buf
     * @param recordLen
     * @param fieldBuffer
     * @return
     */
    private boolean processOneInRecord(boolean evenQuotesSeen,
                                       byte[] buf, int recordLen,
                                       ByteBuffer fieldBuffer) {
        for (int i = 0; i < recordLen; i++) {
            if (nextTupleSkipChar) {
                nextTupleSkipChar = false;
                continue;
            }
            byte b = buf[i];
            if (getNextInQuotedField) {
                if (b == DOUBLE_QUOTE) {
                    // Does a double quote immediately follow?
                    if ((i < recordLen-1) && (buf[i+1] == DOUBLE_QUOTE)) {
                        fieldBuffer.put(b);
                        nextTupleSkipChar = true;
                        continue;
                    }
                    evenQuotesSeen = !evenQuotesSeen;

                    // If the quote is ending the last field in a record,
                    // set the genNextInQuotedField flag to false,
                    // so the return statement conditional (see below)
                    // is false, indicating that we're ready for the next record
                    if (!evenQuotesSeen && i == recordLen - 1) {
                        getNextInQuotedField = false;
                    }

                    if (evenQuotesSeen) {
                        fieldBuffer.put(DOUBLE_QUOTE);
                    }
                } else if (!evenQuotesSeen && (b == fieldDelimiter || b == RECORD_DEL)) {
                    getNextInQuotedField = false;
                    readField(fieldBuffer, getNextFieldID++);
                } else {
                    fieldBuffer.put(b);
                }
            } else if (b == DOUBLE_QUOTE) {
                // Does a double quote immediately follow?                 
                if ((i < recordLen-1) && (buf[i+1] == DOUBLE_QUOTE)) {
                    fieldBuffer.put(b);
                    nextTupleSkipChar = true;
                    continue;
                }
                // If we are at the start of a field,
                // that entire field is quoted:
                getNextInQuotedField = true;
                evenQuotesSeen = true;
            } else if (b == fieldDelimiter) {
                readField(fieldBuffer, getNextFieldID++); // end of the field
            } else {
                evenQuotesSeen = true;
                fieldBuffer.put(b);
            }
        } // end for
        return getNextInQuotedField && (multilineTreatment == Multiline.YES);
    }

    private void readField(ByteBuffer buf, int fieldID) {
        if (mRequiredColumns==null || (mRequiredColumns.length>fieldID && mRequiredColumns[fieldID])) {
            byte[] bytes = new byte[buf.position()];
            buf.rewind();
            buf.get(bytes, 0, bytes.length);
            mProtoTuple.add(new DataByteArray(bytes));
        }
        buf.clear();
    }

    @Override
    public void setLocation(String location, Job job) throws IOException {
        loadLocation = location;
        FileInputFormat.setInputPaths(job, location);       
    }

    @SuppressWarnings("rawtypes")
    @Override
    public InputFormat getInputFormat() {
        if(loadLocation.endsWith(".bz2") || loadLocation.endsWith(".bz")) {
            return new Bzip2TextInputFormat();
        } else {
            return new PigTextInputFormat();
        }
    }

    @Override
    public void prepareToRead(@SuppressWarnings("rawtypes") RecordReader reader, PigSplit split) {
        in = reader;
        splitIndex = split.getSplitIndex();
       
        if (headerTreatment == Headers.DEFAULT) {
            headerTreatment = Headers.READ_INPUT_HEADER;
        }
    }

    @Override
    public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList) throws FrontendException {
        if (requiredFieldList == null)
            return null;
        if (requiredFieldList.getFields() != null)
        {
            int lastColumn = -1;
            for (RequiredField rf: requiredFieldList.getFields())
            {
                if (rf.getIndex()>lastColumn)
                {
                    lastColumn = rf.getIndex();
                }
            }
            mRequiredColumns = new boolean[lastColumn+1];
            for (RequiredField rf: requiredFieldList.getFields())
            {
                if (rf.getIndex()!=-1)
                    mRequiredColumns[rf.getIndex()] = true;
            }
            Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
            try {
                p.setProperty(udfContextSignature, ObjectSerializer.serialize(mRequiredColumns));
            } catch (Exception e) {
                throw new RuntimeException("Cannot serialize mRequiredColumns");
            }
        }
        return new RequiredFieldResponse(true);
    }

    @Override
    public void setUDFContextSignature(String signature) {
        this.udfContextSignature = signature;
    }

    @Override
    public List<OperatorSet> getFeatures() {
        return Arrays.asList(LoadPushDown.OperatorSet.PROJECTION);
    }
}
TOP

Related Classes of org.apache.pig.piggybank.storage.CSVExcelStorage

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.