Package com.etsy.cascading.tap.local

Source Code of com.etsy.cascading.tap.local.LocalTap$LocalScheme

package com.etsy.cascading.tap.local;

import java.io.IOException;
import java.util.Map;
import java.util.Properties;
import java.util.logging.Logger;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;

import cascading.flow.FlowProcess;
import cascading.flow.hadoop.HadoopFlowProcess;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.scheme.Scheme;
import cascading.scheme.SinkCall;
import cascading.scheme.SourceCall;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.hadoop.Lfs;
import cascading.tuple.Fields;
import cascading.tuple.TupleEntryCollector;
import cascading.tuple.TupleEntryIterator;
import cascading.util.Util;

/**
* A shunt around the restrictions on taps that can run under the
* LocalFlowProcess. This class delegates to Lfs for all work, yet can run in
* Cascading local mode.
*
* Due to what appears to be a bug in LocalFlowStep#initTaps, schemes are not
* allowed to side-effect the properties passed to sourceConfInit and
* sinkConfInit. This appears to be a non-issue because local schemes never set
* any properties. overwriteProperties can likely be removed, but is kept in
* case custom local schemes eventually set properties (in a future release of
* Cascading where that's possible).
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
public class LocalTap<SourceCtx, SinkCtx> extends Tap<Properties, RecordReader, OutputCollector> {
    private static final long serialVersionUID = 3480480638297770870L;

    private static Logger LOG = Logger.getLogger(LocalTap.class.getName());

    private String path;
    private JobConf defaults;
    private Lfs lfs;

    public LocalTap(String path, Scheme<JobConf, RecordReader, OutputCollector, SourceCtx, SinkCtx> scheme,
            SinkMode sinkMode) {
        super(new LocalScheme<SourceCtx, SinkCtx>(scheme), sinkMode);
        setup(path, scheme);
    }

    public LocalTap(String path, Scheme<JobConf, RecordReader, OutputCollector, SourceCtx, SinkCtx> scheme) {
        super(new LocalScheme<SourceCtx, SinkCtx>(scheme));
        setup(path, scheme);
    }

    private void setup(String path, Scheme<JobConf, RecordReader, OutputCollector, SourceCtx, SinkCtx> scheme) {
        this.path = path;

        /*
         * LocalTap requires your system Hadoop configuration for defaults to
         * supply the wrapped Lfs. Make sure you have your serializations and
         * serialization tokens defined there.
         */
        defaults = new JobConf();

        // HACK: c.t.h.TextLine checks this property for .zip files; the check
        // assumes the list is non-empty, which we mock up, here
        defaults.set("mapred.input.dir", path);

        // HACK: Parquet uses this property to generate unique file names
        defaults.set("mapred.task.partition", "0");

        // HACK: disable Parquet counters
        defaults.set("parquet.benchmark.bytes.read", "false");
        defaults.set("parquet.benchmark.bytes.total", "false");
        defaults.set("parquet.benchmark.time.read", "false");

        ((LocalScheme<SourceCtx, SinkCtx>) this.getScheme()).setDefaults(defaults);

        lfs = new Lfs(scheme, path);
        ((LocalScheme<SourceCtx, SinkCtx>) this.getScheme()).setLfs(lfs);
    }

    @Override
    public String getIdentifier() {
        return path;
    }

    @Override
    public TupleEntryIterator openForRead(FlowProcess<Properties> flowProcess, RecordReader input) throws IOException {
        JobConf jobConf = mergeDefaults("LocalTap#openForRead", flowProcess.getConfigCopy(), defaults);
        return lfs.openForRead(new HadoopFlowProcess(jobConf));
    }

    @Override
    public TupleEntryCollector openForWrite(FlowProcess<Properties> flowProcess, OutputCollector output)
            throws IOException {
        JobConf jobConf = mergeDefaults("LocalTap#openForWrite", flowProcess.getConfigCopy(), defaults);
        return lfs.openForWrite(new HadoopFlowProcess(jobConf));
    }

    @Override
    public boolean createResource(Properties conf) throws IOException {
        return lfs.createResource(mergeDefaults("LocalTap#createResource", conf, defaults));
    }

    @Override
    public boolean deleteResource(Properties conf) throws IOException {
        return lfs.deleteResource(mergeDefaults("LocalTap#deleteResource", conf, defaults));
    }

    @Override
    public boolean resourceExists(Properties conf) throws IOException {
        return lfs.resourceExists(mergeDefaults("LocalTap#resourceExists", conf, defaults));
    }

    @Override
    public long getModifiedTime(Properties conf) throws IOException {
        return lfs.getModifiedTime(mergeDefaults("LocalTap#getModifiedTime", conf, defaults));
    }

    private static JobConf mergeDefaults(String methodName, Properties properties, JobConf defaults) {
        LOG.fine(methodName + " is merging defaults with: " + properties);
        return HadoopUtil.createJobConf(properties, defaults);
    }

    private static Properties overwriteProperties(Properties properties, JobConf jobConf) {
        for (Map.Entry<String, String> entry : jobConf) {
            properties.setProperty(entry.getKey(), entry.getValue());
        }
        return properties;
    }

    /**
     * Scheme used only for fields and configuration.
     */
    private static class LocalScheme<SourceContext, SinkContext> extends
            Scheme<Properties, RecordReader, OutputCollector, SourceContext, SinkContext> {
        private static final long serialVersionUID = 5710119342340369543L;

        private Scheme<JobConf, RecordReader, OutputCollector, SourceContext, SinkContext> scheme;
        private JobConf defaults;
        private Lfs lfs;

        public LocalScheme(Scheme<JobConf, RecordReader, OutputCollector, SourceContext, SinkContext> scheme) {
            super(scheme.getSourceFields(), scheme.getSinkFields());
            this.scheme = scheme;
        }

        private void setDefaults(JobConf defaults) {
            this.defaults = defaults;
        }

        private void setLfs(Lfs lfs) {
            this.lfs = lfs;
        }

        @Override
        public Fields retrieveSourceFields(FlowProcess<Properties> flowProcess,
                Tap tap) {
            return scheme.retrieveSourceFields(new HadoopFlowProcess(defaults), lfs);
        }

        @Override
        public void presentSourceFields(FlowProcess<Properties> flowProcess,
                Tap tap, Fields fields) {
            scheme.presentSourceFields(new HadoopFlowProcess(defaults), lfs, fields);
        }

        @Override
        public void sourceConfInit(FlowProcess<Properties> flowProcess,
                Tap<Properties, RecordReader, OutputCollector> tap, Properties conf) {
            JobConf jobConf = mergeDefaults("LocalScheme#sourceConfInit", conf, defaults);
            scheme.sourceConfInit(new HadoopFlowProcess(jobConf), lfs, jobConf);
            overwriteProperties(conf, jobConf);
        }

        @Override
        public Fields retrieveSinkFields(FlowProcess<Properties> flowProcess,
                Tap tap) {
            return scheme.retrieveSinkFields(new HadoopFlowProcess(defaults), lfs);
        }

        @Override
        public void presentSinkFields(FlowProcess<Properties> flowProcess,
                Tap tap, Fields fields) {
            scheme.presentSinkFields(new HadoopFlowProcess(defaults), lfs, fields);
        }
           
        @Override
        public void sinkConfInit(FlowProcess<Properties> flowProcess,
                Tap<Properties, RecordReader, OutputCollector> tap, Properties conf) {
            JobConf jobConf = mergeDefaults("LocalScheme#sinkConfInit", conf, defaults);
            scheme.sinkConfInit(new HadoopFlowProcess(jobConf), lfs, jobConf);
            overwriteProperties(conf, jobConf);
        }

        @Override
        public boolean source(FlowProcess<Properties> flowProcess, SourceCall<SourceContext, RecordReader> sourceCall)
                throws IOException {
            throw new RuntimeException("LocalTap#source is never called");
        }

        @Override
        public void sink(FlowProcess<Properties> flowProcess, SinkCall<SinkContext, OutputCollector> sinkCall)
                throws IOException {
            throw new RuntimeException("LocalTap#sink is never called");
        }
    }

    @Override
    public boolean equals(Object object) {
        if (this == object)
            return true;
        if (!(object instanceof LocalTap))
            return false;
        if (!super.equals(object))
            return false;

        LocalTap localTap = (LocalTap) object;

        if (path != null ? !path.equals(localTap.path) : localTap.path != null)
            return false;

        return true;
    }

    @Override
    public int hashCode() {
        int result = super.hashCode();
        result = 31 * result + (path != null ? path.hashCode() : 0);
        return result;
    }

    /** @see Object#toString() */
    @Override
    public String toString() {
        if (path != null)
            return getClass().getSimpleName() + "[\"" + getScheme() + "\"]" + "[\"" + Util.sanitizeUrl(path) + "\"]";
        else
            return getClass().getSimpleName() + "[\"" + getScheme() + "\"]" + "[not initialized]";
    }
}
TOP

Related Classes of com.etsy.cascading.tap.local.LocalTap$LocalScheme

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.