Package com.asakusafw.runtime.directio.hadoop

Source Code of com.asakusafw.runtime.directio.hadoop.HadoopDataSourceCore

/**
* Copyright 2011-2014 Asakusa Framework Team.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.asakusafw.runtime.directio.hadoop;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import com.asakusafw.runtime.compatibility.FileSystemCompatibility;
import com.asakusafw.runtime.directio.BinaryStreamFormat;
import com.asakusafw.runtime.directio.Counter;
import com.asakusafw.runtime.directio.DataDefinition;
import com.asakusafw.runtime.directio.DataFormat;
import com.asakusafw.runtime.directio.DirectDataSource;
import com.asakusafw.runtime.directio.DirectInputFragment;
import com.asakusafw.runtime.directio.FilePattern;
import com.asakusafw.runtime.directio.FragmentableDataFormat;
import com.asakusafw.runtime.directio.OutputAttemptContext;
import com.asakusafw.runtime.directio.OutputTransactionContext;
import com.asakusafw.runtime.directio.ResourceInfo;
import com.asakusafw.runtime.directio.ResourcePattern;
import com.asakusafw.runtime.io.ModelInput;
import com.asakusafw.runtime.io.ModelOutput;

/**
* An implementation of {@link DirectDataSource} using {@link FileSystem}.
* @since 0.2.5
* @version 0.7.0
*/
public class HadoopDataSourceCore implements DirectDataSource {

    static final Log LOG = LogFactory.getLog(HadoopDataSourceCore.class);

    private static final String ATTEMPT_AREA = "attempts";

    private static final String STAGING_AREA = "staging";

    private final HadoopDataSourceProfile profile;

    /**
     * Creates a new instance.
     * @param profile profile of target data source
     * @throws IllegalArgumentException if some parameters were {@code null}
     */
    public HadoopDataSourceCore(HadoopDataSourceProfile profile) {
        if (profile == null) {
            throw new IllegalArgumentException("profile must not be null"); //$NON-NLS-1$
        }
        this.profile = profile;
    }

    @Override
    public <T> List<DirectInputFragment> findInputFragments(
            DataDefinition<T> definition,
            String basePath,
            ResourcePattern resourcePattern) throws IOException, InterruptedException {
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format(
                    "Start finding input (id={0}, path={1}, resourcePattern={2})",
                    profile.getId(),
                    basePath,
                    resourcePattern));
        }
        FilePattern pattern = validate(resourcePattern);
        HadoopDataSourceProfile p = profile;
        FileSystem fs = p.getFileSystem();
        Path root = p.getFileSystemPath();
        Path base = append(root, basePath);
        Path temporary = p.getTemporaryFileSystemPath();
        List<FileStatus> stats = HadoopDataSourceUtil.search(fs, base, pattern);
        stats = filesOnly(stats, temporary);

        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format(
                    "Process finding input (id={0}, path={1}, resource={2}, files={3})",
                    profile.getId(),
                    basePath,
                    resourcePattern,
                    stats.size()));
        }
        if (LOG.isTraceEnabled()) {
            for (FileStatus stat : stats) {
                LOG.trace(MessageFormat.format(
                        "Input found (path={0}, length={1})",
                        stat.getPath(),
                        stat.getLen()));
            }
        }
        DataFormat<T> format = definition.getDataFormat();
        Class<? extends T> dataType = definition.getDataClass();
        List<DirectInputFragment> results;
        if (format instanceof StripedDataFormat<?>) {
            StripedDataFormat.InputContext context = new StripedDataFormat.InputContext(
                    dataType,
                    stats, fs,
                    p.getMinimumFragmentSize(), p.getPreferredFragmentSize(),
                    p.isSplitBlocks(), p.isCombineBlocks());
            StripedDataFormat<T> sformat = (StripedDataFormat<T>) format;
            results = sformat.computeInputFragments(context);
        } else if (format instanceof FragmentableDataFormat<?>) {
            FragmentableDataFormat<T> sformat = (FragmentableDataFormat<T>) format;
            FragmentComputer optimizer = new FragmentComputer(
                    p.getMinimumFragmentSize(sformat), p.getPreferredFragmentSize(sformat),
                    p.isCombineBlocks(), p.isSplitBlocks());
            results = computeInputFragments(optimizer, stats);
        } else {
            FragmentComputer optimizer = new FragmentComputer();
            results = computeInputFragments(optimizer, stats);
        }

        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format(
                    "Finish finding input (id={0}, path={1}, resource={2}, fragments={3})",
                    profile.getId(),
                    basePath,
                    resourcePattern,
                    results.size()));
        }
        return results;
    }

    private boolean isIn(FileStatus stat, Path temporary) {
        assert stat != null;
        assert temporary != null;
        Path path = stat.getPath();
        if (path.equals(temporary) || HadoopDataSourceUtil.contains(temporary, path)) {
            return true;
        }
        return false;
    }

    private List<FileStatus> filesOnly(List<FileStatus> stats, Path temporary) {
        List<FileStatus> results = new ArrayList<FileStatus>();
        for (FileStatus stat : stats) {
            if (FileSystemCompatibility.isDirectory(stat) == false && isIn(stat, temporary) == false) {
                results.add(stat);
            }
        }
        return results;
    }

    private List<DirectInputFragment> computeInputFragments(
            FragmentComputer fragmentComputer,
            List<FileStatus> stats) throws IOException {
        List<DirectInputFragment> results = new ArrayList<DirectInputFragment>();
        for (FileStatus stat : stats) {
            String path = stat.getPath().toString();
            long fileSize = stat.getLen();
            List<BlockInfo> blocks = BlockMap.computeBlocks(profile.getFileSystem(), stat);
            if (LOG.isTraceEnabled()) {
                for (BlockInfo block : blocks) {
                    LOG.trace(MessageFormat.format(
                            "Original BlockInfo (path={0}, start={1}, end={2}, hosts={3})",
                            path,
                            block.getStart(),
                            block.getEnd(),
                            block.getHosts()));
                }
            }
            List<DirectInputFragment> fragments = fragmentComputer.computeFragments(path, fileSize, blocks);
            if (LOG.isTraceEnabled()) {
                for (DirectInputFragment fragment : fragments) {
                    LOG.trace(MessageFormat.format(
                            "Fragment found (path={0}, offset={1}, size={2}, owners={3})",
                            fragment.getPath(),
                            fragment.getOffset(),
                            fragment.getSize(),
                            fragment.getOwnerNodeNames()));
                }
            }
            results.addAll(fragments);
        }
        return results;
    }

    @Override
    public <T> ModelInput<T> openInput(
            DataDefinition<T> definition,
            DirectInputFragment fragment,
            Counter counter) throws IOException, InterruptedException {
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format(
                    "Start opening input (id={0}, path={1}, offset={2}, size={3})",
                    profile.getId(),
                    fragment.getPath(),
                    fragment.getOffset(),
                    fragment.getSize()));
        }
        DataFormat<T> format = definition.getDataFormat();
        Class<? extends T> dataType = definition.getDataClass();
        HadoopFileFormat<T> fileFormat = convertFormat(format);
        ModelInput<T> input = fileFormat.createInput(
                dataType,
                profile.getFileSystem(),
                new Path(fragment.getPath()),
                fragment.getOffset(),
                fragment.getSize(),
                counter);
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format(
                    "Finish opening input (id={0}, path={1}, offset={2}, size={3})",
                    profile.getId(),
                    fragment.getPath(),
                    fragment.getOffset(),
                    fragment.getSize()));
        }
        return input;
    }

    @Override
    public <T> ModelOutput<T> openOutput(
            OutputAttemptContext context,
            DataDefinition<T> definition,
            String basePath,
            String resourcePath,
            Counter counter) throws IOException, InterruptedException {
        FileSystem fs;
        Path attempt;
        if (isLocalAttemptOutput()) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format(
                        "Start opening output (id={0}, path={1}, resource={2}, streaming={3})",
                        profile.getId(),
                        basePath,
                        resourcePath,
                        true));
            }
            fs = profile.getLocalFileSystem();
            attempt = getLocalAttemptOutput(context);
        } else {
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format(
                        "Start opening output (id={0}, path={1}, resource={2}, streaming={3})",
                        profile.getId(),
                        basePath,
                        resourcePath,
                        false));
            }
            fs = profile.getFileSystem();
            attempt = getAttemptOutput(context);
        }
        DataFormat<T> format = definition.getDataFormat();
        Class<? extends T> dataType = definition.getDataClass();
        Path file = append(append(attempt, basePath), resourcePath);
        HadoopFileFormat<T> fileFormat = convertFormat(format);
        ModelOutput<T> output = fileFormat.createOutput(dataType, fs, file, counter);
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format(
                    "Finish opening output (id={0}, path={1}, resource={2}, file={3})",
                    profile.getId(),
                    basePath,
                    resourcePath,
                    file));
        }
        return output;
    }

    boolean isLocalAttemptOutput() {
        return profile.isOutputStreaming() == false
                && HadoopDataSourceUtil.isLocalAttemptOutputDefined(profile.getLocalFileSystem());
    }

    private FilePattern validate(ResourcePattern pattern) throws IOException {
        assert pattern != null;
        if ((pattern instanceof FilePattern) == false) {
            throw new IOException(MessageFormat.format(
                    "{2} must be a subtype of {1} (path={0})",
                    profile.getContextPath(),
                    FilePattern.class.getName(),
                    pattern.getClass().getName()));
        }
        return (FilePattern) pattern;
    }

    private <T> HadoopFileFormat<T> convertFormat(DataFormat<T> format) throws IOException {
        assert format != null;
        if (format instanceof HadoopFileFormat<?>) {
            return (HadoopFileFormat<T>) format;
        } else {
            return new HadoopFileFormatAdapter<T>(validateStream(format), profile.getFileSystem().getConf());
        }
    }

    private <T> BinaryStreamFormat<T> validateStream(DataFormat<T> format) throws IOException {
        assert format != null;
        if ((format instanceof BinaryStreamFormat<?>) == false) {
            throw new IOException(MessageFormat.format(
                    "{2} must be a subtype of {1} (path={0})",
                    profile.getContextPath(),
                    BinaryStreamFormat.class.getName(),
                    format.getClass().getName()));
        }
        return (BinaryStreamFormat<T>) format;
    }

    @Override
    public List<ResourceInfo> list(
            String basePath,
            ResourcePattern resourcePattern,
            Counter counter) throws IOException, InterruptedException {
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format(
                    "Start listing files (id={0}, path={1}, resource={2})",
                    profile.getId(),
                    basePath,
                    resourcePattern));
        }
        FilePattern pattern = validate(resourcePattern);
        HadoopDataSourceProfile p = profile;
        FileSystem fs = p.getFileSystem();
        Path root = p.getFileSystemPath();
        Path base = append(root, basePath);
        Path temporary = p.getTemporaryFileSystemPath();
        List<FileStatus> stats = HadoopDataSourceUtil.search(fs, base, pattern);
        stats = normalize(stats, root, temporary);

        List<ResourceInfo> results = new ArrayList<ResourceInfo>();
        for (FileStatus stat : stats) {
            counter.add(1);
            ResourceInfo resource = new ResourceInfo(
                    profile.getId(),
                    stat.getPath().toString(),
                    FileSystemCompatibility.isDirectory(stat));
            results.add(resource);
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format(
                    "Finish listing files (id={0}, path={1}, resource={2}, count={3})",
                    profile.getId(),
                    basePath,
                    resourcePattern,
                    results.size()));
        }
        return results;
    }

    @Override
    public boolean delete(
            String basePath,
            ResourcePattern resourcePattern,
            boolean recursive,
            Counter counter) throws IOException, InterruptedException {
        assert basePath.startsWith("/") == false;
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format(
                    "Start deleting files (id={0}, path={1}, resource={2}, recursive={3})",
                    profile.getId(),
                    basePath,
                    resourcePattern,
                    recursive));
        }
        FilePattern pattern = validate(resourcePattern);
        HadoopDataSourceProfile p = profile;
        FileSystem fs = p.getFileSystem();
        Path root = p.getFileSystemPath();
        Path base = append(root, basePath);
        List<FileStatus> stats = HadoopDataSourceUtil.search(fs, base, pattern);
        Path temporary = p.getTemporaryFileSystemPath();
        stats = normalize(stats, root, temporary);
        if (recursive) {
            stats = HadoopDataSourceUtil.onlyMinimalCovered(stats);
        }

        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format(
                    "Process deleting files (id={0}, path={1}, resource={2}, files={3})",
                    profile.getId(),
                    basePath,
                    resourcePattern,
                    stats.size()));
        }
        boolean succeed = true;
        for (FileStatus stat : stats) {
            if (LOG.isTraceEnabled()) {
                LOG.trace(MessageFormat.format(
                        "Deleting file (id={0}, path={1}, recursive={2})",
                        profile.getId(),
                        stat.getPath(),
                        recursive));
            }
            if (recursive == false && FileSystemCompatibility.isDirectory(stat)) {
                LOG.info(MessageFormat.format(
                        "Skip deleting directory (id={0}, path={1})",
                        profile.getId(),
                        stat.getPath()));
            } else {
                counter.add(1);
                succeed &= fs.delete(stat.getPath(), recursive);
            }
        }

        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format(
                    "Finish deleting files (id={0}, path={1}, resource={2}, files={3})",
                    profile.getId(),
                    basePath,
                    resourcePattern,
                    stats.size()));
        }
        return succeed;
    }

    private List<FileStatus> normalize(List<FileStatus> stats, Path root, Path temporary) {
        assert stats != null;
        assert root != null;
        assert temporary != null;
        List<FileStatus> results = new ArrayList<FileStatus>();
        for (FileStatus stat : stats) {
            if (root.equals(stat.getPath()) == false && isIn(stat, temporary) == false) {
                results.add(stat);
            }
        }
        return results;
    }

    private Path append(Path parent, String child) {
        assert parent != null;
        assert child != null;
        return child.isEmpty() ? parent : new Path(parent, child);
    }

    @Override
    public void setupAttemptOutput(OutputAttemptContext context) throws IOException, InterruptedException {
        if (profile.isOutputStreaming() == false && isLocalAttemptOutput() == false) {
            LOG.warn(MessageFormat.format(
                    "Streaming output is disabled but the local temporary directory ({1}) is not defined (id={0})",
                    profile.getId(),
                    HadoopDataSourceUtil.KEY_LOCAL_TEMPDIR));
        }
        if (isLocalAttemptOutput()) {
            FileSystem fs = profile.getLocalFileSystem();
            Path attempt = getLocalAttemptOutput(context);
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format(
                        "Create local attempt area (id={0}, path={1})",
                        profile.getId(),
                        attempt));
            }
            fs.mkdirs(attempt);
        } else {
            FileSystem fs = profile.getFileSystem();
            Path attempt = getAttemptOutput(context);
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format(
                        "Create attempt area (id={0}, path={1})",
                        profile.getId(),
                        attempt));
            }
            fs.mkdirs(attempt);
        }
    }

    @Override
    public void commitAttemptOutput(OutputAttemptContext context) throws IOException, InterruptedException {
        Path target;
        if (profile.isOutputStaging()) {
            target = getStagingOutput(context.getTransactionContext());
        } else {
            target = profile.getFileSystemPath();
        }
        if (isLocalAttemptOutput()) {
            Path attempt = getLocalAttemptOutput(context);
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format(
                        "Commit local attempt area (id={0}, path={1}, staging={2})",
                        profile.getId(),
                        attempt,
                        profile.isOutputStaging()));
            }
            HadoopDataSourceUtil.moveFromLocal(
                    context.getCounter(), profile.getLocalFileSystem(), profile.getFileSystem(), attempt, target);
        } else {
            Path attempt = getAttemptOutput(context);
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format(
                        "Commit attempt area (id={0}, path={1}, staging={2})",
                        profile.getId(),
                        attempt,
                        profile.isOutputStaging()));
            }
            HadoopDataSourceUtil.move(context.getCounter(), profile.getFileSystem(), attempt, target);
        }
    }

    @Override
    public void cleanupAttemptOutput(OutputAttemptContext context) throws IOException, InterruptedException {
        if (isLocalAttemptOutput()) {
            Path attempt = getLocalAttemptOutput(context);
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format(
                        "Delete local attempt area (id={0}, path={1})",
                        profile.getId(),
                        attempt));
            }
            FileSystem fs = profile.getLocalFileSystem();
            fs.delete(attempt, true);
        } else {
            Path attempt = getAttemptOutput(context);
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format(
                        "Delete attempt area (id={0}, path={1})",
                        profile.getId(),
                        attempt));
            }
            FileSystem fs = profile.getFileSystem();
            fs.delete(attempt, true);
        }
    }

    @Override
    public void setupTransactionOutput(OutputTransactionContext context) throws IOException, InterruptedException {
        if (profile.isOutputStaging()) {
            FileSystem fs = profile.getFileSystem();
            Path staging = getStagingOutput(context);
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format(
                        "Create staging area (id={0}, path={1})",
                        profile.getId(),
                        staging));
            }
            fs.mkdirs(staging);
        }
    }

    @Override
    public void commitTransactionOutput(OutputTransactionContext context) throws IOException, InterruptedException {
        if (profile.isOutputStaging()) {
            FileSystem fs = profile.getFileSystem();
            Path staging = getStagingOutput(context);
            Path target = profile.getFileSystemPath();
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format(
                        "Commit staging area (id={0}, path={1})",
                        profile.getId(),
                        staging));
            }
            HadoopDataSourceUtil.move(context.getCounter(), fs, staging, target);
        }
    }

    @Override
    public void cleanupTransactionOutput(OutputTransactionContext context) throws IOException, InterruptedException {
        FileSystem fs = profile.getFileSystem();
        Path path = getTemporaryOutput(context);
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format(
                    "Delete temporary area (id={0}, path={1})",
                    profile.getId(),
                    path));
        }
        try {
            if (fs.delete(path, true) == false) {
                LOG.warn(MessageFormat.format(
                        "Failed to delete temporary area (id={0}, path={0})",
                        profile.getId(),
                        path));
            }
        } catch (FileNotFoundException e) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format(
                        "Temporary area is not found (may be not used): {0}",
                        path));
            }
        }
    }

    private Path getTemporaryOutput(OutputTransactionContext context) {
        assert context != null;
        Path tempRoot = profile.getTemporaryFileSystemPath();
        String suffix = String.format("%s-%s",
                context.getTransactionId(),
                context.getOutputId());
        return append(tempRoot, suffix);
    }

    Path getStagingOutput(OutputTransactionContext context) {
        assert context != null;
        Path tempPath = getTemporaryOutput(context);
        String suffix = STAGING_AREA;
        return append(tempPath, suffix);
    }

    Path getAttemptOutput(OutputAttemptContext context) {
        assert context != null;
        Path tempPath = getTemporaryOutput(context.getTransactionContext());
        String suffix = String.format("%s/%s",
                ATTEMPT_AREA,
                context.getAttemptId());
        return append(tempPath, suffix);
    }

    Path getLocalAttemptOutput(OutputAttemptContext context) throws IOException {
        assert context != null;
        Path tempPath = HadoopDataSourceUtil.getLocalTemporaryDirectory(profile.getLocalFileSystem());
        String suffix = String.format("%s-%s-%s",
                context.getTransactionId(),
                context.getAttemptId(),
                context.getOutputId());
        return append(tempPath, suffix);
    }
}
TOP

Related Classes of com.asakusafw.runtime.directio.hadoop.HadoopDataSourceCore

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.