Package org.apache.flink.runtime.fs.s3

Source Code of org.apache.flink.runtime.fs.s3.S3FileSystem

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


package org.apache.flink.runtime.fs.s3;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.flink.configuration.GlobalConfiguration;
import org.apache.flink.core.fs.BlockLocation;
import org.apache.flink.core.fs.FSDataInputStream;
import org.apache.flink.core.fs.FSDataOutputStream;
import org.apache.flink.core.fs.FileStatus;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;
import org.apache.flink.util.StringUtils;

import com.amazonaws.AmazonClientException;
import com.amazonaws.AmazonServiceException;
import com.amazonaws.auth.AWSCredentials;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.Bucket;
import com.amazonaws.services.s3.model.ObjectListing;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.Owner;
import com.amazonaws.services.s3.model.S3ObjectSummary;

/**
* This class provides a {@link FileSystem} implementation which is backed by Amazon's Simple Storage Service (S3). The
* implementation uses the REST API of Amazon S3 to facilitate the communication and read/write the data.
*
*/
public final class S3FileSystem extends FileSystem {

  /**
   * The logging object used for debugging.
   */
  private static final Logger LOG = LoggerFactory.getLogger(S3FileSystem.class);

  /**
   * The configuration key to access the S3 host.
   */
  public static final String S3_HOST_KEY = "fs.s3.host";

  /**
   * The configuration key to access the S3 port.
   */
  public static final String S3_PORT_KEY = "fs.s3.port";

  /**
   * The configuration key to access the S3 Reduced Redundancy Storage setting.
   */
  public static final String S3_RRS_KEY = "fs.s3.rrs";

  /**
   * The configuration key to access the S3 access key.
   */
  public static final String S3_ACCESS_KEY_KEY = "fs.s3.accessKey";

  /**
   * The configuration key to access the S3 secret key.
   */
  public static final String S3_SECRET_KEY_KEY = "fs.s3.secretKey";

  /**
   * The default host to connect to.
   */
  private static final String DEFAULT_S3_HOST = "s3.amazonaws.com";

  /**
   * The default setting whether to use S3 Reduced Redundancy Storage
   */
  private static final boolean DEFAULT_S3_RRS = true;

  /**
   * The default port to connect to.
   */
  private static final int DEFAULT_S3_PORT = 80;

  /**
   * The prefix of the HTTP protocol.
   */
  private static final String HTTP_PREFIX = "http";

  /**
   * The error code for "resource not found" according to the HTTP protocol.
   */
  private static final int HTTP_RESOURCE_NOT_FOUND_CODE = 404;

  /**
   * The character which S3 uses internally to indicate an object represents a directory.
   */
  private static final char S3_DIRECTORY_SEPARATOR = '/';

  /**
   * The scheme which is used by this file system.
   */
  public static final String S3_SCHEME = "s3";

  /**
   * The character set with which the URL is expected to be encoded
   */
  private static final String URL_ENCODE_CHARACTER = "UTF-8";

  /**
   * The host to address the REST requests to.
   */
  private String host = null;

  private int port = -1;

  private URI s3Uri = null;

  private AmazonS3Client s3Client = null;

  private S3DirectoryStructure directoryStructure = null;

  private final boolean useRRS;

  public S3FileSystem() {

    this.useRRS = GlobalConfiguration.getBoolean(S3_RRS_KEY, DEFAULT_S3_RRS);
    LOG.info("Creating new S3 file system binding with Reduced Redundancy Storage "
      + (this.useRRS ? "enabled" : "disabled"));
  }


  @Override
  public Path getWorkingDirectory() {

    return new Path(this.s3Uri);
  }


  @Override
  public URI getUri() {

    return this.s3Uri;
  }


  @Override
  public void initialize(URI name) throws IOException {

    this.host = name.getHost();
    if (this.host == null) {
      LOG.debug("Provided URI does not provide a host to connect to, using configuration...");
      this.host = GlobalConfiguration.getString(S3_HOST_KEY, DEFAULT_S3_HOST);
    }

    this.port = name.getPort();
    if (this.port == -1) {
      LOG.debug("Provided URI does not provide a port to connect to, using configuration...");
      this.port = GlobalConfiguration.getInteger(S3_PORT_KEY, DEFAULT_S3_PORT);
    }

    final String userInfo = name.getUserInfo();

    String awsAccessKey = null;
    String awsSecretKey = null;

    if (userInfo != null) {

      final String[] splits = userInfo.split(":");
      if (splits.length > 1) {
        awsAccessKey = URLDecoder.decode(splits[0], URL_ENCODE_CHARACTER);
        awsSecretKey = URLDecoder.decode(splits[1], URL_ENCODE_CHARACTER);
      }
    }

    if (awsAccessKey == null) {
      LOG.debug("Provided URI does not provide an access key to Amazon S3, using configuration...");
      awsAccessKey = GlobalConfiguration.getString(S3_ACCESS_KEY_KEY, null);
      if (awsAccessKey == null) {
        throw new IOException("Cannot determine access key to Amazon S3");
      }
    }

    if (awsSecretKey == null) {
      LOG.debug("Provided URI does not provide a secret key to Amazon S3, using configuration...");
      awsSecretKey = GlobalConfiguration.getString(S3_SECRET_KEY_KEY, null);
      if (awsSecretKey == null) {
        throw new IOException("Cannot determine secret key to Amazon S3");
      }
    }

    final AWSCredentials credentials = new BasicAWSCredentials(awsAccessKey, awsSecretKey);
    this.s3Client = new AmazonS3Client(credentials);

    initializeDirectoryStructure(name);
  }

  private void initializeDirectoryStructure(final URI name) throws IOException {

    String basePath = name.getPath();
    while (true) {

      try {
        final String endpoint = new URL(HTTP_PREFIX, this.host, this.port, basePath).toString();
        if (LOG.isDebugEnabled()) {
          LOG.debug("Trying S3 endpoint " + endpoint);
        }

        this.s3Client.setEndpoint(endpoint);
        final Owner owner = this.s3Client.getS3AccountOwner();
        LOG.info("Successfully established connection to Amazon S3 using the endpoint " + endpoint);
        LOG.info("Amazon S3 user is " + owner.getDisplayName());

        break;
      } catch (MalformedURLException e) {
        throw new IOException(StringUtils.stringifyException(e));
      } catch (AmazonClientException e) {

        // Truncate path
        if (basePath.isEmpty()) {
          throw new IOException("Cannot establish connection to Amazon S3: "
            + StringUtils.stringifyException(e));
        } else {
          final int pos = basePath.lastIndexOf(Path.SEPARATOR);
          if (pos < 0) {
            basePath = "";
          } else {
            basePath = basePath.substring(0, pos);
          }
        }
      }
    }

    // Set the S3 URI
    try {
      this.s3Uri = new URI(S3_SCHEME, (String) null, this.host, this.port, basePath, null, null);
    } catch (URISyntaxException e) {
      throw new IOException(StringUtils.stringifyException(e));
    }

    // Finally, create directory structure object
    this.directoryStructure = new S3DirectoryStructure(basePath);
  }


  @Override
  public FileStatus getFileStatus(final Path f) throws IOException {

    final S3BucketObjectPair bop = this.directoryStructure.toBucketObjectPair(f);

    // This is the S3:/// base directory
    if (!bop.hasBucket() && !bop.hasObject()) {
      return new S3FileStatus(f, 0L, true, 0L, 0L);
    }

    try {
      if (bop.hasBucket() && !bop.hasObject()) {

        final List<Bucket> buckets = this.s3Client.listBuckets();
        final Iterator<Bucket> it = buckets.iterator();

        // Iterator throw list of buckets to find out creation date
        while (it.hasNext()) {

          final Bucket bucket = it.next();
          if (bop.getBucket().equals(bucket.getName())) {

            final long creationDate = dateToLong(bucket.getCreationDate());
            // S3 does not track access times, so this implementation always sets it to 0
            return new S3FileStatus(f, 0L, true, creationDate, 0L);
          }
        }

        throw new FileNotFoundException("Cannot find " + f.toUri());
      }

      try {
        final ObjectMetadata om = this.s3Client.getObjectMetadata(bop.getBucket(), bop.getObject());
        final long modificationDate = dateToLong(om.getLastModified());
        // S3 does not track access times, so this implementation always sets it to 0
        if (objectRepresentsDirectory(bop.getObject(), om.getContentLength())) {
          return new S3FileStatus(f, 0L, true, modificationDate, 0L);
        } else {
          return new S3FileStatus(f, om.getContentLength(), false, modificationDate, 0L);
        }

      } catch (AmazonServiceException e) {
        if (e.getStatusCode() == HTTP_RESOURCE_NOT_FOUND_CODE) {
          throw new FileNotFoundException("Cannot find " + f.toUri());
        } else {
          throw e;
        }
      }
    } catch (AmazonClientException e) {
      throw new IOException(StringUtils.stringifyException(e));
    }
  }

  private static long dateToLong(final Date date) {

    if (date == null) {
      return 0L;
    }

    return date.getTime();
  }


  @Override
  public BlockLocation[] getFileBlockLocations(final FileStatus file, final long start, final long len)
      throws IOException {

    if ((start + len) > file.getLen()) {
      return null;
    }

    final S3BlockLocation bl = new S3BlockLocation(this.host, file.getLen());

    return new BlockLocation[] { bl };
  }


  @Override
  public FSDataInputStream open(final Path f, final int bufferSize) throws IOException {

    return open(f); // Ignore bufferSize
  }


  @Override
  public FSDataInputStream open(final Path f) throws IOException {

    final FileStatus fileStatus = getFileStatus(f); // Will throw FileNotFoundException if f does not exist

    // Make sure f is not a directory
    if (fileStatus.isDir()) {
      throw new IOException("Cannot open " + f.toUri() + " because it is a directory");
    }

    final S3BucketObjectPair bop = this.directoryStructure.toBucketObjectPair(f);
    if (!bop.hasBucket() || !bop.hasObject()) {
      throw new IOException(f.toUri() + " cannot be opened");
    }

    return new S3DataInputStream(this.s3Client, bop.getBucket(), bop.getObject());
  }


  @Override
  public FileStatus[] listStatus(final Path f) throws IOException {

    final S3BucketObjectPair bop = this.directoryStructure.toBucketObjectPair(f);

    try {

      if (!bop.hasBucket()) {

        final List<Bucket> list = this.s3Client.listBuckets();
        final S3FileStatus[] array = new S3FileStatus[list.size()];
        final Iterator<Bucket> it = list.iterator();
        int i = 0;
        while (it.hasNext()) {
          final Bucket bucket = it.next();
          final long creationDate = dateToLong(bucket.getCreationDate());
          // S3 does not track access times, so this implementation always sets it to 0
          final S3FileStatus status = new S3FileStatus(extendPath(f, bucket.getName()
            + S3_DIRECTORY_SEPARATOR), 0, true, creationDate, 0L);
          array[i++] = status;
        }

        return array;
      }

      if (bop.hasBucket() && !bop.hasObject()) {

        // Check if the bucket really exists
        if (!this.s3Client.doesBucketExist(bop.getBucket())) {
          throw new FileNotFoundException("Cannot find " + f.toUri());
        }

        return listBucketContent(f, bop);

      } else {

        final ObjectMetadata omd = this.s3Client.getObjectMetadata(bop.getBucket(), bop.getObject());
        if (objectRepresentsDirectory(bop.getObject(), omd.getContentLength())) {

          return listBucketContent(f, bop);

        } else {
          final S3FileStatus fileStatus = new S3FileStatus(f, omd.getContentLength(), false,
            dateToLong(omd.getLastModified()), 0L);

          return new FileStatus[] { fileStatus };
        }

      }

    } catch (AmazonClientException e) {
      throw new IOException(StringUtils.stringifyException(e));
    }
  }

  private S3FileStatus[] listBucketContent(final Path f, final S3BucketObjectPair bop) throws IOException {

    ObjectListing listing = null;
    final List<S3FileStatus> resultList = new ArrayList<S3FileStatus>();

    final int depth = (bop.hasObject() ? getDepth(bop.getObject()) + 1 : 0);

    while (true) {

      if (listing == null) {
        if (bop.hasObject()) {
          listing = this.s3Client.listObjects(bop.getBucket(), bop.getObject());
        } else {
          listing = this.s3Client.listObjects(bop.getBucket());
        }
      } else {
        listing = this.s3Client.listNextBatchOfObjects(listing);
      }

      final List<S3ObjectSummary> list = listing.getObjectSummaries();
      final Iterator<S3ObjectSummary> it = list.iterator();
      while (it.hasNext()) {

        final S3ObjectSummary os = it.next();
        String key = os.getKey();

        final int childDepth = getDepth(os.getKey());

        if (childDepth != depth) {
          continue;
        }

        // Remove the prefix
        if (bop.hasObject()) {
          if (key.startsWith(bop.getObject())) {
            key = key.substring(bop.getObject().length());
          }

          // This has been the prefix itself
          if (key.isEmpty()) {
            continue;
          }
        }

        final long modificationDate = dateToLong(os.getLastModified());

        S3FileStatus fileStatus;
        if (objectRepresentsDirectory(os)) {
          fileStatus = new S3FileStatus(extendPath(f, key), 0, true, modificationDate, 0L);
        } else {
          fileStatus = new S3FileStatus(extendPath(f, key), os.getSize(), false, modificationDate, 0L);
        }

        resultList.add(fileStatus);
      }

      if (!listing.isTruncated()) {
        break;
      }
    }

    /*
     * System.out.println("---- RETURN CONTENT ----");
     * for (final FileStatus entry : resultList) {
     * System.out.println(entry.getPath());
     * }
     * System.out.println("------------------------");
     */

    return resultList.toArray(new S3FileStatus[0]);

  }

  private static int getDepth(final String key) {

    int depth = 0;
    int nextStartPos = 0;

    final int length = key.length();

    while (nextStartPos < length) {

      final int sepPos = key.indexOf(S3_DIRECTORY_SEPARATOR, nextStartPos);
      if (sepPos < 0) {
        break;
      } else {
        ++depth;
        nextStartPos = sepPos + 1;
      }
    }

    if (length > 0) {
      if (key.charAt(length - 1) == S3_DIRECTORY_SEPARATOR) {
        --depth;
      }
    }

    return depth;
  }


  @Override
  public boolean delete(Path f, boolean recursive) throws IOException {

    try {
      final FileStatus fileStatus = getFileStatus(f); // Will throw a FileNotFoundException if f is invalid
      final S3BucketObjectPair bop = this.directoryStructure.toBucketObjectPair(f);

      if (fileStatus.isDir()) {

        boolean retVal = false;
        final FileStatus[] dirContent = listStatus(f);
        if (dirContent.length > 0) {
          // Directory is not empty
          if (!recursive) {
            throw new IOException("Found non-empty directory " + f
              + " while performing non-recursive delete");
          }

          for (final FileStatus entry : dirContent) {

            if (delete(entry.getPath(), true)) {
              retVal = true;
            }
          }
        }

        // Now the directory is empty

        if (!bop.hasBucket()) {
          // This is the root directory, do not delete this
          return retVal;
        }

        if (!bop.hasObject()) {
          // This is a real bucket
          this.s3Client.deleteBucket(bop.getBucket());
        } else {
          // This directory is actually represented by an object in S3
          this.s3Client.deleteObject(bop.getBucket(), bop.getObject());
        }
      } else {
        // This is a file
        this.s3Client.deleteObject(bop.getBucket(), bop.getObject());
      }
    } catch (AmazonClientException e) {
      throw new IOException(StringUtils.stringifyException(e));
    }

    return true;
  }


  @Override
  public boolean mkdirs(final Path f) throws IOException {

    final S3BucketObjectPair bop = this.directoryStructure.toBucketObjectPair(f);
    if (!bop.hasBucket() && !bop.hasObject()) {
      // Ignore this call
      return false;
    }

    boolean retCode = false;

    try {

      // Make sure the bucket exists
      if (bop.hasBucket()) {
        if (this.s3Client.doesBucketExist(bop.getBucket())) {
        } else {
          this.s3Client.createBucket(bop.getBucket());
          retCode = true;
        }
      }

      if (bop.hasObject()) {

        // Make sure object name ends with a directory separator character
        String object = bop.getObject();
        if (!object.isEmpty()) {
          if (object.charAt(object.length() - 1) != S3_DIRECTORY_SEPARATOR) {
            object = object.concat(Character.toString(S3_DIRECTORY_SEPARATOR));
          }
        }

        while (true) {

          try {
            this.s3Client.getObjectMetadata(bop.getBucket(), object);
          } catch (AmazonServiceException e) {
            if (e.getStatusCode() == HTTP_RESOURCE_NOT_FOUND_CODE) {
              createEmptyObject(bop.getBucket(), object);

              if (object.length() > 1) {
                final int nextPos = object.lastIndexOf(S3_DIRECTORY_SEPARATOR, object.length() - 2);
                if (nextPos >= 0) {
                  object = object.substring(0, nextPos + 1);
                  continue;
                }
              }

            } else {
              // Rethrow the exception
              throw e;
            }
          }

          // Object already exists, exit
          break;
        }
      }
    } catch (AmazonClientException e) {
      throw new IOException(StringUtils.stringifyException(e));
    }

    return retCode;
  }

  private void createEmptyObject(final String bucketName, final String objectName) {

    final InputStream im = new InputStream() {

      @Override
      public int read() throws IOException {

        return -1;
      }
    };

    final ObjectMetadata om = new ObjectMetadata();
    om.setContentLength(0L);

    this.s3Client.putObject(bucketName, objectName, im, om);
  }


  @Override
  public FSDataOutputStream create(final Path f, final boolean overwrite, final int bufferSize,
      final short replication, final long blockSize)
      throws IOException {

    if (!overwrite && exists(f)) {
      throw new IOException(f.toUri() + " already exists");
    }

    final S3BucketObjectPair bop = this.directoryStructure.toBucketObjectPair(f);
    if (!bop.hasBucket() || !bop.hasObject()) {
      throw new IOException(f.toUri() + " is not a valid path to create a new file");
    }

    if (bufferSize < S3DataOutputStream.MINIMUM_MULTIPART_SIZE) {
      throw new IOException("Provided buffer must be at least " + S3DataOutputStream.MINIMUM_MULTIPART_SIZE
        + " bytes");
    }

    final byte[] buf = new byte[bufferSize]; // TODO: Use memory manager to allocate larger pages

    return new S3DataOutputStream(this.s3Client, bop.getBucket(), bop.getObject(), buf, this.useRRS);
  }


  @Override
  public FSDataOutputStream create(final Path f, final boolean overwrite) throws IOException {

    return create(f, overwrite, S3DataOutputStream.MINIMUM_MULTIPART_SIZE, (short) 1, 1024L);
  }

  private boolean objectRepresentsDirectory(final S3ObjectSummary os) {

    return objectRepresentsDirectory(os.getKey(), os.getSize());
  }

  private boolean objectRepresentsDirectory(final String name, final long size) {

    if (name.isEmpty()) {
      return false;
    }

    if (name.charAt(name.length() - 1) == S3_DIRECTORY_SEPARATOR && size == 0L) {
      return true;
    }

    return false;
  }

  static Path extendPath(final Path parent, final String extension) throws IOException {

    final URI parentUri = parent.toUri();

    if (extension.isEmpty()) {
      return parent;
    }

    final String path = parentUri.getPath();
    String extendedPath;
    if (path.isEmpty()) {
      if (extension.charAt(0) == Path.SEPARATOR_CHAR) {
        extendedPath = extension;
      } else {
        extendedPath = Path.SEPARATOR + extension;
      }
    } else {
      if (path.charAt(path.length() - 1) == Path.SEPARATOR_CHAR) {
        if (extension.charAt(0) == Path.SEPARATOR_CHAR) {
          if (extension.length() > 1) {
            extendedPath = path + extension.substring(1);
          } else {
            extendedPath = path;
          }
        } else {
          extendedPath = path + extension;
        }
      } else {
        if (extension.charAt(0) == Path.SEPARATOR_CHAR) {
          extendedPath = path + extension;
        } else {
          extendedPath = path + Path.SEPARATOR + extension;
        }
      }
    }

    try {
      final URI extendedUri = new URI(parentUri.getScheme(),
        ((parentUri.getAuthority() != null) ? parentUri.getAuthority() : ""), extendedPath,
        parentUri.getQuery(), parentUri.getFragment());
      return new Path(extendedUri);
    } catch (URISyntaxException e) {
      throw new IOException(StringUtils.stringifyException(e));
    }
  }


  @Override
  public boolean rename(final Path src, final Path dst) throws IOException {

    throw new UnsupportedOperationException("This method is not yet implemented");
  }
 
  @Override
  public boolean isDistributedFS() {
    return true;
  }
}
TOP

Related Classes of org.apache.flink.runtime.fs.s3.S3FileSystem

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.