Package com.cloudera.cdk.data.filesystem

Source Code of com.cloudera.cdk.data.filesystem.FileSystemMetadataProvider

/**
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.data.filesystem;

import com.cloudera.cdk.data.Dataset;
import com.cloudera.cdk.data.DatasetDescriptor;
import com.cloudera.cdk.data.DatasetExistsException;
import com.cloudera.cdk.data.MetadataProvider;
import com.cloudera.cdk.data.MetadataProviderException;
import com.cloudera.cdk.data.impl.Accessor;
import com.cloudera.cdk.data.spi.AbstractMetadataProvider;
import com.google.common.base.Charsets;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.base.Supplier;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.io.Closeables;
import java.io.FileNotFoundException;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;

/**
* <p>
* A {@link MetadataProvider} that stores dataset metadata in a Hadoop
* {@link FileSystem}.
* </p>
* <p>
* When configured with a root directory, this implementation serializes the
* information within a {@link com.cloudera.cdk.data.DatasetDescriptor} on the provided
* {@link FileSystem}. The descriptor is serialized as an Avro object and stored
* in a directory named after the dataset name. For example, if the dataset name
* is {@code logs}, the directory {@code rootDirectory/logs/} will be created,
* if it doesn't exist, and the serialized descriptor will be stored in the file
* {@code descriptor.avro}.
* </p>
*/
public class FileSystemMetadataProvider extends AbstractMetadataProvider {

  private static final Logger logger = LoggerFactory
    .getLogger(FileSystemMetadataProvider.class);

  private static final String METADATA_DIRECTORY = ".metadata";
  private static final String SCHEMA_FILE_NAME = "schema.avsc";
  private static final String DESCRIPTOR_FILE_NAME = "descriptor.properties";
  private static final String PARTITION_EXPRESSION_FIELD_NAME = "partitionExpression";
  private static final String VERSION_FIELD_NAME = "version";
  private static final String METADATA_VERSION = "1";
  private static final String FORMAT_FIELD_NAME = "format";
  private static final String LOCATION_FIELD_NAME = "location";

  private static final Set<String> RESERVED_PROPERTIES = Sets.newHashSet(
      PARTITION_EXPRESSION_FIELD_NAME, VERSION_FIELD_NAME, FORMAT_FIELD_NAME,
      LOCATION_FIELD_NAME);

  private final Configuration conf;
  private final Path rootDirectory;

  // cache the rootDirectory's FileSystem to avoid multiple lookups
  private transient final FileSystem rootFileSystem;

  public FileSystemMetadataProvider(Configuration conf, Path rootDirectory) {
    Preconditions.checkArgument(conf != null, "Configuration cannot be null");
    Preconditions.checkArgument(rootDirectory != null, "Root cannot be null");

    this.conf = conf;
    try {
      this.rootFileSystem = rootDirectory.getFileSystem(conf);
      this.rootDirectory = rootFileSystem.makeQualified(rootDirectory);
    } catch (IOException ex) {
      throw new MetadataProviderException(
          "Cannot get FileSystem for root path", ex);
    }
  }

  @Override
  public DatasetDescriptor load(String name) {
    Preconditions.checkArgument(name != null, "Name cannot be null");

    logger.debug("Loading dataset metadata name:{}", name);

    final Path metadataPath = pathForMetadata(name);
    checkExists(rootFileSystem, metadataPath);

    InputStream inputStream = null;
    Properties properties = new Properties();
    DatasetDescriptor.Builder builder = new DatasetDescriptor.Builder();
    Path descriptorPath = new Path(metadataPath, DESCRIPTOR_FILE_NAME);

    boolean threw = true;
    try {
      inputStream = rootFileSystem.open(descriptorPath);
      properties.load(inputStream);
      threw = false;
    } catch (IOException e) {
      throw new MetadataProviderException(
          "Unable to load descriptor file:" + descriptorPath + " for dataset:" + name, e);
    } finally {
      try {
        Closeables.close(inputStream, threw);
      } catch (IOException e) {
        throw new MetadataProviderException(e);
      }
    }

    if (properties.containsKey(FORMAT_FIELD_NAME)) {
      builder.format(Accessor.getDefault().newFormat(
          properties.getProperty(FORMAT_FIELD_NAME)));
    }
    if (properties.containsKey(PARTITION_EXPRESSION_FIELD_NAME)) {
      builder.partitionStrategy(Accessor.getDefault().fromExpression(properties
          .getProperty(PARTITION_EXPRESSION_FIELD_NAME)));
    }
    Path schemaPath = new Path(metadataPath, SCHEMA_FILE_NAME);
    try {
      builder.schemaUri(rootFileSystem.makeQualified(schemaPath).toUri());
    } catch (IOException e) {
      throw new MetadataProviderException(
        "Unable to load schema file:" + schemaPath + " for dataset:" + name, e);
    }

    final Path location;
    if (properties.containsKey(LOCATION_FIELD_NAME)) {
      // the location should always be written by this library and validated
      // when the descriptor is first created.
      location = new Path(properties.getProperty(LOCATION_FIELD_NAME));
    } else {
      // backwards-compatibility: older versions didn't write this property
      location = pathForDataset(name);
    }
    builder.location(location);

    // custom properties
    for (String property : properties.stringPropertyNames()) {
      if (!RESERVED_PROPERTIES.contains(property)) {
        builder.property(property, properties.getProperty(property));
      }
    }

    return builder.build();
  }

  @Override
  public DatasetDescriptor create(String name, DatasetDescriptor descriptor) {
    Preconditions.checkArgument(name != null, "Name cannot be null");
    Preconditions.checkArgument(descriptor != null,
        "Descriptor cannot be null");

    logger.debug("Saving dataset metadata name:{} descriptor:{}", name,
        descriptor);

    final Path dataLocation;

    // If the descriptor has a location, use it.
    if (descriptor.getLocation() != null) {
      dataLocation = new Path(descriptor.getLocation());
    } else {
      dataLocation = pathForDataset(name);
    }

    final Path metadataLocation = pathForMetadata(name);

    // get a DatasetDescriptor with the location set
    DatasetDescriptor newDescriptor = new DatasetDescriptor.Builder(descriptor)
        .location(dataLocation)
        .build();

    try {
      if (rootFileSystem.exists(metadataLocation)) {
        throw new DatasetExistsException(
            "Descriptor directory:" + metadataLocation + " already exists");
      }
      // create the directory so that update can do the rest of the work
      rootFileSystem.mkdirs(metadataLocation);
    } catch (IOException e) {
      throw new MetadataProviderException(
          "Unable to create metadata directory:" + metadataLocation +
          " for dataset:" + name, e);
    }

    writeDescriptor(rootFileSystem, metadataLocation, name, newDescriptor);

    return newDescriptor;
  }

  @Override
  public DatasetDescriptor update(String name, DatasetDescriptor descriptor) {
    Preconditions.checkArgument(name != null, "Name cannot be null");
    Preconditions.checkArgument(descriptor != null,
        "Descriptor cannot be null");

    logger.debug("Saving dataset metadata name:{} descriptor:{}", name,
      descriptor);

    writeDescriptor(
        rootFileSystem, pathForMetadata(name), name, descriptor);

    return descriptor;
  }

  @Override
  public boolean delete(String name) {
    Preconditions.checkArgument(name != null, "Name cannot be null");

    logger.debug("Deleting dataset metadata name:{}", name);

    final Path metadataDirectory = pathForMetadata(name);

    try {
      if (rootFileSystem.exists(metadataDirectory)) {
        if (rootFileSystem.delete(metadataDirectory, true)) {
          return true;
        } else {
          throw new IOException("Failed to delete metadata directory:"
            + metadataDirectory);
        }
      } else {
        return false;
      }
    } catch (IOException e) {
      throw new MetadataProviderException(
          "Unable to find or delete metadata directory:" + metadataDirectory +
          " for dataset:" + name, e);
    }
  }

  @Override
  public boolean exists(String name) {
    Preconditions.checkArgument(name != null, "Name cannot be null");

    final Path potentialPath = pathForMetadata(name);
    try {
      return rootFileSystem.exists(potentialPath);
    } catch (IOException ex) {
      throw new MetadataProviderException(
          "Could not check metadata path:" + potentialPath, ex);
    }
  }

  @Override
  public List<String> list() {
    List<String> datasets = Lists.newArrayList();
    try {
      FileStatus[] entries = rootFileSystem.listStatus(rootDirectory,
          PathFilters.notHidden());
      for (FileStatus entry : entries) {
        // assumes that all unhidden directories under the root are data sets
        if (entry.isDirectory() &&
            rootFileSystem.exists(new Path(entry.getPath(), ".metadata"))) {
          // may want to add a check: !RESERVED_NAMES.contains(name)
          datasets.add(entry.getPath().getName());
        } else {
          continue;
        }
      }
    } catch (FileNotFoundException ex) {
      // the repo hasn't created any files yet
      return datasets;
    } catch (IOException ex) {
      throw new MetadataProviderException("Could not list data sets", ex);
    }
    return datasets;
  }

  /**
   * Returns the root directory where metadata is stored.
   *
   * @return a Path where {@link DatasetDescriptor}s are stored
   *
   * @since 0.8.0
   */
  Path getRootDirectory() {
    return rootDirectory;
  }

  /**
   * Returns the file system where metadata is stored.
   *
   * @return a FileSystem
   *
   * @since 0.8.0
   */
  FileSystem getFileSytem() {
    return rootFileSystem;
  }

  @Override
  public String toString() {
    return Objects.toStringHelper(this)
        .add("rootDirectory", rootDirectory)
        .add("conf", conf).toString();
  }

  private Path pathForDataset(String name) {
    Preconditions.checkState(rootDirectory != null,
      "Dataset repository root directory can not be null");

    return rootFileSystem.makeQualified(pathForDataset(rootDirectory, name));
  }

  private Path pathForMetadata(String name) {
    Preconditions.checkState(rootDirectory != null,
      "Dataset repository root directory can not be null");

    return pathForMetadata(rootDirectory, name);
  }

  /**
   * Writes the contents of a {@code Descriptor} to files.
   *
   * @param fs                The {@link FileSystem} where data will be stored
   * @param metadataLocation  The directory {@link Path} where metadata files
   *                          will be located
   * @param name              The {@link Dataset} name
   * @param descriptor        The {@code Descriptor} contents to write
   *
   * @throws MetadataProviderException  If the {@code metadataLocation} does not
   *                                    exist or if any IOExceptions need to be
   *                                    propagated.
   */
  private static void writeDescriptor(
      FileSystem fs, Path metadataLocation, String name,
      DatasetDescriptor descriptor) {

    checkExists(fs, metadataLocation);

    FSDataOutputStream outputStream = null;
    final Path schemaPath = new Path(metadataLocation, SCHEMA_FILE_NAME);
    boolean threw = true;
    try {
      outputStream = fs.create(schemaPath, true /* overwrite */ );
      outputStream.write(descriptor.getSchema().toString(true)
          .getBytes(Charsets.UTF_8));
      outputStream.flush();
      threw = false;
    } catch (IOException e) {
      throw new MetadataProviderException(
          "Unable to save schema file:" + schemaPath + " for dataset:" + name, e);
    } finally {
      try {
        Closeables.close(outputStream, threw);
      } catch (IOException e) {
        throw new MetadataProviderException(e);
      }
    }

    Properties properties = new Properties();
    properties.setProperty(VERSION_FIELD_NAME, METADATA_VERSION);
    properties.setProperty(FORMAT_FIELD_NAME, descriptor.getFormat().getName());

    final URI dataLocation = descriptor.getLocation();
    if (dataLocation != null) {
      properties.setProperty(LOCATION_FIELD_NAME, dataLocation.toString());
    }

    if (descriptor.isPartitioned()) {
      properties.setProperty(PARTITION_EXPRESSION_FIELD_NAME,
          Accessor.getDefault().toExpression(descriptor.getPartitionStrategy()));
    }

    // copy custom properties to the table
    for (String property : descriptor.listProperties()) {
      // no need to check the reserved list, those are not set on descriptors
      properties.setProperty(property, descriptor.getProperty(property));
    }

    final Path descriptorPath = new Path(metadataLocation, DESCRIPTOR_FILE_NAME);
    threw = true;
    try {
      outputStream = fs.create(descriptorPath, true /* overwrite */ );
      properties.store(outputStream, "Dataset descriptor for " + name);
      outputStream.flush();
      threw = false;
    } catch (IOException e) {
      throw new MetadataProviderException(
          "Unable to save descriptor file:" + descriptorPath + " for dataset:" + name, e);
    } finally {
      try {
        Closeables.close(outputStream, threw);
      } catch (IOException e) {
        throw new MetadataProviderException(e);
      }
    }
  }

  /**
   * Returns the correct metadata path for the given dataset.
   * @param root A Path
   * @param name A String dataset name
   * @return the metadata Path
   */
  private static Path pathForMetadata(Path root, String name) {
    return new Path(pathForDataset(root, name), METADATA_DIRECTORY);
  }

  /**
   * Returns the correct dataset path for the given name and root directory.
   *
   * @param root A Path
   * @param name A String dataset name
   * @return the correct dataset Path
   */
  private static Path pathForDataset(Path root, String name) {
    Preconditions.checkArgument(name != null, "Dataset name cannot be null");

    // Why replace '.' here? Is this a namespacing hack?
    return new Path(root, name.replace('.', Path.SEPARATOR_CHAR));
  }

  /**
   * Precondition-style static validation that a dataset exists
   *
   * @param fs        A FileSystem where the metadata should be stored
   * @param location  The Path where the metadata should be stored
   * @throws com.cloudera.cdk.data.NoSuchDatasetException if the descriptor location is missing
   * @throws MetadataProviderException  if any IOException is thrown
   */
  @SuppressWarnings("deprecation")
  private static void checkExists(FileSystem fs, Path location) {
    try {
      if (!fs.exists(location)) {
        throw new com.cloudera.cdk.data.NoSuchDatasetException("Descriptor location is missing: " +
            location);
      }
    } catch (IOException ex) {
      throw new MetadataProviderException("Cannot access descriptor location", ex);
    }
  }

  /**
   * A fluent builder to aid in the construction of {@link FileSystemMetadataProvider}
   * instances.
   * @since 0.8.0
   */
  public static class Builder implements Supplier<FileSystemMetadataProvider> {

    private Path rootDirectory;
    private Configuration configuration;

    /**
     * The root directory for metadata files.
     *
     * @param path a Path to a FileSystem location
     * @return this Builder for method chaining.
     */
    public Builder rootDirectory(Path path) {
      this.rootDirectory = path;
      return this;
    }

    /**
     * The {@link Configuration} used to find the {@link FileSystem}.
     */
    public Builder configuration(Configuration configuration) {
      this.configuration = configuration;
      return this;
    }

    /**
     * @deprecated will be removed in 0.11.0
     */
    @Override
    @Deprecated
    public FileSystemMetadataProvider get() {
      return build();
    }

    /**
     * Build an instance of the configured {@link FileSystemMetadataProvider}.
     *
     * @since 0.9.0
     */
    @SuppressWarnings("deprecation")
    public FileSystemMetadataProvider build() {
      return new FileSystemMetadataProvider(configuration, rootDirectory);
    }
  }

}
TOP

Related Classes of com.cloudera.cdk.data.filesystem.FileSystemMetadataProvider

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.
ed.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.