/**
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.data.filesystem;
import com.cloudera.cdk.data.Dataset;
import com.cloudera.cdk.data.DatasetDescriptor;
import com.cloudera.cdk.data.DatasetExistsException;
import com.cloudera.cdk.data.MetadataProvider;
import com.cloudera.cdk.data.MetadataProviderException;
import com.cloudera.cdk.data.impl.Accessor;
import com.cloudera.cdk.data.spi.AbstractMetadataProvider;
import com.google.common.base.Charsets;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.base.Supplier;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.io.Closeables;
import java.io.FileNotFoundException;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
/**
* <p>
* A {@link MetadataProvider} that stores dataset metadata in a Hadoop
* {@link FileSystem}.
* </p>
* <p>
* When configured with a root directory, this implementation serializes the
* information within a {@link com.cloudera.cdk.data.DatasetDescriptor} on the provided
* {@link FileSystem}. The descriptor is serialized as an Avro object and stored
* in a directory named after the dataset name. For example, if the dataset name
* is {@code logs}, the directory {@code rootDirectory/logs/} will be created,
* if it doesn't exist, and the serialized descriptor will be stored in the file
* {@code descriptor.avro}.
* </p>
*/
public class FileSystemMetadataProvider extends AbstractMetadataProvider {
private static final Logger logger = LoggerFactory
.getLogger(FileSystemMetadataProvider.class);
private static final String METADATA_DIRECTORY = ".metadata";
private static final String SCHEMA_FILE_NAME = "schema.avsc";
private static final String DESCRIPTOR_FILE_NAME = "descriptor.properties";
private static final String PARTITION_EXPRESSION_FIELD_NAME = "partitionExpression";
private static final String VERSION_FIELD_NAME = "version";
private static final String METADATA_VERSION = "1";
private static final String FORMAT_FIELD_NAME = "format";
private static final String LOCATION_FIELD_NAME = "location";
private static final Set<String> RESERVED_PROPERTIES = Sets.newHashSet(
PARTITION_EXPRESSION_FIELD_NAME, VERSION_FIELD_NAME, FORMAT_FIELD_NAME,
LOCATION_FIELD_NAME);
private final Configuration conf;
private final Path rootDirectory;
// cache the rootDirectory's FileSystem to avoid multiple lookups
private transient final FileSystem rootFileSystem;
public FileSystemMetadataProvider(Configuration conf, Path rootDirectory) {
Preconditions.checkArgument(conf != null, "Configuration cannot be null");
Preconditions.checkArgument(rootDirectory != null, "Root cannot be null");
this.conf = conf;
try {
this.rootFileSystem = rootDirectory.getFileSystem(conf);
this.rootDirectory = rootFileSystem.makeQualified(rootDirectory);
} catch (IOException ex) {
throw new MetadataProviderException(
"Cannot get FileSystem for root path", ex);
}
}
@Override
public DatasetDescriptor load(String name) {
Preconditions.checkArgument(name != null, "Name cannot be null");
logger.debug("Loading dataset metadata name:{}", name);
final Path metadataPath = pathForMetadata(name);
checkExists(rootFileSystem, metadataPath);
InputStream inputStream = null;
Properties properties = new Properties();
DatasetDescriptor.Builder builder = new DatasetDescriptor.Builder();
Path descriptorPath = new Path(metadataPath, DESCRIPTOR_FILE_NAME);
boolean threw = true;
try {
inputStream = rootFileSystem.open(descriptorPath);
properties.load(inputStream);
threw = false;
} catch (IOException e) {
throw new MetadataProviderException(
"Unable to load descriptor file:" + descriptorPath + " for dataset:" + name, e);
} finally {
try {
Closeables.close(inputStream, threw);
} catch (IOException e) {
throw new MetadataProviderException(e);
}
}
if (properties.containsKey(FORMAT_FIELD_NAME)) {
builder.format(Accessor.getDefault().newFormat(
properties.getProperty(FORMAT_FIELD_NAME)));
}
if (properties.containsKey(PARTITION_EXPRESSION_FIELD_NAME)) {
builder.partitionStrategy(Accessor.getDefault().fromExpression(properties
.getProperty(PARTITION_EXPRESSION_FIELD_NAME)));
}
Path schemaPath = new Path(metadataPath, SCHEMA_FILE_NAME);
try {
builder.schemaUri(rootFileSystem.makeQualified(schemaPath).toUri());
} catch (IOException e) {
throw new MetadataProviderException(
"Unable to load schema file:" + schemaPath + " for dataset:" + name, e);
}
final Path location;
if (properties.containsKey(LOCATION_FIELD_NAME)) {
// the location should always be written by this library and validated
// when the descriptor is first created.
location = new Path(properties.getProperty(LOCATION_FIELD_NAME));
} else {
// backwards-compatibility: older versions didn't write this property
location = pathForDataset(name);
}
builder.location(location);
// custom properties
for (String property : properties.stringPropertyNames()) {
if (!RESERVED_PROPERTIES.contains(property)) {
builder.property(property, properties.getProperty(property));
}
}
return builder.build();
}
@Override
public DatasetDescriptor create(String name, DatasetDescriptor descriptor) {
Preconditions.checkArgument(name != null, "Name cannot be null");
Preconditions.checkArgument(descriptor != null,
"Descriptor cannot be null");
logger.debug("Saving dataset metadata name:{} descriptor:{}", name,
descriptor);
final Path dataLocation;
// If the descriptor has a location, use it.
if (descriptor.getLocation() != null) {
dataLocation = new Path(descriptor.getLocation());
} else {
dataLocation = pathForDataset(name);
}
final Path metadataLocation = pathForMetadata(name);
// get a DatasetDescriptor with the location set
DatasetDescriptor newDescriptor = new DatasetDescriptor.Builder(descriptor)
.location(dataLocation)
.build();
try {
if (rootFileSystem.exists(metadataLocation)) {
throw new DatasetExistsException(
"Descriptor directory:" + metadataLocation + " already exists");
}
// create the directory so that update can do the rest of the work
rootFileSystem.mkdirs(metadataLocation);
} catch (IOException e) {
throw new MetadataProviderException(
"Unable to create metadata directory:" + metadataLocation +
" for dataset:" + name, e);
}
writeDescriptor(rootFileSystem, metadataLocation, name, newDescriptor);
return newDescriptor;
}
@Override
public DatasetDescriptor update(String name, DatasetDescriptor descriptor) {
Preconditions.checkArgument(name != null, "Name cannot be null");
Preconditions.checkArgument(descriptor != null,
"Descriptor cannot be null");
logger.debug("Saving dataset metadata name:{} descriptor:{}", name,
descriptor);
writeDescriptor(
rootFileSystem, pathForMetadata(name), name, descriptor);
return descriptor;
}
@Override
public boolean delete(String name) {
Preconditions.checkArgument(name != null, "Name cannot be null");
logger.debug("Deleting dataset metadata name:{}", name);
final Path metadataDirectory = pathForMetadata(name);
try {
if (rootFileSystem.exists(metadataDirectory)) {
if (rootFileSystem.delete(metadataDirectory, true)) {
return true;
} else {
throw new IOException("Failed to delete metadata directory:"
+ metadataDirectory);
}
} else {
return false;
}
} catch (IOException e) {
throw new MetadataProviderException(
"Unable to find or delete metadata directory:" + metadataDirectory +
" for dataset:" + name, e);
}
}
@Override
public boolean exists(String name) {
Preconditions.checkArgument(name != null, "Name cannot be null");
final Path potentialPath = pathForMetadata(name);
try {
return rootFileSystem.exists(potentialPath);
} catch (IOException ex) {
throw new MetadataProviderException(
"Could not check metadata path:" + potentialPath, ex);
}
}
@Override
public List<String> list() {
List<String> datasets = Lists.newArrayList();
try {
FileStatus[] entries = rootFileSystem.listStatus(rootDirectory,
PathFilters.notHidden());
for (FileStatus entry : entries) {
// assumes that all unhidden directories under the root are data sets
if (entry.isDirectory() &&
rootFileSystem.exists(new Path(entry.getPath(), ".metadata"))) {
// may want to add a check: !RESERVED_NAMES.contains(name)
datasets.add(entry.getPath().getName());
} else {
continue;
}
}
} catch (FileNotFoundException ex) {
// the repo hasn't created any files yet
return datasets;
} catch (IOException ex) {
throw new MetadataProviderException("Could not list data sets", ex);
}
return datasets;
}
/**
* Returns the root directory where metadata is stored.
*
* @return a Path where {@link DatasetDescriptor}s are stored
*
* @since 0.8.0
*/
Path getRootDirectory() {
return rootDirectory;
}
/**
* Returns the file system where metadata is stored.
*
* @return a FileSystem
*
* @since 0.8.0
*/
FileSystem getFileSytem() {
return rootFileSystem;
}
@Override
public String toString() {
return Objects.toStringHelper(this)
.add("rootDirectory", rootDirectory)
.add("conf", conf).toString();
}
private Path pathForDataset(String name) {
Preconditions.checkState(rootDirectory != null,
"Dataset repository root directory can not be null");
return rootFileSystem.makeQualified(pathForDataset(rootDirectory, name));
}
private Path pathForMetadata(String name) {
Preconditions.checkState(rootDirectory != null,
"Dataset repository root directory can not be null");
return pathForMetadata(rootDirectory, name);
}
/**
* Writes the contents of a {@code Descriptor} to files.
*
* @param fs The {@link FileSystem} where data will be stored
* @param metadataLocation The directory {@link Path} where metadata files
* will be located
* @param name The {@link Dataset} name
* @param descriptor The {@code Descriptor} contents to write
*
* @throws MetadataProviderException If the {@code metadataLocation} does not
* exist or if any IOExceptions need to be
* propagated.
*/
private static void writeDescriptor(
FileSystem fs, Path metadataLocation, String name,
DatasetDescriptor descriptor) {
checkExists(fs, metadataLocation);
FSDataOutputStream outputStream = null;
final Path schemaPath = new Path(metadataLocation, SCHEMA_FILE_NAME);
boolean threw = true;
try {
outputStream = fs.create(schemaPath, true /* overwrite */ );
outputStream.write(descriptor.getSchema().toString(true)
.getBytes(Charsets.UTF_8));
outputStream.flush();
threw = false;
} catch (IOException e) {
throw new MetadataProviderException(
"Unable to save schema file:" + schemaPath + " for dataset:" + name, e);
} finally {
try {
Closeables.close(outputStream, threw);
} catch (IOException e) {
throw new MetadataProviderException(e);
}
}
Properties properties = new Properties();
properties.setProperty(VERSION_FIELD_NAME, METADATA_VERSION);
properties.setProperty(FORMAT_FIELD_NAME, descriptor.getFormat().getName());
final URI dataLocation = descriptor.getLocation();
if (dataLocation != null) {
properties.setProperty(LOCATION_FIELD_NAME, dataLocation.toString());
}
if (descriptor.isPartitioned()) {
properties.setProperty(PARTITION_EXPRESSION_FIELD_NAME,
Accessor.getDefault().toExpression(descriptor.getPartitionStrategy()));
}
// copy custom properties to the table
for (String property : descriptor.listProperties()) {
// no need to check the reserved list, those are not set on descriptors
properties.setProperty(property, descriptor.getProperty(property));
}
final Path descriptorPath = new Path(metadataLocation, DESCRIPTOR_FILE_NAME);
threw = true;
try {
outputStream = fs.create(descriptorPath, true /* overwrite */ );
properties.store(outputStream, "Dataset descriptor for " + name);
outputStream.flush();
threw = false;
} catch (IOException e) {
throw new MetadataProviderException(
"Unable to save descriptor file:" + descriptorPath + " for dataset:" + name, e);
} finally {
try {
Closeables.close(outputStream, threw);
} catch (IOException e) {
throw new MetadataProviderException(e);
}
}
}
/**
* Returns the correct metadata path for the given dataset.
* @param root A Path
* @param name A String dataset name
* @return the metadata Path
*/
private static Path pathForMetadata(Path root, String name) {
return new Path(pathForDataset(root, name), METADATA_DIRECTORY);
}
/**
* Returns the correct dataset path for the given name and root directory.
*
* @param root A Path
* @param name A String dataset name
* @return the correct dataset Path
*/
private static Path pathForDataset(Path root, String name) {
Preconditions.checkArgument(name != null, "Dataset name cannot be null");
// Why replace '.' here? Is this a namespacing hack?
return new Path(root, name.replace('.', Path.SEPARATOR_CHAR));
}
/**
* Precondition-style static validation that a dataset exists
*
* @param fs A FileSystem where the metadata should be stored
* @param location The Path where the metadata should be stored
* @throws com.cloudera.cdk.data.NoSuchDatasetException if the descriptor location is missing
* @throws MetadataProviderException if any IOException is thrown
*/
@SuppressWarnings("deprecation")
private static void checkExists(FileSystem fs, Path location) {
try {
if (!fs.exists(location)) {
throw new com.cloudera.cdk.data.NoSuchDatasetException("Descriptor location is missing: " +
location);
}
} catch (IOException ex) {
throw new MetadataProviderException("Cannot access descriptor location", ex);
}
}
/**
* A fluent builder to aid in the construction of {@link FileSystemMetadataProvider}
* instances.
* @since 0.8.0
*/
public static class Builder implements Supplier<FileSystemMetadataProvider> {
private Path rootDirectory;
private Configuration configuration;
/**
* The root directory for metadata files.
*
* @param path a Path to a FileSystem location
* @return this Builder for method chaining.
*/
public Builder rootDirectory(Path path) {
this.rootDirectory = path;
return this;
}
/**
* The {@link Configuration} used to find the {@link FileSystem}.
*/
public Builder configuration(Configuration configuration) {
this.configuration = configuration;
return this;
}
/**
* @deprecated will be removed in 0.11.0
*/
@Override
@Deprecated
public FileSystemMetadataProvider get() {
return build();
}
/**
* Build an instance of the configured {@link FileSystemMetadataProvider}.
*
* @since 0.9.0
*/
@SuppressWarnings("deprecation")
public FileSystemMetadataProvider build() {
return new FileSystemMetadataProvider(configuration, rootDirectory);
}
}
}