/**
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.data.spi.filesystem;
import com.google.common.annotations.VisibleForTesting;
import org.kitesdk.data.Dataset;
import org.kitesdk.data.DatasetDescriptor;
import org.kitesdk.data.DatasetExistsException;
import org.kitesdk.data.DatasetIOException;
import org.kitesdk.data.DatasetNotFoundException;
import org.kitesdk.data.impl.Accessor;
import org.kitesdk.data.spi.AbstractDatasetRepository;
import org.kitesdk.data.spi.AbstractMetadataProvider;
import org.kitesdk.data.spi.MetadataProvider;
import com.google.common.base.Charsets;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.io.Closeables;
import java.io.FileNotFoundException;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.kitesdk.data.spi.Compatibility;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
/**
* <p>
* A {@link MetadataProvider} that stores dataset metadata in a Hadoop
* {@link FileSystem}.
* </p>
* <p>
* When configured with a root directory, this implementation serializes the
* information within a {@link org.kitesdk.data.DatasetDescriptor} on the provided
* {@link FileSystem}. The descriptor is serialized as an Avro object and stored
* in a directory named after the dataset name. For example, if the dataset name
* is {@code logs}, the directory {@code rootDirectory/logs/} will be created,
* if it doesn't exist, and the serialized descriptor will be stored in the file
* {@code descriptor.avro}.
* </p>
*/
public class FileSystemMetadataProvider extends AbstractMetadataProvider {
private static final Logger LOG = LoggerFactory
.getLogger(FileSystemMetadataProvider.class);
private static final String METADATA_DIRECTORY = ".metadata";
private static final String SCHEMA_FILE_NAME = "schema.avsc";
private static final String DESCRIPTOR_FILE_NAME = "descriptor.properties";
private static final String PARTITION_EXPRESSION_FIELD_NAME = "partitionExpression";
private static final String VERSION_FIELD_NAME = "version";
private static final String METADATA_VERSION = "1";
private static final String FORMAT_FIELD_NAME = "format";
private static final String LOCATION_FIELD_NAME = "location";
private static final String COMPRESSION_TYPE_FIELD_NAME = "compressionType";
private static final String DEFAULT_NAMESPACE = "default";
private static final Set<String> RESERVED_PROPERTIES = Sets.newHashSet(
PARTITION_EXPRESSION_FIELD_NAME, VERSION_FIELD_NAME, FORMAT_FIELD_NAME,
LOCATION_FIELD_NAME, COMPRESSION_TYPE_FIELD_NAME);
private final Configuration conf;
private final Path rootDirectory;
// cache the rootDirectory's FileSystem to avoid multiple lookups
private transient final FileSystem rootFileSystem;
public FileSystemMetadataProvider(Configuration conf, Path rootDirectory) {
Preconditions.checkNotNull(conf, "Configuration cannot be null");
Preconditions.checkNotNull(rootDirectory, "Root directory cannot be null");
this.conf = conf;
try {
this.rootFileSystem = rootDirectory.getFileSystem(conf);
this.rootDirectory = rootFileSystem.makeQualified(rootDirectory);
} catch (IOException ex) {
throw new DatasetIOException("Cannot get FileSystem for root path", ex);
}
}
@Override
public DatasetDescriptor load(String namespace, String name) {
Preconditions.checkNotNull(namespace, "Namespace cannot be null");
Preconditions.checkNotNull(name, "Dataset name cannot be null");
LOG.debug("Loading dataset metadata name: {}", name);
Path metadataPath = find(namespace, name);
InputStream inputStream = null;
Properties properties = new Properties();
DatasetDescriptor.Builder builder = new DatasetDescriptor.Builder();
Path descriptorPath = new Path(metadataPath, DESCRIPTOR_FILE_NAME);
boolean threw = true;
try {
inputStream = rootFileSystem.open(descriptorPath);
properties.load(inputStream);
threw = false;
} catch (IOException e) {
throw new DatasetIOException(
"Unable to load descriptor file:" + descriptorPath +
" for dataset:" + name, e);
} finally {
try {
Closeables.close(inputStream, threw);
} catch (IOException e) {
throw new DatasetIOException("Cannot close", e);
}
}
if (properties.containsKey(FORMAT_FIELD_NAME)) {
builder.format(Accessor.getDefault().newFormat(
properties.getProperty(FORMAT_FIELD_NAME)));
}
if (properties.containsKey(COMPRESSION_TYPE_FIELD_NAME)) {
builder.compressionType(properties.getProperty(COMPRESSION_TYPE_FIELD_NAME));
}
if (properties.containsKey(PARTITION_EXPRESSION_FIELD_NAME)) {
builder.partitionStrategy(Accessor.getDefault().fromExpression(properties
.getProperty(PARTITION_EXPRESSION_FIELD_NAME)));
}
Path schemaPath = new Path(metadataPath, SCHEMA_FILE_NAME);
try {
builder.schemaUri(rootFileSystem.makeQualified(schemaPath).toUri());
} catch (IOException e) {
throw new DatasetIOException(
"Unable to load schema file:" + schemaPath + " for dataset:" + name, e);
}
final Path location;
if (properties.containsKey(LOCATION_FIELD_NAME)) {
// the location should always be written by this library and validated
// when the descriptor is first created.
location = new Path(properties.getProperty(LOCATION_FIELD_NAME));
} else {
// backwards-compatibility: older versions didn't write this property but
// the data and metadata were always co-located.
location = expectedPathForDataset(namespace, name);
}
builder.location(location);
// custom properties
for (String property : properties.stringPropertyNames()) {
if (!RESERVED_PROPERTIES.contains(property)) {
builder.property(property, properties.getProperty(property));
}
}
return builder.build();
}
@Override
public DatasetDescriptor create(String namespace, String name, DatasetDescriptor descriptor) {
Preconditions.checkNotNull(namespace, "Namespace cannot be null");
Preconditions.checkNotNull(name, "Dataset name cannot be null");
Preconditions.checkNotNull(descriptor, "Descriptor cannot be null");
Compatibility.checkAndWarn(namespace, name, descriptor);
LOG.debug("Saving dataset metadata name:{} descriptor:{}", name,
descriptor);
// no need to check backward-compatibility when creating new datasets
Path metadataLocation = pathForMetadata(namespace, name);
try {
if (rootFileSystem.exists(metadataLocation)) {
throw new DatasetExistsException(
"Descriptor directory already exists: " + metadataLocation);
}
// create the directory so that update can do the rest of the work
rootFileSystem.mkdirs(metadataLocation);
} catch (IOException e) {
throw new DatasetIOException(
"Unable to create metadata directory: " + metadataLocation +
" for dataset: " + name, e);
}
writeDescriptor(rootFileSystem, metadataLocation, name, descriptor);
return descriptor;
}
@Override
public DatasetDescriptor update(String namespace, String name, DatasetDescriptor descriptor) {
Preconditions.checkNotNull(namespace, "Namespace cannot be null");
Preconditions.checkNotNull(name, "Dataset name cannot be null");
Preconditions.checkNotNull(descriptor, "Descriptor cannot be null");
Compatibility.checkAndWarn(namespace, name, descriptor);
LOG.debug("Saving dataset metadata name: {} descriptor: {}", name,
descriptor);
Path metadataPath = find(namespace, name);
writeDescriptor(rootFileSystem, metadataPath, name, descriptor);
return descriptor;
}
@Override
public boolean delete(String namespace, String name) {
Preconditions.checkNotNull(namespace, "Namespace cannot be null");
Preconditions.checkNotNull(name, "Dataset name cannot be null");
LOG.debug("Deleting dataset metadata name: {}", name);
Path metadataDirectory;
try {
metadataDirectory = find(namespace, name);
} catch (DatasetNotFoundException _) {
return false;
}
try {
if (rootFileSystem.exists(metadataDirectory)) {
if (rootFileSystem.delete(metadataDirectory, true)) {
return true;
} else {
throw new IOException("Failed to delete metadata directory:"
+ metadataDirectory);
}
} else {
return false;
}
} catch (IOException e) {
throw new DatasetIOException(
"Unable to find or delete metadata directory:" + metadataDirectory +
" for dataset:" + name, e);
}
}
@Override
public boolean exists(String namespace, String name) {
Preconditions.checkNotNull(namespace, "Namespace cannot be null");
Preconditions.checkNotNull(name, "Dataset name cannot be null");
try {
find(namespace, name);
return true;
} catch (DatasetNotFoundException e) {
return false;
}
}
@SuppressWarnings("deprecation")
@Override
public List<String> namespaces() {
List<String> namespaces = Lists.newArrayList();
try {
FileStatus[] entries = rootFileSystem.listStatus(rootDirectory,
PathFilters.notHidden());
for (FileStatus entry : entries) {
if (entry.isDir()) {
// may want to add a check: !RESERVED_NAMES.contains(name)
if (isNamespace(entry.getPath())) {
namespaces.add(entry.getPath().getName());
} else if (isDataset(entry.getPath())) {
// add the default namespace for datasets with no namespace
namespaces.add(DEFAULT_NAMESPACE);
}
}
}
} catch (FileNotFoundException ex) {
// the repo hasn't created any files yet
return namespaces;
} catch (IOException ex) {
throw new DatasetIOException("Could not list namespaces", ex);
}
return namespaces;
}
@SuppressWarnings("deprecation")
@Override
public List<String> datasets(String namespace) {
Preconditions.checkNotNull(namespace, "Namespace cannot be null");
List<String> datasets = Lists.newArrayList();
try {
// if using the default namespace, add datasets with no namespace dir
if (DEFAULT_NAMESPACE.equals(namespace)) {
FileStatus[] directEntries = rootFileSystem.listStatus(
rootDirectory,
PathFilters.notHidden());
for (FileStatus entry : directEntries) {
if (entry.isDir() && isDataset(entry.getPath())) {
// may want to add a check: !RESERVED_NAMES.contains(name)
datasets.add(entry.getPath().getName());
}
}
}
} catch (FileNotFoundException e) {
// if the root directory doesn't exist, then no namespace directories do
return datasets;
} catch (IOException ex) {
throw new DatasetIOException("Could not list datasets", ex);
}
try {
FileStatus[] entries = rootFileSystem.listStatus(
new Path(rootDirectory, namespace),
PathFilters.notHidden());
for (FileStatus entry : entries) {
if (entry.isDir() && isDataset(entry.getPath())) {
// may want to add a check: !RESERVED_NAMES.contains(name)
datasets.add(entry.getPath().getName());
}
}
} catch (FileNotFoundException ex) {
// the repo hasn't created any files yet
return datasets;
} catch (IOException ex) {
throw new DatasetIOException("Could not list datasets", ex);
}
return datasets;
}
/**
* Returns whether the given {@code Path} contains directories with
* {@code Dataset} metadata.
*
* @param dir a Path to check
* @return {@code true} if there is a direct sub-directory with metadata
* @throws IOException
*/
@SuppressWarnings("deprecation")
private boolean isNamespace(Path dir) throws IOException {
FileStatus[] stats = rootFileSystem.listStatus(dir, PathFilters.notHidden());
for (FileStatus stat : stats) {
if (stat.isDir() && isDataset(stat.getPath())) {
return true;
}
}
return false;
}
/**
* Returns whether the given {@code Path} contains {@code Dataset} metadata.
*
* @param dir a Path to check
* @return {@code true} if there is a .metadata subdirectory
* @throws IOException
*/
private boolean isDataset(Path dir) throws IOException {
return rootFileSystem.isDirectory(new Path(dir, METADATA_DIRECTORY));
}
/**
* Returns the root directory where metadata is stored.
*
* @return a Path where {@link DatasetDescriptor}s are stored
*
* @since 0.8.0
*/
Path getRootDirectory() {
return rootDirectory;
}
/**
* Returns the file system where metadata is stored.
*
* @return a FileSystem
*
* @since 0.8.0
*/
FileSystem getFileSytem() {
return rootFileSystem;
}
@Override
public String toString() {
return Objects.toStringHelper(this)
.add("rootDirectory", rootDirectory)
.add("conf", conf).toString();
}
private Path expectedPathForDataset(String namespace, String name) {
return rootFileSystem.makeQualified(
FileSystemDatasetRepository.pathForDataset(rootDirectory, namespace, name));
}
/**
* Returns the path where this MetadataProvider will store metadata.
*
* Note that this is not dependent on the actual storage location for the
* dataset, although they are usually co-located. This provider must be able
* to read metadata without a location for the Dataset when loading.
*
* @param name The {@link Dataset} name
* @return The directory {@link Path} where metadata files will be located
*/
private Path pathForMetadata(String namespace, String name) {
return pathForMetadata(rootDirectory, namespace, name);
}
/**
* Writes the contents of a {@code Descriptor} to files.
*
* @param fs The {@link FileSystem} where data will be stored
* @param metadataLocation The directory {@link Path} where metadata files
* will be located
* @param name The {@link Dataset} name
* @param descriptor The {@code Descriptor} contents to write
*
* @throws org.kitesdk.data.DatasetIOException
* If the {@code metadataLocation} does not exist or
* if any IOExceptions need to be propagated.
*/
@VisibleForTesting
static void writeDescriptor(
FileSystem fs, Path metadataLocation, String name,
DatasetDescriptor descriptor) {
checkExists(fs, metadataLocation);
FSDataOutputStream outputStream = null;
final Path schemaPath = new Path(metadataLocation, SCHEMA_FILE_NAME);
boolean threw = true;
try {
outputStream = fs.create(schemaPath, true /* overwrite */ );
outputStream.write(descriptor.getSchema().toString(true)
.getBytes(Charsets.UTF_8));
outputStream.flush();
threw = false;
} catch (IOException e) {
throw new DatasetIOException(
"Unable to save schema file: " + schemaPath +
" for dataset: " + name, e);
} finally {
try {
Closeables.close(outputStream, threw);
} catch (IOException e) {
throw new DatasetIOException("Cannot close", e);
}
}
Properties properties = new Properties();
properties.setProperty(VERSION_FIELD_NAME, METADATA_VERSION);
properties.setProperty(FORMAT_FIELD_NAME, descriptor.getFormat().getName());
properties.setProperty(COMPRESSION_TYPE_FIELD_NAME, descriptor.getCompressionType().getName());
final URI dataLocation = descriptor.getLocation();
if (dataLocation != null) {
properties.setProperty(LOCATION_FIELD_NAME, dataLocation.toString());
}
if (descriptor.isPartitioned()) {
properties.setProperty(PARTITION_EXPRESSION_FIELD_NAME,
Accessor.getDefault().toExpression(descriptor.getPartitionStrategy()));
}
// copy custom properties to the table
for (String property : descriptor.listProperties()) {
// no need to check the reserved list, those are not set on descriptors
properties.setProperty(property, descriptor.getProperty(property));
}
final Path descriptorPath = new Path(metadataLocation, DESCRIPTOR_FILE_NAME);
threw = true;
try {
outputStream = fs.create(descriptorPath, true /* overwrite */ );
properties.store(outputStream, "Dataset descriptor for " + name);
outputStream.flush();
threw = false;
} catch (IOException e) {
throw new DatasetIOException(
"Unable to save descriptor file: " + descriptorPath +
" for dataset: " + name, e);
} finally {
try {
Closeables.close(outputStream, threw);
} catch (IOException e) {
throw new DatasetIOException("Cannot close", e);
}
}
}
/**
* Returns the correct metadata path for the given dataset.
* @param root A Path
* @param name A String dataset name
* @return the metadata Path
*/
private static Path pathForMetadata(Path root, String namespace, String name) {
return new Path(
FileSystemDatasetRepository.pathForDataset(root, namespace, name),
METADATA_DIRECTORY);
}
/**
* Precondition-style static validation that a dataset exists
*
* @param fs A FileSystem where the metadata should be stored
* @param location The Path where the metadata should be stored
* @throws org.kitesdk.data.DatasetNotFoundException if the descriptor location is missing
* @throws org.kitesdk.data.DatasetIOException if any IOException is thrown
*/
private static void checkExists(FileSystem fs, Path location) {
try {
if (!fs.exists(location)) {
throw new DatasetNotFoundException(
"Descriptor location does not exist: " + location);
}
} catch (IOException ex) {
throw new DatasetIOException(
"Cannot access descriptor location: " + location, ex);
}
}
/**
* This method provides backward-compatibility for finding metadata.
* <p>
* This handles the case where an existing program is opening a
* DatasetRepository by URI. For example, the DatasetSink and maven plugin do
* this. In that case, the repository URI will directly contain a directory
* named for the dataset with .metadata in it. This checks for the updated
* scheme and falls back to the old scheme if the namespace is "default".
*
* @param namespace the requested namespace.
* @param name the dataset name.
* @return a Path to the correct metadata directory
* @throws DatasetNotFoundException if neither location has metadata
*/
private Path find(String namespace, String name) {
Path expectedPath = pathForMetadata(namespace, name);
if (DEFAULT_NAMESPACE.equals(namespace)) {
// when using the default namespace, the namespace may not be in the path
try {
checkExists(rootFileSystem, expectedPath);
return expectedPath;
} catch (DatasetNotFoundException e) {
try {
Path backwardCompatiblePath = new Path(rootDirectory, new Path(
name.replace('.', Path.SEPARATOR_CHAR), METADATA_DIRECTORY));
checkExists(rootFileSystem, backwardCompatiblePath);
return backwardCompatiblePath;
} catch (DatasetNotFoundException _) {
throw e; // throw the original
}
}
} else {
// no need to check other locations
checkExists(rootFileSystem, expectedPath);
return expectedPath;
}
}
}