/**
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.data.filesystem;
import com.cloudera.cdk.data.Dataset;
import com.cloudera.cdk.data.DatasetDescriptor;
import com.cloudera.cdk.data.DatasetException;
import com.cloudera.cdk.data.DatasetReader;
import com.cloudera.cdk.data.DatasetWriter;
import com.cloudera.cdk.data.FieldPartitioner;
import com.cloudera.cdk.data.spi.Marker;
import com.cloudera.cdk.data.PartitionKey;
import com.cloudera.cdk.data.PartitionStrategy;
import com.cloudera.cdk.data.View;
import com.cloudera.cdk.data.impl.Accessor;
import com.cloudera.cdk.data.spi.AbstractDataset;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.io.IOException;
import java.util.List;
class FileSystemDataset<E> extends AbstractDataset<E> {
private static final Logger logger = LoggerFactory
.getLogger(FileSystemDataset.class);
private final FileSystem fileSystem;
private final Path directory;
private final String name;
private final DatasetDescriptor descriptor;
private PartitionKey partitionKey;
private final PartitionStrategy partitionStrategy;
private final FileSystemView<E> unbounded;
// reusable path converter, has no relevant state
private final PathConversion convert;
FileSystemDataset(FileSystem fileSystem, Path directory, String name,
DatasetDescriptor descriptor) {
this.fileSystem = fileSystem;
this.directory = directory;
this.name = name;
this.descriptor = descriptor;
this.partitionStrategy =
descriptor.isPartitioned() ? descriptor.getPartitionStrategy() : null;
this.convert = new PathConversion();
this.unbounded = new FileSystemView<E>(this);
// remove this.partitionKey for 0.11.0
this.partitionKey = null;
}
/**
* @deprecated will be removed in 0.11.0
*/
@Deprecated
FileSystemDataset(FileSystem fileSystem, Path directory, String name,
DatasetDescriptor descriptor, @Nullable PartitionKey partitionKey) {
this(fileSystem, directory, name, descriptor);
this.partitionKey = partitionKey;
}
@Override
public String getName() {
return name;
}
@Override
public DatasetDescriptor getDescriptor() {
return descriptor;
}
/**
* @deprecated will be removed in 0.11.0
*/
@Deprecated
PartitionKey getPartitionKey() {
return partitionKey;
}
FileSystem getFileSystem() {
return fileSystem;
}
Path getDirectory() {
return directory;
}
@Override
public DatasetWriter<E> newWriter() {
logger.debug("Getting writer to dataset:{}", this);
return unbounded.newWriter();
}
@Override
public DatasetReader<E> newReader() {
logger.debug("Getting reader for dataset:{}", this);
return unbounded.newReader();
}
@Override
public boolean deleteAll() {
return unbounded.deleteAll();
}
@Override
public Iterable<View<E>> getCoveringPartitions() {
Preconditions.checkState(descriptor.isPartitioned(),
"Attempt to get partitions on a non-partitioned dataset (name:%s)",
name);
return unbounded.getCoveringPartitions();
}
PathIterator pathIterator() {
return unbounded.pathIterator();
}
@Override
public FileSystemView<E> from(Marker start) {
return (FileSystemView<E>) unbounded.from(start);
}
@Override
public FileSystemView<E> fromAfter(Marker start) {
return (FileSystemView<E>) unbounded.fromAfter(start);
}
@Override
public FileSystemView<E> to(Marker end) {
return (FileSystemView<E>) unbounded.to(end);
}
@Override
public FileSystemView<E> toBefore(Marker end) {
return (FileSystemView<E>) unbounded.toBefore(end);
}
@Override
public FileSystemView<E> of(Marker partial) {
return (FileSystemView<E>) unbounded.of(partial);
}
@Override
@Nullable
@Deprecated
public Dataset<E> getPartition(PartitionKey key, boolean allowCreate) {
Preconditions.checkState(descriptor.isPartitioned(),
"Attempt to get a partition on a non-partitioned dataset (name:%s)",
name);
logger.debug("Loading partition for key {}, allowCreate:{}", new Object[] {
key, allowCreate });
Path partitionDirectory = fileSystem.makeQualified(
toDirectoryName(directory, key));
try {
if (!fileSystem.exists(partitionDirectory)) {
if (allowCreate) {
fileSystem.mkdirs(partitionDirectory);
} else {
return null;
}
}
} catch (IOException e) {
throw new DatasetException("Unable to locate or create dataset partition directory " + partitionDirectory, e);
}
int partitionDepth = key.getLength();
PartitionStrategy subpartitionStrategy = Accessor.getDefault()
.getSubpartitionStrategy(partitionStrategy, partitionDepth);
return new FileSystemDataset.Builder()
.name(name)
.fileSystem(fileSystem)
.descriptor(new DatasetDescriptor.Builder(descriptor)
.location(partitionDirectory)
.partitionStrategy(subpartitionStrategy)
.build())
.partitionKey(key)
.build();
}
@Override
@Deprecated
public void dropPartition(PartitionKey key) {
Preconditions.checkState(descriptor.isPartitioned(),
"Attempt to drop a partition on a non-partitioned dataset (name:%s)",
name);
Preconditions.checkArgument(key != null, "Partition key may not be null");
logger.debug("Dropping partition with key:{} dataset:{}", key, name);
Path partitionDirectory = toDirectoryName(directory, key);
try {
if (!fileSystem.delete(partitionDirectory, true)) {
throw new DatasetException("Partition directory " + partitionDirectory
+ " for key " + key + " does not exist");
}
} catch (IOException e) {
throw new DatasetException("Unable to locate or drop dataset partition directory " + partitionDirectory, e);
}
}
@Override
@Deprecated
public Iterable<Dataset<E>> getPartitions() {
Preconditions.checkState(descriptor.isPartitioned(),
"Attempt to get partitions on a non-partitioned dataset (name:%s)",
name);
List<Dataset<E>> partitions = Lists.newArrayList();
FileStatus[] fileStatuses;
try {
fileStatuses = fileSystem.listStatus(directory,
PathFilters.notHidden());
} catch (IOException e) {
throw new DatasetException("Unable to list partition directory for directory " + directory, e);
}
for (FileStatus stat : fileStatuses) {
Path p = fileSystem.makeQualified(stat.getPath());
PartitionKey key = fromDirectoryName(p);
PartitionStrategy subPartitionStrategy = Accessor.getDefault()
.getSubpartitionStrategy(partitionStrategy, 1);
Builder builder = new FileSystemDataset.Builder()
.name(name)
.fileSystem(fileSystem)
.descriptor(new DatasetDescriptor.Builder(descriptor)
.location(p)
.partitionStrategy(subPartitionStrategy)
.build())
.partitionKey(key);
partitions.add(builder.<E>build());
}
return partitions;
}
@Override
public String toString() {
return Objects.toStringHelper(this).add("name", name)
.add("descriptor", descriptor).add("directory", directory)
.add("dataDirectory", directory).add("partitionKey", partitionKey)
.toString();
}
@Deprecated
void accumulateDatafilePaths(Path directory, List<Path> paths)
throws IOException {
for (FileStatus status : fileSystem.listStatus(directory,
PathFilters.notHidden())) {
if (status.isDirectory()) {
accumulateDatafilePaths(status.getPath(), paths);
} else {
paths.add(status.getPath());
}
}
}
@SuppressWarnings("unchecked")
private Path toDirectoryName(Path dir, PartitionKey key) {
Path result = dir;
for (int i = 0; i < key.getLength(); i++) {
final FieldPartitioner fp = partitionStrategy.getFieldPartitioners().get(i);
result = new Path(result, convert.dirnameForValue(fp, key.get(i)));
}
return result;
}
@SuppressWarnings("unchecked")
private PartitionKey fromDirectoryName(Path dir) {
final FieldPartitioner fp = partitionStrategy.getFieldPartitioners().get(0);
final List<Object> values = Lists.newArrayList();
if (partitionKey != null) {
values.addAll(partitionKey.getValues());
}
values.add(convert.valueForDirname(fp, dir.getName()));
return Accessor.getDefault().newPartitionKey(values.toArray());
}
public static class Builder {
private Configuration conf;
private FileSystem fileSystem;
private Path directory;
private String name;
private DatasetDescriptor descriptor;
private PartitionKey partitionKey;
public Builder name(String name) {
this.name = name;
return this;
}
protected Builder fileSystem(FileSystem fs) {
this.fileSystem = fs;
return this;
}
public Builder configuration(Configuration conf) {
this.conf = conf;
return this;
}
public Builder descriptor(DatasetDescriptor descriptor) {
Preconditions.checkArgument(descriptor.getLocation() != null,
"Dataset location cannot be null");
this.descriptor = descriptor;
return this;
}
Builder partitionKey(@Nullable PartitionKey partitionKey) {
this.partitionKey = partitionKey;
return this;
}
public <E> FileSystemDataset<E> build() {
Preconditions.checkState(this.name != null, "No dataset name defined");
Preconditions.checkState(this.descriptor != null,
"No dataset descriptor defined");
Preconditions.checkState((conf != null) || (fileSystem != null),
"Configuration or FileSystem must be set");
this.directory = new Path(descriptor.getLocation());
if (fileSystem == null) {
try {
this.fileSystem = directory.getFileSystem(conf);
} catch (IOException ex) {
throw new DatasetException("Cannot access FileSystem", ex);
}
}
Path absoluteDirectory = fileSystem.makeQualified(directory);
return new FileSystemDataset<E>(
fileSystem, absoluteDirectory, name, descriptor, partitionKey);
}
}
}