Package org.kitesdk.data.spi.filesystem

Source Code of org.kitesdk.data.spi.filesystem.FileSystemDataset$Builder

/**
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.data.spi.filesystem;

import com.google.common.collect.Sets;

import java.util.Iterator;
import java.util.Set;
import org.apache.hadoop.mapreduce.InputFormat;
import org.kitesdk.data.DatasetDescriptor;
import org.kitesdk.data.DatasetIOException;
import org.kitesdk.data.spi.Compatibility;
import org.kitesdk.data.spi.PartitionKey;
import org.kitesdk.data.PartitionStrategy;
import org.kitesdk.data.RefinableView;
import org.kitesdk.data.impl.Accessor;
import org.kitesdk.data.spi.AbstractDataset;
import org.kitesdk.data.spi.Constraints;
import org.kitesdk.data.spi.FieldPartitioner;
import org.kitesdk.data.spi.InputFormatAccessor;
import org.kitesdk.data.spi.LastModifiedAccessor;
import org.kitesdk.data.spi.Mergeable;
import org.kitesdk.data.spi.PartitionListener;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.kitesdk.data.spi.PartitionedDataset;
import org.kitesdk.data.spi.SizeAccessor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;
import java.io.IOException;
import java.net.URI;
import java.util.List;
import org.apache.avro.generic.IndexedRecord;
import org.kitesdk.data.Formats;

@SuppressWarnings("deprecation")
public class FileSystemDataset<E> extends AbstractDataset<E> implements
    Mergeable<FileSystemDataset<E>>, InputFormatAccessor<E>, LastModifiedAccessor,
    PartitionedDataset<E>, SizeAccessor {

  private static final Logger LOG = LoggerFactory
    .getLogger(FileSystemDataset.class);

  private final FileSystem fileSystem;
  private final Path directory;
  private final String namespace;
  private final String name;
  private final DatasetDescriptor descriptor;
  private PartitionKey partitionKey;
  private final URI uri;

  private final PartitionStrategy partitionStrategy;
  private final PartitionListener partitionListener;

  private final FileSystemView<E> unbounded;

  // reusable path converter, has no relevant state
  private final PathConversion convert;

  FileSystemDataset(FileSystem fileSystem, Path directory,
                    String namespace, String name,
                    DatasetDescriptor descriptor, URI uri,
                    @Nullable PartitionListener partitionListener,
                    Class<E> type) {
    super(type, descriptor.getSchema());
    if (Formats.PARQUET.equals(descriptor.getFormat())) {
      Preconditions.checkArgument(IndexedRecord.class.isAssignableFrom(type) ||
          type == Object.class,
          "Parquet only supports generic and specific data models, type"
          + " parameter must implement IndexedRecord");
    }

    this.fileSystem = fileSystem;
    this.directory = directory;
    this.namespace = namespace;
    this.name = name;
    this.descriptor = descriptor;
    this.partitionStrategy =
        descriptor.isPartitioned() ? descriptor.getPartitionStrategy() : null;
    this.partitionListener = partitionListener;
    this.convert = new PathConversion(descriptor.getSchema());
    this.uri = uri;

    this.unbounded = new FileSystemView<E>(this, type);
    // remove this.partitionKey for 0.14.0
    this.partitionKey = null;
  }

  FileSystemDataset(FileSystem fileSystem, Path directory,
                    String namespace, String name,
                    DatasetDescriptor descriptor, URI uri,
                    @Nullable PartitionKey partitionKey,
                    @Nullable PartitionListener partitionListener,
                    Class<E> type) {
    this(fileSystem, directory, namespace, name, descriptor, uri,
        partitionListener, type);
    this.partitionKey = partitionKey;
  }

  @Override
  public URI getUri() {
    return uri;
  }

  public String getNamespace() {
    return namespace;
  }

  @Override
  public String getName() {
    return name;
  }

  @Override
  public DatasetDescriptor getDescriptor() {
    return descriptor;
  }

  PartitionKey getPartitionKey() {
    return partitionKey;
  }

  FileSystem getFileSystem() {
    return fileSystem;
  }

  Path getDirectory() {
    return directory;
  }

  PartitionListener getPartitionListener() {
    return partitionListener;
  }

  public boolean deleteAll() {
    // no constraints, so delete is always aligned to partition boundaries
    return unbounded.deleteAllUnsafe();
  }

  public PathIterator pathIterator() {
    return unbounded.pathIterator();
  }

  /**
   * Returns an iterator that provides all leaf-level directories in this view.
   *
   * @return leaf-directory iterator
   */
  public Iterator<Path> dirIterator() {
    return unbounded.dirIterator();
  }

  @Override
  protected RefinableView<E> asRefinableView() {
    return unbounded;
  }

  @Override
  public FileSystemView<E> filter(Constraints c) {
    return unbounded.filter(c);
  }

  @Override
  @Nullable
  @SuppressWarnings("deprecation")
  public PartitionedDataset<E> getPartition(PartitionKey key, boolean allowCreate) {
    Preconditions.checkState(descriptor.isPartitioned(),
      "Attempt to get a partition on a non-partitioned dataset (name:%s)",
      name);

    LOG.debug("Loading partition for key {}, allowCreate:{}", new Object[] {
      key, allowCreate });

    Path partitionDirectory = fileSystem.makeQualified(
        toDirectoryName(directory, key));

    try {
      if (!fileSystem.exists(partitionDirectory)) {
        if (allowCreate) {
          fileSystem.mkdirs(partitionDirectory);
          if (partitionListener != null) {
            partitionListener.partitionAdded(namespace, name,
                toRelativeDirectory(key).toString());
          }
        } else {
          return null;
        }
      }
    } catch (IOException e) {
      throw new DatasetIOException("Unable to locate or create dataset partition directory " + partitionDirectory, e);
    }

    int partitionDepth = key.getLength();
    PartitionStrategy subpartitionStrategy = Accessor.getDefault()
        .getSubpartitionStrategy(partitionStrategy, partitionDepth);

    return new FileSystemDataset.Builder<E>()
        .namespace(namespace)
        .name(name)
        .fileSystem(fileSystem)
        .uri(uri)
        .descriptor(new DatasetDescriptor.Builder(descriptor)
            .location(partitionDirectory)
            .partitionStrategy(subpartitionStrategy)
            .build())
        .type(type)
        .partitionKey(key)
        .partitionListener(partitionListener)
        .build();
  }

  @Override
  @SuppressWarnings("deprecation")
  public void dropPartition(PartitionKey key) {
    Preconditions.checkState(descriptor.isPartitioned(),
      "Attempt to drop a partition on a non-partitioned dataset (name:%s)",
      name);
    Preconditions.checkNotNull(key, "Partition key may not be null");

    LOG.debug("Dropping partition with key:{} dataset:{}", key, name);

    Path partitionDirectory = toDirectoryName(directory, key);

    try {
      if (!fileSystem.delete(partitionDirectory, true)) {
        throw new IOException("Partition directory " + partitionDirectory
          + " for key " + key + " does not exist");
      }
    } catch (IOException e) {
      throw new DatasetIOException("Unable to locate or drop dataset partition directory " + partitionDirectory, e);
    }
  }

  @Override
  @SuppressWarnings("deprecation")
  public Iterable<PartitionedDataset<E>> getPartitions() {
    Preconditions.checkState(descriptor.isPartitioned(),
      "Attempt to get partitions on a non-partitioned dataset (name:%s)",
      name);

    List<PartitionedDataset<E>> partitions = Lists.newArrayList();

    FileStatus[] fileStatuses;

    try {
      fileStatuses = fileSystem.listStatus(directory,
        PathFilters.notHidden());
    } catch (IOException e) {
      throw new DatasetIOException("Unable to list partition directory for directory " + directory, e);
    }

    for (FileStatus stat : fileStatuses) {
      Path p = fileSystem.makeQualified(stat.getPath());
      PartitionKey key = keyFromDirectory(p.getName());
      PartitionStrategy subPartitionStrategy = Accessor.getDefault()
          .getSubpartitionStrategy(partitionStrategy, 1);
      Builder<E> builder = new FileSystemDataset.Builder<E>()
          .namespace(namespace)
          .name(name)
          .fileSystem(fileSystem)
          .uri(uri)
          .descriptor(new DatasetDescriptor.Builder(descriptor)
              .location(p)
              .partitionStrategy(subPartitionStrategy)
              .build())
          .type(type)
          .partitionKey(key)
          .partitionListener(partitionListener);

      partitions.add(builder.build());
    }

    return partitions;
  }

  @Override
  public String toString() {
    return Objects.toStringHelper(this).add("name", name)
      .add("descriptor", descriptor).add("directory", directory)
      .add("dataDirectory", directory).add("partitionKey", partitionKey)
      .toString();
  }

  @Override
  public void merge(FileSystemDataset<E> update) {
    DatasetDescriptor updateDescriptor = update.getDescriptor();

    // check that the dataset's descriptor can read the update
    Compatibility.checkCompatible(updateDescriptor, descriptor);

    Set<String> addedPartitions = Sets.newHashSet();
    for (Path path : update.pathIterator()) {
      URI relativePath = update.getDirectory().toUri().relativize(path.toUri());
      Path newPath;
      if (relativePath.toString().isEmpty()) {
        newPath = directory;
      } else {
        newPath = new Path(directory, new Path(relativePath));
      }

      Path newPartitionDirectory = newPath.getParent();
      try {
        if (!fileSystem.exists(newPartitionDirectory)) {
          fileSystem.mkdirs(newPartitionDirectory);
        }
        LOG.debug("Renaming {} to {}", path, newPath);
        boolean renameOk = fileSystem.rename(path, newPath);
        if (!renameOk) {
          throw new IOException("Dataset merge failed during rename of " + path +
              " to " + newPath);
        }
      } catch (IOException e) {
        throw new DatasetIOException("Dataset merge failed", e);
      }
      if (descriptor.isPartitioned() && partitionListener != null) {
        String partition = newPartitionDirectory.toString();
        if (!addedPartitions.contains(partition)) {
          partitionListener.partitionAdded(namespace, name, partition);
          addedPartitions.add(partition);
        }
      }
    }
  }

  @Override
  public InputFormat<E, Void> getInputFormat(Configuration conf) {
    return new FileSystemViewKeyInputFormat<E>(this, conf);
  }

  @SuppressWarnings("unchecked")
  private Path toDirectoryName(@Nullable Path dir, PartitionKey key) {
    Path result = dir;
    for (int i = 0; i < key.getLength(); i++) {
      final FieldPartitioner fp = partitionStrategy.getFieldPartitioners().get(i);
      if (result != null) {
        result = new Path(result, PathConversion.dirnameForValue(fp, key.get(i)));
      } else {
        result = new Path(PathConversion.dirnameForValue(fp, key.get(i)));
      }
    }
    return result;
  }

  private Path toRelativeDirectory(PartitionKey key) {
    return toDirectoryName(null, key);
  }

  @SuppressWarnings("unchecked")
  private PartitionKey keyFromDirectory(String name) {
    final FieldPartitioner fp = partitionStrategy.getFieldPartitioners().get(0);
    final List<Object> values = Lists.newArrayList();

    if (partitionKey != null) {
      values.addAll(partitionKey.getValues());
    }

    values.add(convert.valueForDirname(fp, name));

    return new PartitionKey(values.toArray());
  }

  @SuppressWarnings("unchecked")
  public PartitionKey keyFromDirectory(Path dir) {

    Path relDir = null;
    URI relUri = directory.toUri().relativize(dir.toUri());

    if (!relUri.toString().isEmpty()) {
      relDir = new Path(relUri);
      Preconditions.checkState(!relDir.equals(dir), "Partition directory %s is not " +
          "relative to dataset directory %s", dir, directory);
    }

    List<String> pathComponents = Lists.newArrayList();
    while (relDir != null && !relDir.getName().equals("")) {
      pathComponents.add(0, relDir.getName());
      relDir = relDir.getParent();
    }

    List<FieldPartitioner> fps = partitionStrategy.getFieldPartitioners();
    Preconditions.checkState(pathComponents.size() <= fps.size(),
        "Number of components in partition directory %s (%s) exceeds number of field " +
            "partitioners %s", dir, pathComponents, partitionStrategy);

    List<Object> values = Lists.newArrayList();
    for (int i = 0; i < pathComponents.size(); i++) {
      values.add(convert.valueForDirname(fps.get(i), pathComponents.get(i)));
    }

    if (partitionKey != null) {
      values.addAll(0, partitionKey.getValues());
    }

    return new PartitionKey(values.toArray());
  }

  @Override
  public long getSize() {
    long size = 0;
    for (Iterator<Path> i = dirIterator(); i.hasNext(); ) {
      Path dir = i.next();
      try {
        for (FileStatus st : fileSystem.listStatus(dir)) {
          size += st.getLen();
        }
      } catch (IOException e) {
        throw new DatasetIOException("Cannot find size of " + dir, e);
      }
    }
    return size;
  }

  @Override
  public long getLastModified() {
    long lastMod = -1;
    for (Iterator<Path> i = dirIterator(); i.hasNext(); ) {
      Path dir = i.next();
      try {
        for (FileStatus st : fileSystem.listStatus(dir)) {
          if (lastMod < st.getModificationTime()) {
            lastMod = st.getModificationTime();
          }
        }
      } catch (IOException e) {
        throw new DatasetIOException("Cannot find last modified time of of " + dir, e);
      }
    }
    return lastMod;
  }

  @Override
  public boolean isEmpty() {
    return unbounded.isEmpty();
  }

  public static class Builder<E> {

    private Configuration conf;
    private FileSystem fileSystem;
    private Path directory;
    private String namespace;
    private String name;
    private DatasetDescriptor descriptor;
    private Class<E> type;
    private URI uri;
    private PartitionKey partitionKey;
    private PartitionListener partitionListener;

    public Builder<E> namespace(String namespace) {
      this.namespace = namespace;
      return this;
    }

    public Builder<E> name(String name) {
      this.name = name;
      return this;
    }

    protected Builder<E> fileSystem(FileSystem fs) {
      this.fileSystem = fs;
      return this;
    }

    public Builder<E> configuration(Configuration conf) {
      this.conf = conf;
      return this;
    }

     public Builder<E> descriptor(DatasetDescriptor descriptor) {
      Preconditions.checkArgument(descriptor.getLocation() != null,
          "Dataset location cannot be null");

      this.descriptor = descriptor;

      return this;
    }

    public Builder<E> type(Class<E> type) {
      Preconditions.checkNotNull(type, "Type cannot be null");

      this.type = type;

      return this;
    }

    public Builder<E> uri(URI uri) {
      this.uri = uri;
      return this;
    }

    Builder<E> partitionKey(@Nullable PartitionKey partitionKey) {
      this.partitionKey = partitionKey;
      return this;
    }

    Builder<E> partitionListener(@Nullable PartitionListener partitionListener) {
      this.partitionListener = partitionListener;
      return this;
    }

    public FileSystemDataset<E> build() {
      Preconditions.checkState(this.namespace != null, "No namespace defined");
      Preconditions.checkState(this.name != null, "No dataset name defined");
      Preconditions.checkState(this.descriptor != null,
        "No dataset descriptor defined");
      Preconditions.checkState((conf != null) || (fileSystem != null),
          "Configuration or FileSystem must be set");
      Preconditions.checkState(type != null, "No type specified");

      this.directory = new Path(descriptor.getLocation());

      if (fileSystem == null) {
        try {
          this.fileSystem = directory.getFileSystem(conf);
        } catch (IOException ex) {
          throw new DatasetIOException("Cannot access FileSystem", ex);
        }
      }

      Path absoluteDirectory = fileSystem.makeQualified(directory);
      return new FileSystemDataset<E>(
          fileSystem, absoluteDirectory, namespace, name, descriptor, uri,
          partitionKey, partitionListener, type);
    }
  }

}
TOP

Related Classes of org.kitesdk.data.spi.filesystem.FileSystemDataset$Builder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.