/*
* Copyright 2011 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.springframework.data.hadoop.fs;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.springframework.beans.factory.DisposableBean;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.core.PriorityOrdered;
import org.springframework.core.io.DefaultResourceLoader;
//import org.springframework.core.io.FileSystemResource;
import org.springframework.core.io.Resource;
import org.springframework.core.io.support.PathMatchingResourcePatternResolver;
import org.springframework.core.io.support.ResourcePatternResolver;
import org.springframework.util.AntPathMatcher;
import org.springframework.util.Assert;
import org.springframework.util.PathMatcher;
import org.springframework.util.StringUtils;
/**
* Spring ResourceLoader over Hadoop FileSystem.
*
* @author Costin Leau
* @author Janne Valkealahti
*
*/
public class HdfsResourceLoader extends DefaultResourceLoader implements ResourcePatternResolver,
PriorityOrdered, Closeable, DisposableBean, InitializingBean {
private static final Log log = LogFactory.getLog(HdfsResourceLoader.class);
/** Pseudo URL prefix for loading from the hdfs path: "hdfs:" */
private static final String HDFS_URL_PREFIX = "hdfs:";
private final FileSystem fs;
private final PathMatcher pathMatcher = new AntPathMatcher();
/** Flag telling if fs is created in this class */
private final boolean internalFS;
private volatile boolean useCodecs = true;
private volatile CompressionCodecFactory codecsFactory;
/** Flag telling if path without prefix is routed to hdfs */
private volatile boolean handleNoprefix = true;
/** If we're impersonating a user */
private String impersonatedUser = null;
/** Needed to fall back to default spring functionality */
private ResourcePatternResolver resourcePatternResolver;
/**
* Constructs a new <code>HdfsResourceLoader</code> instance.
*
* @param config Hadoop configuration to use.
*/
public HdfsResourceLoader(Configuration config) {
this(config, null);
}
/**
* Constructs a new <code>HdfsResourceLoader</code> instance.
*
* @param config Hadoop configuration to use.
* @param uri Hadoop file system URI.
* @param user Hadoop user for accessing the file system.
*/
@SuppressWarnings("resource")
public HdfsResourceLoader(Configuration config, URI uri, String user) {
Assert.notNull(config, "a valid configuration is required");
impersonatedUser = user;
internalFS = true;
FileSystem tempFS = null;
codecsFactory = new CompressionCodecFactory(config);
try {
if (uri == null) {
uri = FileSystem.getDefaultUri(config);
}
tempFS = (StringUtils.hasText(impersonatedUser) ? FileSystem.get(uri, config, impersonatedUser) : FileSystem.get(uri, config));
} catch (Exception ex) {
tempFS = null;
throw new IllegalStateException("Cannot create filesystem", ex);
} finally {
fs = tempFS;
}
}
/**
* Constructs a new <code>HdfsResourceLoader</code> instance.
*
* @param config Hadoop configuration to use.
* @param uri Hadoop file system URI.
*/
public HdfsResourceLoader(Configuration config, URI uri) {
this(config, uri, null);
}
/**
* Constructs a new <code>HdfsResourceLoader</code> instance.
*
* @param fs Hadoop file system to use.
*/
public HdfsResourceLoader(FileSystem fs) {
Assert.notNull(fs, "a non-null file-system required");
this.fs = fs;
internalFS = false;
codecsFactory = new CompressionCodecFactory(fs.getConf());
}
@Override
protected Resource getResourceByPath(String path) {
if (handleNoprefix) {
return new HdfsResource(stripLeadingTilde(path), fs, codecs());
} else {
return super.getResourceByPath(path);
}
}
@Override
public Resource getResource(String location) {
// it looks like spring DefaultResourceLoader will rely java.net.URL to throw
// exception before if fall back to getResourceByPath. This is not reliable
// so do explicit check if location starts with 'hdfs'.
if (location.startsWith(HDFS_URL_PREFIX) || (location.indexOf(':') < 0 && handleNoprefix)) {
return getResourceByPath(location);
} else {
return super.getResource(location);
}
}
@Override
public Resource[] getResources(String locationPattern) throws IOException {
Assert.notNull(locationPattern, "Location pattern must not be null");
if (locationPattern.startsWith(HDFS_URL_PREFIX) || (locationPattern.indexOf(':') < 0 && handleNoprefix)) {
// Only look for a pattern after a prefix here
// (to not get fooled by a pattern symbol in a strange prefix).
if (pathMatcher.isPattern(stripPrefix(locationPattern))) {
// a resource pattern
return findPathMatchingResources(locationPattern);
} else {
// a single resource with the given name
return new Resource[] { getResource(stripPrefix(stripLeadingTilde(locationPattern))) };
}
} else {
return resourcePatternResolver.getResources(locationPattern);
}
}
@Override
public int getOrder() {
return PriorityOrdered.HIGHEST_PRECEDENCE;
}
@Override
public void destroy() throws IOException {
close();
}
@Override
public void close() throws IOException {
if (fs != null && internalFS) {
try {
fs.close();
// swallow bug in FS closing too early - HADOOP-4829
} catch (NullPointerException npe) {
}
}
}
@Override
public void afterPropertiesSet() throws Exception {
if (resourcePatternResolver == null) {
resourcePatternResolver = new PathMatchingResourcePatternResolver(this);
}
}
@Override
public ClassLoader getClassLoader() {
return fs.getConf().getClassLoader();
}
/**
* Sets the handle noprefix.
*
* @param handleNoprefix the new handle noprefix
*/
public void setHandleNoprefix(boolean handleNoprefix) {
this.handleNoprefix = handleNoprefix;
}
/**
* Returns the Hadoop file system used by this resource loader.
*
* @return the Hadoop file system in use.
*/
public FileSystem getFileSystem() {
return fs;
}
/**
* Indicates whether to use (or not) the codecs found inside the Hadoop
* configuration. This affects the content of the streams backing this
* resource - whether the raw content is delivered as is
* or decompressed on the fly (if the configuration allows it so).
* The latter is the default.
*
* @param useCodecs whether to use any codecs defined in the Hadoop configuration
*/
public void setUseCodecs(boolean useCodecs) {
this.useCodecs = useCodecs;
}
/**
* Sets the resource pattern resolver.
*
* @param resourcePatternResolver the new resource pattern resolver
*/
public void setResourcePatternResolver(ResourcePatternResolver resourcePatternResolver) {
this.resourcePatternResolver = resourcePatternResolver;
}
/**
* Find all resources that match the given location pattern via the
* Ant-style PathMatcher.
*
* @param locationPattern the location pattern to match
* @return the result as Resource array
* @throws IOException in case of I/O errors
*/
protected Resource[] findPathMatchingResources(String locationPattern) throws IOException {
String rootDirPath = determineRootDir(locationPattern);
String subPattern = locationPattern.substring(rootDirPath.length());
Resource[] rootDirResources = getResources(rootDirPath);
Set<Resource> result = new LinkedHashSet<Resource>(16);
for (Resource rootDirResource : rootDirResources) {
result.addAll(doFindPathMatchingFileResources(rootDirResource, subPattern));
}
if (log.isDebugEnabled()) {
log.debug("Resolved location pattern [" + locationPattern + "] to resources " + result);
}
return result.toArray(new Resource[result.size()]);
}
/**
* Find all resources in the hdfs file system that match the given location pattern
* via the Ant-style PathMatcher.
*
* @param rootDirResource the root directory as Resource
* @param subPattern the sub pattern to match (below the root directory)
* @return the Set of matching Resource instances
* @throws IOException in case of I/O errors
*/
protected Set<Resource> doFindPathMatchingFileResources(Resource rootDirResource, String subPattern)
throws IOException {
Path rootDir;
try {
rootDir = (rootDirResource instanceof HdfsResource ?
((HdfsResource) rootDirResource).getPath() :
new Path(rootDirResource.getURI().toString()));
} catch (IOException ex) {
if (log.isWarnEnabled()) {
log.warn("Cannot search for matching files underneath " + rootDirResource +
" because it does not correspond to a directory in the file system", ex);
}
return Collections.emptySet();
}
return doFindMatchingFileSystemResources(rootDir, subPattern);
}
/**
* Find all resources in the file system that match the given location pattern
* via the Ant-style PathMatcher.
*
* @param rootDir the root directory in the file system
* @param subPattern the sub pattern to match (below the root directory)
* @return the Set of matching Resource instances
* @throws IOException in case of I/O errors
* @see org.springframework.util.PathMatcher
*/
protected Set<Resource> doFindMatchingFileSystemResources(Path rootDir, String subPattern) throws IOException {
if (log.isDebugEnabled()) {
log.debug("Looking for matching resources in directory tree [" + rootDir.toUri().getPath() + "]");
}
Set<Path> matchingFiles = retrieveMatchingFiles(rootDir, subPattern);
Set<Resource> result = new LinkedHashSet<Resource>(matchingFiles.size());
for (Path path : matchingFiles) {
result.add(new HdfsResource(path, fs, codecs()));
}
return result;
}
/**
* Retrieve files that match the given path pattern,
* checking the given directory and its subdirectories.
*
* @param rootDir the directory to start from
* @param pattern the pattern to match against, * relative to the root directory
* @return the Set of matching Path instances
* @throws IOException if directory contents could not be retrieved
*/
@SuppressWarnings("deprecation")
protected Set<Path> retrieveMatchingFiles(Path rootDir, String pattern) throws IOException {
boolean exists = fs.exists(rootDir);
if (!exists) {
// Silently skip non-existing directories.
if (log.isDebugEnabled()) {
log.debug("Skipping [" + rootDir.toUri().getPath() + "] because it does not exist");
}
return Collections.emptySet();
}
// previous exists() should make sure we don't
// get FileNotFoundException
FileStatus fileStatus = fs.getFileStatus(rootDir);
if (!fileStatus.isDir()) {
// Complain louder if it exists but is no directory.
if (log.isWarnEnabled()) {
log.warn("Skipping [" + rootDir.toUri().getPath() + "] because it does not denote a directory");
}
return Collections.emptySet();
}
String fullPattern = StringUtils.replace(rootDir.toUri().getPath(), File.separator, "/");
if (!pattern.startsWith("/")) {
fullPattern += "/";
}
fullPattern = fullPattern + StringUtils.replace(pattern, File.separator, "/");
Set<Path> result = new LinkedHashSet<Path>(8);
doRetrieveMatchingFiles(fullPattern, rootDir, result);
return result;
}
/**
* Recursively retrieve files that match the given pattern,
* adding them to the given result list.
*
* @param fullPattern the pattern to match against, with prepended root directory path
* @param dir the current directory
* @param result the Set of matching File instances to add to
* @throws IOException if directory contents could not be retrieved
*/
@SuppressWarnings("deprecation")
protected void doRetrieveMatchingFiles(String fullPattern, Path dir, Set<Path> result) throws IOException {
if (log.isDebugEnabled()) {
log.debug("Searching directory [" + dir.toUri().getPath() +
"] for files matching pattern [" + fullPattern + "]");
}
FileStatus[] dirContents = null;
try {
dirContents = fs.listStatus(dir);
} catch (IOException ex) {
// ignore (likely security exception)
}
if (dirContents == null) {
if (log.isWarnEnabled()) {
log.warn("Could not retrieve contents of directory [" + dir.toUri().getPath() + "]");
}
return;
}
for (FileStatus content : dirContents) {
String currPath = StringUtils.replace(content.getPath().toUri().getPath(), File.separator, "/");
if (content.isDir() && pathMatcher.matchStart(fullPattern, currPath + "/")) {
doRetrieveMatchingFiles(fullPattern, content.getPath(), result);
}
if (pathMatcher.match(fullPattern, currPath)) {
result.add(content.getPath());
}
}
}
/**
* Determine the root directory for the given location.
* <p>Used for determining the starting point for file matching,
* resolving the root directory location and passing it
* into {@code doFindPathMatchingPathResources}, with the
* remainder of the location as pattern.
* <p>Will return "/dir/" for the pattern "/dir/*.xml",
* for example.
*
* @param location the location to check
* @return the part of the location that denotes the root directory
*/
protected String determineRootDir(String location) {
int rootDirEnd = location.length();
while (rootDirEnd > 0 && pathMatcher.isPattern(location.substring(0,rootDirEnd))) {
rootDirEnd = location.lastIndexOf('/', rootDirEnd - 2) + 1;
}
return location.substring(0, rootDirEnd);
}
/**
* Removes a leading tilde shortcut if exists.
*/
private String stripLeadingTilde(String locationPattern) {
if (locationPattern.startsWith("~/")) {
return locationPattern.substring(2);
}
return locationPattern;
}
private CompressionCodecFactory codecs() {
return (useCodecs ? codecsFactory : null);
}
/**
* Removes a prefix from a given path and what's
* left is a real 'file' path
*/
private static String stripPrefix(String path) {
String ret = null;
try {
ret = new Path(path).toUri().getPath();
} catch (Exception e) {}
if (ret == null && path.startsWith(HDFS_URL_PREFIX) && !path.startsWith("hdfs://")) {
// check if path is 'hdfs:myfile.txt', strip prefix and colon
ret = path.substring(5);
}
if (ret == null) {
// fall back to given path
ret = path;
}
return ret;
}
}