/*
* Copyright 2008-2009 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package voldemort.store.readonly.mr.utils;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.io.StringReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
import org.joda.time.Period;
import voldemort.cluster.Cluster;
import voldemort.serialization.json.JsonTypeDefinition;
import voldemort.serialization.json.JsonTypes;
import voldemort.store.StoreDefinition;
import voldemort.utils.ByteUtils;
import voldemort.xml.ClusterMapper;
import voldemort.xml.StoreDefinitionsMapper;
import azkaban.common.utils.Props;
import azkaban.common.utils.UndefinedPropertyException;
/**
* Helper functions for Hadoop
*
* @author jkreps
*
*/
public class HadoopUtils {
// Any date written with the pattern should be accepted by the regex.
public static String COMMON_FILE_DATE_PATTERN = "yyyy-MM-dd-HH-mm";
public static String COMMON_FILE_DATE_REGEX = "\\d{4}-\\d{2}-\\d{2}-\\d{2}-\\d{2}";
private static Logger logger = Logger.getLogger(HadoopUtils.class);
private static Object cachedSerializable = null;
public static FileSystem getFileSystem(Props props) {
if(!props.containsKey("hadoop.job.ugi"))
throw new RuntimeException("No parameter hadoop.job.ugi set!");
return getFileSystem(props.getString("hadoop.job.ugi"));
}
public static FileSystem getFileSystem(String user) {
Configuration conf = new Configuration();
conf.set("hadoop.job.ugi", user);
try {
return FileSystem.get(conf);
} catch(IOException e) {
throw new RuntimeException(e);
}
}
/**
* Add the given object to the distributed cache for this job
*
* @param obj A Serializable object to add to the JobConf
* @param job The JobConf
*/
public static <T extends Serializable> void setSerializableInCache(JobConf job, T serializable) {
try {
// TODO: MED /tmp should be changed by conf.getTempDir() or
// something
Path workDir = new Path(String.format("/tmp/%s/%s/_join.temporary",
job.getJobName(),
System.currentTimeMillis()));
Path tempPath = new Path(workDir, "serializable.dat");
tempPath.getFileSystem(job).deleteOnExit(tempPath);
job.set("serializables.file", tempPath.toUri().getPath());
ObjectOutputStream objectStream = new ObjectOutputStream(tempPath.getFileSystem(job)
.create(tempPath));
objectStream.writeObject(serializable);
objectStream.close();
DistributedCache.addCacheFile(new URI(tempPath.toUri().getPath() + "#"
+ tempPath.getName()),
job);
} catch(URISyntaxException e) {
throw new RuntimeException(e);
} catch(IOException e) {
throw new RuntimeException(e);
}
}
public static Path getFilePathFromDistributedCache(String filename, Configuration conf)
throws IOException {
Path[] paths = DistributedCache.getLocalCacheFiles(conf);
Path filePath = null;
File file = new File(filename);
if(paths == null) {
// maybe we are in local mode and hadoop is a complete piece of
// shit that doesn't
// work in local mode
// check if maybe the file is just sitting there on the
// filesystem
if(file.exists())
filePath = new Path(file.getAbsolutePath());
} else {
for(Path path: paths)
if(path.getName().equals(file.getName()))
filePath = path;
}
return filePath;
}
/**
* Get the FileInputStream from distributed cache
*
* @param conf the JobConf
* @return FileInputStream file input stream
* @throws IOException
*/
public static FileInputStream getFileInputStream(String filename, Configuration conf) {
try {
Path filePath = getFilePathFromDistributedCache(filename, conf);
if(filePath == null) {
Path[] paths = DistributedCache.getLocalCacheFiles(conf);
throw new IllegalStateException("No cache file found by the name of '" + filename
+ "', found only " + paths);
}
return new FileInputStream(filePath.toString());
} catch(IOException e) {
throw new RuntimeException(e);
}
}
/**
* Get the given Serializable from the distributed cache as an Object
*
* @param conf The JobConf
* @return The Object that is read from cache
*/
public static Object readSerializableFromCache(Configuration conf) {
/*
* Cache the results of this operation, as this function may be called
* more than once by the same process (i.e., by combiners).
*/
if(HadoopUtils.cachedSerializable != null)
return HadoopUtils.cachedSerializable;
try {
String filename = conf.get("serializables.file");
if(filename == null)
return null;
Path serializable = getFilePathFromDistributedCache(filename, conf);
if(serializable == null) {
Path[] paths = DistributedCache.getLocalCacheFiles(conf);
throw new IllegalStateException("No serializable cache file found by the name of '"
+ filename + "', found only " + paths);
}
ObjectInputStream stream = new ObjectInputStream(new FileInputStream(serializable.toString()));
Object obj = stream.readObject();
stream.close();
HadoopUtils.cachedSerializable = obj;
return obj;
} catch(IOException e) {
throw new RuntimeException(e);
} catch(ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
public static Map<String, String> getMetadataFromSequenceFile(String fileName) {
Path path = new Path(fileName);
try {
return getMetadataFromSequenceFile(path.getFileSystem(new Configuration()), path);
} catch(IOException e) {
throw new RuntimeException(e);
}
}
public static Map<String, String> getMetadataFromSequenceFile(FileSystem fs, String fileName) {
return getMetadataFromSequenceFile(fs, new Path(fileName));
}
/**
* Read the metadata from a hadoop SequenceFile
*
* @param fs The filesystem to read from
* @param fileName The file to read from
* @return The metadata from this file
*/
public static Map<String, String> getMetadataFromSequenceFile(FileSystem fs, Path path) {
try {
Configuration conf = new Configuration();
conf.setInt("io.file.buffer.size", 4096);
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, new Configuration());
SequenceFile.Metadata meta = reader.getMetadata();
reader.close();
TreeMap<Text, Text> map = meta.getMetadata();
Map<String, String> values = new HashMap<String, String>();
for(Map.Entry<Text, Text> entry: map.entrySet())
values.put(entry.getKey().toString(), entry.getValue().toString());
return values;
} catch(IOException e) {
throw new RuntimeException(e);
}
}
public static JsonSchema getSchemaFromPath(Path path) throws IOException {
return getSchemaFromPath(path.getFileSystem(new Configuration()), path, true);
}
public static JsonSchema getSchemaFromPath(FileSystem fs, Path path) throws IOException {
return getSchemaFromPath(fs, path, true);
}
/**
* Pull the schema off of the given file (if it is a file). If it is a
* directory, then pull schemas off of all subfiles, and check that they are
* all the same schema. If so, return that schema, otherwise throw an
* exception
*
* @param fs The filesystem to use
* @param path The path from which to get the schema
* @param checkSameSchema boolean flag to check all files in directory for
* same schema
* @return The schema of this file or all its subfiles
* @throws IOException
*/
public static JsonSchema getSchemaFromPath(FileSystem fs, Path path, boolean checkSameSchema)
throws IOException {
try {
if(fs.isFile(path)) {
// this is a normal file, get a schema from it
Map<String, String> m = HadoopUtils.getMetadataFromSequenceFile(fs, path);
if(!m.containsKey("value.schema") || !m.containsKey("key.schema"))
throw new IllegalArgumentException("No schema found on file " + path.toString());
return new JsonSchema(JsonTypeDefinition.fromJson(m.get("key.schema")),
JsonTypeDefinition.fromJson(m.get("value.schema")));
} else {
FileStatus[] statuses = null;
if(fs.isDirectory(path)) {
// this is a directory, get schemas from all subfiles
statuses = fs.listStatus(path);
} else {
// this is wildcard path, get schemas from all matched files
statuses = fs.globStatus(path);
}
if(statuses == null || statuses.length == 0)
throw new IllegalArgumentException("No files found in path pattern "
+ path.toUri().getPath());
List<JsonSchema> schemas = new ArrayList<JsonSchema>();
for(FileStatus status: statuses) {
if(!HadoopUtils.shouldPathBeIgnored(status.getPath())) {
if(!checkSameSchema) {
// return first valid schema w/o checking all files
return getSchemaFromPath(fs, status.getPath(), checkSameSchema);
}
schemas.add(getSchemaFromPath(fs, status.getPath(), checkSameSchema));
}
}
// now check that all the schemas are the same
if(schemas.size() > 0) {
JsonSchema schema = schemas.get(0);
for(int i = 1; i < schemas.size(); i++)
if(!schema.equals(schemas.get(i)))
throw new IllegalArgumentException("The directory "
+ path.toString()
+ " contains heterogenous schemas: found both '"
+ schema.toString() + "' and '"
+ schemas.get(i).toString() + "'.");
return schema;
} else {
throw new IllegalArgumentException("No Valid metedata file found for Path:"
+ path.toString());
}
}
} catch(Exception e) {
logger.error("failed to get metadata from path:" + path);
throw new RuntimeException(e);
}
}
public static String getRequiredString(Configuration conf, String name) {
String val = conf.get(name);
if(val == null)
throw new IllegalArgumentException("Missing required parameter '" + name + "'.");
else
return val;
}
public static int getRequiredInt(Configuration conf, String name) {
return Integer.parseInt(getRequiredString(conf, name));
}
public static void copyInProps(Props props, Configuration conf, String... keys) {
for(String key: keys)
if(props.get(key) != null)
conf.set(key, props.get(key));
}
public static void copyInRequiredProps(Props props, Configuration conf, String... keys) {
for(String key: keys)
conf.set(key, props.getString(key));
}
/**
* Add all the properties in the Props to the given Configuration
*
* @param conf The Configuration
* @param props The Props
* @return The Configuration with all the new properties
*/
public static void copyInAllProps(Props props, Configuration conf) {
for(String key: props.keySet())
conf.set(key, props.get(key));
}
public static void copyInLocalProps(Props props, Configuration conf) {
for(String key: props.localKeySet())
conf.set(key, props.get(key));
}
public static Props loadHadoopProps(Props parent, File hadoopConfDir) {
// load hadoop properties
Configuration config = new Configuration();
config.addResource(new Path(new File(hadoopConfDir, "hadoop-default.xml").getAbsolutePath()));
config.addResource(new Path(new File(hadoopConfDir, "hadoop-site.xml").getAbsolutePath()));
// copy to props
Props props = new Props(parent);
for(Entry<String, String> entry: config)
props.put(entry.getKey(), config.get(entry.getKey()));
return props;
}
public static void setPropsInJob(Configuration conf, Props props) {
ByteArrayOutputStream output = new ByteArrayOutputStream();
try {
props.storeFlattened(output);
conf.set("azkaban.props", new String(output.toByteArray(), "UTF-8"));
} catch(IOException e) {
throw new RuntimeException("This is not possible!", e);
}
}
public static Props getPropsFromJob(Configuration conf) {
String propsString = conf.get("azkaban.props");
if(propsString == null)
throw new UndefinedPropertyException("The required property azkaban.props was not found in the Configuration.");
try {
ByteArrayInputStream input = new ByteArrayInputStream(propsString.getBytes("UTF-8"));
Properties properties = new Properties();
properties.load(input);
return new Props(null, properties);
} catch(IOException e) {
throw new RuntimeException("This is not possible!", e);
}
}
public static Cluster readCluster(String clusterFile, Configuration conf) throws IOException {
return new ClusterMapper().readCluster(new StringReader(readAsString(new Path(clusterFile))));
}
public static StoreDefinition readStoreDef(String storeFile,
String storeName,
Configuration conf) throws IOException {
List<StoreDefinition> stores = new StoreDefinitionsMapper().readStoreList(new StringReader(readAsString(new Path(storeFile))));
for(StoreDefinition def: stores) {
if(def.getName().equals(storeName))
return def;
}
throw new RuntimeException("Can't find store definition for store '" + storeName + "'.");
}
public static String getFileFromCache(Configuration conf, String fileName) throws IOException {
if("local".equals(conf.get("mapred.job.tracker"))) {
// For local mode Distributed cache is not set.
// try getting the raw file path.
URI[] uris = DistributedCache.getCacheFiles(conf);
return getFileFromURIList(uris, fileName);
} else {
// For Distributed filesystem.
Path[] pathList = DistributedCache.getLocalCacheFiles(conf);
return getFileFromPathList(pathList, fileName);
}
}
public static String getFileFromURIList(URI[] uris, String fileName) throws IOException {
for(URI uri: uris) {
if(uri.getPath().endsWith(fileName)) {
// uri matched
return uri.getPath();
}
}
return null;
}
public static String getFileFromPathList(Path[] pathList, String fileName) {
for(Path file: pathList) {
logger.info("getUriWithFragment path:" + file.toUri().getPath() + " fileName:"
+ fileName);
if(file.getName().equals(fileName)) {
logger.info("FOUND getUriWithFragment path:" + file.toUri().getPath());
return file.toUri().getPath();
}
}
return null;
}
/**
* Find a jar that contains a class of the same name, if any. It will return
* a jar file, even if that is not the first thing on the class path that
* has a class with the same name.
*
* @param my_class the class to find.
* @return a jar file that contains the class, or null.
* @throws IOException
*/
public static String findContainingJar(Class my_class, ClassLoader loader) {
String class_file = my_class.getName().replaceAll("\\.", "/") + ".class";
return findContainingJar(class_file, loader);
}
public static List<String> getFileNames(FileStatus[] statuses) {
List<String> fileNames = new ArrayList<String>();
if(statuses == null)
return fileNames;
for(FileStatus status: statuses)
fileNames.add(status.getPath().getName());
return fileNames;
}
public static String findContainingJar(String fileName, ClassLoader loader) {
try {
for(Enumeration itr = loader.getResources(fileName); itr.hasMoreElements();) {
URL url = (URL) itr.nextElement();
logger.info("findContainingJar finds url:" + url);
if("jar".equals(url.getProtocol())) {
String toReturn = url.getPath();
if(toReturn.startsWith("file:")) {
toReturn = toReturn.substring("file:".length());
}
toReturn = URLDecoder.decode(toReturn, "UTF-8");
return toReturn.replaceAll("!.*$", "");
}
}
} catch(IOException e) {
throw new RuntimeException(e);
}
return null;
}
public static String printAllClassLoaderPaths(String fileName, ClassLoader loader) {
try {
for(Enumeration itr = loader.getResources(fileName); itr.hasMoreElements();) {
URL url = (URL) itr.nextElement();
logger.info("findContainingJar finds url:" + url);
}
} catch(IOException e) {
throw new RuntimeException(e);
}
return null;
}
public static Period parsePeriod(String periodStr) {
Matcher monthsFormat = Pattern.compile("[0-9][0-9]*M").matcher(periodStr);
Matcher daysFormat = Pattern.compile("[0-9][0-9]*d").matcher(periodStr);
Matcher hoursFormat = Pattern.compile("[0-9][0-9]*h").matcher(periodStr);
Matcher minutesFormat = Pattern.compile("[0-9][0-9]*m").matcher(periodStr);
Period period = new Period();
while(monthsFormat.find()) {
period = period.plusMonths(Integer.parseInt(monthsFormat.group()
.substring(0,
monthsFormat.group()
.length() - 1)));
}
while(daysFormat.find()) {
period = period.plusDays(Integer.parseInt(daysFormat.group()
.substring(0,
daysFormat.group()
.length() - 1)));
}
while(hoursFormat.find()) {
period = period.plusHours(Integer.parseInt(hoursFormat.group()
.substring(0,
hoursFormat.group()
.length() - 1)));
}
while(minutesFormat.find()) {
period = period.plusMinutes(Integer.parseInt(minutesFormat.group()
.substring(0,
minutesFormat.group()
.length() - 1)));
}
return period;
}
public static FileSystem getFileSystem(String hdfsUrl, boolean isLocal) throws IOException {
// Initialize fs
FileSystem fs;
if(isLocal) {
fs = FileSystem.getLocal(new Configuration());
} else {
fs = new DistributedFileSystem();
try {
fs.initialize(new URI(hdfsUrl), new Configuration());
} catch(URISyntaxException e) {
throw new IllegalArgumentException(e);
}
}
return fs;
}
/**
* Given a directory path, return the paths of all directories in that tree
* that have no sub-directories.
*
* @param directory
* @return
*/
public static List<String> getLowestLevelDirectories(FileSystem fs,
Path directory,
PathFilter pathFilter) throws IOException {
List<String> lowestLevelDirectories = new ArrayList<String>();
if(hasSubDirectories(fs, directory)) {
// recurse on each of this directory's sub-directories, ignoring any
// files in the
// directory
FileStatus[] statuses = fs.listStatus(directory);
for(FileStatus status: statuses) {
if(status.isDir()) {
lowestLevelDirectories.addAll(getLowestLevelDirectories(fs,
status.getPath(),
pathFilter));
}
}
} else if(pathFilter == null || pathFilter.accept(directory)) {
// this directory has no sub-directories, and either there is no
// filter or it passes the
// filter, so add it and return
lowestLevelDirectories.add(directory.toString());
}
return lowestLevelDirectories;
}
/**
* Given a string representation of a directory path, check whether or not
* the directory has any sub-directories
*
* @param fs
* @param directory
* @return true iff the directory has at least one sub-directory
* @throws IOException
*/
private static boolean hasSubDirectories(FileSystem fs, Path directory) throws IOException {
FileStatus[] statuses = fs.listStatus(directory);
if(statuses == null)
return false;
for(FileStatus status: statuses) {
if(status != null && status.isDir() && !shouldPathBeIgnored(status.getPath())) {
// we have found a subDirectory
return true;
}
}
// we are done looping through the directory and found no subDirectories
return false;
}
public static JobConf addAllSubPaths(JobConf conf, Path path) throws IOException {
if(shouldPathBeIgnored(path)) {
throw new IllegalArgumentException(String.format("Path[%s] should be ignored.", path));
}
final FileSystem fs = path.getFileSystem(conf);
if(fs.exists(path)) {
for(FileStatus status: fs.listStatus(path)) {
if(!shouldPathBeIgnored(status.getPath())) {
if(status.isDir()) {
addAllSubPaths(conf, status.getPath());
} else {
FileInputFormat.addInputPath(conf, status.getPath());
}
}
}
}
return conf;
}
/**
* Check if the path should be ignored. Currently only paths with "_log" are
* ignored.
*
* @param path
* @return
* @throws IOException
*/
public static boolean shouldPathBeIgnored(Path path) throws IOException {
return path.getName().startsWith("_");
}
public static Map<String, String> getMapByPrefix(Configuration conf, String prefix) {
Map<String, String> values = new HashMap<String, String>();
for(Entry<String, String> entry: conf) {
if(entry.getKey().startsWith(prefix))
values.put(entry.getKey().substring(prefix.length()), entry.getValue());
}
return values;
}
public static void saveProps(Props props, String file) throws IOException {
Path path = new Path(file);
FileSystem fs = null;
if(props.containsKey("hadoop.job.ugi")) {
fs = getFileSystem(props);
} else {
fs = path.getFileSystem(new Configuration());
}
saveProps(fs, props, file);
}
public static void saveProps(FileSystem fs, Props props, String file) throws IOException {
Path path = new Path(file);
// create directory if it does not exist.
Path parent = path.getParent();
if(!fs.exists(parent))
fs.mkdirs(parent);
// write out properties
OutputStream output = fs.create(path);
try {
props.storeFlattened(output);
} finally {
output.close();
}
}
public static Props readProps(String file) throws IOException {
Path path = new Path(file);
FileSystem fs = path.getFileSystem(new Configuration());
if(fs.exists(path)) {
InputStream input = fs.open(path);
try {
// wrap it up in another layer so that the user can override
// properties
Props p = new Props(null, input);
return new Props(p);
} finally {
input.close();
}
} else {
return new Props();
}
}
public static String readAsString(Path path) {
InputStream input = null;
try {
FileSystem fs = path.getFileSystem(new Configuration());
input = fs.open(path);
return IOUtils.toString(input);
} catch(IOException e) {
throw new RuntimeException(e);
} finally {
IOUtils.closeQuietly(input);
}
}
public static boolean mkdirs(String pathName) throws IOException {
Path path = new Path(pathName);
FileSystem fs = path.getFileSystem(new Configuration());
return fs.mkdirs(path);
}
public static void deletePathIfExists(JobConf conf, String stepOutputPath) throws IOException {
Path path = new Path(stepOutputPath);
FileSystem fs = path.getFileSystem(conf);
if(fs.exists(path)) {
fs.delete(path, true);
}
}
/**
* Tag the BytesWritable with an integer at the END
*/
public static void appendTag(BytesWritable writable, int tag) {
int size = writable.getLength();
if(writable.getCapacity() < size + 4) {
// BytesWritable preserves old values
writable.setCapacity(size + 4);
}
ByteUtils.writeInt(writable.getBytes(), tag, size);
writable.setSize(size + 4);
}
/**
* read and return integer from the END of BytesWritable The tag bytes are
* NOT removed
*/
public static int readTag(BytesWritable readable) {
return ByteUtils.readInt(readable.getBytes(), readable.getLength() - 4);
}
/**
* creates default data for given schema is needed for mappers/reducers
* which tries to handle different schema.
*
* Outputs<br>
* <br>
* Map : outputs default value for each subType <br>
* List : output empty list <br>
* JsonTypes: default 0 or '' empty strings
*/
public static Object createDefaultData(Object typeSchema) {
if(typeSchema instanceof List<?>) {
ArrayList<Object> list = new ArrayList<Object>(0);
return list;
} else if(typeSchema instanceof Map<?, ?>) {
HashMap<String, Object> map = new HashMap<String, Object>();
for(Map.Entry<String, Object> typeEntry: ((Map<String, Object>) typeSchema).entrySet()) {
map.put(typeEntry.getKey(), createDefaultData(typeEntry.getValue()));
}
return map;
} else if(typeSchema instanceof JsonTypes) {
return createDefaultJsonData((JsonTypes) typeSchema);
}
throw new RuntimeException("Invlaid schema type:" + typeSchema);
}
private static Object createDefaultJsonData(JsonTypes type) {
if(JsonTypes.BOOLEAN.equals(type))
return false;
else if(JsonTypes.DATE.equals(type))
return new Date();
else if(JsonTypes.FLOAT32.equals(type) || JsonTypes.FLOAT64.equals(type)
|| JsonTypes.INT8.equals(type) || JsonTypes.INT16.equals(type)
|| JsonTypes.INT32.equals(type))
return 0;
else if(JsonTypes.BYTES.equals(type)) {
byte[] data = new byte[0];
return data;
} else if(JsonTypes.STRING.equals(type)) {
return "";
}
throw new RuntimeException("Invalid JsonType:" + type);
}
/**
* Looks for the latest (the alphabetically greatest) path contained in the
* given directory that passes the specified regex pattern.
*
* @param fs The file system
* @param directory The directory that will contain the versions
* @param acceptRegex The String pattern
* @return
* @throws IOException
*/
public static Path getLatestVersionedPath(FileSystem fs, Path directory, String acceptRegex)
throws IOException {
final String pattern = acceptRegex != null ? acceptRegex : "\\S+";
PathFilter filter = new PathFilter() {
@Override
public boolean accept(Path arg0) {
return !arg0.getName().startsWith("_") && Pattern.matches(pattern, arg0.getName());
}
};
FileStatus[] statuses = fs.listStatus(directory, filter);
if(statuses == null || statuses.length == 0) {
return null;
}
Arrays.sort(statuses);
return statuses[statuses.length - 1].getPath();
}
/**
* Looks for the latest (the alphabetically greatest) path contained in the
* given directory that passes the specified regex pattern "\\S+" for all
* non spaced words.
*
* @param fs The file system
* @param directory The directory that will contain the versions
* @return
* @throws IOException
*/
public static Path getLatestVersionedPath(FileSystem fs, Path directory) throws IOException {
return getLatestVersionedPath(fs, directory, null);
}
/**
* Does the same thing as getLatestVersionedPath, but checks to see if the
* directory contains #LATEST. If it doesn't, it just returns what was
* passed in.
*
* @param fs
* @param directory
* @return
* @throws IOException
*/
public static Path getSanitizedPath(FileSystem fs, Path directory, String acceptRegex)
throws IOException {
if(directory.getName().endsWith("#LATEST")) {
// getparent strips out #LATEST
return getLatestVersionedPath(fs, directory.getParent(), acceptRegex);
}
return directory;
}
public static Path getSanitizedPath(Path path) throws IOException {
return getSanitizedPath(path.getFileSystem(new Configuration()), path);
}
/**
* Does the same thing as getLatestVersionedPath, but checks to see if the
* directory contains #LATEST. If it doesn't, it just returns what was
* passed in.
*
* @param fs
* @param directory
* @return
* @throws IOException
*/
public static Path getSanitizedPath(FileSystem fs, Path directory) throws IOException {
if(directory.getName().endsWith("#LATEST")) {
// getparent strips out #LATEST
return getLatestVersionedPath(fs, directory.getParent(), null);
}
return directory;
}
/**
* Easily cleans up old data (alphabetically least) paths that is accepted
* by the regex.
*
* @param fs The file system
* @param directory The directory that will contain the versions
* @param acceptRegex The String pattern
* @param backupNumber The number of versions we should keep. Otherwise
* we'll clean up.
* @return
* @throws IOException
*/
public static void cleanupOlderVersions(FileSystem fs,
Path directory,
final String acceptRegex,
int backupNumber) throws IOException {
if(backupNumber < 1) {
logger.error("Number of versions must be 1 or greater");
return;
}
PathFilter filter = new PathFilter() {
@Override
public boolean accept(Path arg0) {
return !arg0.getName().startsWith("_")
&& Pattern.matches(acceptRegex, arg0.getName());
}
};
FileStatus[] statuses = fs.listStatus(directory, filter);
if(statuses == null) {
logger.info("No backup files found");
return;
}
Arrays.sort(statuses);
int lastIndex = statuses.length - backupNumber;
for(int i = 0; i < lastIndex; ++i) {
logger.info("Deleting " + statuses[i].getPath());
fs.delete(statuses[i].getPath(), true);
}
}
public static void cleanupOlderVersions(FileSystem fs, Path directory, int backupNumber)
throws IOException {
cleanupOlderVersions(fs, directory, "\\S+", backupNumber);
}
/**
* Move the file from one place to another. Unlike the raw Hadoop API this
* will throw an exception if it fails. Like the Hadoop api it will fail if
* a file exists in the destination.
*
* @param fs The filesystem
* @param from The source file to move
* @param to The destination location
* @throws IOException
*/
public static void move(FileSystem fs, Path from, Path to) throws IOException {
boolean success = fs.rename(from, to);
if(!success)
throw new RuntimeException("Failed to move " + from + " to " + to);
}
/**
* Move the give file to the given location. Delete any existing file in
* that location. Use the temp directory to make the operation as
* transactional as possible. Throws an exception if the move fails.
*
* @param fs The filesystem
* @param from The source file
* @param to The destination file
* @param temp A temp directory to use
* @throws IOException
*/
public static void replaceFile(FileSystem fs, Path from, Path to, Path temp) throws IOException {
fs.delete(temp, true);
move(fs, to, temp);
try {
move(fs, from, to);
fs.delete(temp, true);
} catch(IOException e) {
// hmm something went wrong, attempt to restore
fs.rename(temp, to);
throw e;
}
}
}