/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.util;
import java.io.BufferedReader;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Stack;
import java.util.StringTokenizer;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.InvalidInputException;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileRecordReader;
/**
* A Map-reduce program to recursively copy directories between
* different file-systems.
*/
public class CopyFiles implements Tool {
private static final Log LOG = LogFactory.getLog(CopyFiles.class);
private static final String usage =
"distcp [OPTIONS] <srcurl>* <desturl>" +
"\n\nOPTIONS:" +
"\n-p Preserve status" +
"\n-i Ignore failures" +
"\n-log <logdir> Write logs to <logdir>" +
"\n-overwrite Overwrite destination" +
"\n-update Overwrite if src size different from dst size" +
"\n-f <urilist_uri> Use list at <urilist_uri> as src list" +
"\n\nNOTE: if -overwrite or -update are set, each source URI is " +
"\n interpreted as an isomorphic update to an existing directory." +
"\nFor example:" +
"\nhadoop distcp -p -update \"hdfs://A:8020/user/foo/bar\" " +
"\"hdfs://B:8020/user/foo/baz\"\n" +
"\n would update all descendants of 'baz' also in 'bar'; it would " +
"\n *not* update /user/foo/baz/bar\n";
private static final long BYTES_PER_MAP = 256 * 1024 * 1024;
private static final int MAX_MAPS_PER_NODE = 20;
private static final int SYNC_FILE_MAX = 10;
static enum Counter { COPY, SKIP, FAIL, BYTESCOPIED, BYTESEXPECTED }
private JobConf conf;
public void setConf(Configuration conf) {
if (conf instanceof JobConf) {
this.conf = (JobConf) conf;
} else {
this.conf = new JobConf(conf);
}
}
public Configuration getConf() {
return conf;
}
@Deprecated
public CopyFiles() { }
public CopyFiles(Configuration conf) {
setConf(conf);
}
/**
* An input/output pair of filenames.
*/
static class FilePair implements Writable {
FileStatus input = new FileStatus();
Path output;
FilePair() { }
FilePair(FileStatus input, Path output) {
this.input = input;
this.output = output;
}
public void readFields(DataInput in) throws IOException {
input.readFields(in);
output = new Path(Text.readString(in));
}
public void write(DataOutput out) throws IOException {
input.write(out);
Text.writeString(out, output.toString());
}
}
/**
* InputFormat of a distcp job responsible for generating splits of the src
* file list.
*/
static class CopyInputFormat implements InputFormat {
/**
* Does nothing.
*/
public void validateInput(JobConf job) throws IOException { }
/**
* Produce splits such that each is no greater than the quotient of the
* total size and the number of splits requested.
* @param job The handle to the JobConf object
* @param numSplits Number of splits requested
*/
public InputSplit[] getSplits(JobConf job, int numSplits)
throws IOException {
int cnfiles = job.getInt("distcp.file.count", -1);
long cbsize = job.getLong("distcp.total.size", -1);
String srcfilelist = job.get("distcp.src.list", "");
if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) {
throw new RuntimeException("Invalid metadata: #files(" + cnfiles +
") total_size(" + cbsize + ") listuri(" +
srcfilelist + ")");
}
Path src = new Path(srcfilelist);
FileSystem fs = src.getFileSystem(job);
FileStatus srcst = fs.getFileStatus(src);
ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
LongWritable key = new LongWritable();
FilePair value = new FilePair();
final long targetsize = cbsize / numSplits;
long pos = 0L;
long last = 0L;
long acc = 0L;
long cbrem = srcst.getLen();
for (SequenceFile.Reader sl = new SequenceFile.Reader(fs, src, job);
sl.next(key, value); last = sl.getPosition()) {
// if adding this split would put this split past the target size,
// cut the last split and put this next file in the next split.
if (acc + key.get() > targetsize && acc != 0) {
long splitsize = last - pos;
splits.add(new FileSplit(src, pos, splitsize, job));
cbrem -= splitsize;
pos = last;
acc = 0L;
}
acc += key.get();
}
if (cbrem != 0) {
splits.add(new FileSplit(src, pos, cbrem, job));
}
return splits.toArray(new FileSplit[splits.size()]);
}
/**
* Returns a reader for this split of the src file list.
*/
public RecordReader getRecordReader(InputSplit split, JobConf job,
Reporter reporter) throws IOException {
return new SequenceFileRecordReader(job, (FileSplit)split);
}
}
/**
* Return true if dst should be replaced by src and the update flag is set.
* Right now, this merely checks that the src and dst len are not equal. This
* should be improved on once modification times, CRCs, etc. can
* be meaningful in this context.
*/
private static boolean needsUpdate(FileStatus src, FileStatus dst) {
return src.getLen() != dst.getLen();
}
/**
* FSCopyFilesMapper: The mapper for copying files between FileSystems.
*/
public static class FSCopyFilesMapper
implements Mapper<LongWritable, FilePair, WritableComparable, Text> {
// config
private int sizeBuf = 128 * 1024;
private FileSystem destFileSys = null;
private boolean ignoreReadFailures;
private boolean preserve_status;
private boolean overwrite;
private boolean update;
private Path destPath = null;
private byte[] buffer = null;
private JobConf job;
// stats
private static final DecimalFormat pcntfmt = new DecimalFormat("0.00");
private int failcount = 0;
private int skipcount = 0;
private int copycount = 0;
private void updateStatus(Reporter reporter) {
reporter.setStatus("Copied: " + copycount + " Skipped: " + skipcount +
" Failed: " + failcount);
}
/**
* Copy a file to a destination.
* @param srcstat src path and metadata
* @param dstpath dst path
* @param reporter
*/
private void copy(FileStatus srcstat, Path dstpath,
OutputCollector<WritableComparable, Text> outc, Reporter reporter)
throws IOException {
int totfiles = job.getInt("distcp.file.count", -1);
assert totfiles >= 0 : "Invalid file count " + totfiles;
// if a directory, ensure created even if empty
if (srcstat.isDir()) {
if (!destFileSys.mkdirs(dstpath)) {
throw new IOException("Failed to create" + dstpath);
}
// TODO: when modification times can be set, directories should be
// emitted to reducers so they might be preserved. Also, mkdirs does
// not currently return an error when the directory already exists;
// if this changes, all directory work might as well be done in reduce
return;
}
Path destParent = dstpath.getParent();
if (totfiles > 1) {
// create directories to hold destination file
if (destParent != null && !destFileSys.mkdirs(destParent)) {
throw new IOException("mkdirs failed to create " + destParent);
}
} else {
// Copying a single file; use dst path provided by user as destination
// rather than destination directory
dstpath = destParent;
}
long cbcopied = 0L;
FSDataInputStream in = null;
FSDataOutputStream out = null;
try {
if (destFileSys.exists(dstpath)
&& (!overwrite && !(update
&& needsUpdate(srcstat, destFileSys.getFileStatus(dstpath))))) {
outc.collect(null, new Text("SKIP: " + srcstat.getPath()));
++skipcount;
reporter.incrCounter(Counter.SKIP, 1);
updateStatus(reporter);
return;
}
// open src file
in = srcstat.getPath().getFileSystem(job).open(srcstat.getPath());
final long cblen = srcstat.getLen();
reporter.incrCounter(Counter.BYTESEXPECTED, cblen);
// open dst file
out = preserve_status
? destFileSys.create(dstpath, true, sizeBuf, srcstat.getReplication(),
srcstat.getBlockSize(), reporter)
: destFileSys.create(dstpath, reporter);
// copy file
int cbread;
while ((cbread = in.read(buffer)) >= 0) {
out.write(buffer, 0, cbread);
cbcopied += cbread;
reporter.setStatus(pcntfmt.format(100.0 * cbcopied / cblen) +
" " + dstpath + " [ " +
StringUtils.humanReadableInt(cbcopied) + " / " +
StringUtils.humanReadableInt(cblen) + " ]");
}
if (cbcopied != cblen) {
final String badlen = "ERROR? copied " + cbcopied + " bytes (" +
StringUtils.humanReadableInt(cbcopied) + ") expected " +
cblen + " bytes (" + StringUtils.humanReadableInt(cblen) +
") from " + srcstat.getPath();
LOG.warn(badlen);
outc.collect(null, new Text(badlen));
}
} finally {
if (in != null)
in.close();
if (out != null)
out.close();
}
// report at least once for each file
++copycount;
reporter.incrCounter(Counter.BYTESCOPIED, cbcopied);
reporter.incrCounter(Counter.COPY, 1);
updateStatus(reporter);
}
/** Mapper configuration.
* Extracts source and destination file system, as well as
* top-level paths on source and destination directories.
* Gets the named file systems, to be used later in map.
*/
public void configure(JobConf job)
{
destPath = new Path(job.get("copy.dest.path", "/"));
try {
destFileSys = destPath.getFileSystem(job);
} catch (IOException ex) {
throw new RuntimeException("Unable to get the named file system.", ex);
}
sizeBuf = job.getInt("copy.buf.size", 128 * 1024);
buffer = new byte[sizeBuf];
ignoreReadFailures = job.getBoolean("distcp.ignore.read.failures", false);
preserve_status = job.getBoolean("distcp.preserve.status.info", false);
update = job.getBoolean("distcp.overwrite.ifnewer", false);
overwrite = !update && job.getBoolean("distcp.overwrite.always", false);
this.job = job;
}
/** Map method. Copies one file from source file system to destination.
* @param key src len
* @param value FilePair (FileStatus src, Path dst)
* @param out Log of failed copies
* @param reporter
*/
public void map(LongWritable key,
FilePair value,
OutputCollector<WritableComparable, Text> out,
Reporter reporter) throws IOException {
FileStatus srcstat = value.input;
Path dstpath = value.output;
try {
copy(srcstat, dstpath, out, reporter);
} catch (IOException e) {
++failcount;
reporter.incrCounter(Counter.FAIL, 1);
updateStatus(reporter);
final String sfailure = "FAIL " + dstpath + " : " +
StringUtils.stringifyException(e);
out.collect(null, new Text(sfailure));
LOG.info(sfailure);
try {
for (int i = 0; i < 3; ++i) {
try {
if (destFileSys.delete(dstpath))
break;
} catch (Throwable ex) {
// ignore, we are just cleaning up
LOG.debug("Ignoring cleanup exception", ex);
}
// update status, so we don't get timed out
updateStatus(reporter);
Thread.sleep(3 * 1000);
}
} catch (InterruptedException inte) {
throw (IOException)new IOException().initCause(inte);
}
} finally {
updateStatus(reporter);
}
}
public void close() throws IOException {
if (0 == failcount || ignoreReadFailures) {
return;
}
throw new IOException("Copied: " + copycount + " Skipped: " + skipcount +
" Failed: " + failcount);
}
}
private static List<Path> fetchFileList(Configuration conf, Path srcList)
throws IOException {
List<Path> result = new ArrayList<Path>();
FileSystem fs = srcList.getFileSystem(conf);
DataInputStream raw = fs.open(srcList);
BufferedReader input = null;
try {
input = new BufferedReader(new InputStreamReader(raw));
String line = input.readLine();
while (line != null) {
result.add(new Path(line));
line = input.readLine();
}
} finally {
if (input != null) {
input.close();
}
}
return result;
}
@Deprecated
public static void copy(Configuration conf, String srcPath,
String destPath, Path logPath,
boolean srcAsList, boolean ignoreReadFailures)
throws IOException {
final Path src = new Path(srcPath);
List<Path> tmp = new ArrayList<Path>();
if (srcAsList) {
tmp.addAll(fetchFileList(conf, src));
} else {
tmp.add(src);
}
EnumSet<cpOpts> flags = ignoreReadFailures
? EnumSet.of(cpOpts.IGNORE_READ_FAILURES)
: EnumSet.noneOf(cpOpts.class);
copy(conf, tmp, new Path(destPath), logPath, flags);
}
/**
* Driver to copy srcPath to destPath depending on required protocol.
* @param srcPaths list of source paths
* @param destPath Destination path
* @param logPath Log output directory
* @param flags Command-line flags
*/
public static void copy(Configuration conf, List<Path> srcPaths,
Path destPath, Path logPath,
EnumSet<cpOpts> flags) throws IOException {
//Job configuration
JobConf job = new JobConf(conf, CopyFiles.class);
job.setJobName("distcp");
//Sanity check for srcPath/destPath
List<IOException> rslt = new ArrayList<IOException>();
for (Path p : srcPaths) {
FileSystem fs = p.getFileSystem(conf);
if (!fs.exists(p)) {
rslt.add(new IOException("Input source " + p + " does not exist."));
}
}
if (!rslt.isEmpty()) {
throw new InvalidInputException(rslt);
}
//Initialize the mapper
try {
setup(conf, job, srcPaths, destPath, logPath, flags);
JobClient.runJob(job);
} finally {
cleanup(conf, job);
}
}
enum cpOpts { IGNORE_READ_FAILURES,
PRESERVE_STATUS,
OVERWRITE,
UPDATE }
/**
* This is the main driver for recursively copying directories
* across file systems. It takes at least two cmdline parameters. A source
* URL and a destination URL. It then essentially does an "ls -lR" on the
* source URL, and writes the output in a round-robin manner to all the map
* input files. The mapper actually copies the files allotted to it. The
* reduce is empty.
*/
public int run(String[] args) throws Exception {
List<Path> srcPath = new ArrayList<Path>();
Path destPath = null;
Path logPath = null;
EnumSet<cpOpts> flags = EnumSet.noneOf(cpOpts.class);
for (int idx = 0; idx < args.length; idx++) {
if ("-i".equals(args[idx])) {
flags.add(cpOpts.IGNORE_READ_FAILURES);
} else if ("-p".equals(args[idx])) {
flags.add(cpOpts.PRESERVE_STATUS);
} else if ("-overwrite".equals(args[idx])) {
flags.add(cpOpts.OVERWRITE);
} else if ("-update".equals(args[idx])) {
flags.add(cpOpts.UPDATE);
} else if ("-f".equals(args[idx])) {
if (++idx == args.length) {
System.out.println("urilist_uri not specified");
System.out.println(usage);
return -1;
}
srcPath.addAll(fetchFileList(conf, new Path(args[idx])));
} else if ("-log".equals(args[idx])) {
if (++idx == args.length) {
System.out.println("logdir not specified");
System.out.println(usage);
return -1;
}
logPath = new Path(args[idx]);
} else if ('-' == args[idx].codePointAt(0)) {
System.out.println("Invalid switch " + args[idx]);
System.out.println(usage);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
} else if (idx == args.length -1) {
destPath = new Path(args[idx]);
} else {
srcPath.add(new Path(args[idx]));
}
}
// mandatory command-line parameters
if (srcPath.isEmpty() || destPath == null) {
System.out.println("Missing " + (destPath == null ? "dst path" : "src"));
System.out.println(usage);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
// incompatible command-line flags
if (flags.contains(cpOpts.OVERWRITE) && flags.contains(cpOpts.UPDATE)) {
System.out.println("Conflicting overwrite policies");
System.out.println(usage);
return -1;
}
try {
copy(conf, srcPath, destPath, logPath, flags);
} catch (Exception e) {
System.err.println("With failures, global counters are inaccurate; " +
"consider running with -i");
System.err.println("Copy failed: " + StringUtils.stringifyException(e));
return -1;
}
return 0;
}
public static void main(String[] args) throws Exception {
JobConf job = new JobConf(CopyFiles.class);
CopyFiles distcp = new CopyFiles(job);
int res = ToolRunner.run(distcp, args);
System.exit(res);
}
/**
* Make a path relative with respect to a root path.
* absPath is always assumed to descend from root.
* Otherwise returned path is null.
*/
public static Path makeRelative(Path root, Path absPath) {
if (!absPath.isAbsolute()) { return absPath; }
String sRoot = root.toUri().getPath();
String sPath = absPath.toUri().getPath();
Enumeration<Object> rootTokens = new StringTokenizer(sRoot, "/");
ArrayList rList = Collections.list(rootTokens);
Enumeration<Object> pathTokens = new StringTokenizer(sPath, "/");
ArrayList pList = Collections.list(pathTokens);
Iterator rIter = rList.iterator();
Iterator pIter = pList.iterator();
while (rIter.hasNext()) {
String rElem = (String) rIter.next();
String pElem = (String) pIter.next();
if (!rElem.equals(pElem)) { return null; }
}
StringBuffer sb = new StringBuffer();
while (pIter.hasNext()) {
String pElem = (String) pIter.next();
sb.append(pElem);
if (pIter.hasNext()) { sb.append("/"); }
}
return new Path(sb.toString());
}
/**
* Calculate how many maps to run.
* Number of maps is bounded by a minimum of the cumulative size of the copy /
* BYTES_PER_MAP and at most MAX_MAPS_PER_NODE * nodes in the
* cluster.
* @param totalBytes Count of total bytes for job
* @param numNodes the number of nodes in cluster
* @return Count of maps to run.
*/
private static int getMapCount(long totalBytes, int numNodes) {
int numMaps = (int)(totalBytes / BYTES_PER_MAP);
numMaps = Math.min(numMaps, numNodes * MAX_MAPS_PER_NODE);
return Math.max(numMaps, 1);
}
/**
* Delete the temporary dir containing the src file list.
* @param conf The dfs/mapred configuration
* @param jobConf The handle to the jobConf object
*/
private static void cleanup(Configuration conf, JobConf jobConf)
throws IOException {
//Clean up jobDirectory
String jobDirName = jobConf.get("distdp.job.dir");
if (jobDirName != null) {
Path jobDirectory = new Path(jobDirName);
FileSystem fs = jobDirectory.getFileSystem(jobConf);
FileUtil.fullyDelete(fs, jobDirectory);
}
}
/**
* Initialize DFSCopyFileMapper specific job-configuration.
* @param conf : The dfs/mapred configuration.
* @param jobConf : The handle to the jobConf object to be initialized.
* @param srcPaths : The source URIs.
* @param destPath : The destination URI.
* @param logPath : Log output directory
* @param flags : Command-line flags
*/
private static void setup(Configuration conf, JobConf jobConf,
List<Path> srcPaths, Path destPath,
Path logPath, EnumSet<cpOpts> flags)
throws IOException {
boolean update;
boolean overwrite;
jobConf.set("copy.dest.path", destPath.toUri().toString());
// turn off speculative execution, because DFS doesn't handle
// multiple writers to the same file.
jobConf.setSpeculativeExecution(false);
jobConf.setInputFormat(CopyInputFormat.class);
jobConf.setOutputKeyClass(Text.class);
jobConf.setOutputValueClass(Text.class);
jobConf.setMapperClass(FSCopyFilesMapper.class);
jobConf.setNumReduceTasks(0);
jobConf.setBoolean("distcp.ignore.read.failures",
flags.contains(cpOpts.IGNORE_READ_FAILURES));
jobConf.setBoolean("distcp.preserve.status.info",
flags.contains(cpOpts.PRESERVE_STATUS));
jobConf.setBoolean("distcp.overwrite.ifnewer",
update = flags.contains(cpOpts.UPDATE));
jobConf.setBoolean("distcp.overwrite.always",
overwrite = !update && flags.contains(cpOpts.OVERWRITE));
Random r = new Random();
String randomId = Integer.toString(r.nextInt(Integer.MAX_VALUE), 36);
Path jobDirectory = new Path(jobConf.getSystemDir(), "distcp_" + randomId);
jobConf.set("distcp.job.dir", jobDirectory.toString());
Path srcfilelist = new Path(jobDirectory, "_distcp_src_files");
jobConf.set("distcp.src.list", srcfilelist.toString());
// default logPath
FileSystem dstfs = destPath.getFileSystem(conf);
if (logPath == null) {
String filename = "_distcp_logs_" + randomId;
if (!dstfs.exists(destPath) || !dstfs.getFileStatus(destPath).isDir()) {
Path parent = destPath.getParent();
dstfs.mkdirs(parent);
logPath = new Path(parent, filename);
} else {
logPath = new Path(destPath, filename);
}
}
jobConf.setOutputPath(logPath);
// create src list
SequenceFile.Writer writer = SequenceFile.createWriter(
jobDirectory.getFileSystem(jobConf), jobConf, srcfilelist,
LongWritable.class, FilePair.class,
SequenceFile.CompressionType.NONE);
int cnfiles = 0;
long cbsize = 0L;
try {
// handle the case where the destination directory doesn't exist
// and we've only a single src directory OR we're updating/overwriting
// the contents of the destination directory.
final boolean special_case =
(srcPaths.size() == 1 && !dstfs.exists(destPath))
|| update || overwrite;
int cnsyncf = 0;
long cbsyncs = 0L;
for (Path p : srcPaths) {
Path root = p.getParent();
FileSystem fs = p.getFileSystem(conf);
if (special_case && fs.getFileStatus(p).isDir()) {
root = p;
}
Stack<Path> pathstack = new Stack<Path>();
pathstack.push(p);
while (!pathstack.empty()) {
for (FileStatus stat : fs.listStatus(pathstack.pop())) {
if (stat.isDir()) {
pathstack.push(stat.getPath());
} else {
++cnsyncf;
cbsyncs += stat.getLen();
++cnfiles;
cbsize += stat.getLen();
}
if (cnsyncf > SYNC_FILE_MAX || cbsyncs > BYTES_PER_MAP) {
writer.sync();
cnsyncf = 0;
cbsyncs = 0L;
}
writer.append(new LongWritable(stat.isDir() ? 0 : stat.getLen()),
new FilePair(stat, new Path(destPath,
makeRelative(root, stat.getPath()))));
}
}
}
} finally {
writer.close();
}
// create dest path dir if copying > 1 file
if (cnfiles > 1 && !dstfs.mkdirs(destPath)) {
throw new IOException("Failed to create" + destPath);
}
jobConf.setInt("distcp.file.count", cnfiles);
jobConf.setLong("distcp.total.size", cbsize);
JobClient client = new JobClient(jobConf);
jobConf.setNumMapTasks(getMapCount(cbsize,
client.getClusterStatus().getTaskTrackers()));
}
}