Source Code of org.apache.hadoop.util.CopyFiles$CopyFilesMapper

/**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.hadoop.util;


import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.UTF8;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;


/**
 * A Map-reduce program to recursively copy directories between
 * diffferent file-systems.
 *
 * @author Milind Bhandarkar
 */
public class CopyFiles extends MapReduceBase implements Reducer {
  
  private static final String usage = "distcp <srcurl> <desturl> "+
          "[-dfs <namenode:port | local> ] [-jt <jobtracker:port | local>] " +
          "[-config <config-file.xml>]";
  
  private static final long MIN_BYTES_PER_MAP = 1L << 28;
  private static final int MAX_NUM_MAPS = 10000;
  private static final int MAX_MAPS_PER_NODE = 10;
  /**
   * Mappper class for Copying files.
   */
  
  public static class CopyFilesMapper extends MapReduceBase implements Mapper {
    
    private int sizeBuf = 4096;
    private FileSystem srcFileSys = null;
    private FileSystem destFileSys = null;
    private Path srcPath = null;
    private Path destPath = null;
    private byte[] buffer = null;
    private static final long reportInterval = 1L << 25;
    private long bytesSinceLastReport = 0L;
    private long totalBytesCopied = 0L;
    
    private void copy(String src, Reporter reporter) throws IOException {
      // open source file
      Path srcFile = new Path(srcPath, src);
      FSDataInputStream in = srcFileSys.open(srcFile);
      
      // create directories to hold destination file and create destFile
      Path destFile = new Path(destPath, src);
      Path destParent = destFile.getParent();
      if (destParent != null) { destFileSys.mkdirs(destParent); }
      FSDataOutputStream out = destFileSys.create(destFile);
      
      // copy file
      while (true) {
        int nread = in.read(buffer);
        if (nread < 0) { break; }
        out.write(buffer, 0, nread);
        bytesSinceLastReport += nread;
        if (bytesSinceLastReport > reportInterval) {
            totalBytesCopied += bytesSinceLastReport;
            bytesSinceLastReport = 0L;
            reporter.setStatus("Total bytes copied: "+totalBytesCopied);
        }
      }
      
      in.close();
      out.close();
      // report at least once for each file
      totalBytesCopied += bytesSinceLastReport;
      bytesSinceLastReport = 0L;
      reporter.setStatus("Total bytes copied: "+totalBytesCopied);
    }
    
    /** Mapper configuration.
     * Extracts source and destination file system, as well as
     * top-level paths on source and destination directories.
     * Gets the named file systems, to be used later in map.
     */
    public void configure(JobConf job) {
      String srcfs = job.get("copy.src.fs", "local");
      String destfs = job.get("copy.dest.fs", "local");
      srcPath = new Path(job.get("copy.src.path", "/"));
      destPath = new Path(job.get("copy.dest.path", "/"));
      try {
        srcFileSys = FileSystem.getNamed(srcfs, job);
        destFileSys = FileSystem.getNamed(destfs, job);
      } catch (IOException ex) {
        throw new RuntimeException("Unable to get the named file system.", ex);
      }
      sizeBuf = job.getInt("copy.buf.size", 4096);
      buffer = new byte[sizeBuf];
    }
    
    /** Map method. Copies one file from source file system to destination.
     * @param key source file name
     * @param value not-used.
     * @param out not-used.
     */
    public void map(WritableComparable key,
        Writable val,
        OutputCollector out,
        Reporter reporter) throws IOException {
      String src = ((UTF8) key).toString();
      copy(src, reporter);
    }
    
    public void close() {
      // nothing
    }
  }
  
  public void reduce(WritableComparable key,
      Iterator values,
      OutputCollector output,
      Reporter reporter) throws IOException {
    // nothing
  }
  
  private static String getFileSysName(URI url) {
    String fsname = url.getScheme();
    if ("dfs".equals(fsname)) {
      String host = url.getHost();
      int port = url.getPort();
      return (port==(-1)) ? host : (host+":"+port);
    } else {
      return "local";
    }
  }


  /**
   * Make a path relative with respect to a root path.
   * absPath is always assumed to descend from root.
   * Otherwise returned path is null.
   */
  private static Path makeRelative(Path root, Path absPath) {
    if (!absPath.isAbsolute()) { return absPath; }
    String sRoot = root.toString();
    String sPath = absPath.toString();
    Enumeration rootTokens = new StringTokenizer(sRoot, "/");
    ArrayList rList = Collections.list(rootTokens);
    Enumeration pathTokens = new StringTokenizer(sPath, "/");
    ArrayList pList = Collections.list(pathTokens);
    Iterator rIter = rList.iterator();
    Iterator pIter = pList.iterator();
    while (rIter.hasNext()) {
      String rElem = (String) rIter.next();
      String pElem = (String) pIter.next();
      if (!rElem.equals(pElem)) { return null; }
    }
    StringBuffer sb = new StringBuffer();
    while (pIter.hasNext()) {
      String pElem = (String) pIter.next();
      sb.append(pElem);
      if (pIter.hasNext()) { sb.append("/"); }
    }
    return new Path(sb.toString());
  }
  /**
   * This is the main driver for recursively copying directories
   * across file systems. It takes at least two cmdline parameters. A source
   * URL and a destination URL. It then essentially does an "ls -lR" on the
   * source URL, and writes the output in aa round-robin manner to all the map
   * input files. The mapper actually copies the files allotted to it. And
   * the reduce is empty.
   */
  public static void main(String[] args) throws IOException {


    Configuration conf = new Configuration();
    String srcPath = null;
    String destPath = null;
    
    for (int idx = 0; idx < args.length; idx++) {
        if ("-dfs".equals(args[idx])) {
            if (idx == (args.length-1)) {
                System.out.println(usage);
                return;
            }
            conf.set("fs.default.name", args[++idx]);
        } else if ("-jt".equals(args[idx])) {
            if (idx == (args.length-1)) {
                System.out.println(usage);
                return;
            }
            conf.set("mapred.job.tracker", args[++idx]);
        } else if ("-config".equals(args[idx])) {
            if (idx == (args.length-1)) {
                System.out.println(usage);
                return;
            }
            conf.addFinalResource(new Path(args[++idx]));
        } else {
            if (srcPath == null) {
                srcPath = args[idx];
            } else if (destPath == null) {
                destPath = args[idx];
            } else {
                System.out.println(usage);
                return;
            }
        }
    }
    
    // mandatory command-line parameters
    if (srcPath == null || destPath == null) {
        System.out.println(usage);
        return;
    }
    
    URI srcurl = null;
    URI desturl = null;
    try {
      srcurl = new URI(srcPath);
      desturl = new URI(destPath);
    } catch (URISyntaxException ex) {
      throw new RuntimeException("URL syntax error.", ex);
    }
    
    JobConf jobConf = new JobConf(conf, CopyFiles.class);
    jobConf.setJobName("copy-files");
    
    String srcFileSysName = getFileSysName(srcurl);
    String destFileSysName = getFileSysName(desturl);
    
    jobConf.set("copy.src.fs", srcFileSysName);
    jobConf.set("copy.dest.fs", destFileSysName);
    FileSystem srcfs;
   
    srcfs = FileSystem.getNamed(srcFileSysName, conf);
    FileSystem destfs = FileSystem.getNamed(destFileSysName, conf);
 
    srcPath = srcurl.getPath();
    if ("".equals(srcPath)) { srcPath = "/"; }
    destPath = desturl.getPath();
    if ("".equals(destPath)) { destPath = "/"; }
    
    boolean isFile = false;
    Path tmpPath = new Path(srcPath);
    Path rootPath = new Path(srcPath);
    if (srcfs.isFile(tmpPath)) {
      isFile = true;
      tmpPath = tmpPath.getParent();
      rootPath = rootPath.getParent();
      jobConf.set("copy.src.path", tmpPath.toString());
    } else {
      jobConf.set("copy.src.path", srcPath);
    }
    jobConf.set("copy.dest.path", destPath);
    
    if (!srcfs.exists(tmpPath)) {
      System.out.println(srcPath+" does not exist.");
      return;
    }
    
    // turn off speculative execution, because DFS doesn't handle
    // multiple writers to the same file.
    jobConf.setSpeculativeExecution(false);
    jobConf.setInputKeyClass(UTF8.class);
    jobConf.setInputValueClass(UTF8.class);
    jobConf.setInputFormat(SequenceFileInputFormat.class);
        
    jobConf.setOutputKeyClass(UTF8.class);
    jobConf.setOutputValueClass(UTF8.class);
    jobConf.setOutputFormat(SequenceFileOutputFormat.class);
    
    jobConf.setMapperClass(CopyFilesMapper.class);
    jobConf.setReducerClass(CopyFiles.class);
    
    jobConf.setNumReduceTasks(1);


    Path tmpDir = new Path("copy-files");
    Path inDir = new Path(tmpDir, "in");
    Path fakeOutDir = new Path(tmpDir, "out");
    FileSystem fileSys = FileSystem.get(jobConf);
    fileSys.delete(tmpDir);
    fileSys.mkdirs(inDir);
    
    
    jobConf.setInputPath(inDir);
    jobConf.setOutputPath(fakeOutDir);
    
    // create new sequence-files for holding paths
    ArrayList pathList = new ArrayList();
    ArrayList finalPathList = new ArrayList();
    pathList.add(new Path(srcPath));
    long totalBytes = 0;
    int part = 0;
    while(!pathList.isEmpty()) {
      Path top = (Path) pathList.remove(0);
      if (srcfs.isFile(top)) {
        totalBytes += srcfs.getLength(top);
        top = makeRelative(rootPath, top);
        finalPathList.add(top.toString());
      } else {
        Path[] paths = srcfs.listPaths(top);
        for (int idx = 0; idx < paths.length; idx++) {
          pathList.add(paths[idx]);
        }
      }
    }
    // ideal number of maps is one per file (if the map-launching overhead
    // were 0. It is limited by jobtrackers handling capacity, which lets say
    // is MAX_NUM_MAPS. It is also limited by MAX_MAPS_PER_NODE. Also for small
    // files it is better to determine number of maps by amount of data per map.
    
    int nFiles = finalPathList.size();
    int numMaps = nFiles;
    if (numMaps > MAX_NUM_MAPS) { numMaps = MAX_NUM_MAPS; }
    if (numMaps > (int) (totalBytes / MIN_BYTES_PER_MAP)) {
        numMaps = (int) (totalBytes / MIN_BYTES_PER_MAP);
    }
    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int tmpMaps = cluster.getTaskTrackers() * MAX_MAPS_PER_NODE;
    if (numMaps > tmpMaps) { numMaps = tmpMaps; }
    if (numMaps == 0) { numMaps = 1; }
    jobConf.setNumMapTasks(numMaps);
    
    for(int idx=0; idx < numMaps; ++idx) {
      Path file = new Path(inDir, "part"+idx);
      SequenceFile.Writer writer = new SequenceFile.Writer(fileSys, file, UTF8.class, UTF8.class);
      for (int ipath = idx; ipath < nFiles; ipath += numMaps) {
        String path = (String) finalPathList.get(ipath);
        writer.append(new UTF8(path), new UTF8(""));
      }
      writer.close();
    }
    finalPathList = null;
    
    try {
      JobClient.runJob(jobConf);
    } finally {
      fileSys.delete(tmpDir);
    }
  
  }
}
Source Code of org.apache.hadoop.util.CopyFiles$CopyFilesMapper

Related Classes of org.apache.hadoop.util.CopyFiles$CopyFilesMapper