List<Path> dirs = asList(status.getPath());
Text key = new Text();
Text value = new Text();
Writer writer = SequenceFile.createWriter(fs, job, bucketFiles, Text.class, Text.class, CompressionType.BLOCK);
int numPartitions = Integer.parseInt(job.get("mapred.reduce.tasks"));
Bucketer partitionBucketer = new Bucketer(numPartitions, 0, false);
partitionBucketer.reset("partition-map");
jobCounters = new Counters();
try {
while (!dirs.isEmpty()) {
List<Path> nextLevel = new LinkedList<Path>();
for (Path dir : dirs) {
jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
print(Verbosity.INFO, "\n\n" + dir.toUri().getPath());
FileStatus[] contents = fs.listStatus(dir, new PathFilter() {
@Override
public boolean accept(Path testPath) {
if (ignoredFiles == null) return true;
ignoredFiles.reset(testPath.toUri().getPath());
return !ignoredFiles.matches();
}
});
if (contents == null || contents.length == 0) {
print(Verbosity.INFO, " is empty");
jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
} else {
List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length);
Set<String> uncrushedFiles = new HashSet<String>(contents.length);
long crushableBytes = 0;
/*
* Queue sub directories for subsequent inspection and examine the files in this directory.
*/
for (FileStatus content : contents) {
Path path = content.getPath();
if (content.isDir()) {
nextLevel.add(path);
} else {
boolean changed = uncrushedFiles.add(path.toUri().getPath());
assert changed : path.toUri().getPath();
long fileLength = content.getLen();
if (fileLength <= maxEligibleSize) {
crushables.add(content);
crushableBytes += fileLength;
}
}
}
/*
* We found a directory with data in it. Make sure we know how to name the crush output file and then increment the
* number of files we found.
*/
if (!uncrushedFiles.isEmpty()) {
if (-1 == findMatcher(dir)) {
throw new IllegalArgumentException("Could not find matching regex for directory: " + dir);
}
jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size());
}
if (0 == crushableBytes) {
print(Verbosity.INFO, " has no crushable files");
jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
} else {
/*
* We found files to consider for crushing.
*/
long nBlocks = crushableBytes / dfsBlockSize;
if (nBlocks * dfsBlockSize != crushableBytes) {
nBlocks++;
}
/*
* maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory.
*/
long dirBuckets = nBlocks / maxFileBlocks;
if (dirBuckets * maxFileBlocks != nBlocks) {
dirBuckets++;
}
if (dirBuckets > Integer.MAX_VALUE) {
throw new AssertionError("Too many buckets: " + dirBuckets);
}
Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs);
directoryBucketer.reset(getPathPart(dir));
for (FileStatus file : crushables) {
directoryBucketer.add(new FileStatusHasSize(file));
}
List<Bucket> crushFiles = directoryBucketer.createBuckets();
if (crushFiles.isEmpty()) {
jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
} else {
nBuckets += crushFiles.size();
jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);
print(Verbosity.INFO, " => " + crushFiles.size() + " output files");
/*
* Write out the mapping between a bucket and a file.
*/
for (Bucket crushFile : crushFiles) {
String bucketId = crushFile.name();
List<String> bucketFiles = crushFile.contents();
print(Verbosity.INFO, format("\n Output %s will include %,d input bytes from %,d files", bucketId,
crushFile.size(), bucketFiles.size()));
key.set(bucketId);
for (String f : bucketFiles) {
boolean changed = uncrushedFiles.remove(f);
assert changed : f;
pathMatcher.reset(f);
pathMatcher.matches();
value.set(pathMatcher.group(5));
writer.append(key, value);
/*
* Print the input file with four leading spaces.
*/
print(Verbosity.VERBOSE, "\n " + f);
}
jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, bucketFiles.size());
partitionBucketer.add(crushFile);
}
}
}
if (!uncrushedFiles.isEmpty()) {
print(Verbosity.INFO, "\n\n Skipped " + uncrushedFiles.size() + " files");
for (String uncrushed : uncrushedFiles) {
print(Verbosity.VERBOSE, "\n " + uncrushed);
}
jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size());
}
skippedFiles.addAll(uncrushedFiles);
}
}
dirs = nextLevel;
}
} finally {
try {
writer.close();
} catch (Exception e) {
LOG.error("Trapped exception during close: " + bucketFiles, e);
}
}
/*
* Now that we have processed all the directories, write the partition map.
*/
List<Bucket> partitions = partitionBucketer.createBuckets();
assert partitions.size() <= numPartitions;
writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);
IntWritable partNum = new IntWritable();
try {
for (Bucket partition : partitions) {
String partitionName = partition.name();
partNum.set(Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1)));
for (String bucketId : partition.contents()) {
key.set(bucketId);
writer.append(key, partNum);
}
}
} finally {
try {
writer.close();
} catch (Exception e) {
LOG.error("Trapped exception during close: " + partitionMap, e);
}
}