/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.behemoth.util;
import java.io.BufferedInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Locale;
import java.util.zip.GZIPInputStream;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.digitalpebble.behemoth.BehemothConfiguration;
import com.digitalpebble.behemoth.BehemothDocument;
/**
* Generates a SequenceFile containing BehemothDocuments given a local
* directory. The BehemothDocument gets its byte content and URL. The detection
* of MIME-type and text extraction can be done later using the TikaProcessor.
*/
public class CorpusGenerator extends Configured implements Tool {
private transient static Logger log = LoggerFactory
.getLogger(CorpusGenerator.class);
private Path input, output;
private Reporter reporter;
public static String unpackParamName = "CorpusGenerator-unpack";
public enum Counters {
DOC_COUNT
};
public CorpusGenerator() {
}
public CorpusGenerator(Path input, Path output) {
setInput(input);
setOutput(output);
}
public CorpusGenerator(Path input, Path output, Reporter reporter) {
this.input = input;
this.output = output;
this.reporter = reporter;
}
public void setInput(Path input) {
this.input = input;
}
public void setOutput(Path output) {
this.output = output;
}
public long generate(boolean recurse) throws IOException {
long result = 0;
// read from input path
// create new Content object and add it to the SequenceFile
Text key = new Text();
BehemothDocument value = new BehemothDocument();
SequenceFile.Writer writer = null;
try {
Configuration conf = getConf();
FileSystem fs = output.getFileSystem(conf);
writer = SequenceFile.createWriter(fs, conf, output,
key.getClass(), value.getClass());
PerformanceFileFilter pff = new PerformanceFileFilter(writer, key,
value, conf, reporter);
// iterate on the files in the source dir
result = processFiles(conf, input, recurse, pff);
} finally {
IOUtils.closeStream(writer);
}
return result;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(BehemothConfiguration.create(),
new CorpusGenerator(), args);
System.exit(res);
}
public int run(String[] args) throws Exception {
Options options = new Options();
// automatically generate the help statement
HelpFormatter formatter = new HelpFormatter();
// create the parser
CommandLineParser parser = new GnuParser();
options.addOption("h", "help", false, "print this message");
options.addOption("i", "input", true, "input file or directory");
options.addOption("o", "output", true, "output Behemoth corpus");
options.addOption("r", "recurse", true,
"processes directories recursively (default true)");
options.addOption("u", "unpack", true,
"unpack content of archives (default true)");
options.addOption(
"md",
"metadata",
true,
"add document metadata separated by semicolon e.g. -md source=internet;label=public");
// parse the command line arguments
CommandLine line = null;
try {
line = parser.parse(options, args);
if (line.hasOption("help")) {
formatter.printHelp("CorpusGenerator", options);
return 0;
}
if (!line.hasOption("i")) {
formatter.printHelp("CorpusGenerator", options);
return -1;
}
if (!line.hasOption("o")) {
formatter.printHelp("CorpusGenerator", options);
return -1;
}
} catch (ParseException e) {
formatter.printHelp("CorpusGenerator", options);
}
boolean recurse = true;
if (line.hasOption("r")
&& "false".equalsIgnoreCase(line.getOptionValue("r")))
recurse = false;
boolean unpack = true;
if (line.hasOption("u")
&& "false".equalsIgnoreCase(line.getOptionValue("u")))
unpack = false;
getConf().setBoolean(unpackParamName, unpack);
Path inputDir = new Path(line.getOptionValue("i"));
Path output = new Path(line.getOptionValue("o"));
if (line.hasOption("md")) {
String md = line.getOptionValue("md");
getConf().set("md", md);
}
setInput(inputDir);
setOutput(output);
long start = System.currentTimeMillis();
if (inputDir.getFileSystem(getConf()).exists(inputDir) == false) {
log.error("Input does not exist : " + inputDir);
return -1;
}
long count = generate(recurse);
long finish = System.currentTimeMillis();
if (log.isInfoEnabled()) {
log.info("CorpusGenerator completed. Timing: " + (finish - start)
+ " ms");
}
log.info(count + " docs converted");
return 0;
}
private static long processFiles(Configuration conf, Path input,
boolean recurse, PerformanceFileFilter pff) throws IOException {
FileSystem fs = input.getFileSystem(conf);
FileStatus[] statuses = fs.listStatus(input, pff);
for (int i = 0; i < statuses.length; i++) {
FileStatus status = statuses[i];
if (recurse == true) {
processFiles(conf, status.getPath(), recurse, pff);
}
}
return pff.counter;
}
// Java hack to move the work of processing files into a filter, so that we
// can process large directories of files
// without having to create a huge list of files
static class PerformanceFileFilter implements PathFilter {
long counter = 0;
PathFilter defaultIgnores = new PathFilter() {
public boolean accept(Path file) {
String name = file.getName();
return name.startsWith(".") == false;// ignore hidden
// directories
}
};
private SequenceFile.Writer writer;
private Text key;
private BehemothDocument value;
private Configuration conf;
private Reporter reporter;
public PerformanceFileFilter(SequenceFile.Writer writer, Text key,
BehemothDocument value, Configuration conf, Reporter reporter) {
this.writer = writer;
this.key = key;
this.value = value;
this.conf = conf;
this.reporter = reporter;
// add the metadata
String md = conf.get("md", "");
if (md.isEmpty() == false) {
String[] mds = md.split(";");
for (String metadata : mds) {
String[] keyval = metadata.split("=");
log.info("key: " + keyval[0] + "\tval:" + keyval[1]);
Writable mdvalue;
Writable mdkey = new Text(keyval[0]);
if (keyval.length == 1) {
mdvalue = NullWritable.get();
} else {
mdvalue = new Text(keyval[1]);
}
value.getMetadata(true).put(mdkey, mdvalue);
}
}
}
public boolean accept(Path file) {
try {
FileSystem fs = file.getFileSystem(conf);
boolean unpack = conf.getBoolean(unpackParamName, true);
if (defaultIgnores.accept(file)
&& fs.getFileStatus(file).isDir() == false) {
String URI = file.toUri().toString();
String uri = URI.toLowerCase(Locale.ENGLISH);
int processed = 0;
// detect whether a file is likely to be an archive
if (unpack) {
if (uri.endsWith(".cpio") || uri.endsWith(".jar")
|| uri.endsWith(".dump") || uri.endsWith(".ar")
|| uri.endsWith("tar") || uri.endsWith(".zip")
|| uri.endsWith("tar.gz")
|| uri.endsWith(".tgz")
|| uri.endsWith(".tbz2")
|| uri.endsWith(".tbz")
|| uri.endsWith("tar.bzip2")) {
InputStream fis = null;
try {
fis = fs.open(file);
if (uri.endsWith(".gz") || uri.endsWith(".tgz")) {
fis = new GZIPInputStream(fis);
} else if (uri.endsWith(".tbz")
|| uri.endsWith(".tbz2")
|| uri.endsWith(".bzip2")) {
fis = new BZip2CompressorInputStream(fis);
}
ArchiveInputStream input = new ArchiveStreamFactory()
.createArchiveInputStream(new BufferedInputStream(
fis));
ArchiveEntry entry = null;
while ((entry = input.getNextEntry()) != null) {
String name = entry.getName();
long size = entry.getSize();
byte[] content = new byte[(int) size];
input.read(content);
key.set(URI + "!" + name);
// fill the values for the content object
value.setUrl(URI + ":" + name);
value.setContent(content);
writer.append(key, value);
processed++;
counter++;
if (reporter != null) {
reporter.incrCounter(
Counters.DOC_COUNT, 1);
}
}
} catch (Throwable t) {
if (processed == 0) {
log.warn("Error unpacking archive: " + file
+ ", adding as a regular file: "
+ t.toString());
} else {
log.warn("Error unpacking archive: "
+ file
+ ", processed "
+ processed
+ " entries, skipping remaining entries: "
+ t.toString());
}
} finally {
if (fis != null) {
fis.close();
}
}
}
}
if (processed == 0) { // not processed as archive
// Hmm, kind of dangerous to do this
byte[] fileBArray = new byte[(int) fs.getFileStatus(
file).getLen()];
try {
FSDataInputStream fis = fs.open(file);
fis.readFully(0, fileBArray);
fis.close();
key.set(URI);
// fill the values for the content object
value.setUrl(URI);
value.setContent(fileBArray);
writer.append(key, value);
counter++;
if (reporter != null) {
reporter.incrCounter(Counters.DOC_COUNT, 1);
}
} catch (FileNotFoundException e) {
log.warn("File not found " + file + ", skipping: "
+ e);
} catch (IOException e) {
log.warn("IO error reading file " + file
+ ", skipping: " + e);
}
}
}
// if it is a directory, accept it so we can possibly recurse on
// it,
// otherwise we don't care about actually accepting the file,
// since
// all the work is done in the accept method here.
return fs.getFileStatus(file).isDir();
} catch (IOException e) {
log.error("Exception", e);
}
return false;
}
}
}