package ivory.core.preprocess;
import ivory.core.Constants;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.dictionary.DefaultFrequencySortedDictionary;
import ivory.core.data.document.TermDocVector;
import ivory.core.data.stat.DfTableArray;
import ivory.core.data.stat.DocLengthTable;
import ivory.core.data.stat.DocLengthTable4B;
import ivory.core.tokenize.OpenNLPTokenizer;
import ivory.core.tokenize.Tokenizer;
import ivory.core.tokenize.TokenizerFactory;
import ivory.core.util.CLIRUtils;
import ivory.pwsim.score.ScoringModel;
import java.io.IOException;
import java.net.URI;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import com.google.common.collect.Maps;
import edu.umd.cloud9.io.map.HMapIFW;
import edu.umd.cloud9.io.map.HMapSFW;
import edu.umd.cloud9.mapred.NullInputFormat;
import edu.umd.cloud9.mapred.NullMapper;
import edu.umd.cloud9.mapred.NullOutputFormat;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.util.map.MapIF;
import edu.umd.hooka.Vocab;
import edu.umd.hooka.alignment.HadoopAlign;
import edu.umd.hooka.ttables.TTable_monolithic_IFAs;
/**
* Translates term doc vectors in foreign language (e.g. German) into target language (e.g. English) using the CLIR technique discussed in Combining Bidirectional Translation and Synonymy
for Cross-Language Information Retrieval, SIGIR'06, Jianqiang Wang and Douglas W. Oard.
* @author ferhanture
*
*/
@SuppressWarnings( "deprecation")
public class BuildTranslatedTermDocVectors extends PowerTool {
private static final Logger LOG = Logger.getLogger(BuildTranslatedTermDocVectors.class);
private static int SAMPLING = 1;
protected static enum Docs { DBG, ZERO, SHORT, Total };
protected static enum DF { TransDf, NoDf }
private static class MyMapperTrans extends MapReduceBase implements
Mapper<IntWritable, TermDocVector, IntWritable, HMapSFW> {
private ScoringModel model;
// eVocabSrc is the English vocabulary for probability table e2f_Probs.
// engVocabTrgis the English vocabulary for probability table f2e_Probs.
// fVocabSrc is the German vocabulary for probability table f2e_Probs.
// fVocabTrg is the German vocabulary for probability table e2f_Probs.
static Vocab eVocabSrc, fVocabSrc, fVocabTrg, eVocabTrg;
static TTable_monolithic_IFAs f2e_Probs, e2f_Probs;
private Tokenizer tokenizer;
static float avgDocLen;
static int numDocs;
static boolean isNormalize;
private String language;
int MIN_SIZE = 0; // minimum document size, to avoid noise in Wikipedia due to stubs/very short articles etc. this is set via Conf object
DefaultFrequencySortedDictionary dict;
DfTableArray dfTable;
public void configure(JobConf conf) {
String termsFile, termidsFile, idToTermFile, dfFile;
numDocs = conf.getInt("Ivory.CollectionDocumentCount", -1);
avgDocLen = conf.getFloat("Ivory.AvgDocLen", -1);
isNormalize = conf.getBoolean("Ivory.Normalize", false);
language = conf.get("Ivory.Lang");
LOG.debug(numDocs + " " + avgDocLen);
MIN_SIZE = conf.getInt("Ivory.MinNumTerms", 0);
try {
if (conf.get ("mapred.job.tracker").equals ("local")) {
// Explicitly not support local mode.
throw new RuntimeException("Local mode not supported!");
}
FileSystem remoteFS = FileSystem.get(conf);
RetrievalEnvironment targetEnv = new RetrievalEnvironment(conf.get(Constants.TargetIndexPath), remoteFS);
termsFile = targetEnv.getIndexTermsData();
termidsFile = targetEnv.getIndexTermIdsData();
idToTermFile = targetEnv.getIndexTermIdMappingData();
dfFile = targetEnv.getDfByIntData();
FileSystem fs = FileSystem.getLocal(conf);
Map<String, Path> pathMapping = Maps.newHashMap();
// We need to figure out which file in the DistributeCache is which...
Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
for (Path p : localFiles) {
LOG.info("In DistributedCache: " + p);
if (p.toString().contains(termsFile)) {
pathMapping.put(termsFile, p);
} else if (p.toString().contains(termidsFile)) {
pathMapping.put(termidsFile, p);
} else if (p.toString().contains(idToTermFile)) {
pathMapping.put(idToTermFile, p);
} else if (p.toString().contains(dfFile)) {
pathMapping.put(dfFile, p);
} else if (p.toString().contains(conf.get("Ivory.E_Vocab_F2E"))) {
pathMapping.put("Ivory.E_Vocab_F2E", p);
LOG.info("Ivory.E_Vocab_F2E -> " + p);
} else if (p.toString().contains(conf.get("Ivory.F_Vocab_F2E"))) {
pathMapping.put("Ivory.F_Vocab_F2E", p);
LOG.info("Ivory.F_Vocab_F2E -> " + p);
} else if (p.toString().contains(conf.get("Ivory.TTable_F2E"))) {
pathMapping.put("Ivory.TTable_F2E", p);
LOG.info("Ivory.TTable_F2E -> " + p);
} else if (p.toString().contains(conf.get("Ivory.E_Vocab_E2F"))) {
pathMapping.put("Ivory.E_Vocab_E2F", p);
LOG.info("Ivory.E_Vocab_E2F -> " + p);
} else if (p.toString().contains(conf.get("Ivory.F_Vocab_E2F"))) {
pathMapping.put("Ivory.F_Vocab_E2F", p);
LOG.info("Ivory.F_Vocab_E2Ff -> " + p);
} else if (p.toString().contains(conf.get("Ivory.TTable_E2F"))) {
pathMapping.put("Ivory.TTable_E2F", p);
LOG.info("Ivory.TTable_E2F -> " + p);
} else if (p.toString().contains(conf.get(Constants.TargetStopwordList))) {
pathMapping.put(Constants.TargetStopwordList, p);
LOG.info(Constants.TargetStopwordList + " -> " + p);
} else if (p.toString().contains(conf.get(Constants.TargetTokenizer))) {
pathMapping.put(Constants.TargetTokenizer, p);
LOG.info(Constants.TargetTokenizer + " -> " + p);
}
}
// transDfTable = CLIRUtils.readTransDfTable(pathMapping.get("transDf"), fs);
LOG.info(" - terms: " + pathMapping.get(termsFile));
LOG.info(" - id: " + pathMapping.get(termidsFile));
LOG.info(" - idToTerms: " + pathMapping.get(idToTermFile));
LOG.info(" - df data: " + pathMapping.get(dfFile));
try{
dict = new DefaultFrequencySortedDictionary(pathMapping.get(termsFile),
pathMapping.get(termidsFile), pathMapping.get(idToTermFile), FileSystem.getLocal(conf));
dfTable = new DfTableArray(pathMapping.get(dfFile), FileSystem.getLocal(conf));
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("Error loading Terms File for dictionary from "+localFiles[0]);
}
eVocabTrg = HadoopAlign.loadVocab(pathMapping.get("Ivory.E_Vocab_F2E"), fs);
fVocabSrc = HadoopAlign.loadVocab(pathMapping.get("Ivory.F_Vocab_F2E"), fs);
f2e_Probs = new TTable_monolithic_IFAs(fs, pathMapping.get("Ivory.TTable_F2E"), true);
eVocabSrc = HadoopAlign.loadVocab(pathMapping.get("Ivory.E_Vocab_E2F"), fs);
fVocabTrg = HadoopAlign.loadVocab(pathMapping.get("Ivory.F_Vocab_E2F"), fs);
e2f_Probs = new TTable_monolithic_IFAs(fs, pathMapping.get("Ivory.TTable_E2F"), true);
tokenizer = TokenizerFactory.createTokenizer(fs, conf.get(Constants.TargetLanguage), pathMapping.get(Constants.TargetTokenizer).toString(), false, pathMapping.get(Constants.TargetStopwordList).toString(), null, null); // just for stopword removal in translateTFs
} catch (IOException e) {
throw new RuntimeException ("Local cache files not read properly.");
}
try {
model = (ScoringModel) Class.forName(conf.get("Ivory.ScoringModel")).newInstance();
} catch (Exception e) {
throw new RuntimeException("Error initializing Ivory.ScoringModel!");
}
// this only needs to be set once for the entire collection
model.setDocCount(numDocs);
model.setAvgDocLength(avgDocLen);
try {
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("Error initializing tokenizer!");
}
if (conf.get("debug") != null) {
LOG.setLevel(Level.DEBUG);
}
LOG.info("# docs in collection = "+numDocs);
LOG.info("avg doc len = "+avgDocLen);
LOG.info("---------");
}
public void map(IntWritable docno, TermDocVector doc,
OutputCollector<IntWritable, HMapSFW> output, Reporter reporter) throws IOException {
if (docno.get() % SAMPLING != 0) {
return; // for generating sample document vectors. no sampling if SAMPLING=1
}
if (!language.equals("english") && !language.equals("en")) {
docno.set(docno.get() + 1000000000);
// To distinguish between the two collections in the PWSim sliding window algorithm.
}
// Translate doc vector.
TermDocVector.Reader reader = doc.getReader();
int numTerms = reader.getNumberOfTerms();
if (numTerms < MIN_SIZE) {
reporter.incrCounter(Docs.SHORT, 1);
return;
}
HMapIFW tfS = new HMapIFW();
// We simply use the source-language doc length since the ratio of doc length to average doc
// length is unlikely to change significantly (not worth complicating the pipeline)
int docLen = CLIRUtils.translateTFs(doc, tfS, eVocabSrc, eVocabTrg, fVocabSrc, fVocabTrg,
e2f_Probs, f2e_Probs, tokenizer, LOG);
// HMapSFW v = CLIRUtils.createTermDocVector(docLen, tfS, eVocabSrc, model, dict, dfTable,
// isNormalize, LOG);
HMapSFW v = CLIRUtils.createTermDocVector(docLen, tfS, eVocabTrg, model, dict, dfTable,
isNormalize, LOG);
// If no translation of any word is in the target vocab, remove document i.e., our model
// wasn't capable of translating it.
if (v.isEmpty()) {
reporter.incrCounter(Docs.ZERO, 1);
} else {
reporter.incrCounter(Docs.Total, 1);
output.collect(docno, v);
}
}
}
public BuildTranslatedTermDocVectors(Configuration conf) {
super(conf);
}
public static final String[] RequiredParameters = { Constants.IndexPath, Constants.TargetIndexPath, "Ivory.ScoringModel" };
public String[] getRequiredParameters() {
return RequiredParameters;
}
@Override
public int runTool() throws Exception {
String indexPath = getConf().get(Constants.IndexPath);
String scoringModel = getConf().get("Ivory.ScoringModel");
RetrievalEnvironment env = new RetrievalEnvironment(indexPath, FileSystem.get(getConf()));
String outputPath = env.getWeightedTermDocVectorsDirectory();
// String transDfFile = indexPath + "/transDf.dat";
String fVocab_f2e= getConf().get("Ivory.F_Vocab_F2E"); // fVocab from P(e|f)
String eVocab_f2e = getConf().get("Ivory.E_Vocab_F2E"); // eVocab from P(e|f)
String ttable_f2e = getConf().get("Ivory.TTable_F2E"); // P(e|f)
String eVocab_e2f = getConf().get("Ivory.E_Vocab_E2F"); // eVocab from P(f|e)
String fVocab_e2f = getConf().get("Ivory.F_Vocab_E2F"); // fVocab from P(f|e)
String ttable_e2f = getConf().get("Ivory.TTable_E2F"); // P(f|e)
String eStopwords = getConf().get(Constants.TargetStopwordList);
String eTokenizerModel = getConf().get(Constants.TargetTokenizer);
// createTranslatedDFFile(transDfFile);
String targetIndexPath = getConf().get(Constants.TargetIndexPath);
RetrievalEnvironment targetEnv = new RetrievalEnvironment(targetIndexPath, FileSystem.get(getConf()));
String termsFilePath = targetEnv.getIndexTermsData();
String termsIdsFilePath = targetEnv.getIndexTermIdsData();
String termIdMappingFilePath = targetEnv.getIndexTermIdMappingData();
String dfByIntFilePath = targetEnv.getDfByIntData();
JobConf conf = new JobConf(getConf(), BuildTranslatedTermDocVectors.class);
conf.setJobName("BuildTranslatedTermDocVectors");
FileSystem fs = FileSystem.get(conf);
if (fs.exists(new Path(outputPath))) {
LOG.info(outputPath + ": Translated term doc vectors already exist! Nothing to do for this job...");
return 0;
}
String collectionName = getConf().get("Ivory.CollectionName");
String inputPath = env.getTermDocVectorsDirectory();
LOG.info("Preparing to build document vectors using " + scoringModel);
LOG.info("Document vectors to be stored in " + outputPath);
LOG.info("CollectionName: " + collectionName);
LOG.info("Input path: " + inputPath);
DocLengthTable mDLTable;
try {
mDLTable = new DocLengthTable4B(env.getDoclengthsData(), fs);
} catch (IOException e1) {
throw new RuntimeException("Error initializing Doclengths file");
}
LOG.info(mDLTable.getAvgDocLength()+" is average source-language document length.");
LOG.info(targetEnv.readCollectionDocumentCount()+" is number of target-language docs. We use the target-side DF table so we set #docs to this value in our scoring model.");
/////// Configuration setup
conf.set(Constants.IndexPath, indexPath);
conf.set("Ivory.ScoringModel", scoringModel);
conf.setFloat("Ivory.AvgDocLen", mDLTable.getAvgDocLength());
conf.setInt(Constants.CollectionDocumentCount, targetEnv.readCollectionDocumentCount());
conf.set(Constants.Language, getConf().get("Ivory.Lang"));
conf.set("Ivory.Normalize", getConf().get("Ivory.Normalize"));
conf.set("Ivory.MinNumTerms", getConf().get("Ivory.MinNumTerms"));
conf.setNumMapTasks(300);
conf.setNumReduceTasks(0);
conf.set("mapred.child.java.opts", "-Xmx2048m");
conf.setInt("mapred.map.max.attempts", 10);
conf.setInt("mapred.reduce.max.attempts", 10);
conf.setInt("mapred.task.timeout", 6000000);
//////// Cache files
DistributedCache.addCacheFile(new URI(termsFilePath), conf);
DistributedCache.addCacheFile(new URI(termsIdsFilePath), conf);
DistributedCache.addCacheFile(new URI(termIdMappingFilePath), conf);
DistributedCache.addCacheFile(new URI(dfByIntFilePath), conf);
DistributedCache.addCacheFile(new URI(eVocab_f2e), conf);
DistributedCache.addCacheFile(new URI(fVocab_f2e), conf);
DistributedCache.addCacheFile(new URI(ttable_f2e), conf);
DistributedCache.addCacheFile(new URI(eVocab_e2f), conf);
DistributedCache.addCacheFile(new URI(fVocab_e2f), conf);
DistributedCache.addCacheFile(new URI(ttable_e2f), conf);
DistributedCache.addCacheFile(new URI(ttable_e2f), conf);
DistributedCache.addCacheFile(new URI(eStopwords), conf);
DistributedCache.addCacheFile(new URI(eTokenizerModel), conf);
FileInputFormat.setInputPaths(conf, new Path(inputPath));
FileOutputFormat.setOutputPath(conf, new Path(outputPath));
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setMapOutputKeyClass(IntWritable.class);
conf.setMapOutputValueClass(HMapSFW.class);
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(HMapSFW.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
conf.setMapperClass(MyMapperTrans.class);
long startTime = System.currentTimeMillis();
JobClient.runJob(conf);
LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
return 0;
}
private void createTranslatedDFFile(String transDfFile) {
try {
JobConf conf2 = new JobConf(getConf(), BuildTranslatedTermDocVectors.class);
conf2.setJobName("BuildTranslatedDfTable");
FileSystem fs2 = FileSystem.get(conf2);
if (fs2.exists(new Path(transDfFile))) {
LOG.info("Translated Df file already exists! Nothing to do for this job...");
} else {
LOG.info("Creating translated Df file ...");
conf2.set("mapred.child.java.opts", "-Xmx2048m");
conf2.setInt("mapred.map.max.attempts", 10);
conf2.setInt("mapred.reduce.max.attempts", 10);
conf2.setInt("mapred.task.timeout", 6000000);
conf2.set("TransDfFile", transDfFile);
conf2.setSpeculativeExecution(false);
conf2.setNumMapTasks(1);
conf2.setNumReduceTasks(0);
conf2.setInputFormat(NullInputFormat.class);
conf2.setOutputFormat(NullOutputFormat.class);
conf2.setMapperClass(DataWriterMapper.class);
JobClient.runJob(conf2);
LOG.info("Translating DF table done.");
}
} catch (IOException e) {
e.printStackTrace();
}
}
private static class DataWriterMapper extends NullMapper {
public void run(JobConf conf, Reporter reporter) throws IOException {
Logger sLogger = Logger.getLogger(DataWriterMapper.class);
sLogger.setLevel(Level.DEBUG);
String indexPath = conf.get(Constants.IndexPath);
FileSystem fs2 = FileSystem.get(conf);
RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs2);
String transDfFile = conf.get("TransDfFile");
String eFile = conf.get("Ivory.E_Vocab_E2F");
String fFile = conf.get("Ivory.F_Vocab_E2F");
String e2fttableFile = conf.get("Ivory.TTable_E2F");
String termsFile = env.getIndexTermsData();
String dfByIntFile = env.getDfByIntData();
if(!fs2.exists(new Path(fFile)) || !fs2.exists(new Path(eFile)) || !fs2.exists(new Path(e2fttableFile)) || !fs2.exists(new Path(termsFile)) || !fs2.exists(new Path(dfByIntFile))){
throw new RuntimeException("Error: Translation files do not exist!");
}
Vocab eVocab_e2f = null, fVocab_e2f = null;
TTable_monolithic_IFAs en2DeProbs = null;
try {
eVocab_e2f = HadoopAlign.loadVocab(new Path(eFile), conf);
fVocab_e2f = HadoopAlign.loadVocab(new Path(fFile), conf);
en2DeProbs = new TTable_monolithic_IFAs(fs2, new Path(e2fttableFile), true);
} catch (IOException e) {
e.printStackTrace();
}
DefaultFrequencySortedDictionary dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), fs2);
DfTableArray dfTable = new DfTableArray(new Path(dfByIntFile), fs2);
HMapIFW transDfTable = CLIRUtils.translateDFTable(eVocab_e2f, fVocab_e2f, en2DeProbs, dict, dfTable);
SequenceFile.Writer writer = SequenceFile.createWriter(fs2, conf, new Path(transDfFile), IntWritable.class, FloatWritable.class);
for(MapIF.Entry term : transDfTable.entrySet()){
reporter.incrCounter(DF.TransDf, 1);
writer.append(new IntWritable(term.getKey()), new FloatWritable(term.getValue()));
}
writer.close();
}
}
}