Source Code of ivory.lsh.eval.Docnos2Titles

package ivory.lsh.eval;


import ivory.core.util.CLIRUtils;
import java.io.IOException;
import java.net.URI;
import java.util.Iterator;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import edu.umd.cloud9.collection.wikipedia.WikipediaPage;
import edu.umd.cloud9.io.array.ArrayListOfIntsWritable;
import edu.umd.cloud9.io.map.HMapIIW;
import edu.umd.cloud9.io.pair.PairOfIntString;
import edu.umd.cloud9.io.pair.PairOfInts;
import edu.umd.cloud9.util.map.HMapIV;


/**


 * @author ferhanture
 * 
 */
public class Docnos2Titles extends Configured implements Tool {


  private static final Logger sLogger = Logger.getLogger(Docnos2Titles.class);


  private static Options options;


  private static void printUsage() {
    HelpFormatter formatter = new HelpFormatter();
    formatter.printHelp( "Docnos2Titles", options );
    System.exit(-1);    
  }


  static enum Pairs {
    COUNT, COUNT2, COUNT3, COUNTE, COUNTF, COUNT4, COUNT3x;
  }  
  /**
   * Candidate generation
   * 
   * Map: (edocno, eWikiPage) --> (<fdocno, edocno>, <E, eTitle>)
   * Map: (fdocno, fWikiPage) --> (<fdocno, edocno>, <F, fTitle>)
   * Input is the union of source and target collections
   * @author ferhanture
   */
  private static class MyMapper extends MapReduceBase implements
  Mapper<IntWritable, WikipediaPage, PairOfInts, PairOfIntString> {


    private HMapIV<ArrayListOfIntsWritable> pwsimMapping;   // mapping for pwsim pairs
    private JobConf mJob;
    private ArrayListOfIntsWritable similarDocnos;
    private String srcLang;
    private PairOfIntString valOut;
    private PairOfInts keyOut;
    private HMapIIW samplesMap = null;


    public void configure(JobConf job) {
      sLogger.setLevel(Level.INFO);
      srcLang = job.get("fLang");      
      mJob = job;
      pwsimMapping = new HMapIV<ArrayListOfIntsWritable>();
      valOut = new PairOfIntString();
      keyOut = new PairOfInts();


      // read doc ids of sample into vectors
      String samplesFile = job.get("Ivory.SampleFile"); 
      if (samplesFile != null) {
        try {
          samplesMap = readSamplesFromCache(getFilename(samplesFile), job);
        } catch (NumberFormatException e) {
          e.printStackTrace();
          throw new RuntimeException("Incorrect format in " + samplesFile);
        } catch (IOException e) {
          e.printStackTrace();
          throw new RuntimeException("I/O error in " + samplesFile);
        } catch (Exception e) {
          e.printStackTrace();
          throw new RuntimeException("Error reading sample file: " + samplesFile);
        }
      }
    }


    private static String getFilename(String s) {
      return s.substring(s.lastIndexOf("/") + 1);
    }


    private static void loadPairs(HMapIV<ArrayListOfIntsWritable> pwsimMapping, int langID, JobConf job, Reporter reporter){
      try {
        Path[] localFiles = DistributedCache.getLocalCacheFiles(job);
        String pwsimFile = job.get("PwsimPairs");
        for (Path localFile : localFiles) {
          if (localFile.toString().contains(getFilename(pwsimFile))) {
            SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.getLocal(job), localFile, job);


            PairOfInts key = (PairOfInts) reader.getKeyClass().newInstance();
            IntWritable value = (IntWritable) reader.getValueClass().newInstance();
            int cnt = 0;
            while (reader.next(key, value)) {
              int fDocno = key.getRightElement();
              int eDocno = key.getLeftElement();
              if ((eDocno == 6127 && fDocno == 1000000074) || (eDocno == 6127 && fDocno == 1000000071)) {
                sLogger.info(key);
              }
              if(langID == CLIRUtils.E){
                if(!pwsimMapping.containsKey(eDocno)){
                  pwsimMapping.put(eDocno, new ArrayListOfIntsWritable());
                }
                pwsimMapping.get(eDocno).add(fDocno);   // we add 1000000000 to foreign docnos to distinguish them during pwsim algo
              }else{
                if(!pwsimMapping.containsKey(fDocno)){
                  pwsimMapping.put(fDocno, new ArrayListOfIntsWritable());
                }
                pwsimMapping.get(fDocno).add(eDocno);   // we add 1000000000 to foreign docnos to distinguish them during pwsim algo
              }
              cnt++;
              key = (PairOfInts) reader.getKeyClass().newInstance();
              value = (IntWritable) reader.getValueClass().newInstance();
            }
            reader.close();
            sLogger.info(pwsimMapping.size() + "," + cnt + " pairs loaded from " + localFile);


          }
        }
      } catch (Exception e) {
        throw new RuntimeException(e);
      }
    }


    private HMapIIW readSamplesFromCache(String samplesFile, JobConf conf) throws IOException {
      Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
      HMapIIW samplesMap = null;
      for (Path localFile : localFiles) {
        if (localFile.toString().contains(samplesFile)) {
          samplesMap = new HMapIIW();
          LineReader reader = new LineReader(FileSystem.getLocal(conf).open(localFile));
          Text t = new Text();
          while (reader.readLine(t) != 0) {
            int docno = Integer.parseInt(t.toString());
            sLogger.info(docno + " --> sample");
            samplesMap.put(docno, 1);
          }
          reader.close();
          sLogger.info(samplesMap.size() + " sampled");
        }
      }
      if (samplesMap == null) throw new RuntimeException("Not found in local cache: " + samplesFile);
      return samplesMap;
    }


    public void map(IntWritable docnoKey, WikipediaPage p, OutputCollector<PairOfInts, PairOfIntString> output, Reporter reporter) throws IOException {
      int docno = docnoKey.get();
      String title = p.getTitle();
      String lang = p.getLanguage();
      int langID = lang.equals(srcLang) ? CLIRUtils.F : CLIRUtils.E;
     
      if (langID == CLIRUtils.F ) {
        docno += 1000000000;
        if (samplesMap != null && !samplesMap.containsKey(docno)) {
          return;
        }
      }
      
      // we only load the mapping once, during the first map() call of a mapper. 
      // this works b/c all input kv pairs of a given mapper will have same lang id (reason explained above)
      if (pwsimMapping.isEmpty()) {
        loadPairs(pwsimMapping, langID, mJob, reporter);
        sLogger.info("Mapping loaded: " + pwsimMapping.size());
      }


      // if no similar docs for docno, return
      if (pwsimMapping.containsKey(docno)) {
        similarDocnos = pwsimMapping.get(docno);  
      }else{
        return;
      }


      for (int similarDocno : similarDocnos) {
        if (langID == CLIRUtils.E) {
          if (samplesMap != null && !samplesMap.containsKey(similarDocno)) {
            continue;
          }
          keyOut.set(similarDocno, docno);
        }else {
          keyOut.set(docno, similarDocno);          
        }
        valOut.set(langID, title);
        output.collect(keyOut, valOut);
      }   
    }
  }


  private static class MyReducer extends MapReduceBase implements
  Reducer<PairOfInts, PairOfIntString, Text, Text>{
    private Text fTitle, eTitle;


    public void configure(JobConf job) {
      fTitle = new Text();
      eTitle = new Text();
    }


    @Override
    public void reduce(PairOfInts docnoPair, Iterator<PairOfIntString> titles,
        OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
      eTitle.clear();
      fTitle.clear();
      sLogger.info(docnoPair);


      int cnt = 0;
      while (titles.hasNext()) {
        PairOfIntString title = titles.next();
        sLogger.info(title);
        if (title.getLeftElement() == CLIRUtils.E) {
          eTitle.set(title.getRightElement());
          cnt++;
        } else if (title.getLeftElement() == CLIRUtils.F) {
          fTitle.set(title.getRightElement());
          cnt++;
        } else {
          throw new RuntimeException("Unknown language ID: " + title.getLeftElement());
        }
      }


      if (cnt == 2) {
        output.collect(fTitle, eTitle);
      }else {
        sLogger.info("Incomplete data for " + docnoPair + ":" + fTitle + "," + eTitle);
      }
    }


  }




  /**
   * Runs this tool.
   */
  @SuppressWarnings("deprecation")
  public int run(String[] args) throws Exception {
    JobConf job = new JobConf(getConf(), Docnos2Titles.class);


    // Read commandline arguments
    CommandLine cmdline = parseArgs(args);
    if (cmdline == null) {
      printUsage();
    }
    String eCollectionPath = cmdline.getOptionValue(ECOLLECTION_OPTION);
    String fCollectionPath = cmdline.getOptionValue(FCOLLECTION_OPTION);
    String pwsimOutputPath = cmdline.getOptionValue(PWSIM_OPTION);
    String titlePairsPath = cmdline.getOptionValue(OUTPUT_PATH_OPTION);
    String eLang = cmdline.getOptionValue(ELANG_OPTION);
    String fLang = cmdline.getOptionValue(FLANG_OPTION);
    String samplesFile = cmdline.getOptionValue(SAMPLEDOCNOS_OPTION); 
    job.setJobName("Docnos2Titles_" + fLang + "-" + eLang);  


    FileInputFormat.addInputPaths(job, eCollectionPath);
    FileInputFormat.addInputPaths(job, fCollectionPath);
    FileOutputFormat.setOutputPath(job, new Path(titlePairsPath));
    DistributedCache.addCacheFile(new URI(pwsimOutputPath), job);
    DistributedCache.addCacheFile(new URI(samplesFile), job);
    job.set("eLang", eLang);
    job.set("fLang", fLang);
    job.set("PwsimPairs", pwsimOutputPath);
    job.set("Ivory.SampleFile", samplesFile);


    job.setInt("mapred.task.timeout", 60000000);
    job.set("mapreduce.map.memory.mb", "3000");
    job.set("mapreduce.map.java.opts", "-Xmx3000m");
    job.setBoolean("mapred.map.tasks.speculative.execution", false);
    job.setBoolean("mapred.reduce.tasks.speculative.execution", false);


    job.setNumMapTasks(100);
    job.setNumReduceTasks(1);
    job.setInt("mapred.min.split.size", 2000000000);
    job.setFloat("mapred.reduce.slowstart.completed.maps", 0.9f);


    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setMapOutputKeyClass(PairOfInts.class);
    job.setMapOutputValueClass(PairOfIntString.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);


    sLogger.info("Running job " + job.getJobName() + "...");
    sLogger.info("E-collection path: " + eCollectionPath);
    sLogger.info("F-collection path: " + fCollectionPath);
    sLogger.info("Pwsim output path: " + pwsimOutputPath);
    sLogger.info("Output path: " + titlePairsPath);
    sLogger.info("Sample file?: " + ((samplesFile != null) ? samplesFile : "none"));


    long startTime = System.currentTimeMillis();
    JobClient.runJob(job);
    System.out.println("Job finished in " 
        + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");


    return 0;
  }


  private static final String FCOLLECTION_OPTION = "f_collection";
  private static final String ECOLLECTION_OPTION = "e_collection";
  private static final String FLANG_OPTION = "f_lang";
  private static final String ELANG_OPTION = "e_lang";
  private static final String PWSIM_OPTION = "pwsim_output";
  private static final String OUTPUT_PATH_OPTION = "output";
  private static final String SAMPLEDOCNOS_OPTION = "docnos";
  private static final String LIBJARS_OPTION = "libjars";


  @SuppressWarnings("static-access")
  private CommandLine parseArgs(String[] args) throws Exception {
    options = new Options();
    options.addOption(OptionBuilder.withDescription("path to output of pwsim algorithm").withArgName("path").hasArg().isRequired().create(PWSIM_OPTION));
    options.addOption(OptionBuilder.withDescription("path to output").withArgName("path").hasArg().isRequired().create(OUTPUT_PATH_OPTION));
    options.addOption(OptionBuilder.withDescription("source-side raw collection path").withArgName("path").hasArg().isRequired().create(FCOLLECTION_OPTION));
    options.addOption(OptionBuilder.withDescription("target-side raw collection path").withArgName("path").hasArg().isRequired().create(ECOLLECTION_OPTION));
    options.addOption(OptionBuilder.withDescription("two-letter code for f-language").withArgName("en|de|tr|cs|zh|ar|es").hasArg().isRequired().create(FLANG_OPTION));
    options.addOption(OptionBuilder.withDescription("two-letter code for e-language").withArgName("en|de|tr|cs|zh|ar|es").hasArg().isRequired().create(ELANG_OPTION));
    options.addOption(OptionBuilder.withDescription("only keep pairs that match these docnos").withArgName("path to sample docnos file").hasArg().create(SAMPLEDOCNOS_OPTION));
    options.addOption(OptionBuilder.withDescription("Hadoop option to load external jars").withArgName("jar packages").hasArg().create(LIBJARS_OPTION));


    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
      cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
      System.err.println("Error parsing command line: " + exp.getMessage());
      return null;
    }
    return cmdline;
  }


  /**
   * Dispatches command-line arguments to the tool via the
   * <code>ToolRunner</code>.
   */
  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(new Docnos2Titles(), args);
    System.exit(res);
  }


}
Source Code of ivory.lsh.eval.Docnos2Titles

Related Classes of ivory.lsh.eval.Docnos2Titles