Source Code of ivory.pwsim.PCP

package ivory.pwsim;


import ivory.core.RetrievalEnvironment;
import ivory.core.data.index.Posting;
import ivory.core.data.index.PostingsList;
import ivory.core.data.index.PostingsReader;
import ivory.core.data.stat.DocLengthTable;
import ivory.core.data.stat.DocLengthTable2B;
import ivory.pwsim.score.ScoringModel;


import java.io.IOException;
import java.util.Iterator;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;




import edu.umd.cloud9.io.map.HMapIFW;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.util.map.MapIF;


/**
 * <p>
 * Computing pairwise document similarity given a document-sorted inverted
 * index. This implementation is based on the algorithms described in the
 * following papers:
 * </p>
 * 
 * <ul>
 * 
 * <li>Tamer Elsayed, Jimmy Lin, and Douglas Oard. <b><a
 * href="http://www.aclweb.org/anthology/P/P08/P08-2067.pdf">Pairwise Document
 * Similarity in Large Collections with MapReduce.</a></b> Proceedings of the
 * 46th Annual Meeting of the Association for Computational Linguistics (ACL
 * 2008), Companion Volume, pages 265-268, June 2008, Columbus, Ohio.
 * 
 * <li>Jimmy Lin. <b><a
 * href="http://portal.acm.org/citation.cfm?id=1571941.1571970">Brute Force and
 * Indexed Approaches to Pairwise Document Similarity Comparisons with
 * MapReduce.</a></b> Proceedings of the 32nd Annual International ACM SIGIR
 * Conference on Research and Development in Information Retrieval (SIGIR 2009),
 * pages 155-162, July 2009, Boston, Massachusetts.
 * 
 * </ul>
 * 
 * @author Tamer Elsayed
 * @author Jimmy Lin
 * 
 */
public class PCP extends PowerTool {


  private static final Logger sLogger = Logger.getLogger(PCP.class);
  {
    sLogger.setLevel(Level.INFO);
  }


  private static class MyMapper extends MapReduceBase implements
      Mapper<IntWritable, PostingsList, IntWritable, HMapIFW> {


    // table that contains length of all document, to be used in computing
    // similarity
    private DocLengthTable mDocLengthTable;


    // similarity measure
    private ScoringModel mModel;


    // threshold to filter common terms that don't contribute much in
    // similarities
    private int dfCut;


    // starting row (in similarity matrix) to be computed
    private int mBlockStart;


    // ending row (in similarity matrix) to be computed
    private int mBlockEnd;


    // collection size
    private int mCollectionDocCount;


    public void configure(JobConf job) {
      mCollectionDocCount = job.getInt("Ivory.CollectionDocumentCount", -1);


      try {
        if (job.get("mapred.job.tracker").equals("local")) {
          FileSystem fs = FileSystem.getLocal(job);
          RetrievalEnvironment re = new RetrievalEnvironment(job.get("Ivory.IndexPath"),
              fs);
          Path path = re.getDoclengthsData();
          sLogger.debug("Reading doclengths: " + path);
          mDocLengthTable = new DocLengthTable2B(path, fs);
        } else {
          Path[] localFiles = DistributedCache.getLocalCacheFiles(job);
          mDocLengthTable = new DocLengthTable2B(localFiles[0], FileSystem.getLocal(job));
        }
      } catch (Exception e) {
        throw new RuntimeException("Error initializing DocLengthTable!");
      }


      dfCut = job.getInt("Ivory.DfCut", -1);
      mBlockStart = job.getInt("Ivory.BlockStart", -1);
      mBlockEnd = job.getInt("Ivory.BlockEnd", -1);
      if (dfCut <= 0 || mBlockStart < 0 || mBlockEnd <= 0)
        throw new RuntimeException("Invalid config parameter(s): dfCut=" + dfCut
            + ", blockStart=" + mBlockStart + ", blockEnd=" + mBlockEnd);


      try {
        mModel = (ScoringModel) Class.forName(job.get("Ivory.ScoringModel")).newInstance();
      } catch (Exception e) {
        throw new RuntimeException("Mappers failed to initialize!");
      }
      // this only needs to be set once for the entire collection
      mModel.setDocCount(mDocLengthTable.getDocCount());
      mModel.setAvgDocLength(mDocLengthTable.getAvgDocLength());


    }


    Posting e1 = new Posting();
    Posting e2 = new Posting();


    public void map(IntWritable key, PostingsList postings,
        OutputCollector<IntWritable, HMapIFW> output, Reporter reporter) throws IOException {


      sLogger.debug(mCollectionDocCount);
      postings.setCollectionDocumentCount(mCollectionDocCount);


      PostingsReader reader1 = postings.getPostingsReader();


      if (reader1.getNumberOfPostings() > dfCut)
        return;


      // set per postings list
      mModel.setDF(reader1.getNumberOfPostings());


      // performing PCP
      while (reader1.nextPosting(e1)) {


        // Here's a hidden dependency: How we do the blocking depends on
        // how the postings are sorted. If the postings are sorted in
        // ascending docno, then we can break out of the loop after
        // we've gone past the block bounds (as in code below).
        // Otherwise (say, if postings are sorted by tf), we have to go
        // through all postings.
        // -- Jimmy, 2008/09/03
        //


        if (e1.getDocno() < mBlockStart)
          continue;
        if (e1.getDocno() >= mBlockEnd)
          break;


        HMapIFW map = new HMapIFW();


        sLogger.debug(key + ": " + e1);


        PostingsReader reader2 = postings.getPostingsReader();


        while (reader2.nextPosting(e2)) {


          sLogger.debug(key + ": " + e1 + ", " + e2);


          if (e1.getDocno() == e2.getDocno())
            continue;


          // compute partial score of similarity for a pair of
          // documents
          float weight = mModel.computeScore(e1.getScore(), e2.getScore(),
              mDocLengthTable.getDocLength(e1.getDocno()), mDocLengthTable
                  .getDocLength(e2.getDocno()));


          map.put(e2.getDocno(), weight);
        }
        output.collect(new IntWritable(e1.getDocno()), map);
      }
    }
  }


  private static class MyReducer extends MapReduceBase implements
      Reducer<IntWritable, HMapIFW, IntWritable, HMapIFW> {


    HMapIFW map = new HMapIFW();
    HMapIFW newMap = new HMapIFW();
    int topN = -1;


    public void configure(JobConf job) {
      topN = job.getInt("Ivory.TopN", -1);
    }


    public void reduce(IntWritable doc, Iterator<HMapIFW> values,
        OutputCollector<IntWritable, HMapIFW> output, Reporter reporter) throws IOException {


      map.clear();
      while (values.hasNext()) {
        map.plus(values.next());
      }
      newMap.clear();
      if (topN > 0) {
        // get only top N similar documents
        int i = 0;
        for (MapIF.Entry e : map.getEntriesSortedByValue()) {
          if (i >= topN)
            break;
          newMap.put(e.getKey(), e.getValue());
          i++;
        }
      } else {
        for (MapIF.Entry e : map.getEntriesSortedByValue())
          newMap.put(e.getKey(), e.getValue());
      }


      // note: output is not sorted but will only include top N if needed
      output.collect(doc, newMap);
    }
  }


  public PCP(Configuration conf) {
    super(conf);
  }


  public static final String[] RequiredParameters = {
      "Ivory.IndexPath",
      "Ivory.OutputPath",
      "Ivory.NumMapTasks",
      "Ivory.NumReduceTasks",
      "Ivory.ScoringModel",
      "Ivory.DfCut",
      "Ivory.BlockSize",
      "Ivory.TopN"
  };


  public String[] getRequiredParameters() {
    return RequiredParameters;
  }


  public int runTool() throws Exception {
    String indexPath = getConf().get("Ivory.IndexPath");
    String outputPath = getConf().get("Ivory.OutputPath");


    int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0);
    int reduceTasks = getConf().getInt("Ivory.NumReduceTasks", 0);
    int dfCut = getConf().getInt("Ivory.DfCut", -1);
    int blockSize = getConf().getInt("Ivory.BlockSize", -1);
    int topN = getConf().getInt("Ivory.TopN", -1);


    FileSystem fs = FileSystem.get(getConf());


    RetrievalEnvironment re = new RetrievalEnvironment(indexPath, fs);


    String collectionName = re.readCollectionName();
    int numDocs = re.readCollectionDocumentCount();
    Path docLengthPath = re.getDoclengthsData();
    String scoringModel = getConf().get("Ivory.ScoringModel");


    sLogger.info("Characteristics of the collection:");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info("Characteristics of the job:");
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - NumReduceTasks: " + reduceTasks);
    sLogger.info(" - DfCut: " + getConf().getInt("Ivory.DfCut", 0));
    sLogger.info(" - BlockSize: " + blockSize);
    sLogger.info(" - ScoringModel: " + scoringModel);
    sLogger.info(" - topN: " + topN);
    sLogger.info(" - OutputPath: " + outputPath);


    getConf().setInt("Ivory.CollectionDocumentCount", numDocs);


    if (fs.exists(new Path(outputPath))) {
      System.out.println("PCP output path already exists!");
      return 0;
    }


    int numBlocks = numDocs / blockSize + 1;


    for (int i = 0; i < numBlocks; i++) {
      int start = blockSize * i;
      int end = i == numBlocks - 1 ? numDocs : blockSize * (i + 1);


      JobConf conf = new JobConf(getConf(), PCP.class);
      DistributedCache.addCacheFile(docLengthPath.toUri(), conf);


      sLogger.info("block " + i + ": " + start + "-" + end);


      conf.setInt("Ivory.BlockStart", start);
      conf.setInt("Ivory.BlockEnd", end);


      conf.setJobName("PCP:" + collectionName + "-dfCut=" + dfCut
          + (topN > 0 ? "-topN" + topN : "-all") + ":Block #" + i);


      conf.setNumMapTasks(mapTasks);
      conf.setNumReduceTasks(reduceTasks);


      String currentOutputPath = outputPath + "/block" + i;


      FileInputFormat.setInputPaths(conf, new Path(re.getPostingsDirectory()));
      FileOutputFormat.setOutputPath(conf, new Path(currentOutputPath));


      conf.setInputFormat(SequenceFileInputFormat.class);
      conf.setOutputKeyClass(IntWritable.class);
      conf.setOutputValueClass(HMapIFW.class);
      conf.setOutputFormat(SequenceFileOutputFormat.class);


      conf.setMapperClass(MyMapper.class);
      conf.setCombinerClass(IdentityReducer.class);
      conf.setReducerClass(MyReducer.class);


      JobClient.runJob(conf);
    }


    return 0;
  }


}
Source Code of ivory.pwsim.PCP

Related Classes of ivory.pwsim.PCP