Package ivory.core.index

Source Code of ivory.core.index.DistributeGlobalStatsToPostings$MyMapper

package ivory.core.index;


import ivory.core.RetrievalEnvironment;
import ivory.core.data.dictionary.DefaultCachedFrequencySortedDictionary;
import ivory.core.data.index.PostingsList;
import ivory.core.data.index.PostingsListDocSortedPositional;
import ivory.core.data.stat.PrefixEncodedGlobalStats;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.log4j.Logger;


import edu.umd.cloud9.io.pair.PairOfIntLong;
import edu.umd.cloud9.util.PowerTool;

public class DistributeGlobalStatsToPostings extends PowerTool {
  private static final Logger sLogger = Logger.getLogger(DistributeGlobalStatsToPostings.class);

  private static class MyMapper extends MapReduceBase implements
      Mapper<IntWritable, PostingsList, IntWritable, PostingsList> {

    private PrefixEncodedGlobalStats gs;

    private DefaultCachedFrequencySortedDictionary mTermIdMap;

    public void configure(JobConf job) {
      try {
        Path[] localFiles = DistributedCache.getLocalCacheFiles(job);

        sLogger.info("0: " + localFiles[0]);
        sLogger.info("1: " + localFiles[1]);
        sLogger.info("2: " + localFiles[2]);
        sLogger.info("3: " + localFiles[3]);
        sLogger.info("4: " + localFiles[4]);
        sLogger.info("5: " + localFiles[5]);

        FileSystem fs = FileSystem.getLocal(job);

        gs = new PrefixEncodedGlobalStats(localFiles[0], fs);
        gs.loadDFStats(localFiles[1], fs);
        gs.loadCFStats(localFiles[2], fs);

        String indexPath = job.get("Ivory.IndexPath");
        sLogger.info("loading TermIdMap from " + indexPath);
        mTermIdMap = new DefaultCachedFrequencySortedDictionary(localFiles[3], localFiles[4],
            localFiles[5], 0.2f, fs);
      } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException("Error loading global term stats!");
      }
    }

    public void map(IntWritable key, PostingsList p,
        OutputCollector<IntWritable, PostingsList> output, Reporter reporter)
        throws IOException {

      // map from the id back to text
      // sLogger.info("termid: " + key);
      String term = mTermIdMap.getTerm(key.get());
      // sLogger.info("term: " + term);
      PairOfIntLong pair = gs.getStats(term);

      if (pair == null) {
        p.setCf(-1);
        p.setDf(-1);
      } else {
        p.setCf(pair.getRightElement());
        p.setDf(pair.getLeftElement());
      }

      output.collect(key, p);
    }
  }

  public static final String[] RequiredParameters = { "Ivory.IndexPath", "Ivory.GlobalStatsPath",
      "Ivory.NumMapTasks" };

  public String[] getRequiredParameters() {
    return RequiredParameters;
  }

  public DistributeGlobalStatsToPostings(Configuration conf) {
    super(conf);
  }

  public int runTool() throws Exception {
    sLogger.info("Distributing df/cf stats...");

    JobConf conf = new JobConf(getConf(), DistributeGlobalStatsToPostings.class);
    FileSystem fs = FileSystem.get(conf);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 1);

    String indexPath = conf.get("Ivory.IndexPath");
    String statsPath = conf.get("Ivory.GlobalStatsPath");

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String collectionName = env.readCollectionName();

    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - NumReduceTasks: " + reduceTasks);

    // back up old stats
    Path p1 = new Path(indexPath + "/property.CollectionDocumentCount");
    Path p2 = new Path(indexPath + "/property.CollectionDocumentCount.local");

    if (!fs.exists(p2)) {
      sLogger.info("preserving local " + p1.getName());
      fs.rename(p1, p2);
    }

    p1 = new Path(indexPath + "/property.CollectionAverageDocumentLength");
    p2 = new Path(indexPath + "/property.CollectionAverageDocumentLength.local");

    if (!fs.exists(p2)) {
      sLogger.info("preserving local " + p1.getName());
      fs.rename(p1, p2);
    }

    p1 = new Path(indexPath + "/property.CollectionLength");
    p2 = new Path(indexPath + "/property.CollectionLength.local");

    if (!fs.exists(p2)) {
      sLogger.info("preserving local " + p1.getName());
      fs.rename(p1, p2);
    }

    // distribute global stats
    RetrievalEnvironment genv = new RetrievalEnvironment(statsPath, fs);
    long collectionLength = genv.readCollectionLength();
    int docCount = genv.readCollectionDocumentCount();
    float avgdl = genv.readCollectionAverageDocumentLength();

    sLogger.info("writing global stats from all index segments: ");
    sLogger.info(" - CollectionLength: " + collectionLength);
    sLogger.info(" - CollectionDocumentCount: " + docCount);
    sLogger.info(" - AverageDocumentLength: " + avgdl);

    env.writeCollectionLength(collectionLength);
    env.writeCollectionDocumentCount(docCount);
    env.writeCollectionAverageDocumentLength(avgdl);

    // preserve old postings
    Path postingsPath1 = new Path(indexPath + "/postings/");
    Path postingsPath2 = new Path(indexPath + "/postings.old/");

    if (fs.exists(postingsPath1)) {
      sLogger.info("renaming " + postingsPath1.getName() + " to " + postingsPath2.getName());
      fs.rename(postingsPath1, postingsPath2);
    }

    conf.setJobName("DistributeGlobalStatsToPostings:" + collectionName);

    FileInputFormat.setInputPaths(conf, postingsPath2);
    FileOutputFormat.setOutputPath(conf, postingsPath1);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);

    DistributedCache.addCacheFile(new URI(statsPath + "/dict.terms"), conf);
    DistributedCache.addCacheFile(new URI(statsPath + "/dict.df"), conf);
    DistributedCache.addCacheFile(new URI(statsPath + "/dict.cf"), conf);

    DistributedCache.addCacheFile(new URI(env.getIndexTermsData()), conf);
    DistributedCache.addCacheFile(new URI(env.getIndexTermIdsData()), conf);
    DistributedCache.addCacheFile(new URI(env.getIndexTermIdMappingData()), conf);

    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.map.max.attempts", 10);
    conf.setInt("mapred.reduce.max.attempts", 10);
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PostingsListDocSortedPositional.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    JobClient.runJob(conf);

    return 0;
  }

  public static void main(String[] args) throws Exception {
    if (args.length != 2) {
      System.out.println("usage: [global-stats] [index-path]");
      System.exit(-1);
    }

    Configuration conf = new Configuration();

    String gsPath = args[0];
    String indexPath = args[1];

    conf.set("Ivory.IndexPath", indexPath);
    conf.set("Ivory.GlobalStatsPath", gsPath);
    conf.setInt("Ivory.NumMapTasks", 100);

    sLogger.info("Distributing global statistics to " + indexPath);
    new DistributeGlobalStatsToPostings(conf).run();
  }
}
TOP

Related Classes of ivory.core.index.DistributeGlobalStatsToPostings$MyMapper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.