Package ivory.core.index

Source Code of ivory.core.index.BuildIntPostingsForwardIndex$MyMapper

/*
* Ivory: A Hadoop toolkit for web-scale information retrieval
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package ivory.core.index;

import ivory.core.Constants;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.document.IntDocVector;
import ivory.core.preprocess.PositionalSequenceFileRecordReader;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.log4j.Logger;

import edu.umd.cloud9.util.PowerTool;

public class BuildIntPostingsForwardIndex extends PowerTool {
  private static final Logger LOG = Logger.getLogger(BuildIntPostingsForwardIndex.class);

  public static final long BIG_LONG_NUMBER = 1000000000;

  protected static enum Dictionary { Size };

  private static class MyMapper
      extends Mapper<IntWritable, IntDocVector, IntWritable, Text> {
    @Override
    public void run(Context context) throws IOException, InterruptedException {
      String file = ((FileSplit) context.getInputSplit()).getPath().getName();
      LOG.info("Input file: " + file);

      IntWritable key = new IntWritable();
      Text outputValue = new Text();

      PositionalSequenceFileRecordReader<IntWritable, IntDocVector> reader =
          new PositionalSequenceFileRecordReader<IntWritable, IntDocVector>();
      reader.initialize(context.getInputSplit(), context);

      int fileNo = Integer.parseInt(file.substring(file.lastIndexOf("-") + 1));
      long pos = reader.getPosition();
      while (reader.nextKeyValue()) {
        key = reader.getCurrentKey();
        outputValue.set(fileNo + "\t" + pos);

        context.write(key, outputValue);
        context.getCounter(Dictionary.Size).increment(1);

        pos = reader.getPosition();
      }
      reader.close();
    }
  }

  private static class MyReducer extends Reducer<IntWritable, Text, Text, Text> {
    private String positionsFile;
    private int collectionTermCount;
    private FSDataOutputStream out;
    private int curKeyIndex = 0;

    @Override
    public void setup(Context context) {
      Configuration conf = context.getConfiguration();
      FileSystem fs;
      try {
        fs = FileSystem.get(conf);
      } catch (Exception e) {
        throw new RuntimeException("Error opening the FileSystem!");
      }

      String indexPath = conf.get(Constants.IndexPath);

      RetrievalEnvironment env = null;
      try {
        env = new RetrievalEnvironment(indexPath, fs);
      } catch (IOException e) {
        throw new RuntimeException("Unable to create RetrievalEnvironment!");
      }

      positionsFile = env.getPostingsIndexData();
      collectionTermCount = env.readCollectionTermCount();

      LOG.info("Ivory.PostingsPositionsFile: " + positionsFile);
      LOG.info("Ivory.CollectionTermCount: " + collectionTermCount);

      try {
        out = fs.create(new Path(positionsFile), true);
        out.writeInt(collectionTermCount);
      } catch (Exception e) {
        throw new RuntimeException("Error in creating files!");
      }
    }

    @Override
    public void reduce(IntWritable key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
      Iterator<Text> iter = values.iterator();
      String[] s = iter.next().toString().split("\\s+");

      if (iter.hasNext()) {
        throw new RuntimeException("There shouldn't be more than one value, key=" + key);
      }

      int fileNo = Integer.parseInt(s[0]);
      long filePos = Long.parseLong(s[1]);
      long pos = BIG_LONG_NUMBER * fileNo + filePos;

      curKeyIndex++;

      // This is subtle point: Ivory.CollectionTermCount specifies the number of terms in the
      // collection, computed by BuildTermIdMap. However, this number is sometimes greater than the
      // number of postings that are in the index (as is the case for ClueWeb09). Here's what
      // happens: when creating TermDocVectors, the vocabulary gets populated with tokens that
      // contain special symbols. The vocabulary then gets compressed with front-coding. However,
      // for whatever reason, the current implementation cannot properly handle these special
      // characters. So when we convert from TermDocVectors to IntDocVectors, we can't find the term
      // id for these special tokens. As a result, no postings list get created for them. So there
      // are assigned term ids for which there are no postings. Since IntPostingsForwardIndex
      // assumes consecutive term ids (since it loads position offsets into an array), we must
      // insert "padding".
      // - Jimmy, 5/29/2010

      while (curKeyIndex < key.get()) {
        out.writeLong(-1);
        curKeyIndex++;
      }

      out.writeLong(pos);
    }

    @Override
    public void cleanup(Context context) throws IOException {
      // Insert padding at the end.
      while (curKeyIndex < collectionTermCount) {
        out.writeLong(-1);
        curKeyIndex++;
      }

      out.close();

      if (curKeyIndex != collectionTermCount) {
        throw new IOException(String.format("Expected %d terms, actually got %d terms!",
            collectionTermCount, curKeyIndex));
      }
    }
  }

  public BuildIntPostingsForwardIndex(Configuration conf) {
    super(conf);
  }

  public static final String[] RequiredParameters = { Constants.IndexPath };

  public String[] getRequiredParameters() {
    return RequiredParameters;
  }

  public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    int minSplitSize = conf.getInt(Constants.MinSplitSize, 0);
    String indexPath = conf.get(Constants.IndexPath);

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String collectionName = env.readCollectionName();

    LOG.info("Tool: " + BuildIntPostingsForwardIndex.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));

    Job job = new Job(getConf(),
        BuildIntPostingsForwardIndex.class.getSimpleName() + ":" + collectionName);
    job.setJarByClass(BuildIntPostingsForwardIndex.class);

    Path inputPath = new Path(env.getPostingsDirectory());
    FileInputFormat.setInputPaths(job, inputPath);

    Path postingsIndexPath = new Path(env.getPostingsIndexData());

    if (fs.exists(postingsIndexPath)) {
      LOG.info("Postings forward index path already exists!");
      return 0;
    }
    job.setNumReduceTasks(1);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(NullOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);

    return 0;
  }
}
TOP

Related Classes of ivory.core.index.BuildIntPostingsForwardIndex$MyMapper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.