Package ivory.core.preprocess

Source Code of ivory.core.preprocess.BuildDictionary$MyReducer

/*
* Ivory: A Hadoop toolkit for web-scale information retrieval
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package ivory.core.preprocess;

import it.unimi.dsi.sux4j.mph.TwoStepsLcpMonotoneMinimalPerfectHashFunction;
import it.unimi.dsi.util.FrontCodedStringList;
import it.unimi.dsi.util.ShiftAddXorSignedStringMap;
import ivory.core.Constants;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.dictionary.DictionaryTransformationStrategy;
import ivory.core.util.QuickSort;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.Iterator;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.log4j.Logger;

import com.google.common.collect.Lists;

import edu.umd.cloud9.io.pair.PairOfIntLong;
import edu.umd.cloud9.util.PowerTool;

public class BuildDictionary extends PowerTool {
  private static final Logger LOG = Logger.getLogger(BuildDictionary.class);

  protected static enum Terms { Total }

  private static class MyReducer
      extends Reducer<Text, PairOfIntLong, NullWritable, NullWritable> {
    private FSDataOutputStream termsOut, idsOut, idsToTermOut,
        dfByTermOut, cfByTermOut, dfByIntOut, cfByIntOut;
    private int numTerms;
    private int[] seqNums = null;
    private int[] dfs = null;
    private long[] cfs = null;
    private int curKeyIndex = 0;

    private String[] terms;

    @Override
    public void setup(Reducer<Text, PairOfIntLong, NullWritable, NullWritable>.Context context)
        throws IOException {
      LOG.info("Starting setup.");
      Configuration conf = context.getConfiguration();
      FileSystem fs = FileSystem.get(conf);
      RetrievalEnvironment env = new RetrievalEnvironment(conf.get(Constants.IndexPath), fs);

      numTerms = conf.getInt(Constants.CollectionTermCount, 0);

      terms = new String[numTerms];
      seqNums = new int[numTerms];
      dfs = new int[numTerms];
      cfs = new long[numTerms];

      termsOut = fs.create(new Path(env.getIndexTermsData()), true);
      //termsOut.writeInt(numTerms);

      idsOut = fs.create(new Path(env.getIndexTermIdsData()), true);
      idsOut.writeInt(numTerms);

      idsToTermOut = fs.create(new Path(env.getIndexTermIdMappingData()), true);
      idsToTermOut.writeInt(numTerms);

      dfByTermOut = fs.create(new Path(env.getDfByTermData()), true);
      dfByTermOut.writeInt(numTerms);

      cfByTermOut = fs.create(new Path(env.getCfByTermData()), true);
      cfByTermOut.writeInt(numTerms);

      dfByIntOut = fs.create(new Path(env.getDfByIntData()), true);
      dfByIntOut.writeInt(numTerms);

      cfByIntOut = fs.create(new Path(env.getCfByIntData()), true);
      cfByIntOut.writeInt(numTerms);
      LOG.info("Finished setup.");
    }

    @Override
    public void reduce(Text key, Iterable<PairOfIntLong> values, Context context)
        throws IOException, InterruptedException {
      String term = key.toString();
      Iterator<PairOfIntLong> iter = values.iterator();
      PairOfIntLong p = iter.next();
      int df = p.getLeftElement();
      long cf = p.getRightElement();
      WritableUtils.writeVInt(dfByTermOut, df);
      WritableUtils.writeVLong(cfByTermOut, cf);

      if (iter.hasNext()) {
        throw new RuntimeException("More than one record for term: " + term);
      }

      //termsOut.writeUTF(term);

      terms[curKeyIndex] = term;
      seqNums[curKeyIndex] = curKeyIndex;
      dfs[curKeyIndex] = -df;
      cfs[curKeyIndex] = cf;
      curKeyIndex++;

      context.getCounter(Terms.Total).increment(1);
    }

    @Override
    public void cleanup(
        Reducer<Text, PairOfIntLong, NullWritable, NullWritable>.Context context)
        throws IOException {
      LOG.info("Starting cleanup.");
      if (curKeyIndex != numTerms) {
        throw new RuntimeException("Total expected Terms: " + numTerms +
            ", Total observed terms: " + curKeyIndex + "!");
      }
      // Sort based on df and change seqNums accordingly.
      QuickSort.quicksortWithSecondary(seqNums, dfs, cfs, 0, numTerms - 1);

      // Write sorted dfs and cfs by int here.
      for (int i = 0; i < numTerms; i++) {
        WritableUtils.writeVInt(dfByIntOut, -dfs[i]);
        WritableUtils.writeVLong(cfByIntOut, cfs[i]);
      }
      cfs = null;

      // Encode the sorted dfs into ids ==> df values erased and become ids instead. Note that first
      // term id is 1.
      for (int i = 0; i < numTerms; i++) {
        dfs[i] = i + 1;
      }

      // Write current seq nums to be index into the term array.
      for (int i = 0; i < numTerms; i++)
        idsToTermOut.writeInt(seqNums[i]);

      // Sort on seqNums to get the right writing order.
      QuickSort.quicksort(dfs, seqNums, 0, numTerms - 1);
      for (int i = 0; i < numTerms; i++) {
        idsOut.writeInt(dfs[i]);
      }

      ByteArrayOutputStream bytesOut;
      ObjectOutputStream objOut;
      byte[] bytes;

      List<String> termList = Lists.newArrayList(terms);
      FrontCodedStringList frontcodedList = new FrontCodedStringList(termList, 8, true);

      bytesOut = new ByteArrayOutputStream();
      objOut = new ObjectOutputStream(bytesOut);
      objOut.writeObject(frontcodedList);
      objOut.close();

      bytes = bytesOut.toByteArray();
      termsOut.writeInt(bytes.length);
      termsOut.write(bytes);

      ShiftAddXorSignedStringMap dict = new ShiftAddXorSignedStringMap(termList.iterator(),
          new TwoStepsLcpMonotoneMinimalPerfectHashFunction<CharSequence>(termList,
              new DictionaryTransformationStrategy(true)));

      bytesOut = new ByteArrayOutputStream();
      objOut = new ObjectOutputStream(bytesOut);
      objOut.writeObject(dict);
      objOut.close();

      bytes = bytesOut.toByteArray();
      termsOut.writeInt(bytes.length);
      termsOut.write(bytes);

      termsOut.close();
      idsOut.close();
      idsToTermOut.close();
      dfByTermOut.close();
      cfByTermOut.close();
      dfByIntOut.close();
      cfByIntOut.close();
      LOG.info("Finished cleanup.");
    }
  }

  public static final String[] RequiredParameters = {
      Constants.CollectionName, Constants.IndexPath };

  public String[] getRequiredParameters() {
    return RequiredParameters;
  }

  public BuildDictionary(Configuration conf) {
    super(conf);
  }

  public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    String collectionName = conf.get(Constants.CollectionName);

    LOG.info("PowerTool: " + BuildDictionary.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    if (!fs.exists(new Path(indexPath))) {
      LOG.error("index path doesn't existing: skipping!");
      return 0;
    }

    if (fs.exists(new Path(env.getIndexTermsData())) &&
        fs.exists(new Path(env.getIndexTermIdsData())) &&
        fs.exists(new Path(env.getIndexTermIdMappingData())) &&
        fs.exists(new Path(env.getDfByTermData())) &&
        fs.exists(new Path(env.getCfByTermData())) &&
        fs.exists(new Path(env.getDfByIntData())) &&
        fs.exists(new Path(env.getCfByIntData()))) {
      LOG.info("term and term id data exist: skipping!");
      return 0;
    }

    conf.setInt(Constants.CollectionTermCount, (int) env.readCollectionTermCount());
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    Path tmpPath = new Path(env.getTempDirectory());
    fs.delete(tmpPath, true);

    Job job = new Job(conf,
        BuildDictionary.class.getSimpleName() + ":" + collectionName);

    job.setJarByClass(BuildDictionary.class);
    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(env.getTermDfCfDirectory()));
    FileOutputFormat.setOutputPath(job, tmpPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfIntLong.class);
    job.setOutputKeyClass(Text.class);
    job.setSortComparatorClass(DictionaryTransformationStrategy.WritableComparator.class);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    fs.delete(tmpPath, true);

    return 0;
  }
}
TOP

Related Classes of ivory.core.preprocess.BuildDictionary$MyReducer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.