Package de.jungblut.nlp.mr

Source Code of de.jungblut.nlp.mr.TfIdfCalculatorJob$DocumentVectorizerReducer

package de.jungblut.nlp.mr;

import java.io.IOException;

import org.apache.commons.math3.util.FastMath;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

import de.jungblut.math.sparse.SparseDoubleVector;
import de.jungblut.writable.VectorWritable;

/**
* Job that will calculate tf-idf based on the output of the
* {@link WordCorpusFrequencyJob}.
*
* @author thomas.jungblut
*
*/
public class TfIdfCalculatorJob {

  public static final String NUMBER_OF_DOCUMENTS_KEY = "documents.num";
  public static final String NUMBER_OF_TOKENS_KEY = "tokens.num";
  public static final String SPAM_DOCUMENT_PERCENTAGE_KEY = "spam.percentage";
  public static final String WORD_COUNT_OUTPUT_KEY = "wordcount.output";

  /**
   * Calculate the sparse vector with TF-IDF.
   */
  public static class DocumentVectorizerReducer extends
      Reducer<Text, TextIntIntIntWritable, Text, VectorWritable> {

    private long numDocs;
    private long documentThreshold;
    private int numTokens;
    private boolean wordCount;

    @Override
    protected void setup(Context context) throws IOException,
        InterruptedException {
      numDocs = context.getConfiguration().getLong(NUMBER_OF_DOCUMENTS_KEY, 1);
      numTokens = context.getConfiguration().getInt(NUMBER_OF_TOKENS_KEY, 1);
      documentThreshold = (long) (numDocs * context.getConfiguration()
          .getFloat(SPAM_DOCUMENT_PERCENTAGE_KEY, 0.5f));
      wordCount = context.getConfiguration().getBoolean(WORD_COUNT_OUTPUT_KEY,
          false);
    }

    /**
     * Input is the document ID with several (token, document frequency, term
     * frequency, token index) pairs.
     */
    @Override
    protected void reduce(Text key, Iterable<TextIntIntIntWritable> values,
        Context context) throws IOException, InterruptedException {

      SparseDoubleVector vector = new SparseDoubleVector(numTokens);
      for (TextIntIntIntWritable pair : values) {
        if (documentThreshold > pair.getSecond().get()) {
          double val = 0d;
          if (wordCount) {
            val = pair.getThird().get();
          } else {
            val = pair.getThird().get()
                * (FastMath.log(numDocs) - FastMath.log(pair.getSecond().get()));
          }
          vector.set(pair.getFourth().get(), val);
        }
      }

      context.write(key, new VectorWritable(vector));

    }

  }

  /**
   * Calculates TF-IDF vectors from text input in the following format:<br/>
   *
   * <pre>
   * documentid \t corpus
   * </pre>
   *
   * <br/>
   * <br/>
   *
   * It will run two jobs, a first job determines the document frequency of a
   * token, as well as its index in the resulting vector. The output is a
   * {@link SequenceFile} with {@link Text} as key and {@link VectorWritable} as
   * value.
   *
   */
  public static void main(String[] args) throws Exception {
    if (args.length != 4) {
      System.out
          .println("Usage: <Comma separated input paths> <immediate output path> <Output path> <dictionary output path>");
      System.exit(1);
    }
    Configuration conf = new Configuration();
    Job job = WordCorpusFrequencyJob.createJob(args[0], args[1], args[3], conf);
    job.waitForCompletion(true);
    long numDocs = WordCorpusFrequencyJob.getNumberOfDocuments(job);
    long numTokens = WordCorpusFrequencyJob.getNumberOfTokens(job);
    conf = new Configuration();
    Job createJob = createJob(args[1], args[2], conf, numDocs, numTokens);
    createJob.waitForCompletion(true);
  }

  /**
   * Creates a tf-idf job.
   *
   * @param in the input path, the output of the {@link WordCorpusFrequencyJob}.
   * @param out the output directory.
   * @param conf the configuration.
   * @param numberOfDocuments the number of documents in the corpus per token.
   *          (map input counter value of {@link WordCorpusFrequencyJob}.)
   * @param numberOfTokens the number of tokens in the corpus. (reduce input
   *          group counter value of {@link WordCorpusFrequencyJob}.)
   * @return a job with the configured propertys like name, key/value classes
   *         and input format as text.
   */
  public static Job createJob(String in, String out, Configuration conf,
      long numberOfDocuments, long numberOfTokens) throws IOException {

    conf.setLong(NUMBER_OF_DOCUMENTS_KEY, numberOfDocuments);
    conf.setLong(NUMBER_OF_TOKENS_KEY, numberOfTokens);

    Job job = Job.getInstance(conf, "TF-IDF Calculator");

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, new Path(out));

    job.setMapperClass(Mapper.class);
    job.setReducerClass(DocumentVectorizerReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(TextIntIntIntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setNumReduceTasks(1);
    return job;
  }

}
TOP

Related Classes of de.jungblut.nlp.mr.TfIdfCalculatorJob$DocumentVectorizerReducer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.