Source Code of com.datasalt.pangool.examples.avro.AvroTopicalWordCount$MyAvroComparator

/**
 * Copyright [2012] [Datasalt Systems S.L.]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datasalt.pangool.examples.avro;


import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;


import org.apache.avro.Schema.Type;
import org.apache.avro.generic.GenericData.Record;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.codehaus.jackson.map.ObjectMapper;


import com.datasalt.pangool.PangoolRuntimeException;
import com.datasalt.pangool.examples.BaseExampleJob;
import com.datasalt.pangool.examples.topicalwordcount.TopicalWordCount;
import com.datasalt.pangool.io.Fields;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.Schema.Field;
import com.datasalt.pangool.io.Tuple;
import com.datasalt.pangool.tuplemr.Criteria.Order;
import com.datasalt.pangool.tuplemr.OrderBy;
import com.datasalt.pangool.tuplemr.TupleMRBuilder;
import com.datasalt.pangool.tuplemr.TupleMRException;
import com.datasalt.pangool.tuplemr.TupleMapper;
import com.datasalt.pangool.tuplemr.TupleReducer;
import com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat;
import com.datasalt.pangool.tuplemr.serialization.AvroFieldSerialization;
import com.datasalt.pangool.tuplemr.serialization.AvroFieldSerialization.AvroFieldDeserializer;


/**
 * This is an advanced example to illustrate the usage of custom serializers and custom comparators.<br>
 * 
 * In this example the intermediate schema contains just a single Avro Record, whose avro schema is
 * "topic:int, word:string, count:int".<br>
 * The custom serialization used is defined in {@link AvroFieldSerialization}. In addition to this, a custom comparator
 * {@link MyAvroComparator} is used to just compare and group by "topic , word". <br>
 * The behaviour is identical to the example {@link TopicalWordCount} , but using the external serialization provided by
 * Avro.
 */
public class AvroTopicalWordCount extends BaseExampleJob {


  @SuppressWarnings("serial")
  public static class TokenizeMapper extends TupleMapper<LongWritable, Text> {


    protected Tuple tuple;
    protected Record record;
    protected ObjectMapper mapper;


    public void setup(TupleMRContext context, Collector collector) throws IOException,
        InterruptedException {
      this.mapper = new ObjectMapper();
      tuple = new Tuple(context.getTupleMRConfig().getIntermediateSchema(0));
      record = new Record(getAvroSchema());
      tuple.set("my_avro", record);
    };


    @SuppressWarnings("rawtypes")
    @Override
    public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
        throws IOException, InterruptedException {


      Map document = mapper.readValue(value.toString(), Map.class);
      record.put("topic", (Integer) document.get("topicId"));
      StringTokenizer itr = new StringTokenizer((String) document.get("text"));
      record.put("count", 1);
      while(itr.hasMoreTokens()) {
        record.put("word", itr.nextToken());
        tuple.set("my_avro", record);
        collector.write(tuple);
      }
    }
  }


  @SuppressWarnings("serial")
  public static class CountReducer extends TupleReducer<ITuple, NullWritable> {


    @Override
    public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector)
        throws IOException, InterruptedException, TupleMRException {


      int count = 0;
      ITuple outputTuple = null;
      Record outputRecord = null;
      for(ITuple tuple : tuples) {
        Record record = (Record) tuple.get("my_avro");
        count += (Integer) record.get("count");
        outputTuple = tuple;
        outputRecord = record;
      }
      outputRecord.put("count", count);
      outputTuple.set("my_avro", outputRecord);
      collector.write(outputTuple, NullWritable.get());
    }
  }


  public AvroTopicalWordCount() {
    super("Usage: AvroTopicalWordCount [input_path] [output_path]");
  }


  static Schema getSchema() {
    Field avroField = Fields.createAvroField("my_avro", getAvroSchema(), false);
    return new Schema("schema", Arrays.asList(avroField));
  }


  static org.apache.avro.Schema getAvroSchema() {
    List<org.apache.avro.Schema.Field> avroFields = new ArrayList<org.apache.avro.Schema.Field>();
    avroFields.add(new org.apache.avro.Schema.Field("word", org.apache.avro.Schema.create(Type.STRING),
        null, null));
    avroFields.add(new org.apache.avro.Schema.Field("topic", org.apache.avro.Schema.create(Type.INT),
        null, null));
    avroFields.add(new org.apache.avro.Schema.Field("count", org.apache.avro.Schema.create(Type.INT),
        null, null));
    org.apache.avro.Schema result = org.apache.avro.Schema
        .createRecord("avro_schema", null, null, false);
    result.setFields(avroFields);
    return result;
  }


  /**
   * A custom comparator that deserializes bytes to Avro {@link Record} instances, and then compares by "topic" and
   * "word" fields.
   * 
   */
  @SuppressWarnings("serial")
  public static class MyAvroComparator implements RawComparator<Record>, Serializable {


    // MyAvroComparator must be serializable so this must be transient
    private transient AvroFieldDeserializer<Record> deser;
    private transient Record record1, record2;
    private transient DataInputBuffer inputBuffer;
    private String avroSchema;
    private String[] fields;


    public MyAvroComparator(org.apache.avro.Schema avroSchema, String... fields) {
      this.avroSchema = avroSchema.toString();
      this.fields = fields;
    }


    // lazy loading of deserializer and buffers
    private void init() {
      if(deser == null) {
        deser = new AvroFieldDeserializer<Record>(org.apache.avro.Schema.parse(avroSchema), false);
      }
      if(inputBuffer == null) {
        inputBuffer = new DataInputBuffer();
      }
    }


    @Override
    @SuppressWarnings({ "unchecked", "rawtypes" })
    public int compare(Record record1, Record record2) {
      for(String field : fields) {
        int comparison = ((Comparable) record1.get(field)).compareTo(record2.get(field));
        if(comparison != 0) {
          return comparison;
        }
      }
      return 0;
    }


    @Override
    public int compare(byte[] b1, int o1, int l1, byte[] b2, int o2, int l2) {
      init();
      try {
        inputBuffer.reset(b1, o1, l1);
        deser.open(inputBuffer);
        record1 = deser.deserialize(record1);
        deser.close();
        inputBuffer.reset(b2, o2, l2);
        deser.open(inputBuffer);
        record2 = deser.deserialize(record2);
        deser.close();
        return compare(record1, record2);
      } catch(IOException e) {
        throw new PangoolRuntimeException(e);
      }
    }
  }


  @Override
  public int run(String[] args) throws Exception {
    if(args.length != 2) {
      failArguments("Wrong number of arguments");
      return -1;
    }


    delete(args[1]);


    TupleMRBuilder mr = new TupleMRBuilder(conf, "Pangool Topical Word Count");
    mr.addIntermediateSchema(getSchema());
    mr.setGroupByFields("my_avro");
    // here the custom comparator that groups by "topic,word" is used.
    MyAvroComparator customComp = new MyAvroComparator(getAvroSchema(), "topic", "word");
    mr.setOrderBy(new OrderBy().add("my_avro", Order.ASC, customComp));
    mr.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), new TokenizeMapper());
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    mr.setTupleOutput(new Path(args[1]), getSchema());
    mr.setTupleReducer(new CountReducer());
    mr.setTupleCombiner(new CountReducer());


    try {
      mr.createJob().waitForCompletion(true);
    } finally {
      mr.cleanUpInstanceFiles();
    }
    return 1;
  }


  public static void main(String[] args) throws Exception {
    ToolRunner.run(new AvroTopicalWordCount(), args);
  }
}
Source Code of com.datasalt.pangool.examples.avro.AvroTopicalWordCount$MyAvroComparator

Related Classes of com.datasalt.pangool.examples.avro.AvroTopicalWordCount$MyAvroComparator