Source Code of com.ikanow.infinit.e.hadoop.processing.InfiniteProcessingEngine$InfiniteReducer

/*******************************************************************************
 * Copyright 2012 The Infinit.e Open Source Project
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/


package com.ikanow.infinit.e.hadoop.processing;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;


import org.apache.log4j.Logger;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.ToolRunner;
import org.bson.BSONObject;






import com.ikanow.infinit.e.data_model.store.MongoDbUtil;
//import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
//import com.ikanow.infinit.e.data_model.store.feature.association.AssociationFeaturePojo;
//import com.ikanow.infinit.e.data_model.store.feature.entity.EntityFeaturePojo;
import com.ikanow.infinit.e.hadoop.configuration.InfiniteProcessingEngineConfig;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
import com.mongodb.hadoop.io.BSONWritable;
import com.mongodb.hadoop.util.MongoTool;


public class InfiniteProcessingEngine extends MongoTool {


  // You can use this logger to output to the Plugin manager GUI - run in debug mode and "info" messages will appear
  // (in standalone Hadoop mode, error messages will also appear)
  static Logger _logger = Logger.getLogger("com.ikanow.infinit.e.hadoop.processing.InfiniteProcessingEngine");
  static boolean _logMessages = true; // (set to false for improved performance)
  
  // MAPPER
  
  //TODO: pick the output key/value classes
  // (if they are different to those of the reducer then you must use $mapper_key_class and $mapper_value_class overrides in the query
  public static class InfiniteMapper extends Mapper<Object, BSONObject, Text, BSONWritable> {
    
    // State:
    InfiniteProcessingEngineConfig _config;
    
    // Folder if yout want it:
    InfiniteFolder _folder = null;
    
    @Override
    protected void setup(Context context) {
      String args = context.getConfiguration().get("arguments");
      if (null != args) {
        _config = InfiniteProcessingEngineConfig.fromApi(args, InfiniteProcessingEngineConfig.class);
      }
      else {
        _config = new InfiniteProcessingEngineConfig(); // (all defaults)
      }
      if (_logMessages) _logger.info("CONFIGURATION = " + _config.toApi());
      
      //TODO if using folder
      _folder = new  InfiniteFolder();
      
      synchronized (InfiniteProcessingEngine.class) {
        //TODO: anything that needs to be synchronized across multiple mappers/reducers in a JVM
      }
    }
    @Override
    public void map( Object key, BSONObject value, Context context ) throws IOException, InterruptedException
    {
      // 1] Get object
      
      //TODO: optionally serialize into one of the pojos, depending on input type
      // (this is a trade-off between performance and maintainability, you can
      //  also just access "value" directly)
      //DocumentPojo doc = DocumentPojo.fromDb( (BasicDBObject) value, DocumentPojo.class );
      //EntityFeaturePojo ent = EntityFeaturePojo.fromDb( (BasicDBObject) value, EntityFeaturePojo.class );
      //AssociationFeaturePojo assoc = AssociationFeaturePojo.fromDb( (BasicDBObject) value, AssociationFeaturePojo.class );
      //CompressedFullTextPojo fullText = CompressedFullTextPojo.fromDb( (BasicDBObject) value, CompressedFullTextPojo.class );
      
      // (or direct access as described above)
      BasicDBObject record = (BasicDBObject)value;
      String docUrl = record.getString(DocumentPojo.url_, null);
      String docTitle = record.getString(DocumentPojo.title_, null);
      BasicDBList tags = (BasicDBList) record.get(DocumentPojo.tags_);
      
      // 2] Processing and output/folding 
      
      //TODO: now do whatever processing you want to and emit (as many times as you want) as follows:
      // example, count tags
      if (null != tags) {
        for (Object tagObj: tags) {
          if (tagObj instanceof String) {
            BSONWritable outVal = new BSONWritable();
            outVal.put(DocumentPojo.url_, docUrl);            
            outVal.put(DocumentPojo.title_, docTitle);  
            if (_config.fold) { // fold, can be faster for simpler aggregations
              _folder.fold((String) tagObj, outVal, context);
            }
            else {
              // Write don't fold, default:
              context.write(new Text((String) tagObj), outVal);
            }            
            
            //DEBUG:
            if (_logMessages) _logger.info("MAPOUT " + tagObj + ": " + MongoDbUtil.convert(outVal).toString());            
          }
        }//(end loop over tags)
      }//(end if tags specificed)
      
    }
    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
      //TODO: 1] emit folder cleanup if using 
      if (null != _folder) {
        _folder.cleanup(context);
      }
      
      // 2] Any user specific cleanup
      // (none)
    }
    
  }
  // FOLDER OPTIONS
  
  // A folder is a convenient utility for simple statistical operations
  // it lets you "reduce" on the fly within a mapper, so the minimal number of objects are
  // exported to the combiner/reducer
  
  public static class InfiniteFolder {
    
    protected HashMap<String, BSONWritable> _folderState = new HashMap<String, BSONWritable>();
    
    public void fold(String key, BSONWritable value, @SuppressWarnings("rawtypes") org.apache.hadoop.mapreduce.Mapper.Context context) throws IOException, InterruptedException {
      
      // 1] Get current state, or create if not there
      
      BSONWritable currVal = _folderState.get(key);
      if (null == currVal) {
        currVal = new BSONWritable();
        _folderState.put(key, currVal);
      }
      
      // 2] Processing logic
            
      //TODO: write your logic in here, the key/val can be anything you want, since you control the map code also
      // Eg: count the docs
      Integer currDocs = (Integer) currVal.get("count");
      if (null == currDocs) {
        currDocs = 0;
      }
      currDocs++;
      currVal.put("count", currDocs);
          
      //DEBUG:
      if (_logMessages) _logger.info("FOLD_IN " + key + ": " + MongoDbUtil.convert(value) + "->" + MongoDbUtil.convert(currVal).toString());            
            
      // 3] Emit if state getting too large
      //TODO eg:
      if (_folderState.size() > 1000) {
        cleanup(context);
      }
      
      //TODO: if you are worried about the _folderState getting too large you can check its size here
      // and emit (hence passing "context" in)
    }
    
    @SuppressWarnings("unchecked")
    public void cleanup(@SuppressWarnings("rawtypes") org.apache.hadoop.mapreduce.Mapper.Context context) throws IOException, InterruptedException {
      //TODO: ensure the classes here are correct
      for (Map.Entry<String, BSONWritable> it: _folderState.entrySet()) {
        context.write(new Text(it.getKey()), it.getValue());
        
        //DEBUG:
        if (_logMessages) _logger.info("FOLD_OUT " + it.getKey() + ": " + MongoDbUtil.convert(it.getValue()).toString());            
      }
      _folderState.clear();
    }
  }  
  
  // COMBINER OPTIONS
  
  // There are 4 options:
  // 1) Use no combiner - configure this from the API
  // 2) Have a combiner that just passes data directly through - see NullCombiner
  // 3) Use the reducer as a combiner - configure this from the API
  // 4) Have a custom combiner - see InfiniteCustomCombiner
  
  //TODO need to ensure these key/value classes are correct (input and output)
  public static class InfiniteNullCombiner extends Reducer<Text, BSONWritable, Text, BSONWritable> 
  {
    public void reduce( Text key, Iterable<BSONWritable> values, Context context )
    throws IOException, InterruptedException
    {
      for (BSONWritable value: values) {
        context.write(key, value);
      }
    }
  }
  
  //TODO need to ensure these key/value classes are correct (input and output)
  public static class InfiniteCustomCombiner extends Reducer<Text, BSONWritable, Text, BSONWritable> 
  {
    public void reduce( Text key, Iterable<BSONWritable> values, Context context )
    throws IOException, InterruptedException
    {
      // Do whatever processing you want to and output the result, eg:
      //TODO job specific processing
      // eg count the docs for each tag
      int numDocs = 0;
      for (BSONWritable value: values) {
        
        if (!value.containsField("count")) { // (if directly from the mapper)
          //DEBUG
          if (_logMessages && (0 == numDocs)) _logger.info("COMBINE_IN_FROM_MAPPER, eg " + key + ": " + MongoDbUtil.convert(value));
          numDocs++;
        }
        else {
          //DEBUG
          if (_logMessages) _logger.info("COMBINE_IN_FROM_FOLDER " + key + ": " + MongoDbUtil.convert(value));
          
          Integer docs = (Integer) value.get("count"); // (if from the folder)
          numDocs += docs;
        }
      }
      BSONWritable outVal = new BSONWritable();
      outVal.put("count", numDocs);
      context.write(key, outVal);


      //DEBUG
      if (_logMessages) _logger.info("COMBINE_OUT " + key + ": " + MongoDbUtil.convert(outVal));
    }
  }
    
  // REDUCER


  //TODO need to ensure these key/value classes are correct (input and output)
  public static class InfiniteReducer extends Reducer<Text, BSONWritable, Text, BSONWritable> 
  {
    public void reduce( Text key, Iterable<BSONWritable> values, Context context )
    throws IOException, InterruptedException
    {
      // Do whatever processing you want to and output the result, eg:
      //TODO job specific processing
      // eg count the docs for each tag
      int numDocs = 0;
      for (BSONWritable value: values) {        
        if (!value.containsField("count")) { // (if directly from the mapper OR from the null combiner)
          //DEBUG
          if (_logMessages && (0 == numDocs)) _logger.info("REDUCE_IN_FROM_MAPPER, eg " + key + ": " + MongoDbUtil.convert(value));
          
          numDocs++;
        }
        else {
          //DEBUG
          if (_logMessages) _logger.info("REDUCE_IN_FROM_COMBO " + key + ": " + MongoDbUtil.convert(value));
          
          Integer docs = (Integer) value.get("count"); // (if from the custom combiner OR directly from the folder)
          numDocs += docs;
        }
      }
      BSONWritable outVal = new BSONWritable();
      outVal.put("count", numDocs);
      context.write(key, outVal);


      //DEBUG
      if (_logMessages) _logger.info("REDUCE_OUT " + key + ": " + MongoDbUtil.convert(outVal));
    }
  }  
  
  /**
   * @param args
   * @throws Exception 
   */
  public static void main(String[] args) throws Exception {
    final int exitCode = ToolRunner.run( new InfiniteProcessingEngine(), args );
    System.exit( exitCode );
  }


}
Source Code of com.ikanow.infinit.e.hadoop.processing.InfiniteProcessingEngine$InfiniteReducer

Related Classes of com.ikanow.infinit.e.hadoop.processing.InfiniteProcessingEngine$InfiniteReducer