Package com.ikanow.infinit.e.hadoop.processing

Source Code of com.ikanow.infinit.e.hadoop.processing.InfiniteProcessingEngine$InfiniteReducer

/*******************************************************************************
* Copyright 2012 The Infinit.e Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/

package com.ikanow.infinit.e.hadoop.processing;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.log4j.Logger;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.ToolRunner;
import org.bson.BSONObject;



import com.ikanow.infinit.e.data_model.store.MongoDbUtil;
//import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
//import com.ikanow.infinit.e.data_model.store.feature.association.AssociationFeaturePojo;
//import com.ikanow.infinit.e.data_model.store.feature.entity.EntityFeaturePojo;
import com.ikanow.infinit.e.hadoop.configuration.InfiniteProcessingEngineConfig;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
import com.mongodb.hadoop.io.BSONWritable;
import com.mongodb.hadoop.util.MongoTool;

public class InfiniteProcessingEngine extends MongoTool {

  // You can use this logger to output to the Plugin manager GUI - run in debug mode and "info" messages will appear
  // (in standalone Hadoop mode, error messages will also appear)
  static Logger _logger = Logger.getLogger("com.ikanow.infinit.e.hadoop.processing.InfiniteProcessingEngine");
  static boolean _logMessages = true; // (set to false for improved performance)
 
  // MAPPER
 
  //TODO: pick the output key/value classes
  // (if they are different to those of the reducer then you must use $mapper_key_class and $mapper_value_class overrides in the query
  public static class InfiniteMapper extends Mapper<Object, BSONObject, Text, BSONWritable> {
   
    // State:
    InfiniteProcessingEngineConfig _config;
   
    // Folder if yout want it:
    InfiniteFolder _folder = null;
   
    @Override
    protected void setup(Context context) {
      String args = context.getConfiguration().get("arguments");
      if (null != args) {
        _config = InfiniteProcessingEngineConfig.fromApi(args, InfiniteProcessingEngineConfig.class);
      }
      else {
        _config = new InfiniteProcessingEngineConfig(); // (all defaults)
      }
      if (_logMessages) _logger.info("CONFIGURATION = " + _config.toApi());
     
      //TODO if using folder
      _folder = new  InfiniteFolder();
     
      synchronized (InfiniteProcessingEngine.class) {
        //TODO: anything that needs to be synchronized across multiple mappers/reducers in a JVM
      }
    }
    @Override
    public void map( Object key, BSONObject value, Context context ) throws IOException, InterruptedException
    {
      // 1] Get object
     
      //TODO: optionally serialize into one of the pojos, depending on input type
      // (this is a trade-off between performance and maintainability, you can
      //  also just access "value" directly)
      //DocumentPojo doc = DocumentPojo.fromDb( (BasicDBObject) value, DocumentPojo.class );
      //EntityFeaturePojo ent = EntityFeaturePojo.fromDb( (BasicDBObject) value, EntityFeaturePojo.class );
      //AssociationFeaturePojo assoc = AssociationFeaturePojo.fromDb( (BasicDBObject) value, AssociationFeaturePojo.class );
      //CompressedFullTextPojo fullText = CompressedFullTextPojo.fromDb( (BasicDBObject) value, CompressedFullTextPojo.class );
     
      // (or direct access as described above)
      BasicDBObject record = (BasicDBObject)value;
      String docUrl = record.getString(DocumentPojo.url_, null);
      String docTitle = record.getString(DocumentPojo.title_, null);
      BasicDBList tags = (BasicDBList) record.get(DocumentPojo.tags_);
     
      // 2] Processing and output/folding
     
      //TODO: now do whatever processing you want to and emit (as many times as you want) as follows:
      // example, count tags
      if (null != tags) {
        for (Object tagObj: tags) {
          if (tagObj instanceof String) {
            BSONWritable outVal = new BSONWritable();
            outVal.put(DocumentPojo.url_, docUrl);           
            outVal.put(DocumentPojo.title_, docTitle)
            if (_config.fold) { // fold, can be faster for simpler aggregations
              _folder.fold((String) tagObj, outVal, context);
            }
            else {
              // Write don't fold, default:
              context.write(new Text((String) tagObj), outVal);
            }           
           
            //DEBUG:
            if (_logMessages) _logger.info("MAPOUT " + tagObj + ": " + MongoDbUtil.convert(outVal).toString());           
          }
        }//(end loop over tags)
      }//(end if tags specificed)
     
    }
    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
      //TODO: 1] emit folder cleanup if using
      if (null != _folder) {
        _folder.cleanup(context);
      }
     
      // 2] Any user specific cleanup
      // (none)
    }
   
  }
  // FOLDER OPTIONS
 
  // A folder is a convenient utility for simple statistical operations
  // it lets you "reduce" on the fly within a mapper, so the minimal number of objects are
  // exported to the combiner/reducer
 
  public static class InfiniteFolder {
   
    protected HashMap<String, BSONWritable> _folderState = new HashMap<String, BSONWritable>();
   
    public void fold(String key, BSONWritable value, @SuppressWarnings("rawtypes") org.apache.hadoop.mapreduce.Mapper.Context context) throws IOException, InterruptedException {
     
      // 1] Get current state, or create if not there
     
      BSONWritable currVal = _folderState.get(key);
      if (null == currVal) {
        currVal = new BSONWritable();
        _folderState.put(key, currVal);
      }
     
      // 2] Processing logic
           
      //TODO: write your logic in here, the key/val can be anything you want, since you control the map code also
      // Eg: count the docs
      Integer currDocs = (Integer) currVal.get("count");
      if (null == currDocs) {
        currDocs = 0;
      }
      currDocs++;
      currVal.put("count", currDocs);
         
      //DEBUG:
      if (_logMessages) _logger.info("FOLD_IN " + key + ": " + MongoDbUtil.convert(value) + "->" + MongoDbUtil.convert(currVal).toString());           
           
      // 3] Emit if state getting too large
      //TODO eg:
      if (_folderState.size() > 1000) {
        cleanup(context);
      }
     
      //TODO: if you are worried about the _folderState getting too large you can check its size here
      // and emit (hence passing "context" in)
    }
   
    @SuppressWarnings("unchecked")
    public void cleanup(@SuppressWarnings("rawtypes") org.apache.hadoop.mapreduce.Mapper.Context context) throws IOException, InterruptedException {
      //TODO: ensure the classes here are correct
      for (Map.Entry<String, BSONWritable> it: _folderState.entrySet()) {
        context.write(new Text(it.getKey()), it.getValue());
       
        //DEBUG:
        if (_logMessages) _logger.info("FOLD_OUT " + it.getKey() + ": " + MongoDbUtil.convert(it.getValue()).toString());           
      }
      _folderState.clear();
    }
  } 
 
  // COMBINER OPTIONS
 
  // There are 4 options:
  // 1) Use no combiner - configure this from the API
  // 2) Have a combiner that just passes data directly through - see NullCombiner
  // 3) Use the reducer as a combiner - configure this from the API
  // 4) Have a custom combiner - see InfiniteCustomCombiner
 
  //TODO need to ensure these key/value classes are correct (input and output)
  public static class InfiniteNullCombiner extends Reducer<Text, BSONWritable, Text, BSONWritable>
  {
    public void reduce( Text key, Iterable<BSONWritable> values, Context context )
    throws IOException, InterruptedException
    {
      for (BSONWritable value: values) {
        context.write(key, value);
      }
    }
  }
 
  //TODO need to ensure these key/value classes are correct (input and output)
  public static class InfiniteCustomCombiner extends Reducer<Text, BSONWritable, Text, BSONWritable>
  {
    public void reduce( Text key, Iterable<BSONWritable> values, Context context )
    throws IOException, InterruptedException
    {
      // Do whatever processing you want to and output the result, eg:
      //TODO job specific processing
      // eg count the docs for each tag
      int numDocs = 0;
      for (BSONWritable value: values) {
       
        if (!value.containsField("count")) { // (if directly from the mapper)
          //DEBUG
          if (_logMessages && (0 == numDocs)) _logger.info("COMBINE_IN_FROM_MAPPER, eg " + key + ": " + MongoDbUtil.convert(value));
          numDocs++;
        }
        else {
          //DEBUG
          if (_logMessages) _logger.info("COMBINE_IN_FROM_FOLDER " + key + ": " + MongoDbUtil.convert(value));
         
          Integer docs = (Integer) value.get("count"); // (if from the folder)
          numDocs += docs;
        }
      }
      BSONWritable outVal = new BSONWritable();
      outVal.put("count", numDocs);
      context.write(key, outVal);

      //DEBUG
      if (_logMessages) _logger.info("COMBINE_OUT " + key + ": " + MongoDbUtil.convert(outVal));
    }
  }
   
  // REDUCER

  //TODO need to ensure these key/value classes are correct (input and output)
  public static class InfiniteReducer extends Reducer<Text, BSONWritable, Text, BSONWritable>
  {
    public void reduce( Text key, Iterable<BSONWritable> values, Context context )
    throws IOException, InterruptedException
    {
      // Do whatever processing you want to and output the result, eg:
      //TODO job specific processing
      // eg count the docs for each tag
      int numDocs = 0;
      for (BSONWritable value: values) {       
        if (!value.containsField("count")) { // (if directly from the mapper OR from the null combiner)
          //DEBUG
          if (_logMessages && (0 == numDocs)) _logger.info("REDUCE_IN_FROM_MAPPER, eg " + key + ": " + MongoDbUtil.convert(value));
         
          numDocs++;
        }
        else {
          //DEBUG
          if (_logMessages) _logger.info("REDUCE_IN_FROM_COMBO " + key + ": " + MongoDbUtil.convert(value));
         
          Integer docs = (Integer) value.get("count"); // (if from the custom combiner OR directly from the folder)
          numDocs += docs;
        }
      }
      BSONWritable outVal = new BSONWritable();
      outVal.put("count", numDocs);
      context.write(key, outVal);

      //DEBUG
      if (_logMessages) _logger.info("REDUCE_OUT " + key + ": " + MongoDbUtil.convert(outVal));
    }
  } 
 
  /**
   * @param args
   * @throws Exception
   */
  public static void main(String[] args) throws Exception {
    final int exitCode = ToolRunner.run( new InfiniteProcessingEngine(), args );
    System.exit( exitCode );
  }

}
TOP

Related Classes of com.ikanow.infinit.e.hadoop.processing.InfiniteProcessingEngine$InfiniteReducer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.