Source Code of com.ikanow.infinit.e.processing.custom.output.CustomOutputIndexingEngine

/*******************************************************************************
 * Copyright 2012, The Infinit.e Open Source Project.
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3,
 * as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package com.ikanow.infinit.e.processing.custom.output;


import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.TreeSet;


import org.bson.types.ObjectId;
import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.ImmutableSettings.Builder;
import org.elasticsearch.index.query.FilterBuilders;
import org.elasticsearch.index.query.QueryBuilders;


import com.ikanow.infinit.e.data_model.index.ElasticSearchManager;
import com.ikanow.infinit.e.data_model.index.IndexManager;
import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.MongoDbManager;
import com.ikanow.infinit.e.data_model.store.custom.mapreduce.CustomMapReduceJobPojo;
import com.ikanow.infinit.e.data_model.utils.ThreadSafeSimpleDateFormat;
import com.ikanow.infinit.e.processing.custom.utils.InfiniteHadoopUtils;
import com.mongodb.BasicDBObject;
import com.mongodb.CommandResult;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;


public class CustomOutputIndexingEngine {


  ////////////////////////////////////////////////////////////////////////////////////
  
  // TOP LEVEL LOGIC


  //TODO: at some point for performance, it should probably do the indexing from within the custom job output format class
  // we can set up all the initialization from "prepareOutputCollection" (use the size of the previous iteration)
  // and then just sort out the alias swapping from here
  
  /**
   * Tidy up the output collection _before_ it is written into
   */
  public static void prepareOutputCollection(CustomMapReduceJobPojo job) {
    // (currently nothing to do here) 
  }
  
  public static void completeOutput(CustomMapReduceJobPojo job, Date ageOutTime, ObjectId newIdThreshold) {
    
    BasicDBObject outputSettings = new BasicDBObject();
    if (!isIndexed(job, outputSettings)) {
      return;
    }
    
    if ((null == job.appendResults) || !job.appendResults) {
      
      // delete/re-create index, fill with data, swap the aliases, delete the old index 
      
      ElasticSearchManager customIndex = createNewIndex(job, true);
      transferData(customIndex, job, newIdThreshold);
      swapAliases(customIndex, job.communityIds, false);
      // remove old index      
      String indexToRemove = new StringBuilder("custom_").append(job.outputCollection).toString();      
      if (IndexManager.pingIndex(indexToRemove)) { // delete if it exists...
        IndexManager.getIndex(indexToRemove).deleteMe();
      }


    }
    else { // append mode
      //TODO: note that if swapping from non-append to append mode, then might run into...
      // ...shards-too-small issue: will live with this for now (preferred option is to use aliases)
      
      // create index (and add aliases), fill with data, remove aged-out data
      ElasticSearchManager customIndex = getExistingIndex(job);
      if (null == customIndex) { // easy case, create index 
        customIndex = createNewIndex(job, false);
        swapAliases(customIndex, job.communityIds, false);
      }
      transferData(customIndex, job, newIdThreshold);
      deleteOldCustomRecords(customIndex, ageOutTime);
    }
  }//TODO TOTEST (append mode), TESTED (api mode) 
  
  // (Called when the job is completed)
  public static void deleteOutput(CustomMapReduceJobPojo job) {
    String indexToRemove = new StringBuilder("custom_").append(job.outputCollection).toString();      
    if (IndexManager.pingIndex(indexToRemove)) { // delete if it exists...
      IndexManager.getIndex(indexToRemove).deleteMe();
    }
    indexToRemove = new StringBuilder("custom_").append(job.outputCollectionTemp).toString();      
    if (IndexManager.pingIndex(indexToRemove)) { // delete if it exists...
      IndexManager.getIndex(indexToRemove).deleteMe();
    }    
  }//TESTED (by hand via API)
  
  ////////////////////////////////////////////////////////////////////////////////////
  
  // UTILITY CODE
  
  private static boolean isIndexed(CustomMapReduceJobPojo job, BasicDBObject settings) {
    BasicDBObject postProcObject = (BasicDBObject) com.mongodb.util.JSON.parse(InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.POSTPROC));
    if (null == postProcObject) {
      return false;
    }
    if (null != settings) {
      settings.putAll((DBObject)postProcObject);
    }    
    String indexMode = postProcObject.getString("indexMode", null);
    
    return indexMode != null;
  }//TESTED (by hand via API) 
  
  public static boolean isIndexed(CustomMapReduceJobPojo job) {
    return isIndexed(job, null);
  }//TESTED (by hand via API)
  
  public static ElasticSearchManager getExistingIndex(CustomMapReduceJobPojo job) {
    if (!isIndexed(job)) {
      return null;
    }
    boolean appendMode = ((null == job.appendResults) || !job.appendResults);
    String outputCollectionToUse = appendMode ? job.outputCollection : job.outputCollection;
    String indexToBuild = new StringBuilder("custom_").append(outputCollectionToUse).toString();
    
    if (!IndexManager.pingIndex(indexToBuild)) { 
      return null;
    }
    return IndexManager.getIndex(indexToBuild);
  }//TESTED (by hand - via changing the communities in updateJob)
  
  private static ElasticSearchManager createNewIndex(CustomMapReduceJobPojo job, boolean deleteExisting) {
    boolean appendMode = !((null == job.appendResults) || !job.appendResults);
    String outputCollectionToUse = appendMode ? job.outputCollection : job.outputCollectionTemp;
    String indexToBuild = new StringBuilder("custom_").append(outputCollectionToUse).toString();
    
    if (deleteExisting) {
      if (IndexManager.pingIndex(indexToBuild)) { // delete if it exists...
        IndexManager.getIndex(indexToBuild).deleteMe();
      }
    }
    
    // Recreate the index, size will depend on the size of the collection 
    
    DBCollection outputCollection = DbManager.getCollection(job.getOutputDatabase(), outputCollectionToUse);
    long collectionSize = 1L;
    CommandResult outputCollectionStats = outputCollection.getStats();
    if (null != outputCollectionStats) {
      collectionSize = outputCollectionStats.getLong("size", 1L);
    }
    // Let's say 1 shard/500MB
    final long SHARD_SIZE = 500L*1024L*1024L;
    int numShards = 1 + (int)(collectionSize/SHARD_SIZE);
    if (appendMode) { // increase the size for the future shards=1 + 2*(initial size + 1)
      numShards++;
      numShards *= 2;
      numShards++; // (somewhat arbitrary equation, 1 shard -> 5 was pretty much the justification...)
    }    
    
    Builder localSettingsEvent = ImmutableSettings.settingsBuilder();
    localSettingsEvent.put("number_of_shards", numShards).put("number_of_replicas", 0);
    ElasticSearchManager customIndex = IndexManager.createIndex(indexToBuild, "custom", false, null, null, localSettingsEvent);
      // (don't need to set the mapping - that happens automatically because I've registered a template)
    
    return customIndex;    
  }//TESTED (basic functionality)
  //TODO: totest (different shard sizes)
  
    private static ThreadSafeSimpleDateFormat _dateFormat = new ThreadSafeSimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
  
  private static void transferData(ElasticSearchManager customIndex, CustomMapReduceJobPojo job, ObjectId newIdThreshold) {
    //(lots of scope for performance improvements here, but the intention is to move all this into the job itself, so it's probably moot)
    
    boolean appendMode = !((null == job.appendResults) || !job.appendResults);
    String outputCollectionToUse = appendMode ? job.outputCollection : job.outputCollectionTemp;
    DBCollection outputCollection = DbManager.getCollection(job.getOutputDatabase(), outputCollectionToUse);
    
    final int BATCH_SIZE = 1000;
    BasicDBObject query = new BasicDBObject();
    if (null != newIdThreshold) {
      query.put("_id", new BasicDBObject(MongoDbManager.gt_, newIdThreshold));
    }
    DBCursor dbc = outputCollection.find(query).batchSize(BATCH_SIZE);
    Iterator<DBObject> it = dbc.iterator();
    ArrayList<DBObject> toEsList = new ArrayList<DBObject>(BATCH_SIZE);
    String jobtitle = new StringBuilder("custom:").append(job.jobtitle).toString();
    while (it.hasNext()) {
      DBObject dbo = it.next();
      // Object transforms:
      // 1) _id needs to be a string
      ObjectId _id = (ObjectId) dbo.get("_id");
      dbo.put("_id", _id.toString());
      // 2) set timestamp if not set by user:
      Object ts = dbo.get("@timestamp");
      if (null == ts) {
        dbo.put("@timestamp", _dateFormat.format(new Date(_id.getTime())));
      }
      // 3) always overwrite sourceKey with job title
      dbo.put("sourceKey", jobtitle);
      Object message = dbo.get("message");
      // 4) add a message field if one doesn't already exist...
      if (null == message) {
        dbo.put("message", dbo.get("key").toString());
      }
      // 5) Support for incremental mode
      Object updateId = dbo.get("_updateId");
      if (null != updateId) {
        dbo.put("_id", updateId.toString());
      }
      toEsList.add(dbo);
      
      if (toEsList.size() >= BATCH_SIZE) {
        customIndex.bulkAddDocuments(toEsList, "_id", null, true);
        toEsList.clear();
      }
    }
    if (toEsList.size() > 0) {
      customIndex.bulkAddDocuments(toEsList, "_id", null, true);
    }
  }//TESTED (by hand - non-append mode: timestamp, _id, key)
  //TODO: totest .. updateId, message ... append mode
  
  public static void swapAliases(ElasticSearchManager customIndex, List<ObjectId> newCommIds, boolean deleteOldAliases) {
    // Add new aliases as sets of communities (at most 10 per alias)
    TreeSet<ObjectId> newTreeSet = new TreeSet<ObjectId>();
    HashSet<String> aliasSet = new HashSet<String>();
    if (null != newCommIds) {
      newTreeSet.addAll(newCommIds);
      int added = 0;
      StringBuffer sb = new StringBuffer("customs");
      for (ObjectId commId: newCommIds) {
        sb.append("_").append(commId.toString());
        added++;
        if (added >= 10) {
          customIndex.createAlias(sb.toString());
          aliasSet.add(sb.toString());          
          added = 0;
          sb = new StringBuffer("customs");
        }//TESTED
      }
      if (added > 0) {
        customIndex.createAlias(sb.toString());
        aliasSet.add(sb.toString());        
      }//TESTED
    }
    
    if (deleteOldAliases) {
      // Remove all old aliases (unless we already have them)
      ClusterStateResponse retVal = customIndex.getRawClient().admin().cluster().prepareState()
          .setIndices(customIndex.getIndexName())
          .setRoutingTable(false).setNodes(false).setListenerThreaded(false).get();
  
      for (IndexMetaData indexMetadata: retVal.getState().getMetaData()) {
        Iterator<String> aliasIt = indexMetadata.aliases().keysIt();
        while (aliasIt.hasNext()) {
          String alias = aliasIt.next();
          if (!alias.equals(customIndex.getIndexName()) && !aliasSet.contains(alias)) {
            customIndex.removeAlias(alias);          
          }
        }
      }//TESTED
    }
  }//TESTED
  
  @SuppressWarnings("deprecation")
  private static void deleteOldCustomRecords(ElasticSearchManager customIndex, Date deleteThreshold) {
    if (null == deleteThreshold) {
      return;
    }
    customIndex.doDeleteByQuery(QueryBuilders.constantScoreQuery(FilterBuilders.numericRangeFilter("@timestamp").to(deleteThreshold.getTime())));
  }
}
Source Code of com.ikanow.infinit.e.processing.custom.output.CustomOutputIndexingEngine

Related Classes of com.ikanow.infinit.e.processing.custom.output.CustomOutputIndexingEngine