Package com.ikanow.infinit.e.data_model.custom

Source Code of com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter

/*******************************************************************************
* Copyright 2012 The Infinit.e Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package com.ikanow.infinit.e.data_model.custom;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NavigableSet;
import java.util.TreeSet;

import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.log4j.Logger;
import org.bson.BasicBSONObject;
import org.bson.types.ObjectId;

import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.MongoDbManager;
import com.ikanow.infinit.e.data_model.store.config.source.SourceHarvestStatusPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.MongoURI;
import com.mongodb.hadoop.input.MongoInputSplit;
import com.mongodb.hadoop.util.MongoSplitter;

public class InfiniteMongoSplitter
{
  public static final int MAX_SPLITS = 10000000;
 
  private static Logger _logger = Logger.getLogger(InfiniteMongoSplitter.class);     
 
  /**
   * Checks if the new params MAX_SPLITS and MAX_DOCS_PER_SPLIT are set
   * in the config.  If they are it will use those to do splits via limit/skip
   * otherwise it will call the previous chunking splitter in MongoSplitter.
   *
   * @param conf
   * @return
   */
 
  public static List<InputSplit> calculateSplits(InfiniteMongoConfig conf)
  {
    // First off: What is our sharding scheme?
   
    boolean shardingPolicyNew = false;
    try {
      BasicDBObject shardQuery = new BasicDBObject("_id", "doc_metadata.metadata");
      BasicDBObject shardInfo = (BasicDBObject) DbManager.getCollection("config", "collections").findOne(shardQuery);
      if (null != shardInfo) {
        BasicDBObject shardInfoKey = (BasicDBObject) shardInfo.get("key");
        if (null != shardInfoKey) {
          shardingPolicyNew = (shardInfoKey.size() > 1);
        }
      }
    }//TESTED (new and old)
    catch (Exception e) {} // stick with the old sharding, it's probably going to die soon after though, honestly
   
    // conf.getQuery returns a new copy of the query, so get once and use everywhere...
    BasicDBObject confQuery = (BasicDBObject) conf.getQuery();
   
    BasicDBObject srcTagsQuery = (BasicDBObject) conf.getSourceTags();
   
    String collection = conf.getInputURI().getCollection();
    if (!collection.equals(DbManager.getDocument().getContent().getName()) && !collection.equals(DbManager.getDocument().getMetadata().getName()))
    {
      // Case 1: feature table or custom table
      // Just run legacy code
      return calculateSplits_phase2(conf, confQuery, false, false, null);     
    }
    else { // complex cases...
      boolean simpleOtherIndex = false;
      // Check whether a simple query has been performed on a different indexed field     
      if (null == srcTagsQuery) { // (if srcTags specified, then going to want to use sourceKey as the index)
        for (String s: Arrays.asList(EntityPojo.docQuery_index_, DocumentPojo.url_)) {
          Object selector = confQuery.get(s);
          if (selector instanceof String) {
            simpleOtherIndex = true;
            break;
          }
          else if (selector instanceof DBObject) {
            DBObject selectorDbo = (DBObject)selector;
            if (selectorDbo.containsField(DbManager.in_)) {
              simpleOtherIndex = true;
              break;
            }
          }
        }//TESTED (both types, plus check complex indexes don't work)     
        // ALLOWED: {"entities.index": { "$in": [ "xxx", "yyy"] }, {"entities.index": "xxx" }, ditto for "url"
        // NOT ALLOWED: { "entities.index": { "$ne": "xxx" } }
      }
      //TESTED check ignored if eg entity_index specified
     
      if (simpleOtherIndex) {
        // Case 2: we have a simple query on an indexed field
        // Just run legacy code
       
        return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null);         
      }//TESTED
      else if (conf.getLimit() > 0) { // debug
        //Case 3: Ensure we have small sets of sources to search over
        BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery, conf.getMaxDocsPerSplit());
        final List<InputSplit> splits = new ArrayList<InputSplit>();
       
        boolean queryNonTrivial = isQueryNonTrivial(confQuery);
        if (!queryNonTrivial) {
          //Case 3a: query is trivial, so can just create splits directly from the split pre-calcs
          int toProcess = conf.getLimit();
          Iterator<Object> itSplit = collectionOfSplits.iterator();
          while ((toProcess > 0) && (itSplit.hasNext())) {
            BasicDBObject split = (BasicDBObject) itSplit.next();

            int docCount = (int)split.getLong(SourceHarvestStatusPojo.doccount_, 0L);
            int toGet = (docCount > toProcess) ? toProcess : docCount;
            BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_));
            if (null != modQuery) {
              splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout()));
              toProcess -= docCount;
            }
          }//TESTED
        }
        else {
          // Case 3b: annoying, some extra query terms, gonna need to do it the hard way...
          int toProcess = conf.getLimit();
          Iterator<Object> itSplit = collectionOfSplits.iterator();
          DBCollection coll = InfiniteMongoConfigUtil.getCollection(conf.getInputURI());
          while ((toProcess > 0) && (itSplit.hasNext())) {
            BasicDBObject split = (BasicDBObject) itSplit.next();
           
            BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_));
            if (null != modQuery) {
              int docsCounted = (int) coll.getCount(modQuery, null, toProcess, 0);
              int toGet = (docsCounted > toProcess) ? toProcess : docsCounted;
              if (docsCounted > 0) {
                splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout()));
                toProcess -= docsCounted;
              }
            }//TESTED
          }
        }//TESTED
       
        return splits;
      }
      else { // More complex cases:
       
        if (shardingPolicyNew) {
          // Case 4a: NEW SHARDING SCHEME
         
          // Always fetch the new sources, eg convert communityId to sourceKeys
          try {         
            splitPrecalculations_newShardScheme(confQuery, srcTagsQuery); // (modifies confQuery if returns true)       
            boolean queryNonTrivial = isQueryNonTrivial(confQuery);
           
            return calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null);

              // (ie trivial query => always use chunks, bypass skip/limit test)
          }//TESTED (trivial + non-trivial)
          catch (Exception e) { // Didn't match any sources, no problem
            return new ArrayList<InputSplit>();
          }//TESTED
         
        }//TESTED
        else {

          BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery, conf.getMaxDocsPerSplit());
         
          if (null == collectionOfSplits) {
            // Case 4b: OLD SHARDING SCHEME can't get a partition by source keys, just back off to old code
            return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null);           
          }//TESTED (old code)
          else {
            conf.setMaxDocsPerSplit(2*conf.getMaxDocsPerSplit());
              // (because we stop creating splits when the exceed the size)
           
            // Case 4c: OLD SHARDING SCHEME, have a source key partition
            int nMaxCount = 1 + conf.getMaxDocsPerSplit()*conf.getMaxSplits();
            boolean queryNonTrivial = isQueryNonTrivial(confQuery);
            final List<InputSplit> splits = new ArrayList<InputSplit>();
           
            BasicDBObject savedQuery = confQuery;
           
            Iterator<Object> itSplit = collectionOfSplits.iterator();
            BasicDBList bigSplit = null;
            while (itSplit.hasNext()) {
              BasicDBObject split = (BasicDBObject) itSplit.next();
              int docCount = (int)split.getLong(SourceHarvestStatusPojo.doccount_, 0L);
              if (docCount < nMaxCount) { // small split, will use skip/limit
                BasicDBObject modQuery = convertQuery(savedQuery, split.get(DocumentPojo.sourceKey_));
                if (null != modQuery) {

                  final int SPLIT_THRESHOLD = 3;
                  // A few cases:
                  if ((docCount < (SPLIT_THRESHOLD*conf.getMaxDocsPerSplit())) || !queryNonTrivial) {
                    splits.addAll(calculateSplits_phase2(conf, modQuery, false, shardingPolicyNew, (Integer)docCount));
                  }//TESTED (based on limit, based on query)
                  else {
                    // My guess at the point at which you might as well as do the full query in the hope you're going
                    // to save some (empty) splits
                    splits.addAll(calculateSplits_phase2(conf, modQuery, false, shardingPolicyNew, null));
                  }//TESTED
                }//TESTED
              }
              else { // large split, combine all these guys into an array of source keys
                if (null == bigSplit) {
                  bigSplit = new BasicDBList();
                }
                bigSplit.add(split.get(DocumentPojo.sourceKey_));
                  // (guaranteed to be a single element)
              }
            }//(end loop over collections)
           
            if (null != bigSplit) {
             
              // If we have a big left over community then create a set of splits for that - always chunks if query trivial
              if (1 == bigSplit.size()) {
                confQuery.put(DocumentPojo.sourceKey_, bigSplit.iterator().next());               
              }
              else {
                confQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, bigSplit));
              }
              splits.addAll(calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null));
            }//TESTED: singleton+trivial (sandy), array+trivial (sentiment/enron), array+non-trivial (sentiment/enron, docGeo), singleton+non-trivial (sandy, docGeo)

            return splits;
           
          }//TESTED: end if Cases 4a, 4b, 4c
         
        }//(end if old vs new sharding policy)
       
      }//(non-debug case)
    }//(content or metadata table are most complex)
  }

  @SuppressWarnings("unchecked")
  public static List<InputSplit> calculateSplits_phase2(InfiniteMongoConfig conf, BasicDBObject confQuery, boolean alwaysUseChunks, boolean newShardScheme, Integer splitDocCount)
  {
    alwaysUseChunks &= (conf.getMaxSplits() != MAX_SPLITS);
      // (in standalone mode, never use chunks)
   
    MongoURI uri = conf.getInputURI();
    DBCollection coll = InfiniteMongoConfigUtil.getCollection(uri);
    if (conf.getLimit() > 0) {
      return calculateManualSplits(conf, confQuery, 1, conf.getLimit(), coll);     
    }
    else
    {
      if (!alwaysUseChunks) {
        int nMaxCount = 1 + conf.getMaxDocsPerSplit()*conf.getMaxSplits();
        int count = 0;
        if (null == splitDocCount) {
          if (nMaxCount <= 1) {
            nMaxCount = 0;
          }
          else {
            //DEBUG
            //System.out.println(coll.find(confQuery).limit(1).explain());
           
            count = (int) coll.getCount(confQuery, null, nMaxCount, 0);
            if (0 == count) {
              return new ArrayList<InputSplit>();
            }
          }//TESTED
        }
        else {
          count = splitDocCount;
        }
       
        //if maxdocssplit and maxsplits is set and there are less documents than splits*docspersplit then use the new splitter
        //otherwise use the old splitter
        if ( conf.getMaxDocsPerSplit() > 0 && conf.getMaxSplits() > 0 && ( count < nMaxCount ) )
        {
          _logger.debug("Calculating splits manually");
          int splits_needed = (count/conf.getMaxDocsPerSplit()) + 1;
         
          return calculateManualSplits(conf, confQuery, splits_needed, conf.getMaxDocsPerSplit(), coll);
        }//TESTED
      }         
      if (newShardScheme && !confQuery.containsField(DocumentPojo.sourceKey_)) {
        // OK if we're going to do the sharded version then we will want to calculate
        splitPrecalculations_newShardScheme(confQuery, null); // (modifies confQuery if returns true)       
      }//TESTED: checked did nothing when had sourceKey, added sourceKey when necessary (eg entities.index case)
     
      if (!newShardScheme) { // unlike new sharding scheme, in this case the query is fixed, so overwrite now:
        conf.setQuery(confQuery);
      }
     
      List<InputSplit> splits = MongoSplitter.calculateSplits(conf);
        // (unless manually set, like above, runs with the _original_ query)
      int initialSplitSize  = splits.size();
     
      // We have the MongoDB-calculated splits, now calculate their intersection vs the query
      @SuppressWarnings("rawtypes")
      Map<String, TreeSet<Comparable>> orderedArraySet = new HashMap<String, TreeSet<Comparable>>();
      @SuppressWarnings("rawtypes")
      Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin = new HashMap<String, NavigableSet<Comparable>>();
      BasicDBObject originalQuery = confQuery;
     
     
      ArrayList<InputSplit> newsplits = new ArrayList<InputSplit>(splits.size());
      Iterator<InputSplit> splitIt = splits.iterator();
      while (splitIt.hasNext()) {
        try {
          orderedArraySet_afterMin.clear();
         
          MongoInputSplit mongoSplit = (MongoInputSplit)splitIt.next();
          BasicDBObject min = (BasicDBObject) mongoSplit.getQuerySpec().get("$min");
          BasicDBObject max = (BasicDBObject) mongoSplit.getQuerySpec().get("$max");
         
          //DEBUG
          //_logger.info("+----------------- NEW SPLIT ----------------: " + min + " /" + max);
          //System.out.println("+----------------- NEW SPLIT ----------------: " + min + " /" + max);
         
          if (null != min) { // How does the min fit in with the general query
            try {
              if (compareFields(-1, originalQuery, min, max, orderedArraySet, orderedArraySet_afterMin) < 0) {
                splitIt.remove();
                continue;
              }
            }
            catch (Exception e) {} // do nothing probably just some comparable issue
          }//TESTED
         
          if (null != max) { // How does the min fit in with the general query
            try {
              if (compareFields(1, originalQuery, max, min, orderedArraySet, orderedArraySet_afterMin) > 0) {
                splitIt.remove();
                continue;
              }
            }
            catch (Exception e) {} // do nothing probably just some comparable issue
          }//TESTED
         
          //DEBUG
          //_logger.info("(retained split)");
          //System.out.println("(retained split)");
         
          // (don't worry about edge cases, won't happen very often and will just result in a spurious empty mapper)
         
          ////////////////////////////////
         
          // Now some infinit.e specific processing...
         
          if (newShardScheme) {
            @SuppressWarnings("rawtypes")
            TreeSet<Comparable> sourceKeyOrderedArray = orderedArraySet.get(DocumentPojo.sourceKey_);
            if ((null != sourceKeyOrderedArray) && !sourceKeyOrderedArray.isEmpty()) {
              @SuppressWarnings("rawtypes")
              Comparable minSourceKey = null;
              Object minSourceKeyObj = (null == min) ? null : min.get(DocumentPojo.sourceKey_);
              if (minSourceKeyObj instanceof String) {
                minSourceKey = (String)minSourceKeyObj;
              }
              if (null == minSourceKey) {
                minSourceKey = sourceKeyOrderedArray.first();
              }//TESTED
              @SuppressWarnings("rawtypes")
              Comparable maxSourceKey = null;
              Object maxSourceKeyObj = (null == max) ? null : max.get(DocumentPojo.sourceKey_);
              if (maxSourceKeyObj instanceof String) {
                maxSourceKey = (String)maxSourceKeyObj;
              }
              if (null == maxSourceKey) {
                maxSourceKey = sourceKeyOrderedArray.last();
              }//TESTED
             
              DBObject splitQuery = mongoSplit.getQuerySpec();
              BasicDBObject splitQueryQuery = new BasicDBObject((BasicBSONObject) splitQuery.get("$query"));             
              if (0 == minSourceKey.compareTo(maxSourceKey)) { // single matching sourceKEy
                splitQueryQuery.put(DocumentPojo.sourceKey_, maxSourceKey);
              }//TESTED (array of sources, only one matches)
              else { // multiple matching source keys
                splitQueryQuery.put(DocumentPojo.sourceKey_,
                    new BasicDBObject(DbManager.in_, sourceKeyOrderedArray.subSet(minSourceKey, true, maxSourceKey, true)));
              }//TESTED (array of sources, multiple match)         
              newsplits.add(new InfiniteMongoInputSplit(mongoSplit, splitQueryQuery, conf.isNoTimeout()));                             
            }
            else { // original query is of sufficient simplicity
              newsplits.add(new InfiniteMongoInputSplit(mongoSplit, originalQuery, conf.isNoTimeout()));             
            }//TESTED (no change to existing source)
           
          }//TESTED
          else { // old sharding scheme, remove min/max and replace with normal _id based query where possible
           
            DBObject splitQuery = mongoSplit.getQuerySpec();
            // Step 1: create a query range for _id:
            BasicDBObject idRange = null;
            Object idMin = (min == null) ? null : min.get(DocumentPojo._id_);
            Object idMax = (max == null) ? null : max.get(DocumentPojo._id_);
            if (!(idMin instanceof ObjectId))
              idMin = null;
            if (!(idMax instanceof ObjectId))
              idMax = null;
           
            if ((null != idMin) || (null != idMax)) {
              idRange = new BasicDBObject();
              if (null != idMin) {
                idRange.put(DbManager.gte_, idMin);
              }
              if (null != idMax) {
                idRange.put(DbManager.lt_, idMax);
              }
            }//TESTED           
           
            // Step 2: merge with whatever we have at the moment:
            if (null != idRange) {
              BasicDBObject splitQueryQuery = new BasicDBObject((BasicBSONObject) splitQuery.get("$query"))
              Object idQueryElement = splitQueryQuery.get(DocumentPojo._id_);
              boolean convertedAwayFromMinMax = false;
              if (null == idQueryElement) { // nice and easy, add _id range
                splitQueryQuery.put(DocumentPojo._id_, idRange);
                convertedAwayFromMinMax = true;
              }//TESTED
              else if (! splitQueryQuery.containsField(DbManager.and_)) { // OK we're going to just going to make life easy
                splitQueryQuery.remove(DocumentPojo._id_);
                splitQueryQuery.put(DbManager.and_, Arrays.asList(
                    new BasicDBObject(DocumentPojo._id_, idQueryElement),
                    new BasicDBObject(DocumentPojo._id_, idRange)));
                convertedAwayFromMinMax = true;             
              }//TESTED
              // (else stick with min/max)
             
              if (convertedAwayFromMinMax) { // can construct an _id query
                splitQuery.removeField("$min");
                splitQuery.removeField("$max");
              }//TESTED
              splitQuery.put("$query", splitQueryQuery);
            }
            newsplits.add(new InfiniteMongoInputSplit(mongoSplit, conf.isNoTimeout()));
          }//TESTED     
        }
        catch (Exception e) {
          //DEBUG
          //e.printStackTrace();
        } // do nothing must be some other type of input split
      }//TESTED
     
      //DEBUG
      //System.out.println("Calculating splits via mongo-hadoop: " + initialSplitSize + " reduced to " + splits.size());

      _logger.info("Calculating (converted) splits via mongo-hadoop: " + initialSplitSize + " reduced to " + newsplits.size());
      return newsplits;
    }
  }//TESTED
 
  /**
   * Creates numSplits amount of splits with limit items in each split
   * using limit and skip to determine the sets
   *
   * @param conf
   * @param numSplits
   * @param count
   * @param coll
   * @return
   */
  private static List<InputSplit> calculateManualSplits(InfiniteMongoConfig conf, BasicDBObject confQuery, int numSplits, int limit, DBCollection coll)
  {
    final List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
    _logger.debug("using a limit of " + limit + " for "+numSplits+" splits");
    for ( int i = 0; i < numSplits; i++ )
    {
      splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), confQuery, conf.getFields(), conf.getSort(), limit, i*limit, conf.isNoTimeout()));
    }
    return splits;
  }   

  ///////////////////////////////////////////////////////////
 
  // UTILITY CODE
 
  // Comparison code to calculate if there is a non-zero intersection between the query and the chunk
  // Note that (eg) if you have [key:A, _id:B] as your min (/max)
  // then _id>B only applies if key==A ... if key>A then the entire _id space is allowed
   
  @SuppressWarnings({ "unchecked", "rawtypes" })
  private static int compareFields(int direction, BasicDBObject query, BasicDBObject minOrMax, BasicDBObject maxOrMin,
                    Map<String, TreeSet<Comparable>> orderedArraySet, Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin)
  {
    for (String field: minOrMax.keySet()) {
      //DEBUG
      //System.out.println("1] Compare: " + field + ": " + direction);
     
      try {
        Object queryOfThisField = query.get(field);
        Object minField = minOrMax.get(field);
        if ((null != queryOfThisField) && (minField instanceof Comparable)){
          int result = 0;
          Comparable comparableMinOrMaxElement = (Comparable)minField;
          if (queryOfThisField instanceof BasicDBObject) {
            result = compareComplexObject(field, direction, (BasicDBObject) queryOfThisField, comparableMinOrMaxElement, orderedArraySet, orderedArraySet_afterMin);
          }//TESTED
          else { // -1 if comparableQueryElement < comparableMinOrMaxElement
            Comparable comparableQueryElement = (Comparable)queryOfThisField;
            result = comparableQueryElement.compareTo(comparableMinOrMaxElement);
            //DEBUG
            //System.out.println("3] Vals: " + comparableQueryElement + " vs " + comparableMinOrMaxElement + " = " + result);
          }//TESTED   
          if (result != 0) { // if we ever get a strict inequality then stop checking fields..
            if ((result == direction) || !minOrMax.equals(maxOrMin)) {
              // (fail)                 (pass but min/max keys different so not point checking any more)
              return result; 
            }//TESTED
          }
          // else equality, pass but keep checking fields
        }
      }
      catch (Exception e) {
        //DEBUG
        //e.printStackTrace();
      } // do nothing probably some odd comparable issue
    }
    return -direction; // (ie pass by default)
  }//TESTED
 
  // returns direction to pass without checking further fields, 0 to pass but check further fields, -direction to fail immediately
  // in practice won't ever return 0 (because it's not trivial to work out exact equality with complex operators)
 
  @SuppressWarnings({ "rawtypes", "unchecked" })
  private static int compareComplexObject(String parentField, int direction, BasicDBObject complexQueryElement, Comparable minOrMaxElement,
                        Map<String, TreeSet<Comparable>> orderedArraySet, Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin)
  {
    for (String field: complexQueryElement.keySet()) {
      //DEBUG
      //System.out.println("2] Compare operator: " + field + ", vs " + minOrMaxElement);
     
      if (field.equals(MongoDbManager.in_)) {
       
        NavigableSet<Comparable> orderedArray = null;
        if (1 == direction) { // try orderedArraySet_afterMin first...
          orderedArray = orderedArraySet_afterMin.get(parentField);
          //DEBUG
          //System.out.println("2.0] Found orderered sub-array for: " + parentField + ", size= " + orderedArray.size());
        }//TESTED
        if (null == orderedArray) { // (min, or max but min didn't set a sub-array)
          orderedArray = orderedArraySet.get(parentField);
          if (null == orderedArray) {
            // First time for this field, order the $in for easy comparison
            orderedArray = new TreeSet<Comparable>();
            Collection queryList = (Collection)complexQueryElement.get(MongoDbManager.in_);
            for (Object o: queryList) {
              Comparable c = (Comparable)o;
              orderedArray.add(c);
            }   
            //DEBUG
            //System.out.println("2.1] Created orderered array for: " + parentField + ", size= " + orderedArray.size());
           
            //DEBUG:
//            if (!orderedArray.isEmpty()) {
//              System.out.println("2.1.1] Head: " + orderedArray.iterator().next());         
//              System.out.println("2.1.2] Tail: " + orderedArray.descendingIterator().next());         
//            }
           
            orderedArraySet.put(parentField, (TreeSet<Comparable>)orderedArray);
              // (know this cast is valid by construction)
          }//TESTED
        }       
        if (-1 == direction) { // comparing vs min
          //DEBUG
          //System.out.println("2.2] tailSet: " + orderedArray.tailSet(minOrMaxElement, true).size());
          NavigableSet<Comparable> minElements = orderedArray.tailSet(minOrMaxElement, true);
          if (minElements.isEmpty()) { // (elements >= minElement)
            return direction; // will always fail
          }
          else {
            orderedArraySet_afterMin.put(parentField, minElements);
          }//TESTED
        }//TESTED
        else if (1 == direction) { // comparing vs max
          //DEBUG
          //System.out.println("2.2] headSet: " + orderedArray.headSet(minOrMaxElement, true).size());
         
          if (orderedArray.headSet(minOrMaxElement, true).isEmpty()) { // (elements <= maxElement)
            return direction; // will always fail
          }         
        }//TESTED
      }
      else if (field.equals(MongoDbManager.gt_) || field.equals(MongoDbManager.gte_)) { // (don't worry about the boundaries, just results in spurious empty chunks)
        if (1 == direction) { // can't do anything about $gt vs min
          Comparable comparableQueryElement = (Comparable)complexQueryElement.get(field);
          //DEBUG
          //System.out.println("2.3.1] GT Vals: " + comparableQueryElement + " vs " + minOrMaxElement + " = " + comparableQueryElement.compareTo(minOrMaxElement));
         
          if (comparableQueryElement.compareTo(minOrMaxElement) > 0) // ie query _lower_ limit > chunk max
            return direction; // ie fail
        }
      }//TESTED
      else if (field.equals(MongoDbManager.lt_) || field.equals(MongoDbManager.lte_)) { // (don't worry about the boundaries, just results in spurious empty chunks)
        if (-1 == direction) { // can't do anything about $lt vs max
          Comparable comparableQueryElement = (Comparable)complexQueryElement.get(field);
          //DEBUG
          //System.out.println("2.3.2] LT Vals: " + comparableQueryElement + " vs " + minOrMaxElement + " = " + comparableQueryElement.compareTo(minOrMaxElement));
         
          if (comparableQueryElement.compareTo(minOrMaxElement) < 0) // ie query upper limit < chunk min
            return direction; // ie fail
        }
      }//TESTED
    }
    return -direction; // (ie pass by default, don't check other fields unless they have the same min/max)
  }//TESTED (tested $in, $gte?, $lte?, $gte?/$lte? combinations)
 
  //TEST INFO:
  // shardKey = { sourceKey:1, _id: 1 }
  // FIRST TESTED  AGAINST $in 114 different keys starting with jdbc*
  // THEN query={"sourceKey": "jdbc.oracle.thin.@ec2-54-205-223-166.compute-1.amazonaws.com.152.1438"} ... left 226 chunks, hand checked
  // THEN query={"sourceKey": "jdbc.oracle.thin.@ec2-54-205-223-166.compute-1.amazonaws.com.152.1438"}, _id: { $oid: "52702a06e4b0b912ee0615f1" } ... left 1 chunk, hand checked
  // THEN query={"sourceKey": "jdbc.oracle.thin.@ec2-54-205-223-166.compute-1.amazonaws.com.152.1438"}, _id: {"$gte": {"$oid": "52702a06e4b0b912ee0615f0"}, "$lt": {"$oid":  "52753c1fe4b019e585827285"} } ...  left 3 chunks, hand checked
  // THEN query={_id: {"$gte": {"$oid": "52702a06e4b0b912ee0615f0"}, "$lt": {"$oid":  "52753c1fe4b019e585827285"} } ...  left 89 chunks, hand checked a few

  ////////////////////////////////////////////////////////////////////////
  ////////////////////////////////////////////////////////////////////////

  // Util - returns a list of shards  
 
  @SuppressWarnings("unchecked")
  public static boolean splitPrecalculations_newShardScheme(BasicDBObject query, BasicDBObject srcTagsQuery) {
    // Get the communityIds from the query
    Collection<ObjectId> communityIds = null;
    try {
      BasicDBObject communityIdsIn = (BasicDBObject)query.get(DocumentPojo.communityId_);
      communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_);
      if (null == communityIds) {
        return false;
      }
    }
    catch (Exception e) {
      //DEBUG
      //e.printStackTrace();
     
      return false; // back out
    }
   
    BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_, new BasicDBObject(DbManager.in_, communityIds));
    BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1);
    keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1);

    // Get and remove the sourceKey information, incorporate into source query,
    // so it's nice and simple by the time it gets to the actual query
    Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_);
   
    if (null != srcTagsQuery) { // Simpler case: src tags specified, so going to get a list of all the sources regardless
      if (null != sourceKeyQueryTerm) {
        keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm);
      }
      keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_));
    }//TESTED (including $all to test that "$srctags":{"$all": ["tagtest","db"]} matches on tags: ["tagtest","db", "tagtest2" ]
    else if (null != sourceKeyQueryTerm) {
      boolean sourceKeyQueryComplex = false;
     
      if (sourceKeyQueryTerm instanceof BasicDBObject) {
        BasicDBObject sourceKeyQueryTermDbo = (BasicDBObject) sourceKeyQueryTerm;
        if (sourceKeyQueryTermDbo.size() <= 2) { // every term must be lt/lte/gt/gte
          for (String sourceKeyQueryTermEl: sourceKeyQueryTermDbo.keySet()) {
            if (!sourceKeyQueryTermEl.equals(DbManager.in_) &&
                !sourceKeyQueryTermEl.equals(DbManager.lt_) && !sourceKeyQueryTermEl.equals(DbManager.lte_) &&
                !sourceKeyQueryTermEl.equals(DbManager.gt_) && !sourceKeyQueryTermEl.equals(DbManager.gte_))
            {
              sourceKeyQueryComplex = true;
              break;
            }//TESTED (eg ne)
            else if (sourceKeyQueryTermEl.equals(DbManager.in_) && (1 != sourceKeyQueryTermDbo.size())) {
              sourceKeyQueryComplex = true;
              break;             
            }//TESTED ((lt,in))
          }
        }//TESTED: (in, (gte,lt), ne)
        else {
          sourceKeyQueryComplex = true;         
        }//TESTED ({ "sourceKey": { "$in": ["test"], "$gt": "alex", "$lte":"test" } })
      }
      else if (sourceKeyQueryTerm instanceof java.util.regex.Pattern) { // probably a
        sourceKeyQueryComplex = true;         
      }
      //TESTED ($regex)
     
      if (sourceKeyQueryComplex) {
        keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm); // ie we'll simplify it below
      }
      else {
        return false; // already have a perfectly good source key specification
      }
    }//TESTED (See combinations above)
       
    DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(keyFields);
    int count = dbc.count();
   
    if (count > 5000) {
      // (too many source keys to process, just going to leave well alone... note will mean $srctags will fail open)
      return false;
    }
    else {
      ArrayList<String> sources = new ArrayList<String>(count);
      while (dbc.hasNext()) {
        BasicDBObject dbo = (BasicDBObject)dbc.next();
        String sourceKey = (String) dbo.get(SourcePojo.key_);
        sources.add(sourceKey);
      }
      if (sources.isEmpty()) {
        throw new RuntimeException(); // will just return no splits at all, no problem
      }//TESTED
      if (1 == sources.size()) {
        query.put(DocumentPojo.sourceKey_, sources.get(0));
      }//TESTED
      else {
        query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sources));
      }//TESTED
     
      return true;
    }   
  }//TESTED (See combinations above)
 
  // Util for creating a useful object containing source info (old sharding, _id - or new sharding but debug mode)
 
  @SuppressWarnings("unchecked")
  public static BasicDBList splitPrecalculations_oldShardSchemeOrDebug(BasicDBObject query, BasicDBObject srcTagsQuery, int maxCountPerTask) {
    // Get the communityIds from the query
    Collection<ObjectId> communityIds = null;
    try {
      BasicDBObject communityIdsIn = (BasicDBObject)query.get(DocumentPojo.communityId_);
      communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_);
      if (null == communityIds) {
        return null;
      }
    }
    catch (Exception e) {
      return null; // back out
    }
   
    BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_, new BasicDBObject(DbManager.in_, communityIds));
    BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1);
    keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1);

    // Get and remove the sourceKey information, incorporate into source query:
    Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_);
    if (null != sourceKeyQueryTerm) {
      keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm);
    }//TESTED
    if (null != srcTagsQuery) {
      keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_));
    }//TESTED
   
    DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(keyFields);
    if (dbc.count() > 5000) {
      // (too many source keys to process, just going to leave well alone... note this means $srctags will fail open)
      return null;
    }
    else {
      //TreeMap<String, Long> sourceKeys = new TreeMap<String, Long>();
      // Build collections of objects of format { sourceKey: string or [], totalDocs }
      BasicDBList sourceKeyListCollection = new BasicDBList();
      BasicDBList sourceKeyList = null;
      int runningDocs = 0;
      int runningSources = 0;
      while (dbc.hasNext()) {
        BasicDBObject dbo = (BasicDBObject)dbc.next();
        String sourceKey = (String) dbo.get(SourcePojo.key_);
        if (null != sourceKey) {
          long docCount = 0L;
          try {
            BasicDBObject harvestStatus = (BasicDBObject) dbo.get(SourcePojo.harvest_);
            if (null != harvestStatus) {
              docCount = harvestStatus.getLong(SourceHarvestStatusPojo.doccount_, 0L);
            }
          }
          catch (Exception e) {}
         
          //DEBUG
          //System.out.println("SOURCE=" + sourceKey + " DOC_COUNT=" + docCount + " RUNNING=" + runningDocs +"," + runningSources + ": " + sourceKeyList);
         
          if (docCount > maxCountPerTask) { // source is large enough by itself
            // Create collection
            BasicDBObject collection = new BasicDBObject();
            collection.put(DocumentPojo.sourceKey_, sourceKey);
            collection.put(SourceHarvestStatusPojo.doccount_, docCount);
            sourceKeyListCollection.add(collection);
            // (leaving running* alone, can keep building that)
          }//TESTED (by eye, system community of demo cluster)
          else if ((runningDocs + docCount) > maxCountPerTask) { // have now got a large enough collection of sources
            if (null == sourceKeyList) {
              sourceKeyList = new BasicDBList();
            }
            sourceKeyList.add(sourceKey);
            // Create collection
            BasicDBObject collection = new BasicDBObject();
            collection.put(DocumentPojo.sourceKey_, sourceKeyList);
            collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount);
            sourceKeyListCollection.add(collection);     
            sourceKeyList = null;
            runningDocs = 0;
            runningSources = 0;
          }//TESTED (by eye, system community of demo cluster)
          else if (runningSources >= 15) { // have a limit on the number of sources per query, to keep the queries manageable
            sourceKeyList.add(sourceKey);
            // Create collection
            BasicDBObject collection = new BasicDBObject();
            collection.put(DocumentPojo.sourceKey_, sourceKeyList);
            collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount);
            sourceKeyListCollection.add(collection);     
            sourceKeyList = null;
            runningDocs = 0;
            runningSources = 0;           
          }//TESTED (by eye, system community of demo cluster)
          else { // (keep) build(ing) list
            if (null == sourceKeyList) {
              sourceKeyList = new BasicDBList();
            }
            sourceKeyList.add(sourceKey);
            runningDocs += docCount;
            runningSources++;           
          }//TESTED (by eye, system community of demo cluster)
        } //(end if has source key)
      }//(end loop over cursor)

      // Finish off:
      if (null != sourceKeyList) {       
        // Create collection
        BasicDBObject collection = new BasicDBObject();
        collection.put(DocumentPojo.sourceKey_, sourceKeyList);
        collection.put(SourceHarvestStatusPojo.doccount_, runningDocs);
        sourceKeyListCollection.add(collection);     
      }//TESTED (by eye, system community of demo cluster)
     
      if (sourceKeyListCollection.isEmpty()) { // query returns empty
        throw new RuntimeException("Communities contain no sources");
      }
      return sourceKeyListCollection;

    } // (end if too many source keys across the communities)
  }//TESTED

  // Utility - has user specified fields other than community Id, index, or sourceKey
 
  private static boolean isQueryNonTrivial(BasicDBObject query) {
    if ((query.size() > 3) || ((query.size() > 2) && !query.containsField(DocumentPojo.sourceKey_))) {
      return true;
    }
    return false;
  }//TESTED
 
  // Utility - create new query with overwritten sourceKey
 
  private static BasicDBObject convertQuery(BasicDBObject originalQuery, Object sourceObj) {
    BasicDBObject modQuery = null;
    if (null != sourceObj) {
      if (sourceObj instanceof Collection) {
        modQuery = new BasicDBObject(originalQuery.toMap());
        @SuppressWarnings("rawtypes")
        Collection sources = (Collection)sourceObj;
        modQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sources));
      }//TESTED
      else if (sourceObj instanceof String) {
        modQuery = new BasicDBObject(originalQuery.toMap());
        String source = (String)sourceObj;
        modQuery.put(DocumentPojo.sourceKey_, source);             
      }//TESTED
    }
    return modQuery;
  }//TESTED 
}
TOP

Related Classes of com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.