Package com.ikanow.infinit.e.api.knowledge.processing

Source Code of com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils_MultiCommunity$Community_LocalInfo

/*******************************************************************************
* Copyright 2012, The Infinit.e Open Source Project.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package com.ikanow.infinit.e.api.knowledge.processing;

import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;

import org.bson.types.ObjectId;

import com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.EntSigHolder;
import com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.TempDocBucket;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;

class ScoringUtils_MultiCommunity {

  ///////////////////////////////////////////////////////////////////////////////////////////////////////////// 
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////// 

  //APPROXIMATE PAN-COMMUNITY SIGNIFICANCE CALCULATIONS

  // Classes:

  // Holds the list of all possible communities that could be returned (and whether they've been seen before)
  static class Community_GlobalInfo {
    Community_GlobalInfo(int nIndex_) { nIndex = nIndex_; bInDataset = false; nEntityCount = 0; }
    public int nIndex;
    public boolean bInDataset;   
    public long nEntityCount;
  }
  // Holds the relative doc counts between community pairs, used to estimate doc counts from unobserved communities
  // (for a specific entity, although obviously these stats are across all entities)
  static class Community_GlobalCorrelation {
    Community_GlobalCorrelation(int nId1, int nId2) {
      community1_id = nId1; community2_id = nId2;
    }
    private int community1_id;
    private int community2_id;
    private long community1_docCount = 0;
    private long community2_docCount = 0;
    private long overlapping_entities = 0;
    public int getCommunity1_id() { return community1_id; }
    public int getCommunity2_id() { return community2_id; }
    public long getCommunity1_docCount() { return community1_docCount; }
    public long getCommunity2_docCount() { return community2_docCount; }
    public long getOverlapping_entities() { return overlapping_entities; }
    public void updateCorrelation(int nCommIdA, int nCommIdB, long nDocCountA, long nDocCountB) {
      if ((0 != nDocCountA) && (0 != nDocCountB)) { // (new entity)
        overlapping_entities++;
      }
      if (community1_id == nCommIdA) {
        community1_docCount += nDocCountA;
        community2_docCount += nDocCountB;
      }
      else {
        community1_docCount += nDocCountB;
        community2_docCount +=nDocCountA;       
      }     
    }
  }
  // Just holds the matching vs total counts for each community per entity
  static class Community_LocalInfo {
    public long datasetCount;
    public long communityCount;
  }
  // Holds per entity extensions for multi community scoring
  static class Community_EntityExtensions {
    public int singleId; // (only kick complex code in if needed - intialized when an instance of this class is created)
    public Community_LocalInfo multiInfo[] = null; // (only used if needed)
    public Map<Integer, Community_LocalInfo> multiInfoMap = null; // (only used if *really* needed)
    public int observedCount = 1; // (ie must be 1 if this even exists)   
  }

  ///////////////////////////////////////////////

  // State:

  private ObjectId[] _community_ids = null; // (taken from calling function)
  private HashSet<ObjectId> _community_ids_lookupSet = null; // (annoyingly required to handle multi-community (query) "queues" and "buckets" (user queues))
 
  private Map<ObjectId, Community_GlobalInfo> _community_globalInfo = null; //new TreeMap<String, GlobalCommunityInfo>();
    // (created on demand)
  private Map<Integer, Community_GlobalInfo> _community_globalInfo_observed = null; //new TreeMap<String, GlobalCommunityInfo>();
    // (created on the fly)
  private Community_GlobalInfo[] _community_globalInfoTable = null; // (same as above, but with faster lookup given id)
  private int _community_nObserved = 0;
  private Community_GlobalCorrelation _community_correlationTable[] = null; //= new GlobalCommunityCorrelation[(n*(n-1))/2];
  private ObjectId _community_cachedValue = null; // (last community observed from document)
  private int _community_nCachedId = -1;
  private int _community_nPreMultiId = -1; // (_community_nCachedId is -1 until we setup the metadata...
    // ...which we defer as long as possible in order to not to do it if we don't have to,
    // as a result, a bunch of EntSigHolder objects will have an incorrect community_singleId==-1)

  private static final int _community_nUpperLimitForArrays = 25;
  private static final int _community_nLowerLimitForMaps = 10;
 
  ///////////////////////////////////////////////
 
  // External Utility
 
  ScoringUtils_MultiCommunity(String[] communityIds) {
    _community_ids_lookupSet = new HashSet<ObjectId>();
    _community_ids = new ObjectId[communityIds.length];
    for (int i = 0; i < communityIds.length; ++i) {
      _community_ids[i] = new ObjectId(communityIds[i]);
      _community_ids_lookupSet.add(_community_ids[i]);
    }
  }   

  ///////////////////////////////////////////////
 
  // Get converted community Ids
 
  ObjectId[] getCommunityIds() {
    return _community_ids;
  }
  ///////////////////////////////////////////////

  // External Utility
 
  // Tells the delegating class whether the multi community code has been activated yet
 
  boolean isActive() {
    return (-1 != _community_nCachedId);
  }
 
  ///////////////////////////////////////////////
 
  // External Utility
 
  // Initalize a new entity
 
  void initializeEntity(EntSigHolder ent) {
    ent.community = new Community_EntityExtensions();
    ent.community.singleId = _community_nCachedId;
    if (-1 != _community_nCachedId) {
      _community_globalInfoTable[ent.community.singleId].nEntityCount++;
    }
  }
 
  ///////////////////////////////////////////////

  // Internal Utility

  final private int community_getCorrelationIndex(int nId1, int nId2) {
    // each value of nId1 "uses up" (N-nId1) elements
    // so the start of the nId2s for a given nId1 (nId2>nId1) is:
    // sigma(s=1,s<=nId1)[N - s] = N*nId1 - (nId1*(nId1+1))/2
    // and then to get the (nId1,nId2) position, you add (nId2-nId1-1)
    // and the equation below drops out:   
    if (nId1 < nId2) {
      return (nId1*(2*_community_ids.length - nId1 - 3))/2 + (nId2 - 1);
    }
    else {
      return (nId2*(2*_community_ids.length - nId2 - 3))/2 + (nId1 - 1);
    }
  }//TESTED

  ///////////////////////////////////////////////

  // Top level check to determine the community Id and do some initialization where needed

  int community_getIdAndInitialize(ObjectId communityId, Map<String, EntSigHolder> entitiesInDataset) {
    if (0 == _community_nObserved) { // First time through, still don't know if we need any of this horrible logic
      if (!_community_ids_lookupSet.contains(communityId)) { // (first doc seen is not in an accessible community)
        return Integer.MIN_VALUE; // (this doc cannot be viewed with this community context)
      }//TESTED
     
      _community_nObserved = 1;
      _community_cachedValue = communityId;
        // (leave _community_nCachedId as -1 until we know we're going to need it... then we'll use
        //  _community_nCachedIdMap to update all the old local values as needed)
    }//TESTED
    else if (!_community_cachedValue.equals(communityId)) {
      // Not the same community as last time, is it a new community?

      Community_GlobalInfo global = null;
      if (null == this._community_globalInfo) { // Setup metadata if not already done so
        community_setupMetadata();
        global = _community_globalInfo.get(_community_cachedValue);
       
        if (null == global) { // (this should not be possible)
          throw new RuntimeException("Internal Logic Error: first document seen was in inaccessible community");
        }//TESTED       
       
        global.bInDataset = true;
        global.nEntityCount = entitiesInDataset.size();
          // (Update observed entities for original community)
       
        _community_nPreMultiId = global.nIndex;
        _community_globalInfo_observed.put(global.nIndex, global);
      }//TESTED
      global = _community_globalInfo.get(communityId);
        // this should always return an object, see setupCommunityMetadata()
      if (null == global) {
        return Integer.MIN_VALUE; // (this doc cannot be viewed with this community context)
      }       

      if (!global.bInDataset) {
        global.bInDataset = true;
        _community_globalInfo_observed.put(global.nIndex, global); // (preserves order)
        _community_nObserved++;
      }//TESTED
      _community_cachedValue = communityId;
      _community_nCachedId = global.nIndex;

    }//TESTED (both above clauses seen in true and false states)

    // (else nothing to do)

    return _community_nCachedId;
  }//TESTED

  ///////////////////////////////////////////////

  // Initialization function - only need be called if >1 community observed

  void community_setupMetadata() {
    _community_globalInfo = new TreeMap<ObjectId, Community_GlobalInfo>();
    _community_globalInfo_observed = new TreeMap<Integer, Community_GlobalInfo>();
    int n = _community_ids.length;
    _community_correlationTable = new Community_GlobalCorrelation[1 + community_getCorrelationIndex(n - 2, n - 1)];
    _community_globalInfoTable = new Community_GlobalInfo[n];

    int nPosInTable = 0;
    for (int i = 0; i < _community_ids.length; ++i) {
      Community_GlobalInfo commInfo = new Community_GlobalInfo(i);
      _community_globalInfo.put(_community_ids[i], commInfo);
      _community_globalInfoTable[i] = commInfo;
      for (int j = i + 1; j < _community_ids.length; ++j) {
        if (0 == i) { // First time through, also setting up communities
          Community_GlobalInfo subCommInfo = new Community_GlobalInfo(j);         
          _community_globalInfo.put(_community_ids[j], subCommInfo);
        }
        Community_GlobalCorrelation correlation = new Community_GlobalCorrelation(i, j);
        _community_correlationTable[nPosInTable] = correlation;
        nPosInTable++;
      }
    }
  }//TESTED

  /////////////////////////////////////////////////////////////////////////////////////////////////////

  // Per entity function

  void community_updateCorrelations(EntSigHolder ent, long nTotalDocCount, String sIndex) {   
    int newCommunityId = _community_nCachedId;
   
    boolean bMultiCommunityNotInitialized = (null == ent.community.multiInfo) && (null == ent.community.multiInfoMap);

    if ((-1 == ent.community.singleId) && (newCommunityId == _community_nPreMultiId)) {
      //CASE 1: First time through in "multi mode", not a new community though

      ent.community.singleId = newCommunityId; // (can't be -1 because if we're here it's because we're in "multi mode")
     
      //DEBUG PRINT:
      //System.out.println("ScoringUtils.community_updateCorrelations#1: " + sIndex
      //    + " / updated deferred single community id" + newCommunityId);     
      return;
    }//TESTED (case1)
    else if (bMultiCommunityNotInitialized && (newCommunityId == ent.community.singleId)) {
      // CASE 2: STILL ONLY SEEN ONE COMMUNITY

      // (nothing to do, handled by existing "single community" code)
      return;
    }//TESTED (case2)
    else if (bMultiCommunityNotInitialized) {
      // CASE 3: MULTIPLE COMMUNITIES FOR THE FIRST TIME

      if (-1 == ent.community.singleId) {
        ent.community.singleId = _community_nPreMultiId;

        //DEBUG PRINT:
        //System.out.println("ScoringUtils.community_updateCorrelations#3a: " + sIndex
        //    + " / updating deferred cache " + _community_nCachedIdMap);

      }//TESTED

      // Some ugly optimization logic (also elsewhere in this function):
      // - <=10 communities, just use "ent.multipleCommunityInfo"
      // - 10<.<=25 communities, use both array and map
      // - >25 communities, just use map
      if (_community_ids.length <= _community_nUpperLimitForArrays) {
        ent.community.multiInfo = new Community_LocalInfo[_community_ids.length];
      }
      if (_community_ids.length > _community_nLowerLimitForMaps) {
        ent.community.multiInfoMap = new TreeMap<Integer,Community_LocalInfo>();
      }
      // Information from previously-sole community:
      Community_LocalInfo infoForCommunity = new Community_LocalInfo();
      infoForCommunity.communityCount = ent.nTotalDocCount;
      infoForCommunity.datasetCount = ent.nDocCountInQuerySubset;
      if (null != ent.community.multiInfo) {
        ent.community.multiInfo[ent.community.singleId] = infoForCommunity;
      }
      if (null != ent.community.multiInfoMap) {
        ent.community.multiInfoMap.put(ent.community.singleId, infoForCommunity);
      }

      // Information from new community:
      Community_LocalInfo infoForCommunity2 = new Community_LocalInfo();
      infoForCommunity2.communityCount = nTotalDocCount;
      infoForCommunity2.datasetCount = 1;
      if (null != ent.community.multiInfo) {
        ent.community.multiInfo[newCommunityId] = infoForCommunity2;
      }
      if (null != ent.community.multiInfoMap) {
        ent.community.multiInfoMap.put(newCommunityId, infoForCommunity2);
      }
      //(update global and local entity and community counts)
      _community_globalInfoTable[newCommunityId].nEntityCount++;
      ent.community.observedCount++;

      // Update global correlation (update both sides, unusually):

      Community_GlobalCorrelation correlation =
        _community_correlationTable[community_getCorrelationIndex(newCommunityId, ent.community.singleId)];
      correlation.updateCorrelation(newCommunityId,  ent.community.singleId, nTotalDocCount, ent.nTotalDocCount);

      //DEBUG PRINT:
      //System.out.println("ScoringUtils.community_updateCorrelations#3c: "  + sIndex
      //    + " / " + newCommunityId + " prev " + ent.community.singleId);
      //System.out.println("ScoringUtils.community_updateCorrelations#3c, correlation: " + new com.google.gson.Gson().toJson(correlation));
      //System.out.println("ScoringUtils.community_updateCorrelations#3c, local old: " + new com.google.gson.Gson().toJson(infoForCommunity));
      //System.out.println("ScoringUtils.community_updateCorrelations#3c, local new: " + new com.google.gson.Gson().toJson(infoForCommunity2));

    }//TESTED(case3)
    else {
      // CASE 4: WORKING WITH MULTIPLE COMMUNITIES

      Community_LocalInfo infoForCommunity;
      if (null != ent.community.multiInfo) { // (obviously try array lookup first, much faster)
        infoForCommunity = ent.community.multiInfo[newCommunityId];
      }
      else { // then the info map must be non-null
        infoForCommunity = ent.community.multiInfoMap.get(newCommunityId);
      }
      if (null == infoForCommunity) { // New community, add to local list and update global communities
        infoForCommunity = new Community_LocalInfo();
        infoForCommunity.communityCount = nTotalDocCount;

        //DEBUG PRINT:
        //System.out.println("ScoringUtils.community_updateCorrelations#4a: " + sIndex
        //    + " / " + newCommunityId);

        community_updateCorrelations_innerLoop(ent, nTotalDocCount, newCommunityId, false);

        // Update local info:

        if (null != ent.community.multiInfo) {
          ent.community.multiInfo[newCommunityId] = infoForCommunity;
        }
        if (null != ent.community.multiInfoMap) {
          ent.community.multiInfoMap.put(newCommunityId, infoForCommunity);          
        }
        //(update global and local entity and community counts)
        _community_globalInfoTable[newCommunityId].nEntityCount++;
        ent.community.observedCount++;

        //DEBUG PRINT:
        //System.out.println("ScoringUtils.community_updateCorrelations#4a, local new: " + newCommunityId +": "
        //    + new com.google.gson.Gson().toJson(infoForCommunity)
        //    + " / global=" + _community_globalInfoTable[newCommunityId] .nEntityCount + ", local_comms=" + ent.community.observedCount
        //    + " (" + ent.community.mutliInfo  + "/" + ent.community.multiInfoMap + ")");

      }//(end if new community for this entity)
      else {       
        // Just check if the DC is up-to-date:
        if (nTotalDocCount > infoForCommunity.communityCount) {
          //DEBUG PRINT:
          //System.out.println("ScoringUtils.community_updateCorrelations#4b, update local old: " + sIndex + " , " + new com.google.gson.Gson().toJson(infoForCommunity)
          //    + " (" + ent.community.multiInfo  + " to " + nTotalDocCount + ")");

          // (Update all the correlation tables if the difference is significant...)
          long nDiff = nTotalDocCount - infoForCommunity.communityCount; 
          if (((nDiff*100)/nTotalDocCount) > 15) { // say >15%
            community_updateCorrelations_innerLoop(ent, nDiff, newCommunityId, true);
          }
        }
      }//TESTED, including >15% case (modified by hand)

      // In either case, update the dataset count
      infoForCommunity.datasetCount++;

    }//TESTED (case4)

  }//TESTED

  ///////////////////////////////////////////////

  // Utility function for community_updateCorrelations()

  private void community_updateCorrelations_innerLoop(EntSigHolder ent, long nTotalDocCount, int newCommunityId, boolean bUpdateNewOnly)
  {
    // Update global correlations:
    // (Note slightly unpleasant due to performance-related use of both array and map)
    // (Also note in !bUpdateNewOnly haven't inserted new object yet so can happily loop over collection without worrying about that)
    if (null != ent.community.multiInfoMap) { // Loop over the map, faster than the array
      for (Map.Entry<Integer,Community_LocalInfo> existingCommunityEntry: ent.community.multiInfoMap.entrySet()) {
        int nExistingCommunityId = existingCommunityEntry.getKey();
        if (bUpdateNewOnly && (newCommunityId == nExistingCommunityId)) {
          continue;
        }
        long nExistingTotalDocCount = existingCommunityEntry.getValue().communityCount;
        Community_GlobalCorrelation correlation =
          _community_correlationTable[community_getCorrelationIndex(newCommunityId, nExistingCommunityId)];

        if (bUpdateNewOnly) {
          nExistingTotalDocCount = 0;
        }
        correlation.updateCorrelation(newCommunityId,  nExistingCommunityId, nTotalDocCount, nExistingTotalDocCount);
        //DEBUG PRINT:
        //System.out.println("ScoringUtils.community_updateCorrelations#4xa, correlation: " + new com.google.gson.Gson().toJson(correlation));
      }//TESTED

      //(~ cut and pasted below)     
    }
    else { // Loop over the array, faster than the map

      //(~ cut and paste from above)
      for (int i = 0; i < ent.community.multiInfo.length; ++i) {
        if (bUpdateNewOnly && (newCommunityId == i)) {
          continue;
        }
        Community_LocalInfo existingCommunity = ent.community.multiInfo[i];
        if (null != existingCommunity) { // (can be null, means we haven't seen it yet)             
          long nExistingTotalDocCount = existingCommunity.communityCount;

          Community_GlobalCorrelation correlation =
            _community_correlationTable[community_getCorrelationIndex(newCommunityId, i)];

          if (bUpdateNewOnly) {
            nExistingTotalDocCount = 0;
          }
          correlation.updateCorrelation(newCommunityId,  i, nTotalDocCount, nExistingTotalDocCount);
          //DEBUG PRINT:
          //System.out.println("ScoringUtils.community_updateCorrelations#4xb, correlation: " + new com.google.gson.Gson().toJson(correlation));
        }           
      }//TESTED
    }       
  }//TESTED

  //////////////////////////////////////////////////////////////////////////////////////////////////////////////
  //////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
  // Use the correlation coefficients calculated during stage1_/community_ to estimate any missing doc counts - top level logic

  double community_estimateAnyMissingDocCounts(EntSigHolder ent) {
   
    double estimatedDocCount = 0;

    // Check first if we can immediately return:
    if ((null == _community_globalInfo_observed) || (ent.community.observedCount == _community_globalInfo_observed.size())) {
     
      //DEBUG PRINT:
      //System.out.println("community_estimateAnyMissingDocCounts: Escaping unneeded calcs: " + ent.entityInstances.get(0).dbo.getString("index") +
      //    " / " + ent.community.observedCount + " , " + new com.google.gson.Gson().toJson(_community_globalInfo_observed));
     
      return 0;
    }//TESTED
   
    // If not, need to find the missing communities and estimate their total doc counts:
   
    if (null == ent.community.multiInfoMap) { // This is just a really easy case - small number of communities - so let's knock it off first
     
      Iterator<Map.Entry<Integer, Community_GlobalInfo>> globalItQuick = _community_globalInfo_observed.entrySet().iterator();
      while (globalItQuick.hasNext()) {
        Map.Entry<Integer, Community_GlobalInfo> globalMeta = globalItQuick.next();
        int nEntityIt = globalMeta.getKey();
       
        boolean bMissingEntity = false;
        if (null != ent.community.multiInfo) {
          Community_LocalInfo local =  ent.community.multiInfo[nEntityIt];
          bMissingEntity = (null == local) || (0 == local.datasetCount);
        }
        else { // Special case, entity only ever saw 1 community
          bMissingEntity = !((-1 == ent.community.singleId) ? (nEntityIt == _community_nPreMultiId) : (nEntityIt == ent.community.singleId));
        }
        if (bMissingEntity) { // Missing
          estimatedDocCount += community_estimateMissingDocCount(ent, nEntityIt);

          //DEBUG PRINT:
          //System.out.println("community_estimateAnyMissingDocCounts: Update1a: " + ent.entityInstances.get(0).dbo.getString("index") + ": "
          //    + nEntityIt + " , " + ent.community.observedCount + " = " + estimatedDocCount);
        }
      }
      return estimatedDocCount;
    }//TESTED (both singleId case and (null != ent.community.multiInfo) case)
   
    // If we've got here then there are enough communities that we need to loop over only observed ones
   
    Iterator<Map.Entry<Integer, Community_GlobalInfo>> globalIt = _community_globalInfo_observed.entrySet().iterator();
    Iterator<Map.Entry<Integer, Community_LocalInfo>> entityIt = ent.community.multiInfoMap.entrySet().iterator();
   
    while (entityIt.hasNext()) {
      Map.Entry<Integer, Community_LocalInfo> entityInfo = entityIt.next();
     
      ObjectId entityCommunityId = _community_ids[entityInfo.getKey()];
      Map.Entry<Integer, Community_GlobalInfo> globalInfo = globalIt.next(); // (if entityIt hasNext then so must globalIt)
     
      while (true) {
        ObjectId globalCommunityId = _community_ids[globalInfo.getKey()];
       
        if (globalCommunityId.equals(entityCommunityId)) { // Lock step
          break;
        }
        else {
          estimatedDocCount += community_estimateMissingDocCount(ent, globalInfo.getKey());
         
          //DEBUG PRINT:
          //System.out.println("community_estimateAnyMissingDocCounts: Update2: " + ent.entityInstances.get(0).dbo.getString("index") + ": "
          //    + globalInfo.getKey() + " , " + ent.community.observedCount + " = " + estimatedDocCount);
         
          if (globalIt.hasNext()) {
            globalInfo = globalIt.next();
          }
          else {
            break;
          }
        }
      }// (end "lockstep" loop over all observed global communities)
     
    }//(end "lockstep" loop over communities observed in entity)
    //TESTED
   
    while (globalIt.hasNext()) {
      Map.Entry<Integer, Community_GlobalInfo> globalInfo = globalIt.next();
      estimatedDocCount += community_estimateMissingDocCount(ent, globalInfo.getKey());
      //DEBUG PRINT:
      //System.out.println("community_estimateAnyMissingDocCounts: " + ent.entityInstances.get(0).dbo.getString("index")
      //    + ": add missing counts from " + globalInfo.getKey() + ": " + estimatedDocCount);
    }
    //TESTED   
   
    return estimatedDocCount;
  }//TESTED
 
  //////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
  // Use the correlation coefficients calculated during stage1_/community_ to estimate any missing doc counts - actual calculation

  double community_estimateMissingDocCount(EntSigHolder ent, int nMissingCommunityId) {

    double estimatedDocCount = 0;
   
    if (null != ent.community.multiInfoMap) { // Loop over the map, faster than the array
     
      //DEBUG PRINT:
      //System.out.println("community_estimateMissingDocCount 1: loop over ent.community.multiInfoMap");
     
      for (Map.Entry<Integer,Community_LocalInfo> existingCommunityEntry: ent.community.multiInfoMap.entrySet()) {
        int nExistingCommunityId = existingCommunityEntry.getKey();
        Community_LocalInfo existingCommunity = existingCommunityEntry.getValue();
          // (note can't be == to nMissingCommunityId otherwise it wouldn't be missing!)

        Community_GlobalCorrelation correlation =
          _community_correlationTable[community_getCorrelationIndex(nMissingCommunityId, nExistingCommunityId)];

        estimatedDocCount += community_estimateMissingDocCount_innerLoop(nMissingCommunityId, nExistingCommunityId, existingCommunity.communityCount, correlation);

      }// (end loop over community against which I can correlate)
     
      //(~ cut and pasted below)     
    }//TESTED
    else if (null != ent.community.multiInfo) { // Loop over the array, faster than the map
      //(~ cut and paste from above)
     
      //DEBUG PRINT:
      //System.out.println("community_estimateMissingDocCount 2: loop over ent.community.multiInfo");
     
      for (int i = 0; i < ent.community.multiInfo.length; ++i) {
        Community_LocalInfo existingCommunity = ent.community.multiInfo[i];
       
        if (null != existingCommunity) {
          Community_GlobalCorrelation correlation =
            _community_correlationTable[community_getCorrelationIndex(nMissingCommunityId, i)];
 
          estimatedDocCount += community_estimateMissingDocCount_innerLoop(nMissingCommunityId, i, existingCommunity.communityCount, correlation);
        }
      }
    }//TESTED (end performance-oriented choice over which collection to loop over)
    else {
      // In this case, the entity has never been intiialized, ie only has 1 entity to compare against:

      //DEBUG PRINT:
      //System.out.println("community_estimateMissingDocCount 3: no loop needed, single id: " + this._community_nPreMultiId + "/" + ent.community.singleId);
     
      int nExistingCommunity = (-1 == ent.community.singleId) this._community_nPreMultiId : ent.community.singleId;
     
      Community_GlobalCorrelation correlation =
        _community_correlationTable[community_getCorrelationIndex(nMissingCommunityId, nExistingCommunity)];
     
      estimatedDocCount += community_estimateMissingDocCount_innerLoop(nMissingCommunityId, nExistingCommunity, ent.nTotalDocCount, correlation);
    }//(TESTED)
   
    return estimatedDocCount;   
  }//TESTED (all 3 cases above)

  ///////////////////////////////////////////////

  // Utility function for community_estimateMissingDocCount()

 
  private static Double[] frequencyAdjustment = 0.218, 0.357, 0.433, 0.477, 0.497, 0.497, 0.477, 0.433, 0.357, 0.218  };
    // (for high query overlaps, the "significance" (statistical) of an entity not overlapping is high, so suppress somewhat the calculated frequency)
    // (for low query overlaps, suppress the calculated frequency to take into account prob that entity simply doesn't occur in both communities)
      // (so in "conclusion" pre-calculate table of sqrt((1 - t)t) for 10 buckets, 0-9,10-19,etc)
 
  private double community_estimateMissingDocCount_innerLoop(int nMissingCommunityId, int nExistingCommunityId,
                                long nExistingCommunityCount, Community_GlobalCorrelation correlation) {
    long nMissingCommunityOverlapCount;
    long nExistingCommunityOverlapCount;
    if (correlation.getOverlapping_entities() > 0) {
     
      long nMissingEntityCount = this._community_globalInfoTable[nMissingCommunityId].nEntityCount;
      long nExistingEntityCount = this._community_globalInfoTable[nExistingCommunityId].nEntityCount;
      double freqAdjustment = 0.109; // (a default just in case nXxxEntityCount == correlation, == 0.5*frequencyAdjustment[0])
      if (nMissingEntityCount < nExistingEntityCount) { // (use the smaller of the 2)
        if (nMissingEntityCount > correlation.getOverlapping_entities()) { // (just in case!)
          freqAdjustment = frequencyAdjustment[(int) ((correlation.getOverlapping_entities()*10)/nMissingEntityCount)];
        }
      }
      else {
        if (nExistingEntityCount > correlation.getOverlapping_entities()) { // (just in case!)
          freqAdjustment = frequencyAdjustment[(int) ((correlation.getOverlapping_entities()*10)/nExistingEntityCount)];
        }
      }
      if (nMissingCommunityId == correlation.getCommunity1_id()) {
        nMissingCommunityOverlapCount = correlation.getCommunity1_docCount();
        nExistingCommunityOverlapCount = correlation.getCommunity2_docCount();           
      }
      else {
        nMissingCommunityOverlapCount = correlation.getCommunity2_docCount();
        nExistingCommunityOverlapCount = correlation.getCommunity1_docCount();           
      }
      if (nExistingCommunityOverlapCount > 0) {

        //DEBUG PRINT:
        //System.out.println("community_estimateMissingDocCount_innerLoop ("+nMissingCommunityId+","+nExistingCommunityId+"): " +
        //    freqAdjustment +" * " +
        //    (nExistingCommunityCount*nMissingCommunityOverlapCount)/nExistingCommunityOverlapCount+ " << "
        //    + nMissingCommunityOverlapCount +" , " + nExistingCommunityOverlapCount + " ; " + nExistingCommunityCount
        //    + " / " + nMissingEntityCount + " , " + nExistingEntityCount + " ; " + correlation.getOverlapping_entities());       
       
        return (freqAdjustment*(nExistingCommunityCount*nMissingCommunityOverlapCount)/nExistingCommunityOverlapCount);
      }
    }
    return 0;
  }//TESTED
 
  //////////////////////////////////////////////////////////////////////////////////////////////////////////////
  //////////////////////////////////////////////////////////////////////////////////////////////////////////////
  //////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
  // Deduplication logic
 
  static boolean community_areDuplicates(TempDocBucket lhs, TempDocBucket rhs) {
    if (lhs.url.equals(rhs.url)) {
      // These are probably duplicates, we'll do some more tests...
      BasicDBList lhsEnts = (BasicDBList) lhs.dbo.get(DocumentPojo.entities_);
      BasicDBList rhsEnts = (BasicDBList) rhs.dbo.get(DocumentPojo.entities_);
     
      if (!((null != lhsEnts) ^ (null != rhsEnts))) // ie both null or neither null
      {
        if (null != lhsEnts) {
          if (lhsEnts.size() != rhsEnts.size()) {
            return false;
          }//TESTED
          if (lhsEnts.size() > 0) {
            BasicDBObject lhsFirstEnt = (BasicDBObject) lhsEnts.get(0);
            BasicDBObject rhsFirstEnt = (BasicDBObject) rhsEnts.get(0);
            String lhsIndex = lhsFirstEnt.getString(EntityPojo.index_);
            String rhsIndex = rhsFirstEnt.getString(EntityPojo.index_);
            if ((null != lhsIndex) && (null != rhsIndex)) {
              if (!lhsIndex.equals(rhsIndex)) {
                return false;
              }
            }
          }//(end "random" entity test)
          //TESTED
        }
      } // (end entity count test)
     
      // Finally we'll just count the events:
      BasicDBList lhsEvents = (BasicDBList) lhs.dbo.get(DocumentPojo.associations_);
      BasicDBList rhsEvents = (BasicDBList) rhs.dbo.get(DocumentPojo.associations_);
      if (!((null != lhsEvents) ^ (null != rhsEvents))) // ie both null or neither null
      {
        if (null != lhsEvents) {
          if (lhsEvents.size() != rhsEvents.size()) {
            return false;
          }//TESTED         
        }
      }//(end event count test)
           
      return true;
    }
    return false;   
  }//TESTED
 
  //////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
  // Combine duplicate documents
 
  static void community_combineDuplicateDocs(TempDocBucket masterDoc) {
    //(source key)
    BasicDBList masterSourceKeyList = (BasicDBList) masterDoc.dbo.get(DocumentPojo.sourceKey_);
    String masterSourceKey = (masterSourceKeyList.size() > 0) ? (String)masterSourceKeyList.get(0) : null;
    HashSet<String> sourceKeySet = null;
    //(community Id)
    BasicDBList masterCommunityList = (BasicDBList) masterDoc.dbo.get(DocumentPojo.communityId_);
    ObjectId masterCommunityId = (masterCommunityList.size() > 0) ? (ObjectId)masterCommunityList.get(0) : null;
    HashSet<ObjectId> communityIdSet = null;
    //(source title)
    BasicDBList masterSourceList = null;
    String masterSource = null;
    HashSet<String> sourceSet = null;
    //(mediaType)
    BasicDBList masterTypeList = null;
    String masterType = null;
    HashSet<String> typeSet = null;
    //(tags)
    BasicDBList masterTagList = null;
    String masterTag = null;
    HashSet<String> tagSet = null;
   
    for (TempDocBucket slave = masterDoc.dupList; null != slave; slave = slave.dupList) {
      String sourceKey = slave.dbo.getString(DocumentPojo.sourceKey_);
      int nCommunityPos = sourceKey.indexOf('#');
      if ((nCommunityPos > 0) && (null != sourceKey)) {
        sourceKey = sourceKey.substring(0, nCommunityPos);
      }//TESTED
     
      // Always combine communities:
      ObjectId communityIdObj = (ObjectId) slave.dbo.get(DocumentPojo.communityId_);
      communityIdSet = combineElements(communityIdSet, TypeErasureWorkaround.Type.OBJECTID, masterCommunityList, masterCommunityId, slave, null, communityIdObj);
     
      if ((null != masterSourceKey) && (null != sourceKey) && sourceKey.equals(masterSourceKey)) {
        continue;
      }//TESTED
     
      // else fall through to...
     
      // Combine source keys:
      sourceKeySet = combineElements(sourceKeySet, TypeErasureWorkaround.Type.STRING, masterSourceKeyList, masterSourceKey, slave, null, sourceKey);
     
      // Combine source titles:
      masterSourceList = (BasicDBList) masterDoc.dbo.get(DocumentPojo.source_);
      masterSource = (masterSourceList.size() > 0) ? (String)masterSourceList.get(0) : null;
      String slaveSource =  slave.dbo.getString(DocumentPojo.source_);
      sourceSet = combineElements(sourceSet, TypeErasureWorkaround.Type.STRING, masterSourceList, masterSource, slave, null, slaveSource);
     
      // Combine media types:
      masterTypeList = (BasicDBList) masterDoc.dbo.get(DocumentPojo.mediaType_);
      masterType = (masterTypeList.size() > 0) ? (String)masterTypeList.get(0) : null;
      String slaveType =  slave.dbo.getString(DocumentPojo.mediaType_);
      typeSet = combineElements(typeSet, TypeErasureWorkaround.Type.STRING, masterTypeList, masterType, slave, null, slaveType);
     
      // Combine tags:
      masterTagList = (BasicDBList) masterDoc.dbo.get(DocumentPojo.tags_);
      masterTag = (masterTagList.size() > 0) ? (String)masterTagList.get(0) : null;
      BasicDBList slaveTagList =  (BasicDBList) slave.dbo.get(DocumentPojo.tags_);
      tagSet = combineElements(tagSet, TypeErasureWorkaround.Type.STRING, masterTagList, masterTag, slave, slaveTagList, null);
     
    } // (end loop over slaves)
   
    // Now use any sets created to fill in
   
    if (null != communityIdSet) {
      masterCommunityList.clear();
      for (ObjectId id: communityIdSet) {   
        masterCommunityList.add(id);       
      }
    }//TESTED
   
    if (null != sourceKeySet) {
      masterSourceKeyList.clear();
      for (String key: sourceKeySet) {   
        masterSourceKeyList.add(key);       
      }
    }//TESTED
   
    if (null != sourceSet) {
      masterSourceList.clear();
      for (String src: sourceSet) {   
        masterSourceList.add(src);       
      }
    }//TESTED
   
    if (null != typeSet) {
      masterTypeList.clear();
      for (String type: typeSet) {   
        masterTypeList.add(type);       
      }
    }//TESTED
   
    if (null != tagSet) {
      masterTagList.clear();
      for (String tag: tagSet) {   
        masterTagList.add(tag);       
      }
    }//TESTED
   
  }//TESTED
 
  ///////////////////////////////////////////////
 
  // Utilities
 
  private static class TypeErasureWorkaround {
    public enum Type { OBJECTID, STRING }
    public static HashSet<?> newSet(Type type) {
      if (Type.OBJECTID == type) {
        return new HashSet<ObjectId>();     
      }
      else {
        return new HashSet<String>();
      }
    }
  }//TESTED
 
  ///////////////////////////////////////////////

  // Combine the elements from the different documents into arrays (preserving uniqueness)
 
  @SuppressWarnings("unchecked")
  private static <T> HashSet<T> combineElements(HashSet<T> combinorSet, TypeErasureWorkaround.Type type, 
                            BasicDBList masterEls, T masterEl,
                            TempDocBucket slave, BasicDBList slaveEls, T slaveEl)
  {
    int nSlaveEls = 0;
    if (null == slaveEl) {
      slaveEl = ((nSlaveEls = slaveEls.size()) > 0) ? (T)slaveEls.get(0) : null;
    }
    if ((null == combinorSet) && (null == slave.dupList) && (masterEls.size() < 2)) { // most of the time, only combining 2 objects
      //DEBUG PRINT:
      //System.out.println("community_combineDuplicateDocs.combineElements#1: " + slaveEl.toString() + " vs " +  masterEl.toString());
     
      if (null != slaveEl) {
        if ((masterEl == null) || !slaveEl.equals(masterEl)) {
          masterEls.add(slaveEl);
        }
        if (nSlaveEls > 1) { // (pretty inelegant way of doing this never mind, it's only in 2 places)
          for (int i = 1; i < nSlaveEls; ++i) {
            slaveEl = (T) slaveEls.get(i);
            if ((masterEl == null) || !slaveEl.equals(masterEl)) {
              masterEls.add(slaveEl);
            }
          }
        }//TESTED
      }
    }//TESTED
    else { // Slower version, needed because >1 duplicate so it gets a bit messy...
      if (null == combinorSet) {
        combinorSet = (HashSet<T>) TypeErasureWorkaround.newSet(type);
        combinorSet.add(masterEl);
        int nMasterEls = masterEls.size();
        for (int i = 1; i < nMasterEls; ++i) {
          combinorSet.add((T) masterEls.get(i));         
        }       
      }//TESTED
      combinorSet.add(slaveEl);
      if (nSlaveEls > 1) { // (pretty inelegant way of doing this never mind, it's only in 2 places)
        for (int i = 1; i < nSlaveEls; ++i) {
          combinorSet.add((T) slaveEls.get(i));
        }
      }//TESTED
      //DEBUG PRINT:
      //System.out.println("community_combineDuplicateDocs.combineElements#2: " + slaveEl.toString() + " vs " +  masterEl.toString());
    }//TESTED   
    return combinorSet;
  }//TESTED
}
TOP

Related Classes of com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils_MultiCommunity$Community_LocalInfo

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.
', 'auto'); ga('send', 'pageview');