Package com.ikanow.infinit.e.processing.custom.utils

Source Code of com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchMongoInputFormat

/*******************************************************************************
* Copyright 2012 The Infinit.e Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package com.ikanow.infinit.e.processing.custom.utils;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.bson.types.ObjectId;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.action.search.SearchRequestBuilder;
import org.elasticsearch.search.SearchHit;

import com.ikanow.infinit.e.api.knowledge.QueryHandler;
import com.ikanow.infinit.e.data_model.api.knowledge.AdvancedQueryPojo;
import com.ikanow.infinit.e.data_model.custom.InfiniteMongoConfig;
import com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat;
import com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputSplit;
import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.mongodb.BasicDBObject;

public class InfiniteElasticsearchMongoInputFormat extends InfiniteMongoInputFormat
  @Override
  public List<InputSplit> getSplits(JobContext context)
  {
    final Configuration hadoopConfiguration = context.getConfiguration();
    final InfiniteMongoConfig conf = new InfiniteMongoConfig( hadoopConfiguration );
    String queryStr = hadoopConfiguration.get("mongo.input.query");
    String userIdStr = hadoopConfiguration.get("infinit.e.userid");
    AdvancedQueryPojo query = AdvancedQueryPojo.fromApi(queryStr, AdvancedQueryPojo.class);
    return calculateSplits(query, userIdStr, conf);   
 
 
  static List<InputSplit> calculateSplits(AdvancedQueryPojo query, String userIdStr, InfiniteMongoConfig conf) {
    // Community IDs, get from QUERY?
    // user id str - get from configuration?
   
    QueryHandler qh = new QueryHandler();
   
    LinkedList<InputSplit> outList = new LinkedList<InputSplit>();
   
    String[] communityIdStrs = new String[query.communityIds.size()];
    int i = 0;
    for (ObjectId commId: query.communityIds) {
      communityIdStrs[i++] = commId.toString();
    }
   
    QueryHandler.QueryInfo queryInfo = qh.convertInfiniteQuery(query, communityIdStrs, userIdStr);
    if (null != queryInfo) {
      int numDocs = 0;
      int maxDocs = conf.getLimit();
      if (0 == maxDocs) {
        maxDocs = Integer.MAX_VALUE;
      }
      int docsPerScroll = 500; // (note this is multiplied by the number of primary shards, will normally be 5 per community)     
      if (maxDocs < docsPerScroll) {
        docsPerScroll = maxDocs; // (can't do any better than this because we don't know what the distribution across shards will be)
      }
     
      SearchRequestBuilder searchOptions = queryInfo.indexMgr.getSearchOptions();
      searchOptions.setSize(docsPerScroll);
      searchOptions.setSearchType(SearchType.SCAN);
      searchOptions.setScroll("1m");
     
      SearchResponse rsp = queryInfo.indexMgr.doQuery(queryInfo.queryObj, searchOptions);
      String scrollId = rsp.getScrollId();
     
      final int SPLIT_SIZE = 4000;
      ArrayList<ObjectId> idList = null;
      for (;;) { // Until no more hits
       
        rsp = queryInfo.indexMgr.doScrollingQuery(scrollId, "1m");
        SearchHit[] docs = rsp.getHits().getHits();
        scrollId = rsp.getScrollId(); // (for next time)
       
        if ((null == docs) || (0 == docs.length)) {
          break;
        }         
        //DEBUG
        //System.out.println("SCROLL! " + docs.length + " : " + docs[0].getId());
               
        for (SearchHit hit: docs)  {
          if (null == idList) {
            idList = new ArrayList<ObjectId>(SPLIT_SIZE);
          }
          String idStr = hit.getId();
          try {
            idList.add(new ObjectId(idStr));
          }
          catch (Exception e) {} // carry on...
          if (SPLIT_SIZE == idList.size()) {
            //DEBUG
            //System.out.println("SPLIT! " + idList.size() + " " + idList.get(0) + " , " + outList.size());
                   
            BasicDBObject mongoQuery = new BasicDBObject(DocumentPojo._id_,
                            new BasicDBObject(DbManager.in_, idList));
            outList.add((InputSplit)new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(),
                    mongoQuery, conf.getFields(), conf.getSort(), 0, 0, conf.isNoTimeout()));
            idList = null;
          }//TESTED
          numDocs++;
          if (numDocs >= maxDocs) {
            break;
          }//TESTED
        }
        if (numDocs >= maxDocs) {
          break;
        }//TESTED
      }//(end loop over hits)
      if (null != idList) {
        //DEBUG
        //System.out.println("SPLIT! " + idList.size() + " " + idList.get(0) + " , " + outList.size());
       
        BasicDBObject mongoQuery = new BasicDBObject(DocumentPojo._id_,
                      new BasicDBObject(DbManager.in_, idList));
        outList.add((InputSplit)new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(),
                mongoQuery, conf.getFields(), conf.getSort(), 0, 0, conf.isNoTimeout()));
      }//TESTED
    }
    return outList;
  }//TESTED
}
TOP

Related Classes of com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchMongoInputFormat

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.