In [8]:

    
import elasticsearch;
import random;
import math;
import time;

# search the user_id based on the given name and the nameId list
def _searchUserIdByName(_name, _nameIdList):
    _uid = '';
    if (_nameIdList):
        for _tuple in _nameIdList:
            if (_tuple['name']==_name):
                _uid = _tuple['id'];
                break;
    return _uid;

def _fetchJeymartItems(_e2, _hit, _docObj):
    _res = _e2.search(
        index='odt_jeymart_items',
        doc_type='doc',
        body=
        {
            "query": {
                "function_score": {
                  "query": {
                    "bool": {
                      "must": [
                        {
                          "exists": {
                            "field": "hf_price_suggested"
                          }
                        },
                        {
                          "match": {
                            "t_description": _hit['t_description']
                          }
                        },
                        {
                          "terms": {
                            "k_category": _hit['k_category']
                          }
                        },
                        {
                          "match": {
                            "k_photo": _hit['k_photo']
                          }
                        }
                      ]
                    }
                  },
                  "functions": [
                    {
                      "random_score": {
                        "field": "hf_price_suggested"
                      }
                    }
                  ],
                  "score_mode": "sum"
                }
              },
          "sort": [
            {
              "_score": {
                "order": "asc"
              }
            }
          ]    
        }        
    );
    
    _len=len(_res['hits']['hits']);
    _i = int(math.floor(random.random()*_len));
    if (_i >= _len):
        if (_len>0):
            _i=_i-1;
        else:
            _i=-1;
        
    #while (_i >= len(_res['hits']['hits'])):
    #    _i = int(math.floor(random.random()*len(_res['hits']['hits'])));
    
    if (_i!=-1):
        _chosenHit=_res['hits']['hits'][_i];
        _docObj['s_brand_name']=_chosenHit['_source']['s_brand_name'];
        _docObj['hf_price_suggested']=_chosenHit['_source']['hf_price_suggested'];
        
    return _docObj;

def _runBulkInBatches(_es, _bList):
    _res = {};
    _len = len(_bList);
    _threshold = 50000;
    _parts=[];
    # some problem here...
    _partsTotal = int(round(_len/_threshold+0.5, 0));
    _startIdx=0;
    
    for _i in range(_partsTotal):
        if (_startIdx<_len):
            _parts[_i]=_bList[_startIdx:_threshold];
            _startIdx+=_threshold;
        else:
            break;
    
    # run bulk and sleep for 1minute
    for _partBulk in _parts:
        _es.bulk(body=_partBulk);
        time.sleep(60);
        
    return _res;


#_choice = input("do you want to run the fix?");
_choice="yes";
if (_choice.lower()=="yes"):
    _es = elasticsearch.Elasticsearch( [ "http://localhost:9201" ], requestTimeout="600000" );

    _res = _es.search(
        index = "odt_jeymart_user_trx",
        doc_type = "doc",
        body = {
            "size": 0,
            "aggs": {
                "NAME": {
                  "terms": {
                    "field": "user.s_full_name.keyword",
                    "size": 10000
                  }
                }
            }
        }
    );
    # create user_id based on the names
    _names = _res['aggregations']['NAME']['buckets'];
    _uids = [];
    _nameId = [];
    
    for _i in range(len(_names)):
        _uid = int(round(random.random()*1000000, 0));
        # check if _uid exists or not
        while (_uid in _uids):
            _uid = int(round(random.random()*1000000, 0));

        _uids.append(_uid);
    # associate the name and user_id together
    for _j in range(len(_names)):
        _nameId.append({
            'name': _names[_j]['key'],
            'id': _uids[_j] 
        });

    #print(_nameId[0]['name']);
    #print(_nameId[0]['id']);    
    #print(_nameId);
    
    # ----------------------------------------
    # - scrolling of the odt_jeymart_user_trx
    # ----------------------------------------
    _e2 = elasticsearch.Elasticsearch([ "http://localhost:9201" ]);
    
    _res = _es.search(
        index = "odt_jeymart_user_trx",
        doc_type = "doc",
        body = {
            "size": 1000,
            "query": {
                "match_all": {}
            }
        },
        scroll = "5m"
    );
    #print(len(_res['hits']['hits']));
    #print(_res);
    
    # close the cursor
    #_es.clear_scroll( scroll_id = _res['_scroll_id']);

    _acc_hits=0;
    _bulkLst = [];
    while (len(_res['hits']['hits'])>0):
        # loop each hits
        for _hit in _res['hits']['hits']:
            _docCtx = {};
            _src = _hit['_source'];
            
            # 1. (k_user_id) search and replace the value of this doc
            _uid_1 = _searchUserIdByName(_src['user']['s_full_name'], _nameId);
            _docCtx['k_user_id'] = _uid_1;
            
            # 2. run another query against the "odt_jeymart_items"
            _docCtx = _fetchJeymartItems(_e2, _hit['_source'], _docCtx);
            
            # 3. update the _bulkLst for _bulk operation
            _bulkLst.append({ "update": { "_index": "odt_jeymart_user_trx", "_type": "doc", "_id": _hit['_id'] } });
            _bulkLst.append({ "doc": _docCtx });
            
        _res = _es.scroll(
            scroll_id = _res['_scroll_id'],
            scroll = "5m"
        );
        # counter info
        _acc_hits+=len(_res['hits']['hits']);
        print(_acc_hits);
        
    #_size=len(_bulkLst);
    print(len(_bulkLst));
    #print(_bulkLst[_size-1]);
    #print(_bulkLst[_size-2]);    
    
    # 4. run _bulk
    # too large, divide into 4 batches... play safe
    
    
    _res = _es.bulk(body=_bulkLst);
    print(_res);
    
    # close the cursor
    #_es.clear_scroll( scroll_id = _res['_scroll_id']);

    
    del _es;
    del _e2;
else:
    print('** so you are supposed to run the following **');









    



1000
2000
3000






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-8-35e676410ddd> in <module>()
    182 
    183             # 2. run another query against the "odt_jeymart_items"
--> 184             _docCtx = _fetchJeymartItems(_e2, _hit['_source'], _docCtx);
    185 
    186             # 3. update the _bulkLst for _bulk operation

<ipython-input-8-35e676410ddd> in _fetchJeymartItems(_e2, _hit, _docObj)
     60             {
     61               "_score": {
---> 62                 "order": "asc"
     63               }
     64             }

/Users/jason.wong/anaconda/lib/python3.6/site-packages/elasticsearch/client/utils.py in _wrapped(*args, **kwargs)
     74                 if p in kwargs:
     75                     params[p] = kwargs.pop(p)
---> 76             return func(*args, params=params, **kwargs)
     77         return _wrapped
     78     return _wrapper

/Users/jason.wong/anaconda/lib/python3.6/site-packages/elasticsearch/client/__init__.py in search(self, index, doc_type, body, params)
    634             index = '_all'
    635         return self.transport.perform_request('GET', _make_path(index,
--> 636             doc_type, '_search'), params=params, body=body)
    637 
    638     @query_params('_source', '_source_exclude', '_source_include',

/Users/jason.wong/anaconda/lib/python3.6/site-packages/elasticsearch/client/utils.py in _make_path(*parts)
     48     return '/' + '/'.join(
     49         # preserve ',' and '*' in url for nicer URLs in logs
---> 50         quote_plus(_escape(p), b',*') for p in parts if p not in SKIP_IN_PATH)
     51 
     52 # parameters that apply to all methods

/Users/jason.wong/anaconda/lib/python3.6/site-packages/elasticsearch/client/utils.py in <genexpr>(.0)
     48     return '/' + '/'.join(
     49         # preserve ',' and '*' in url for nicer URLs in logs
---> 50         quote_plus(_escape(p), b',*') for p in parts if p not in SKIP_IN_PATH)
     51 
     52 # parameters that apply to all methods

/Users/jason.wong/anaconda/lib/python3.6/urllib/parse.py in quote_plus(string, safe, encoding, errors)
    782     # Check if ' ' in string, where string may either be a str or bytes.  If
    783     # there are no spaces, the regular quote will produce the right answer.
--> 784     if ((isinstance(string, str) and ' ' not in string) or
    785         (isinstance(string, bytes) and b' ' not in string)):
    786         return quote(string, safe, encoding, errors)

KeyboardInterrupt: