Quick intro to map/reduce

Aaron Gonzales


In [ ]:


In [1]:
import asm_utils as util

In [3]:
data = util.get_collection(collection='samples')
all_data = util.get_collection()

In [4]:
data.count()


Out[4]:
10865

In [124]:


In [153]:
## Gets average number of unique assembly instruction counts for all classes
map = '''function() { 
             emit(this.class, this.asm_info.num_uniq_instr);
        };        
        '''

reduce = '''
            function(key, value) {
                return Array.sum(value)/value.length; 
            }
            '''

In [154]:
data.map_reduce(map, reduce, {'inline':1})


Out[154]:
{'counts': {'emit': 435, 'input': 435, 'output': 9, 'reduce': 18},
 'ok': 1.0,
 'results': [{'_id': '1', 'value': 62.89846153846154},
  {'_id': '2', 'value': 59.46},
  {'_id': '3', 'value': 20.966299019607845},
  {'_id': '4', 'value': 70.66433747412009},
  {'_id': '5', 'value': 37.857142857142854},
  {'_id': '6', 'value': 119.35714285714286},
  {'_id': '7', 'value': 18.244301994301996},
  {'_id': '8', 'value': 67.9},
  {'_id': '9', 'value': 33.73148148148149}],
 'timeMillis': 97}

In [151]:
## Gets average number of unique assembly instruction counts for all classes
map = '''function() { 
             emit(this.class, {uniq: this.asm_info.num_uniq_instr, tot: this.asm_info.num_instr});
        };        
        '''

reduce = '''
            function(key, value) {
                var res = { uniqs:0, tots: 0 }
                var count = 0;
                value.forEach(function(value) {
                    res.uniqs += value.uniq;
                    res.tots += value.tot;
                    count += 1;
                    })
                res.uniqs = res.uniqs/count;
                res.tots = res.tots/count;
                return res;
            }
            '''

In [152]:
data.map_reduce(map, reduce, {'inline':1})


Out[152]:
{'counts': {'emit': 435, 'input': 435, 'output': 9, 'reduce': 18},
 'ok': 1.0,
 'results': [{'_id': '1', 'value': {'tots': nan, 'uniqs': nan}},
  {'_id': '2', 'value': {'tots': 7324.64, 'uniqs': 59.46}},
  {'_id': '3', 'value': {'tots': nan, 'uniqs': nan}},
  {'_id': '4', 'value': {'tots': nan, 'uniqs': nan}},
  {'_id': '5', 'value': {'tots': 2504.0, 'uniqs': 37.857142857142854}},
  {'_id': '6', 'value': {'tots': nan, 'uniqs': nan}},
  {'_id': '7', 'value': {'tots': nan, 'uniqs': nan}},
  {'_id': '8', 'value': {'tots': 4277.38, 'uniqs': 67.9}},
  {'_id': '9', 'value': {'tots': nan, 'uniqs': nan}}],
 'timeMillis': 124}

In [182]:
# This one gets the counts of all classes. 

map = '''function() { 
             emit(this.class, 1);
        };        
        '''

reduce = '''
            function(key, value) {
                return Array.sum(value); 
            }
            '''

In [183]:
result = data.map_reduce(map, reduce, {'inline':1})
result


Out[183]:
{'counts': {'emit': 435, 'input': 435, 'output': 9, 'reduce': 18},
 'ok': 1.0,
 'results': [{'_id': '1', 'value': 50.0},
  {'_id': '2', 'value': 50.0},
  {'_id': '3', 'value': 50.0},
  {'_id': '4', 'value': 50.0},
  {'_id': '5', 'value': 35.0},
  {'_id': '6', 'value': 50.0},
  {'_id': '7', 'value': 50.0},
  {'_id': '8', 'value': 50.0},
  {'_id': '9', 'value': 50.0}],
 'timeMillis': 77}

In [157]:
result = all_data.map_reduce(map, reduce, {'inline':1})

In [158]:
result


Out[158]:
{'counts': {'emit': 10865, 'input': 10865, 'output': 9, 'reduce': 263},
 'ok': 1.0,
 'results': [{'_id': '1', 'value': 1540.0},
  {'_id': '2', 'value': 2477.0},
  {'_id': '3', 'value': 2942.0},
  {'_id': '4', 'value': 475.0},
  {'_id': '5', 'value': 42.0},
  {'_id': '6', 'value': 751.0},
  {'_id': '7', 'value': 398.0},
  {'_id': '8', 'value': 1228.0},
  {'_id': '9', 'value': 1012.0}],
 'timeMillis': 27737}

In [5]:
# This one gets the counts of all classes. 

map = '''function() { 
             emit(this.class, 1);
        };        
        '''

reduce = '''
            function(key, value) {
                return Array.sum(value); 
            }
            '''

In [8]:
result = all_data.map_reduce(map, reduce, {'inline':1}, query={'dlls': []})

In [9]:
result


Out[9]:
{'counts': {'emit': 304, 'input': 304, 'output': 8, 'reduce': 16},
 'ok': 1.0,
 'results': [{'_id': '1', 'value': 7.0},
  {'_id': '2', 'value': 8.0},
  {'_id': '3', 'value': 5.0},
  {'_id': '4', 'value': 230.0},
  {'_id': '6', 'value': 1.0},
  {'_id': '7', 'value': 11.0},
  {'_id': '8', 'value': 39.0},
  {'_id': '9', 'value': 3.0}],
 'timeMillis': 75}

In [168]:
comments = [['word', 'comment'],['wdofas', 'dafsdf']]
comments


Out[168]:
[['word', 'comment'], ['wdofas', 'dafsdf']]

In [161]:
data.find_one().keys()


Out[161]:
dict_keys(['_id', 'calls', 'id', 'dlls', 'ida_comments', 'class', 'asm_info'])

In [197]:
comments = all_data.find({"class": '5'})

In [201]:
coms = [com['ida_comments'] for com in all_data.find()]

In [200]:
len(coms)


Out[200]:
42

In [ ]: