In [3]:
import src.utils.utils as utils
import bytecode_query as bq
%load_ext autoreload
%autoreload 2
import hash_bytecode as hb


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [5]:
from sklearn.feature_extraction import DictVectorizer

In [ ]:
%pylab inline

In [ ]:


In [4]:
db  = utils.get_mongodb(username='populator')

In [6]:
tester = db.samples.find_one({'class': '4'})

In [7]:
tester


Out[7]:
{'_id': ObjectId('5513a805127d27664d47bcb7'),
 'asm_info': {'num_instr': 0,
  'num_uniq_instr': 0,
  'seq': [],
  'uniq_instr': []},
 'calls': {'calls': [],
  'cdecl': [],
  'cdecl_count': 0,
  'fastcall': [],
  'fastcall_count': 0,
  'stdcall': [],
  'stdcall_count': 0,
  'thiscall': [],
  'thiscall_count': 0,
  'total_calls': 0},
 'class': '4',
 'dlls': [],
 'hexcode': {'bigrams': {'AA 69': 0,
   '3C 4A': 0,
   '9C C1': 0,
   'A2 91': 0,
   '88 17': 0,
   'D8 2C': 0,
   '51 C6': 0,
   '7D FA': 0,
   '56 A4': 0,
   '21 FA': 0,
   'BD C8': 0,
   'AE 82': 0,
   '2C C9': 0,
   'B9 20': 0,
   'E6 82': 0,
   '69 FC': 0,
   'A6 43': 1,
   '18 6E': 0,
   'D9 CA': 0,
   'F7 23': 0,
   'DB 2C': 0,
   '30 8B': 0,
   'D5 5A': 0,
   '4D 43': 0,
   'E5 39': 0,
   '91 D3': 0,
   '86 2F': 0,
   'A5 EB': 0,
   '32 D6': 1,
   '72 2D': 0,
   'A3 E4': 0,
   '6D EA': 0,
   'E2 81': 0,
   'E3 4E': 0,
   '6A E4': 0,
   'E7 AE': 0,
   '24 D4': 1,
   '27 A8': 0,
   'D6 42': 0,
   'D9 8A': 0,
   '9F 85': 0,
   '71 95': 0,
   '11 86': 1,
   'BB 1B': 0,
   '6E 7E': 0,
   '69 56': 0,
   '48 60': 0,
   'FB 86': 0,
   'D1 F4': 0,
   '12 EB': 2,
   '23 81': 0,
   'DE D9': 0,
   '10 3E': 0,
   '39 BE': 0,
   '42 75': 0,
   'FC D0': 0,
   'BF 9E': 0,
   '21 E7': 0,
   '40 AD': 0,
   'C1 A8': 0,
   'F4 A8': 0,
   '36 DB': 0,
   '80 20': 0,
   '90 C1': 0,
   'CD C7': 0,
   '19 C3': 0,
   '18 EC': 0,
   '92 82': 0,
   '59 E9': 0,
   '2F C3': 1,
   '13 C2': 0,
   '24 43': 0,
   '23 39': 0,
   '6C A1': 1,
   'B1 39': 0,
   '66 A1': 0,
   'CA 44': 1,
   'AF ??': 0,
   'D4 92': 0,
   '6C C6': 0,
   'A4 BC': 0,
   '9A 2B': 0,
   '39 17': 1,
   'CC 43': 0,
   '9D 1A': 0,
   '2F 3B': 0,
   'F4 40': 1,
   '3C BD': 0,
   'BA 79': 0,
   'CA 2F': 0,
   '12 52': 0,
   'A0 63': 1,
   '51 7E': 0,
   'AB C6': 1,
   'F2 59': 0,
   '64 6D': 1,
   'DC 95': 1,
   '78 FE': 0,
   'E5 A9': 0,
   'A8 C2': 0,
   '8B C6': 0,
   'E1 D5': 0,
   'AA F2': 0,
   'FE 76': 1,
   'EC 6D': 0,
   '5B A3': 0,
   'C8 9D': 0,
   'A5 BE': 0,
   '46 AB': 0,
   '3C 43': 0,
   '60 14': 1,
   'F9 A1': 0,
   'E2 6A': 0,
   '33 4B': 0,
   '16 86': 0,
   'EB E6': 0,
   '58 C6': 0,
   '53 78': 0,
   '37 A3': 0,
   'C1 B1': 1,
   'D9 56': 0,
   'E3 6B': 0,
   '50 34': 0,
   'B1 63': 1,
   '34 42': 0,
   '?? 75': 0,
   '83 1C': 0,
   'EE A3': 0,
   '28 91': 0,
   'DF 88': 0,
   '93 E8': 1,
   '74 4A': 0,
   'FD 88': 1,
   'A5 B3': 0,
   'D8 C9': 1,
   'A7 29': 0,
   'E6 88': 0,
   '63 2D': 0,
   'C3 BD': 0,
   'B8 3F': 0,
   '4A 25': 0,
   '4A F4': 0,
   '6B E1': 1,
   'E3 2D': 1,
   '6F 6B': 0,
   '84 BA': 0,
   '85 26': 0,
   '7C 17': 0,
   '37 2F': 1,
   'E0 D8': 0,
   'C0 C3': 0,
   '4F 3A': 0,
   '7E 20': 0,
   'B4 C0': 0,
   'CE D0': 1,
   '53 AD': 0,
   'AD E8': 0,
   'E5 70': 0,
   '3F 49': 0,
   'DB E2': 0,
   '99 1F': 0,
   'AA 1B': 0,
   '75 BC': 0,
   '37 D2': 0,
   '2F DB': 0,
   '24 61': 0,
   '65 FA': 0,
   'F1 BB': 0,
   '?? 80': 0,
   '3E 4D': 1,
   'EE 93': 0,
   'FA D9': 0,
   '61 2A': 0,
   'CF 6F': 1,
   'D8 C2': 0,
   '7D 62': 0,
   '96 CB': 0,
   'B8 43': 0,
   '58 F1': 0,
   'F8 20': 0,
   'B5 15': 0,
   'F8 AD': 0,
   '7B B1': 0,
   '2E D3': 0,
   '92 2D': 0,
   '66 BA': 0,
   'AA BC': 0,
   '38 20': 0,
   'D5 FA': 0,
   '3A 98': 0,
   '51 27': 1,
   'ED 2E': 0,
   'B5 AE': 1,
   '66 41': 0,
   'C1 4D': 0,
   '70 C9': 0,
   '2E FC': 0,
   '9B 26': 0,
   'D8 7A': 1,
   '72 57': 0,
   'AD EC': 0,
   'DB D3': 0,
   'FB 65': 0,
   '49 86': 0,
   '9B 36': 1,
   '65 B0': 0,
   '39 16': 0,
   'A8 72': 1,
   '8D 85': 0,
   'FD FF': 3,
   '8E B6': 1,
   'DD F6': 0,
   '2E ??': 0,
   '8B 82': 0,
   'B8 C3': 0,
   '38 43': 1,
   '70 BD': 0,
   'DD 1F': 0,
   '6A EC': 1,
   '43 E9': 0,
   '96 D3': 1,
   'A2 81': 0,
   'A4 91': 0,
   '6B 48': 0,
   '4E AB': 1,
   '6C 9D': 0,
   '4B F5': 1,
   'DD E1': 0,
   '6D 50': 0,
   'BF 5A': 0,
   '3D B7': 0,
   '80 14': 0,
   'CE 36': 0,
   '7B A0': 0,
   '68 56': 0,
   '94 18': 0,
   '45 C3': 0,
   '54 76': 0,
   'B6 FE': 0,
   '27 52': 0,
   '1E 20': 0,
   'E8 9C': 1,
   'A9 C8': 0,
   'F4 CD': 0,
   '12 A3': 0,
   'B4 C4': 0,
   '78 94': 0,
   'AC F2': 0,
   '40 1A': 0,
   '5F C8': 0,
   'DA D6': 0,
   '71 94': 0,
   '24 5B': 0,
   '22 CB': 0,
   '73 E9': 1,
   '99 EE': 0,
   '46 3C': 0,
   '5C 2C': 0,
   '6B C6': 0,
   '7A E9': 1,
   'B8 15': 0,
   'DA 5E': 0,
   'EE 14': 0,
   '6A 79': 0,
   'C7 4B': 0,
   'B8 F9': 0,
   'E2 E4': 0,
   'FC 1B': 1,
   '8D 75': 0,
   '64 59': 0,
   'A2 43': 0,
   'F5 7C': 1,
   'F1 AA': 0,
   'EC F7': 0,
   '38 32': 0,
   'E2 F2': 0,
   '2B 73': 0,
   '1F BE': 0,
   'E4 7C': 0,
   '91 68': 0,
   '22 70': 0,
   '7D 3F': 0,
   '5B 28': 0,
   '41 D2': 0,
   'BC C9': 0,
   '28 35': 0,
   '91 23': 0,
   'EE CC': 0,
   'CE 2A': 0,
   '3A F6': 0,
   'BD 32': 1,
   'B5 39': 3,
   'C9 CE': 1,
   '29 90': 0,
   'D0 C1': 0,
   'FD C6': 0,
   '65 67': 0,
   '17 AC': 1,
   '67 98': 0,
   '91 F6': 0,
   '5C 8B': 1,
   '3D 14': 0,
   '2E B3': 0,
   '7B 66': 0,
   'CD E5': 0,
   'D2 64': 0,
   'B2 56': 0,
   'F2 C3': 0,
   '44 14': 0,
   'D4 82': 1,
   '86 27': 0,
   '3E 37': 0,
   '34 F4': 0,
   'F1 C4': 0,
   'AF 85': 0,
   '70 C0': 0,
   '9B 8D': 1,
   'D5 D0': 0,
   'FF 96': 1,
   'D7 CC': 0,
   'E4 5B': 0,
   'D8 83': 0,
   'C8 D3': 0,
   '3D 58': 0,
   '2F 10': 0,
   'C5 E3': 0,
   '7C 4B': 0,
   '1C DA': 0,
   'EB EC': 0,
   '97 B9': 0,
   '78 63': 0,
   'DC 54': 0,
   '94 53': 1,
   '8A 24': 1,
   'BB 39': 0,
   'CD ??': 0,
   '1D A4': 0,
   'F4 97': 0,
   '16 DB': 0,
   '61 EB': 1,
   'F4 F9': 0,
   'B7 21': 0,
   'D2 DC': 1,
   '2C 71': 0,
   '76 B8': 0,
   '96 6A': 0,
   '21 32': 0,
   '2C FE': 1,
   '32 9A': 0,
   'D1 52': 1,
   '1D D8': 0,
   '2B 78': 0,
   '19 DE': 1,
   'AC A9': 0,
   '34 B2': 0,
   '6B 22': 0,
   '80 8C': 0,
   'E9 E1': 0,
   '69 F1': 0,
   '55 9C': 0,
   'D9 70': 0,
   '89 69': 0,
   '87 C8': 0,
   'FA 16': 0,
   'CA E1': 0,
   '1C 79': 1,
   '88 55': 0,
   '4D F4': 0,
   '75 95': 1,
   'AF DE': 0,
   '5A D7': 0,
   'C6 EB': 0,
   'D9 44': 0,
   '29 A3': 0,
   '82 AF': 0,
   'BD 5D': 0,
   'F6 70': 0,
   'F3 FF': 0,
   'E4 47': 0,
   '44 2C': 0,
   '82 E5': 0,
   'ED C3': 0,
   '99 7E': 0,
   '2C BB': 0,
   '6A EB': 0,
   '2A 3B': 0,
   '78 A3': 0,
   'A8 C7': 0,
   '43 2B': 2,
   '13 DB': 0,
   'F2 53': 0,
   'AE 5B': 0,
   'B0 F9': 0,
   '44 62': 0,
   'EE D6': 1,
   '32 5F': 0,
   '3D C3': 0,
   'FB 2A': 0,
   '71 F2': 0,
   'B7 DD': 0,
   '70 59': 0,
   'C0 B6': 0,
   '75 B8': 0,
   'B8 77': 0,
   '6B 51': 0,
   '3B 3A': 0,
   '27 17': 1,
   '45 8C': 0,
   '8B 3E': 0,
   'CC 1E': 0,
   '7F 8D': 0,
   '4E 55': 0,
   '5D 41': 1,
   '1C 22': 0,
   'C9 CF': 3,
   '9F DB': 0,
   '47 5F': 0,
   'DD 23': 0,
   '94 63': 1,
   '91 4A': 0,
   '9B CA': 1,
   '6B 4B': 0,
   '39 5F': 0,
   '36 4F': 0,
   '8F BD': 0,
   '7F AE': 0,
   'ED 5A': 0,
   '79 5D': 0,
   'B5 D7': 0,
   '36 29': 0,
   '52 3D': 0,
   '96 EC': 0,
   '58 A5': 0,
   'B4 3E': 0,
   'E6 F6': 0,
   'D1 C5': 1,
   '11 30': 0,
   'E8 FC': 0,
   'AF 74': 1,
   'A3 21': 0,
   'E6 37': 0,
   '1E 97': 0,
   '2E 74': 0,
   '63 1D': 0,
   '1C D2': 0,
   '41 59': 0,
   'FC A3': 0,
   '7C 14': 0,
   '36 5D': 0,
   '90 AC': 0,
   'F9 58': 0,
   'AB 5D': 0,
   '28 E0': 0,
   'C8 60': 0,
   '77 CB': 0,
   '8E FD': 0,
   '60 B6': 1,
   '6C B3': 0,
   '4B 31': 0,
   '89 52': 0,
   '53 EA': 0,
   '1E 45': 0,
   '86 12': 0,
   '64 1E': 0,
   'D8 B7': 0,
   '72 5D': 0,
   'F4 DC': 0,
   '9A 23': 0,
   'A6 2E': 0,
   'F6 C5': 0,
   'F9 EF': 0,
   '47 6F': 0,
   'AF 23': 1,
   '78 83': 0,
   'C2 4A': 0,
   'D5 73': 0,
   'B7 AE': 0,
   '8D 23': 0,
   '4D 9B': 0,
   '82 70': 1,
   '91 26': 1,
   '86 3F': 0,
   'E8 91': 0,
   '74 62': 0,
   '7A 65': 0,
   'C8 92': 1,
   'DC 82': 0,
   'AF 17': 0,
   '1B D2': 0,
   'F3 75': 0,
   '99 B5': 0,
   '93 B9': 0,
   'A2 4D': 0,
   'B5 52': 0,
   '90 40': 0,
   '69 2D': 1,
   '1C 6B': 0,
   '33 DA': 0,
   '47 43': 1,
   '61 28': 1,
   'C8 6B': 0,
   'A0 A2': 0,
   'DB B5': 0,
   '45 AC': 0,
   'C0 A6': 0,
   '35 65': 0,
   '30 78': 0,
   'DC 5C': 0,
   '2D CA': 0,
   'D9 FC': 0,
   'D5 2B': 0,
   'AE 7A': 0,
   'E9 83': 0,
   '8A 22': 0,
   '22 B6': 0,
   'EC 44': 0,
   '5D D9': 0,
   '51 F6': 0,
   '8D BC': 0,
   '4D B4': 0,
   'BE 87': 1,
   'AC 34': 0,
   'FA F0': 0,
   'D8 CA': 2,
   '86 1C': 0,
   '41 F6': 2,
   'B2 16': 1,
   '68 AE': 1,
   '49 4C': 0,
   'AA 8C': 0,
   '28 B9': 0,
   'A5 94': 0,
   'CB DF': 0,
   '3F 76': 1,
   '91 73': 0,
   '94 F9': 0,
   '7D B5': 0,
   '53 F8': 0,
   'B7 DC': 0,
   '40 BA': 0,
   '95 96': 0,
   '34 A2': 1,
   '76 CC': 0,
   'AA 6E': 0,
   '41 42': 0,
   'D1 F8': 0,
   'C1 63': 0,
   '9E A5': 0,
   '38 82': 0,
   '4A 8A': 0,
   '3A 84': 0,
   'DA 23': 0,
   '87 2C': 1,
   'AD 32': 0,
   'FE C3': 0,
   '64 96': 0,
   'FC EF': 0,
   '26 72': 1,
   '22 4D': 0,
   '4F DC': 1,
   '47 6E': 0,
   '24 57': 0,
   '80 6F': 1,
   '17 4F': 0,
   '59 99': 0,
   '4D 9A': 0,
   '91 C5': 0,
   '31 81': 0,
   'E4 C8': 1,
   '11 62': 0,
   'AD 70': 1,
   '5E DF': 0,
   'DF A3': 1,
   'DB E8': 0,
   '18 27': 1,
   '43 46': 0,
   'D0 85': 0,
   '1F 79': 0,
   'D8 FF': 0,
   '52 1B': 0,
   'FB B3': 1,
   '44 57': 0,
   '7C BF': 0,
   'B4 8D': 0,
   '98 C3': 1,
   'C7 89': 0,
   '21 EC': 0,
   'C6 70': 1,
   'FE D5': 0,
   'F5 41': 0,
   '39 94': 1,
   '9A 8C': 1,
   '5E F2': 0,
   '6C B5': 0,
   'F8 81': 0,
   '23 BB': 0,
   'D4 88': 0,
   '98 14': 0,
   '11 EE': 0,
   '7B 36': 0,
   'B6 6F': 0,
   '32 BB': 0,
   '25 78': 0,
   '86 E0': 0,
   '3D 3E': 0,
   '6C 21': 0,
   'B2 4B': 0,
   'EB 29': 0,
   'E5 5B': 0,
   '4B 25': 0,
   '6E 34': 0,
   'D7 ??': 0,
   'B7 99': 6,
   '62 AF': 0,
   '3B 3D': 1,
   'B8 7A': 1,
   'FE 56': 0,
   '27 5E': 0,
   '88 F7': 0,
   '44 7D': 0,
   '58 C9': 0,
   'BF A6': 0,
   '68 44': 1,
   '54 7A': 0,
   '71 C9': 0,
   '60 D8': 0,
   '63 74': 0,
   'AE 75': 0,
   '80 7F': 1,
   '5F 99': 0,
   '39 D8': 0,
   '46 34': 0,
   '2E F0': 0,
   '6C 5A': 0,
   'EB B6': 0,
   '35 63': 0,
   'A7 65': 0,
   'B5 F0': 0,
   '1D E6': 0,
   'FD 40': 0,
   'B7 13': 0,
   'F1 77': 0,
   '89 31': 0,
   'EB 44': 0,
   '9F D4': 0,
   '8A 80': 1,
   'B2 42': 0,
   'A4 5D': 0,
   '8B 8E': 0,
   '91 1C': 0,
   '73 E4': 0,
   '90 B3': 0,
   '52 55': 1,
   '4E 19': 0,
   'DA D8': 0,
   'B3 E7': 0,
   '18 35': 0,
   'D4 73': 0,
   '92 ED': 0,
   '1E 52': 0,
   'A7 44': 0,
   '19 A4': 0,
   'ED 6A': 0,
   '49 BB': 0,
   'F9 A2': 0,
   '8F F1': 0,
   '19 E8': 0,
   '11 57': 0,
   '12 7E': 0,
   '95 DB': 0,
   'E1 61': 0,
   'AE ED': 0,
   'C3 97': 0,
   'EC AD': 0,
   '33 B8': 0,
   'A2 8A': 0,
   'EC D1': 0,
   '3F 37': 0,
   '36 C2': 0,
   'C7 9D': 0,
   '67 20': 0,
   '27 FF': 0,
   '59 F3': 0,
   '77 2B': 0,
   'BD 99': 0,
   '33 7A': 0,
   'C9 A5': 0,
   '12 89': 0,
   '59 F5': 0,
   'EF BA': 0,
   'C7 ED': 0,
   'E0 7F': 0,
   'B8 57': 0,
   '7A E4': 0,
   '3D 92': 0,
   'F2 55': 1,
   '83 56': 1,
   '82 50': 1,
   '9D E4': 0,
   '53 3A': 0,
   'A8 90': 1,
   '7A E6': 0,
   '27 13': 0,
   'AF 65': 0,
   'B5 D5': 0,
   '84 DC': 1,
   'AB 00': 0,
   '10 D7': 0,
   '44 FF': 0,
   'E8 42': 1,
   '8E E4': 0,
   'DD BF': 0,
   '35 3B': 0,
   '59 60': 0,
   'EC BA': 0,
   '99 50': 0,
   'A4 66': 0,
   '67 10': 0,
   'DC E8': 2,
   'B7 8B': 0,
   '75 DD': 0,
   '2F 14': 0,
   '9C EA': 0,
   '2C D7': 0,
   '95 90': 0,
   'CC DF': 0,
   '2C ED': 0,
   '53 A3': 0,
   '34 9A': 0,
   'BC 30': 0,
   '3C 56': 1,
   '65 DE': 0,
   'DA A3': 0,
   'A8 80': 0,
   'B0 C5': 0,
   '15 47': 0,
   '47 55': 0,
   '97 94': 0,
   '90 65': 1,
   'B1 81': 0,
   '3C 65': 0,
   '?? 53': 0,
   '4C C2': 0,
   '5F 41': 0,
   '17 BD': 0,
   'EC EE': 1,
   'BE C7': 0,
   'A7 3A': 1,
   'AC B8': 0,
   '4C 60': 0,
   '13 A2': 0,
   'D8 9A': 0,
   'BF 96': 0,
   '31 44': 0,
   'D1 CE': 0,
   'FF 22': 0,
   '58 4B': 0,
   '81 29': 0,
   '28 4F': 0,
   '52 C1': 0,
   'CA 73': 0,
   '68 77': 0,
   'E4 24': 0,
   'D4 D5': 0,
   'CF B5': 0,
   'CD 96': 0,
   '84 62': 0,
   'DA E8': 1,
   '3E 18': 3,
   '4F EB': 0,
   '67 13': 0,
   '23 66': 0,
   'B9 51': 0,
   '5B 3A': 0,
   '1A 9C': 0,
   'C0 38': 0,
   '1F B5': 1,
   'F9 BF': 0,
   '73 E0': 0,
   'AF 63': 0,
   'DD C4': 0,
   '53 2F': 0,
   '6E 32': 0,
   '95 A7': 0,
   '4D 8F': 0,
   'E2 8A': 1,
   'E0 44': 0,
   '17 EE': 0,
   'A3 72': 0,
   'F0 9F': 0,
   'EA 29': 0,
   '54 AE': 1,
   'A8 3A': 0,
   '33 12': 1,
   'F8 A5': 0,
   'B0 A8': 0,
   '?? 39': 0,
   '63 A9': 0,
   'FF 77': 2,
   'D7 EB': 0,
   '9F AE': 0,
   'B8 46': 0,
   '19 72': 0,
   'F3 95': 0,
   '37 D5': 0,
   '45 E7': 0,
   'F8 C5': 0,
   'E8 45': 0,
   '7F BA': 0,
   'F8 68': 0,
   '46 A0': 0,
   'FB 14': 1,
   '7C EA': 0,
   'A4 B2': 0,
   '3E BC': 0,
   '74 5C': 0,
   '94 A6': 0,
   '88 B7': 0,
   '51 21': 1,
   '9C ED': 0,
   'A2 C6': 0,
   'A4 36': 0,
   '8A 8C': 2,
   '26 81': 1,
   '7A 91': 0,
   '8A 39': 0,
   'EE F1': 0,
   'E3 DE': 1,
   '1E 34': 0,
   '72 7D': 0,
   'DD 36': 0,
   '88 A6': 0,
   '15 66': 0,
   '10 ED': 0,
   '1B B3': 0,
   'DE AD': 0,
   '4C 6B': 2,
   'AB F4': 0,
   '3B 91': 0,
   '9C 38': 0,
   '68 3A': 0,
   '95 F3': 1,
   '1D 3B': 0,
   '90 E5': 0,
   '1B B1': 0,
   '6C B6': 0,
   'B8 83': 0,
   'FD EB': 0,
   '25 36': 0,
   '60 73': 0,
   'AB AE': 1,
   'D8 F8': 0,
   '15 81': 0,
   '43 59': 0,
   '9C 32': 0,
   '22 C8': 1,
   '51 E6': 1,
   '15 8C': 0,
   '75 96': 0,
   '64 ??': 0,
   '7B 46': 0,
   'C0 94': 0,
   '5B 85': 0,
   '63 E6': 0,
   '9E E5': 0,
   '1F 68': 0,
   '29 2F': 1,
   '76 7D': 1,
   'A4 FE': 0,
   '13 9E': 0,
   '15 F0': 4,
   'A5 E2': 0,
   '37 A0': 0,
   '72 18': 0,
   'F4 BC': 1,
   '17 CF': 0,
   'E0 32': 0,
   '5F F7': 0,
   '00 65': 1,
   '60 EB': 0,
   '8B 7D': 1,
   '35 22': 0,
   'A4 A7': 0,
   '21 18': 0,
   '23 24': 0,
   'B3 3B': 0,
   '62 36': 0,
   'A9 DE': 2,
   '56 B5': 0,
   'F7 20': 0,
   '91 FE': 0,
   'E2 A3': 0,
   'DE FD': 1,
   '5E 9E': 0,
   '54 E9': 0,
   '00 8F': 0,
   '90 E8': 0,
   'FB D1': 0,
   'D7 56': 0,
   '71 10': 0,
   '37 28': 0,
   'AE 6E': 0,
   '9D 23': 1,
   '83 FF': 0,
   '20 C8': 0,
   'F4 86': 0,
   'BD ED': 0,
   'EB 7A': 0,
   '20 8F': 1,
   '36 7A': 1,
   'B1 5C': 0,
   '5B C1': 0,
   '37 8A': 0,
   'F3 31': 0,
   '94 E8': 0,
   '64 51': 0,
   '70 64': 0,
   '29 BD': 0,
   'E8 B0': 0,
   '74 E5': 0,
   '4B E2': 0,
   '29 64': 0,
   '5E 91': 0,
   '92 4C': 0,
   '35 96': 0,
   '9C 53': 0,
   'D7 D0': 1,
   '15 C2': 0,
   'F6 DE': 0,
   '9B A0': 0,
   '35 D1': 0,
   '6C 9C': 0,
   '8B 10': 0,
   'C3 96': 0,
   '18 A9': 0,
   '6C A3': 0,
   'E4 16': 0,
   'B5 ??': 0,
   '6E DC': 0,
   'EB 8F': 0,
   '8A DF': 0,
   '16 36': 0,
   '38 53': 0,
   '59 CA': 0,
   'B6 AD': 0,
   'A1 C5': 0,
   '19 9B': 0,
   'D5 E9': 2,
   'B7 CC': 0,
   '2D AE': 0,
   '85 C7': 0,
   '12 9C': 1,
   '1F 5F': 0,
   '37 6D': 0,
   '5F 15': 0,
   '16 35': 1,
   '37 70': 0,
   'AC 25': 0,
   'AE C9': 0,
   '27 65': 0,
   'F2 66': 0,
   'B1 9B': 0,
   '26 F2': 0,
   'EE 4B': 0,
   '47 70': 0,
   '86 E4': 0,
   '1F 6A': 0,
   '7A 76': 0,
   'F0 56': 0,
   'D4 FC': 0,
   'A7 6D': 0,
   '3A 6F': 0,
   '87 EB': 0,
   'B6 A7': 0,
   'F2 94': 0,
   '5E 58': 0,
   '29 55': 0,
   '78 5F': 0,
   '3E 3C': 0,
   '8C ED': 0,
   'D6 E8': 0,
   'E8 85': 0,
   '8C BE': 1,
   '5A E8': 0,
   'D7 94': 0,
   '61 7D': 1,
   'D6 5D': 0,
   '4D FA': 0,
   '88 80': 0,
   '9E 81': 0,
   '80 7E': 0,
   '8E D3': 0,
   'BE 6A': 0,
   'C6 6F': 0,
   '54 DB': 0,
   '86 18': 0,
   '6E 8F': 0,
   'E5 2C': 0,
   '34 41': 0,
   'EA B6': 0,
   ...},
  'unigrams': {'00': 182,
   '10': 73,
   '11': 105,
   '12': 86,
   '13': 99,
   '14': 125,
   '15': 103,
   '16': 100,
   '17': 125,
   '18': 92,
   '19': 79,
   '1A': 85,
   '1B': 99,
   '1C': 72,
   '1D': 87,
   '1E': 67,
   '1F': 96,
   '20': 86,
   '21': 96,
   '22': 76,
   '23': 76,
   '24': 76,
   '25': 80,
   '26': 87,
   '27': 89,
   '28': 91,
   '29': 84,
   '2A': 91,
   '2B': 92,
   '2C': 104,
   '2D': 95,
   '2E': 92,
   '2F': 107,
   '30': 76,
   '31': 118,
   '32': 80,
   '33': 79,
   '34': 92,
   '35': 82,
   '36': 88,
   '37': 114,
   '38': 150,
   '39': 109,
   '3A': 100,
   '3B': 78,
   '3C': 80,
   '3D': 86,
   '3E': 170,
   '3F': 92,
   '40': 74,
   '41': 87,
   '42': 97,
   '43': 80,
   '44': 82,
   '45': 76,
   '46': 86,
   '47': 105,
   '48': 109,
   '49': 118,
   '4A': 90,
   '4B': 89,
   '4C': 78,
   '4D': 83,
   '4E': 89,
   '4F': 93,
   '50': 98,
   '51': 90,
   '52': 95,
   '53': 73,
   '54': 88,
   '55': 86,
   '56': 86,
   '57': 90,
   '58': 98,
   '59': 94,
   '5A': 91,
   '5B': 94,
   '5C': 76,
   '5D': 80,
   '5E': 91,
   '5F': 85,
   '60': 93,
   '61': 82,
   '62': 68,
   '63': 104,
   '64': 94,
   '65': 88,
   '66': 70,
   '67': 87,
   '68': 80,
   '69': 73,
   '6A': 102,
   '6B': 91,
   '6C': 95,
   '6D': 100,
   '6E': 117,
   '6F': 82,
   '70': 101,
   '71': 92,
   '72': 136,
   '73': 78,
   '74': 172,
   '75': 90,
   '76': 193,
   '77': 96,
   '78': 99,
   '79': 77,
   '7A': 100,
   '7B': 113,
   '7C': 102,
   '7D': 87,
   '7E': 99,
   '7F': 84,
   '80': 88,
   '81': 87,
   '82': 114,
   '83': 120,
   '84': 101,
   '85': 89,
   '86': 84,
   '87': 102,
   '88': 100,
   '89': 94,
   '8A': 124,
   '8B': 154,
   '8C': 130,
   '8D': 121,
   '8E': 79,
   '8F': 84,
   '90': 96,
   '91': 117,
   '92': 120,
   '93': 148,
   '94': 83,
   '95': 93,
   '96': 101,
   '97': 92,
   '98': 99,
   '99': 173,
   '9A': 128,
   '9B': 126,
   '9C': 98,
   '9D': 102,
   '9E': 129,
   '9F': 104,
   '??': 0,
   'A0': 83,
   'A1': 72,
   'A2': 89,
   'A3': 99,
   'A4': 80,
   'A5': 76,
   'A6': 105,
   'A7': 121,
   'A8': 102,
   'A9': 93,
   'AA': 86,
   'AB': 120,
   'AC': 110,
   'AD': 76,
   'AE': 96,
   'AF': 108,
   'B0': 99,
   'B1': 103,
   'B2': 97,
   'B3': 133,
   'B4': 94,
   'B5': 89,
   'B6': 122,
   'B7': 125,
   'B8': 80,
   'B9': 83,
   'BA': 108,
   'BB': 192,
   'BC': 118,
   'BD': 104,
   'BE': 121,
   'BF': 85,
   'C0': 96,
   'C1': 87,
   'C2': 107,
   'C3': 102,
   'C4': 92,
   'C5': 98,
   'C6': 96,
   'C7': 110,
   'C8': 98,
   'C9': 107,
   'CA': 104,
   'CB': 102,
   'CC': 96,
   'CD': 94,
   'CE': 105,
   'CF': 100,
   'D0': 91,
   'D1': 85,
   'D2': 148,
   'D3': 98,
   'D4': 90,
   'D5': 80,
   'D6': 144,
   'D7': 89,
   'D8': 103,
   'D9': 95,
   'DA': 102,
   'DB': 309,
   'DC': 96,
   'DD': 85,
   'DE': 92,
   'DF': 90,
   'E0': 87,
   'E1': 108,
   'E2': 115,
   'E3': 104,
   'E4': 86,
   'E5': 87,
   'E6': 98,
   'E7': 109,
   'E8': 88,
   'E9': 93,
   'EA': 106,
   'EB': 110,
   'EC': 74,
   'ED': 108,
   'EE': 93,
   'EF': 95,
   'F0': 152,
   'F1': 93,
   'F2': 92,
   'F3': 88,
   'F4': 120,
   'F5': 85,
   'F6': 115,
   'F7': 132,
   'F8': 124,
   'F9': 113,
   'FA': 121,
   'FB': 141,
   'FC': 107,
   'FD': 131,
   'FE': 183,
   'FF': 3574}},
 'id': 'HaTioeY3kbvJW2LXtOwF',
 'ida_comments': ['++',
  '|   This filehas been generated by The Interactive Disassembler (IDA)    |',
  '|      Copyright(c) 2013 Hexrays.com>    |',
  '| License info:                    |',
  '|   Microsoft    |',
  '++',
  'Format      :Binary file',
  'Base Address:0000h Range: 0000h - 895Eh Loaded length: 895Eh',
  'Segment type:Pure code'],
 'rand_id': 8340,
 'strings': {'dlls': [],
  'function_calls': [],
  'raw': ['/y\\0',
   'iKot',
   'pB,^',
   '0Vj=',
   '*qT+',
   '1#<B$',
   'QY7z+z',
   'A\tu~|',
   '^ 1^m',
   'y!Y+G',
   ')WZ`',
   'Gmp7',
   's&6$T',
   '4](R',
   '}%$~',
   'FB%:`I',
   'j{)e',
   'GCs0W',
   '-CPV{',
   "'Y2.",
   'Y_/Y(z',
   'Awdx[',
   'a+9(HO',
   'sdAd',
   ',9df',
   'k:jY',
   'X35d',
   '(2\t4',
   '0(nq^',
   'j=T0',
   'rR\tZ',
   '!kqz',
   'FN8(',
   'ZZW,{D',
   'zJe8',
   'k>)k',
   '1&1\tqh.x',
   'wdm:',
   '5` M',
   '_TuFg',
   'I17#@',
   'GK/d',
   '..A5',
   "p'p%b",
   'wLGmh3',
   '7k&-',
   'f}Rgj',
   'S&ne',
   'myX$',
   '=Md5',
   'X.?f(',
   '?=Oo',
   'H&1[E?',
   '6~:j',
   'A\tApx',
   ':^~Yp',
   'uqP.5',
   'u+AZ',
   '9ZoK',
   'w3sq',
   "2HxX'",
   '<W}q',
   '.B1y',
   'aY7',
   'oW1{',
   'Pqbh}&O',
   '9+bP/R<',
   'UuN#',
   '"V\\}Jd',
   'Li`',
   '4$7',
   "iQ'h",
   '|Rrt',
   'l"lg',
   '\\?{k8',
   '&*+IFH:8Dx',
   '3&\tU',
   'd7i%@',
   "'<.n1",
   '/\\A$s',
   'jUoU"',
   '>r\tl^',
   't9(}',
   '[+hO',
   'r=S^',
   '}r/7',
   ')X,!',
   ')%i2',
   'LW]5',
   'clt~',
   ',G0]$',
   '>^!=',
   "['*{v",
   'BrDa',
   '<;F6ntaP2/',
   'SjR.',
   ',xi-',
   ',T>-',
   'Cc,(',
   'OH;5',
   ']jn_h>:',
   '|z22j',
   '>Xe#',
   'a?[y',
   'vM/2@(%',
   'Orj&',
   ']}OU',
   'ixWd+,x',
   'RNfn',
   '|2W$',
   '[*Bx',
   'HjH>k',
   'V=OsK',
   'i*-p',
   '-@ep',
   'nP$',
   '\\uXh',
   '7e`t',
   '~xFr_LNI',
   ';kzd',
   'H>&C',
   '6-hp',
   '0Y,FR',
   '%!=PW',
   '1pJ]',
   ']tEg',
   'tV+h',
   'xKQ+',
   'c=n1',
   'vH1`6',
   'p7I_Q',
   '+Omw',
   '?P4/Rr',
   'fty`',
   "{'8R",
   'O-8e',
   '5" g',
   '-%#hF3',
   '.ponN',
   '.;Rzu',
   ';X\t:',
   '{5-=`X-"K',
   'K`q&',
   'xeZ3',
   '5dNO',
   '4M2?',
   'p1rq',
   'J;73',
   '}&:O',
   '6X;),B',
   '-<VPDI',
   '\\G`w',
   '/Qr"',
   '+v\\ivW',
   'r\t)"\\',
   'Nc,qY',
   '=L"}',
   'akE,]',
   '{-JQ',
   ']:0M',
   '{r7P',
   'pv& Swm7R',
   'nH-VJ',
   'DTq#0k',
   'Lkru',
   '^*Ig-',
   'o]cU',
   'cm\\',
   'XODg',
   'I?>/',
   'C{>r',
   'ZQ g<',
   'hHsm',
   '#x^aHFo',
   '2iDO',
   'p7s+C',
   'w.u^',
   '+Ztn',
   'MODB\\',
   '+APB2',
   'a:IK',
   'ja=c',
   'm]\\7',
   'way6',
   '8&"B!W',
   'Y\too',
   'db9p',
   'd$AM',
   'Z[>.',
   'pl##',
   'vGJ~',
   '^e1{',
   'EFkhYZ',
   "o'Gs-",
   "';9!",
   ']n4p',
   '7 E$',
   'c9dA',
   '2e*',
   'Fp U',
   'w91p',
   'YuB[i',
   'n4n0',
   'QRkx',
   '-]6r',
   'EC>E',
   'pwt8$!',
   'f .N',
   'Nvcv;',
   'PtJF8)j$',
   'V$y.<',
   'Fws9',
   'a~H-P@',
   '{m4q',
   '4)m8',
   '/9r9',
   'l/&G',
   'QQ[S`',
   '*!efw',
   'L^;0',
   ':7xwU',
   'iQ~l',
   'T?.m',
   'cOIf',
   '(<<U',
   'Cc)~7KV#M',
   'kCb-z9',
   'y9rE1',
   '<=2*~',
   'Pj#C',
   'IdH"o(7Y/',
   'sFzy',
   '<2/.',
   ';Nr9P',
   'B5pWx',
   'WR+l<',
   'bd@K',
   ',$/@',
   '=`aH',
   'xQ#3&__',
   'JW/:',
   '(1YH7',
   'O[4^',
   'h>XUG',
   '^KZ(',
   'J`~3',
   '"3\tv',
   '1X){P',
   '[lz6|',
   '@\\9.',
   'm^WVHS',
   '\\~-(',
   'X?}xhZ',
   'Ay2sL',
   'IKP?',
   'exrrz',
   'jVv^',
   '|!/d',
   'VYp%',
   '6wBN9',
   '.HMc',
   ':kNj',
   '70Mx',
   '_!BX@',
   'w&_w',
   'RT\\T',
   '~^[\\',
   '&/z}',
   'S.66',
   '?Hd#T1',
   'r?W]',
   'Tsv[Ka',
   'l%w]',
   'Q5P3',
   'OiYZq',
   'W}/9',
   "R'E/|",
   'XG`p',
   'uDlt',
   '{rs7',
   'yBUH',
   'n/r4',
   's!~Qxb',
   'w~\tLP',
   '=a]>',
   '^0&E\\l',
   'rC=Y',
   '8v7r',
   '8v7r',
   '`?~>',
   '`?~>',
   '8v/v',
   'v/v).']}}

In [9]:
bq.upack_doc(tester, 'bigrams')


Out[9]:
('HaTioeY3kbvJW2LXtOwF',
 4,
 (0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  3,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  3,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  3,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  2,
  0,
  2,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  6,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  2,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  4,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  ...))

In [15]:
bq.upack_query([tester], 'bigrams', numpy=True)


Out[15]:
(array(['HaTioeY3kbvJW2LXtOwF'], 
       dtype='<U20'), array([4]), array([[0, 0, 0, ..., 0, 0, 0]]))

In [25]:
docs = db.samples.find({'rand_id': {"$exists": True}}, {'id':1, 'class':1, 'hexcode':1})

In [26]:
docs.count()


Out[26]:
10865

In [27]:
ids, classes, unigrams = bq.upack_query(docs, 'unigrams', numpy=True)

In [21]:
classes


Out[21]:
array([2, 8, 9, 9, 1, 6, 2, 2, 2, 6])

In [47]:
from sklearn.ensemble import RandomForestClassifier

import sklearn.cross_validation as cv
import sklearn.metrics as met

In [33]:
rf = RandomForestClassifier(n_estimators=1500, max_depth=15, n_jobs=-1)

In [40]:
uni_train, uni_test, uni_train_lab, uni_test_lab = cv.train_test_split(
        unigrams, classes, test_size=0.33, random_state=42)

In [43]:
%%time
rf.fit(uni_train, uni_train_lab)


CPU times: user 1min 36s, sys: 245 ms, total: 1min 37s
Wall time: 7.71 s
Out[43]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [46]:
predictions = rf.predict(uni_test)

In [50]:
print(met.classification_report(predictions, uni_test_lab))


             precision    recall  f1-score   support

          1       0.99      0.96      0.97       534
          2       1.00      0.99      1.00       807
          3       1.00      1.00      1.00       980
          4       0.99      0.96      0.98       151
          5       0.83      0.77      0.80        13
          6       0.94      0.95      0.95       233
          7       0.99      1.00      1.00       110
          8       0.93      0.98      0.95       413
          9       0.98      0.99      0.99       345

avg / total       0.98      0.98      0.98      3586


In [51]:
docs = db.samples.find({'rand_id': {"$exists": True}}, {'id':1, 'class':1, 'hexcode':1})
ids, classes, unigrams = bq.upack_query(docs, 'bigrams', numpy=True)

In [135]:
bigrams = unigrams

In [ ]:


In [ ]:
cv.KFold = cv.KFold(n=ids.shape[0], n_folds=5, shuffle=False, random_state=12)

In [ ]:
# this is obnoxious
rf_predicted = []
for train_index, test_index in kf:
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    rf.fit(X_train, y_train)

In [52]:
rf_predicted = cv.cross_val_predict(rf, unigrams, classes, cv=5)

In [ ]:


In [53]:
print(met.classification_report(classes, rf_predicted))


             precision    recall  f1-score   support

          1       0.97      0.99      0.98      1540
          2       1.00      1.00      1.00      2477
          3       1.00      1.00      1.00      2942
          4       0.95      1.00      0.97       475
          5       1.00      0.02      0.05        42
          6       0.88      0.97      0.93       751
          7       0.99      0.95      0.97       398
          8       0.98      0.94      0.96      1228
          9       0.99      0.98      0.98      1012

avg / total       0.98      0.98      0.98     10865


In [ ]:


In [140]:
rf_bigram = RandomForestClassifier(n_estimators=1500, n_jobs=-1, max_depth=15)

In [141]:
rf_bigram_predicted = cv.cross_val_predict(rf_bigram, bigrams, classes, cv=5)

In [142]:
print(met.classification_report(classes, rf_bigram_predicted))


             precision    recall  f1-score   support

          1       0.97      0.99      0.98      1540
          2       1.00      1.00      1.00      2477
          3       1.00      1.00      1.00      2942
          4       0.95      1.00      0.97       475
          5       0.00      0.00      0.00        42
          6       0.88      0.97      0.93       751
          7       0.99      0.95      0.97       398
          8       0.98      0.94      0.96      1228
          9       0.99      0.98      0.98      1012

avg / total       0.98      0.98      0.98     10865


In [143]:
met.confusion_matrix(classes, rf_bigram_predicted)


Out[143]:
array([[1522,    0,    0,    1,    0,   16,    0,    0,    1],
       [   3, 2468,    0,    0,    0,    0,    1,    4,    1],
       [   0,    0, 2942,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,  473,    0,    0,    1,    0,    1],
       [   0,    0,    0,    6,    0,   33,    0,    3,    0],
       [   3,    1,    0,    6,    0,  732,    0,    8,    1],
       [   3,    0,    1,    0,    0,   16,  377,    0,    1],
       [  29,    1,    5,   12,    0,   26,    0, 1149,    6],
       [   7,    1,    0,    0,    0,    7,    0,    5,  992]])

In [ ]:
met.confusion_matrix(classes, rf_bigram.predict(bigrams))

In [ ]:


In [133]:
# unigram testing set
docs = db.test_samples.find({'id': {"$exists": True}}, {'id':1, 'hexcode':1})
docs.count()


Out[133]:
10873

In [ ]:
test_ids, _, test_unigrams = bq.upack_query(docs, 'unigrams', numpy=True, test=True)

In [ ]:


In [ ]:


In [60]:
unigram_predicted_probs = rf.predict_proba(test_unigrams)

In [67]:
import numpy as np

In [72]:
def predict_and_save(X, model, ids, create_submission=False, filename='submission.txt'):
    """
    Predicts a set of 9 probabilities per malware sample, that
    correspond to the 9 malware classes. If `create_submission`
    is True, then a text file named `filename` is created for
    submission into Kaggle.

    Arguments:
    `X`: The data in which predictions will be made.
    `create_submission`: Indicates whether a submission file should
    be created or not.
    `filename`: The file that will contain the submission.
    """
    predicted_prob = model.predict_proba(X)
    if create_submission:
        to_print = np.column_stack((ids,
                                    predicted_prob))
        np.savetxt(filename, to_print, header=','.join(['"Id"'] + \
                            ['"Prediction%d"' % x for x in range(1, 10)]), \
                               fmt='%s', delimiter=',')
    return predicted_prob

In [132]:
ps = predict_and_save(test_unigrams, rf, test_ids, create_submission=True, filename='unigram_rf.txt')

In [145]:



Out[145]:
(10873, 9)

In [ ]:


In [ ]:
docs = db.test_samples.find({'id': {"$exists": True}}, {'id':1, 'hexcode':1})
test_ids, _, test_bigrams = bq.upack_query(docs, 'bigrams', numpy=True, test=True)

In [78]:


In [80]:


In [92]:


In [93]:



Out[93]:
10870

In [85]:


In [87]:


In [ ]:


In [ ]:


In [113]:


In [114]:



Out[114]:
4

In [100]:


In [110]:


In [ ]: