In [ ]:


In [ ]:


In [ ]:


In [1]:
import src.utils.utils as utils


hello

In [ ]:


In [205]:
print(utils.get_mongodb.__doc__)


    Returns the mongodb root database you specify from mongo
    Args:
        db_address (str): url for the machine on which the mongo instance
                    lives; defaults to the dept vm. passing 'andres' will go to
                    andres' machines
        username (string): username for mongo
        Password (string): default to the password
        collection ('string'): the collection you want.
    

In [206]:
malware = utils.get_mongodb(username='populator')

In [ ]:


In [6]:
sampleset = malware.samples.find_one({'class': '5'})

In [4]:
malware.collection_names()


Out[4]:
['reduced', 'samples', 'system.indexes', 'test_samples']

In [44]:
with open('./windows_fnames.dict', 'r') as f:
    fnames = {name.strip() for name in f.readlines()}

In [40]:
'PathCchAppend' in fnames


Out[40]:
True

In [24]:
test_string_filt = malware.samples.find_one({'id': 'cIojVJGQOtrL0S1ApeDY'})

In [25]:
test_string_filt['strings'].keys()


Out[25]:
dict_keys(['raw', 'function_calls'])

In [22]:
import insert_strings as strings

In [23]:
strings.read_strings_file(test_string_filt['id'])


Out[23]:
[]

In [26]:
test_string_filt['strings']


Out[26]:
{'function_calls': [],
 'raw': ['40 $',
  '5$55',
  '! 10',
  '!0%!',
  '*j`*',
  'J`(@`',
  '"(*J',
  '`*`j',
  'B*Bb((',
  '`Bbb',
  '*( @',
  'b``"',
  ':";)',
  '@PHP',
  '800',
  'SSRTW@C',
  'TRVU',
  'DPFWWTDQP',
  'USAA',
  'VUPPBFWWAC',
  'GRBEF',
  'USRTGD',
  'DF@V',
  'TQBR',
  'BCAG',
  'ADGDGB',
  'SQGQ',
  'TBPEV',
  'RB@FFU@',
  'WFVP',
  'GDPTTU',
  'ASFCGQ',
  'VS@S',
  'PCBB',
  'FEQSS',
  '@BB@ND',
  'DLJJJD',
  'NJ@JD',
  'DHL@',
  'NFDJL',
  'FLFJBDN',
  'JHN@',
  'FJDJDD',
  'F@F@NJLH',
  'H@LH',
  'BDNDNF',
  'HDD@',
  'DJDDH@NJFNB@F',
  '@LBL',
  'DDNJJH',
  '@JBN',
  'F@JJ',
  'HHLNN@',
  'DDNBD',
  'DHNBJL',
  '',
  '',
  '',
  '"rd"d',
  '$15$D`',
  '&Q3c',
  'cB\'"',
  '1suv',
  'Q0ug',
  'qv4uc',
  '&dTtq',
  "'sr%",
  '90 8',
  '0991',
  '9 9!9',
  '(918',
  '0 !)',
  '001',
  'K\tCI',
  '@HKI',
  '((0(',
  '($$8',
  'D@Ll`d@H(L$@L',
  '(d(`',
  'dDD`$HDL`d$l(L@H',
  '(,`L`H(h',
  '`llL',
  'HL`@HDD`',
  'hlll',
  'D` @',
  'd@L,`LHl',
  'h(L$Dl',
  'D$,d(lh$h@',
  'D$D@llH(`( $@',
  'h$@@',
  '`,DD',
  '@HlDlLd',
  'l  `hL`LDhhd$ `',
  '@dl(',
  'HD@l$',
  'd hH$',
  'lH$d$h',
  '$Ll$((Ll(l',
  ',h,@l',
  ',h` dH',
  '(d(l',
  '$,d l',
  '(l(D',
  'L@d`$',
  'D dL',
  'h`,D',
  'l`  (h`DD',
  'l(D',
  'HL@H`',
  'h,@hL(DH @``lDH',
  'DD@@$D',
  ',`d(',
  '(LlH,lldh`D',
  'h@,,`(',
  '(`ddH$l@H',
  'Dh,lHd@`@@',
  ', D,$DD`',
  'dA@@',
  '@A@P@@',
  '@PPQ',
  'P@@P',
  'QQPP',
  'P@@P',
  'Q@AA',
  'QAAQ@AAQ@@',
  'A@@P',
  'QQPQ@Q',
  'Q@QQ',
  'QPAQPQAPQP@',
  '@@QQQ',
  '@@QQA',
  'A@PAA',
  'P@QAQP',
  'PTTT',
  '@T@DPP',
  'DDPT',
  'TDTT',
  'DT@TD@@DP',
  'PTPD@',
  '@TT@',
  'PT@DD@',
  '@T@@@D',
  '@DT@',
  'D@DT@',
  'PPPTP',
  'T@P@PP',
  '@T@P@T',
  'TPDPD',
  'j5jbj',
  'jhj8je',
  'jWj0j8',
  "j'jjj",
  'j0jYjL',
  'jLj?',
  'joj)j,',
  'jnjTj}',
  'jqjpjw',
  'jmj=j',
  'jjjbj|',
  'jfjvjo',
  'jcjzjG',
  'j+js',
  'j4jbjc',
  'jhj j_',
  'jajCjm',
  'jrjB',
  'jkjSjj',
  'j5jIjg',
  'jxj7',
  'jZj{',
  'j+j*j',
  'jHjmjm',
  'j7j\\j4',
  'jmj0jA',
  'jHj(j^',
  'jEj3j+',
  'jtjkj?',
  'j~jGjG',
  'j2j\\jY',
  'j^jxj`',
  'jsj%jU',
  'jMj9',
  'jFjVjZ',
  'j7j8',
  'j{j8j"',
  'j8j[',
  'jnj3j',
  'j)jt',
  'jijgjx',
  'j>jY',
  'jZj{j',
  'j^j;jW',
  'j.j+j&',
  "j'j&j2",
  'j"j4',
  'j_j:',
  'jfj6',
  'jVjAjL',
  'jBjHjL',
  'jwj[',
  'j6j[jN',
  'jDj(j"',
  'j{j2jO',
  'j\tjA',
  'j~j~j"',
  'j~jQ',
  'jnjNj>',
  'jmj8j',
  'jBjFjA',
  'j=j}j',
  'jBjtj8',
  'jQjmj',
  'jaj*',
  'jGj]j8',
  'jvj~jW',
  'j"j+',
  'jHjXjZ',
  'jujIjp',
  'jLj}j*',
  'jCj~j',
  'jOjdj]',
  'j#jUj+',
  'jdjCj',
  'jQj\\jj',
  'j(jSjU',
  'j^j*jg',
  'j6j;jg',
  'j\tjpj',
  'j;j1',
  'j\tjxj',
  'j~j{',
  'j j"',
  'j8jWj:',
  'jgjyj_',
  'jRjQjV',
  'jjj7',
  'j+jxj',
  'jcj>j',
  'j\\jojC',
  'j3jJj<',
  'j%juj',
  'j/j]je',
  'j!j:j',
  'jHjQjE',
  'jaj"j$',
  'j@j\\',
  'jNjXj9',
  'jgj#j',
  'j,jqjC',
  'jhj_jj',
  'jXj,j',
  'jpj8',
  'jXj-j>',
  'j|jej',
  'jbjPj',
  'jVj`j/',
  'j,j\\j3',
  'jcj.j',
  'jkj_jE',
  'jqjejI',
  'jUj&j\\',
  'jYj\\',
  "j#jcj'",
  'j=jGj>',
  'jbjOjl',
  'jij,j>',
  'jujS',
  'joj2jH',
  'jsjtj',
  'jfjQ',
  'j7j&jv',
  'j"j{',
  'jZj|j',
  'j~jYj',
  'jEj$ju',
  'j.jaj2',
  'jqjb',
  'jOj%',
  'jnjIjn',
  'j=jKj$',
  'jdj,j*',
  'jwjnj?',
  'j6jiju',
  'mZiU',
  '0Vaw',
  'L^Vh',
  'XPh',
  'pQR3',
  'QRhBi',
  'Vh,A',
  'F)5q',
  'lQR3',
  'QRh,',
  'hmn+',
  'Vh,A',
  'pQR3',
  'PQRh',
  'hnyr^h',
  'O)=z',
  '}P0B',
  ',;f!',
  '+=cKD',
  '%h_s',
  'WPPPPPPP',
  'Wh,A',
  'Rh A',
  'Phq`',
  'nu^V',
  '2[iYQ',
  ')XPh',
  'Wh/B',
  ";5'lD",
  '3=f D',
  'InternetAttemptConnect',
  'HttpSendRequestW',
  'InternetReadFile',
  'InternetQueryOptionW',
  'InternetSetOptionW',
  'InternetAutodial',
  'InternetCloseHandle',
  'HttpQueryInfoW',
  'InternetOpenW',
  'InternetOpenUrlW',
  'WININET.dll',
  'LocalAlloc',
  'GetCurrentProcess',
  'GetCurrentThread',
  'GetFileAttributesW',
  'GetDateFormatW',
  'GetTimeFormatW',
  'GetStartupInfoW',
  'LoadLibraryA',
  'ExpandEnvironmentStringsA',
  'LocalFree',
  'OpenMutexW',
  'DosDateTimeToFileTime',
  'lstrcmpiW',
  'CreateDirectoryW',
  'lstrlenW',
  'GetProcessHeap',
  'GetModuleHandleW',
  'CreateMutexW',
  'SetUnhandledExceptionFilter',
  'GetTickCount',
  'GetCurrentThreadId',
  'IsDebuggerPresent',
  'FindResourceW',
  'LoadResource',
  'WaitForSingleObject',
  'GetCurrentProcessId',
  'GetSystemTime',
  'GetModuleFileNameW',
  'GetExitCodeProcess',
  'QueryPerformanceCounter',
  'GetLastError',
  'CreateEventW',
  'LoadLibraryW',
  'Sleep',
  'lstrlenA',
  'GetTempPathW',
  'FreeLibrary',
  'SetEvent',
  'GetLocaleInfoW',
  'GetVersionExW',
  'MultiByteToWideChar',
  'GetLocalTime',
  'GetVersion',
  'GetProcAddress',
  'KERNEL32.dll',
  'SetTimer',
  'LoadIconW',
  'GetForegroundWindow',
  'GetFocus',
  'IsWindow',
  'CopyRect',
  'SetCapture',
  'GetActiveWindow',
  'MessageBoxW',
  'GetCursorPos',
  'SetWindowLongW',
  'SetCursor',
  'LoadCursorW',
  'GetSystemMetrics',
  'SetForegroundWindow',
  'PostMessageW',
  'FindWindowW',
  'SendMessageW',
  'EnableWindow',
  'InvalidateRect',
  'GetDesktopWindow',
  'LoadImageW',
  'OffsetRect',
  'DestroyIcon',
  'IsChild',
  'MessageBeep',
  'PostMessageA',
  'PeekMessageA',
  'GetKeyboardLayout',
  'GetCapture',
  'RegisterWindowMessageA',
  'AdjustWindowRect',
  'GetKeyboardType',
  'ActivateKeyboardLayout',
  'EndMenu',
  'GetClassInfoExW',
  'SetDlgItemInt',
  'CreateWindowExA',
  'CreateDialogParamW',
  'CharPrevA',
  'CharPrevW',
  'CreateAcceleratorTableW',
  'SetWindowPos',
  'GetMenuStringA',
  'EmptyClipboard',
  'WinHelpW',
  'GetWindowRgn',
  'RegisterWindowMessageW',
  'RegisterClassExA',
  'MonitorFromPoint',
  'EnumClipboardFormats',
  'LoadIconA',
  'GetTopWindow',
  'CreateDialogParamA',
  'DestroyCursor',
  'MonitorFromWindow',
  'GetDlgItemInt',
  'RemoveMenu',
  'MessageBoxIndirectW',
  'ShowCaret',
  'SetActiveWindow',
  'InsertMenuItemW',
  'CreatePopupMenu',
  'InsertMenuA',
  'GetMenu',
  'GetScrollPos',
  'GetMenuItemID',
  'SendDlgItemMessageW',
  'CharNextA',
  'PeekMessageW',
  'EndDialog',
  'UpdateLayeredWindow',
  'USER32.dll',
  'CreateFontIndirectW',
  'CreatePen',
  'CreateCompatibleDC',
  'CreateSolidBrush',
  'GetStockObject',
  'CreatePalette',
  'StretchDIBits',
  'DeleteObject',
  'GetTextExtentPointA',
  'CreateMetaFileW',
  'CreateICA',
  'SetEnhMetaFileBits',
  'GdiGetBatchLimit',
  'GetRasterizerCaps',
  'SetMetaFileBitsEx',
  'CreatePolyPolygonRgn',
  'CreateBitmapIndirect',
  'CreateColorSpaceW',
  'CreateEllipticRgn',
  'CreateFontA',
  'GDI32.dll',
  'GetOpenFileNameW',
  'COMDLG32.dll',
  'ImpersonateSelf',
  'OpenThreadToken',
  'OpenProcessToken',
  'AllocateAndInitializeSid',
  'InitializeSecurityDescriptor',
  'GetLengthSid',
  'InitializeAcl',
  'AddAccessAllowedAce',
  'SetSecurityDescriptorDacl',
  'SetSecurityDescriptorGroup',
  'FileEncryptionStatusW',
  'FreeSid',
  'SetSecurityDescriptorOwner',
  'IsValidSecurityDescriptor',
  'RegOpenKeyExA',
  'RegQueryValueExA',
  'RegDeleteValueW',
  'RegOpenKeyExW',
  'RegQueryValueExW',
  'RegCreateKeyExW',
  'RegSetValueExW',
  'RegCloseKey',
  'AccessCheck',
  'RevertToSelf',
  'DecryptFileW',
  'ADVAPI32.dll',
  'OLEAUT32.dll',
  'ShellExecuteW',
  'SHELL32.dll',
  'NetGetJoinableOUs',
  'netapi32.dll',
  'RasSetEapUserInfo',
  'rasman.dll',
  'PSetupCreateMonitorInfo',
  'PSetupDriverInfoFromName',
  'PSetupInstallMonitor',
  'ntprint.dll',
  '`XxP',
  '!@Y()P',
  'ih0\typ',
  '8Qq9Hi!',
  'dadAe',
  'e DdEA !EdD% !`',
  '%DAAa$%E',
  '`@E!ED',
  '$` A%$!',
  '`!e`!$%',
  'a`E$',
  'E!$e$EE@`',
  '@d$@',
  'AAD !',
  '`@a`$daAA',
  '`D`eA$',
  '@ A',
  'EDaD',
  '$aa@aD',
  'a@%$e',
  'a``E',
  'd@@a',
  'D edaEaEEAE%AE',
  'aE!$d A!A ae',
  '+.\t)',
  '% ,!',
  '@C `!',
  '#!#A',
  '`b!AB',
  '!a#BA` #b`` Ab@#',
  'cb!a"A',
  'ba',
  'CB#cB @Ca CcaA##@',
  'C"@ `#a c!',
  '@CcbA',
  'CBAC`',
  'Bab"',
  '"CCC"A',
  '" B"!A""BA',
  '" B @@',
  'AaCCBa !"@b@" "B#C',
  'AB#`',
  'C`A B`@CCacc@A@aC',
  '!aBA',
  'B"c#',
  'b"a"',
  'C` ` A "',
  '`CAA',
  '6&04',
  '74"%',
  '!075',
  '33"" 232',
  '1407!',
  'c##a',
  '(',
  '(((',
  '((',
  '( ((((',
  '( (',
  '((',
  '( (',
  '(',
  'RJBRR',
  'BNH@',
  'DDDBLN',
  'HBNH',
  'F@LBB@',
  'JNBLB',
  'HN@@B',
  'LJHL',
  'FNNBL@B',
  'EBFGA',
  'AGCD',
  '@DBD',
  'BDFAGDE',
  'DG@@F',
  '@ABGG',
  'FDAC@ADF',
  'BGAE',
  'GFADFD',
  'FB@@E',
  'CGCD',
  'AAAK',
  'PP@@@',
  'PPP@P',
  'PPPP@P@P',
  'PP@@PP',
  'PP@@',
  'PP@P',
  '@PP@PP',
  'p0p`R""',
  'r RRBp@`0Bp',
  '22b0"ppB"``',
  'B2pb2r@',
  'rB2@RB',
  '"0P2',
  '2r" @bp',
  '`PP2 @2Pp"Bp 0B@RP',
  'P Brb',
  'R prr"00',
  '`P2`B"',
  '`R @@rr',
  'rBBB',
  'b@``p`"P',
  'P Bp',
  '22"0p',
  'p`PP',
  '`r0Br B@Bpb',
  'b"@BP',
  'Bbb"0',
  '2 RR0\t)',
  '*(\t"',
  ')#*)*(*',
  '"!"))(',
  '#\t")!',
  '!(\t#)',
  '! !+',
  '"**',
  '!+"") )!',
  'DJBB',
  'B@DJ',
  'w"R&',
  '?B4K',
  'F`"F',
  'gH!',
  '"\'S]',
  '0V|x',
  'u6YoCf*8V',
  'TaOO',
  "_Q.'",
  '-59r',
  'Y\\E*(',
  '5CwM',
  'uu?_',
  '~/~~X',
  'eZS0',
  'X\tSTtTw',
  'nx{N',
  't=S',
  '#^f>',
  'UiF<RP',
  'z#-MHI',
  'j.E=',
  'P-lbM',
  'bD>p',
  '5 +a',
  '[v~S',
  'cQ|@',
  '9h+be;(',
  'hT{v',
  'RxFj',
  'ts1',
  ',l[|',
  'rC}Y)',
  'Qye|',
  'q4r]f',
  '\\UL(`',
  'oo[J)',
  'bsKrf',
  '4jH*Z',
  ']Dch',
  '+^9|?9"C',
  'AwNmC',
  'mcx8',
  '"[|y',
  '|{H3',
  'Y /Y',
  '=J^8',
  ',Hya7',
  'vz4eR~v',
  'a1EO',
  'B8V"',
  'yg.%',
  'Vy2m5Z',
  'X MX`',
  'h:nN',
  '"dmwn',
  ':BV)n)s',
  'a(;tP',
  'jQpsP',
  '$~(V',
  'mx?s',
  '&TFs',
  'Lo}#',
  'N\\|#',
  ']?S=',
  'CZ C',
  'JzlJ',
  'feuP$',
  '00Ps',
  ':ii4',
  '}ReeA',
  "WXZ'=z",
  'I")F',
  '8wJp',
  'h|8}',
  '\\uPz',
  '+%:gKB',
  '|&N,W',
  'V"/H',
  '2r-f',
  'IIk]',
  'EHO*f',
  ':b&)',
  'OiQe',
  '_&<4j',
  '9T*!',
  'v:*6',
  'S)C!^',
  '4vB`+R',
  'w2Pm',
  'ema',
  'V}/R',
  ';x8i',
  'e_#l',
  'O|x_5:',
  '%Gs?',
  '/1TA',
  'C\\|mr',
  'SC)op',
  ')&Y~z',
  'vP<s]',
  '2-a>',
  'BN\tE',
  'p"\\)',
  '{h-9',
  'E@yte',
  'A7[u',
  '#c#_',
  '2TAx:!',
  'jRMk',
  't"?z',
  'nc**',
  'gkqUHk',
  '/Oj,J',
  '&%;"y',
  'aTBWf',
  'f{\tDX',
  'AS2]0b',
  '60d',
  "0%(0&O'",
  'E6qf',
  'S0#1',
  '\\<#"c',
  "'}Mo",
  'yBxt',
  'th#k',
  "/'(e[Q",
  '6H9-3',
  '@$C$Mr',
  '5T:7',
  '`*rE`g',
  'fwOO2',
  '`WJ|\tr',
  'tPRK',
  'D/y"',
  'q!q4',
  '~dxy',
  'sr7_',
  'e]{!',
  '#P*S',
  'UY@,',
  '}ua.<',
  ';.4=',
  '!*|T',
  'TR*FP',
  'B4S`',
  '-\tU@^LL',
  '0&Y/i',
  'Fz(+',
  '_G4A',
  '5L=Y',
  '+NOE',
  "9'}`*",
  'vT#a',
  '$}r^',
  '/g]7u',
  'IZuH',
  '"#_&',
  'NU%5',
  '0Q6X',
  'sdU%8',
  'Pa_C~6@',
  'wN{)#k',
  'th.,',
  ';<gYB',
  'ON,~',
  "q!x&Q'",
  '4{@0"',
  'jEyA',
  '&:/@',
  'j/"\\',
  '--"',
  'WQJ^',
  'S8F{',
  'z?$.',
  '[MKh',
  '*:*\\',
  'J:V=',
  '}fK',
  '6Q{f5',
  '1{MTyN',
  "sP'T",
  'xZM)',
  ':_XiD?',
  'V (rbh',
  'XuQn',
  'QRI"',
  'Va1VU',
  'Qlc,',
  ':|6*',
  '5^hE',
  'cGd.',
  'wD+E',
  '78#G',
  "W.',",
  'wfwL',
  'eaKQ|',
  'uloQ',
  ']@\tb',
  'i4TY9',
  'S[OF',
  'H({y',
  '/#k8o',
  'O\\@zL',
  ">'c$",
  ':rae',
  '4#{F',
  ')YF#',
  'Vmn1',
  'naa~d|',
  'eA1g',
  'Rc!=',
  's;4l',
  '%P]F',
  'le48?',
  'MWW',
  '|:e`',
  'tZgj',
  'pqFK#',
  '#WvOc',
  '1SSOG',
  "X'iT",
  '\\@G]',
  'RAhA',
  'TI\\l',
  'aXbg',
  'u>o&o',
  ')?r(1',
  'xV,27',
  '{+6ud',
  '/`aV',
  '$gJ',
  "R.ea'",
  'krzM',
  'TB)D,0nx',
  'Q9/B',
  'I}<L',
  'p;i',
  'Y_)(',
  '\\K*W@',
  '1fERN',
  'lzRq',
  'vYhC$4',
  'oVHa',
  '@1Kg',
  'xR-#N',
  'V+;w',
  '"p[,',
  'Z7$a\\',
  'xV`!',
  '%vYCk',
  'FH+:*',
  'XlAEe',
  '0a JE(',
  '~[GF',
  'lPK(',
  '(lax',
  'T_f#)',
  'h3i:',
  'Cd};',
  '@#Pn\tQe',
  'b9{$',
  'QKo7',
  '}f/b',
  '1l9+',
  'v\\\tq',
  'NhcQ',
  'ro8-#',
  '\\$=55',
  '!r|4',
  '_JHU',
  '~:k/',
  'EZ7=.',
  'R %V',
  ")B'_/",
  'vEmc',
  '6tN',
  'F3K4t',
  'R?!9',
  '(cQ?',
  ':ct[',
  'Cu=f',
  'i~aM',
  'gLe',
  'u=i3$>',
  'Z_\\Jb3W',
  '\'%Q1"8',
  '^|~v',
  'Avoc',
  '+]PL',
  '^l^p',
  'wSMN',
  'iPr=}',
  '{1<r',
  '@lAF/',
  '.\\%t',
  'ywq]',
  'K%`aq',
  'NNU>W',
  '!|?G1',
  '9Sm{',
  '@_EG',
  'xGGG',
  'Jt%^',
  'rHpFT',
  '?Mv,',
  '"1%#',
  '7F.C',
  '#-*9+',
  ']UcHn',
  'X@bX',
  'Wa:yx',
  '>qWL',
  '%|SG',
  'H_@]0J',
  'Y1sJ',
  "ADH'",
  '\\8Er',
  'KX 9',
  '%7{p',
  'C$xNGV)',
  '^qCk',
  'g,WF',
  'A5T(',
  ',Xe?',
  '}Xh@*',
  "'2\\OZ",
  'Ih&@',
  'KUgH',
  ')D@dl(',
  'b=Hu1',
  'Zc(u',
  'WxwN\\',
  'cZ?C',
  'A#Jj',
  'L`(=',
  '#pK',
  ',0h$h)',
  'rJlw',
  'S+sD',
  'h/-VE',
  'mm7j',
  'G>/3',
  'z@53',
  ',Yl&',
  "!Tx'C",
  ',;7&)BPr',
  '3/Ko',
  '~Prv',
  '7mlP0',
  '1KS',
  'GShM?ZY',
  'E0cW',
  'I:H.',
  '=J/',
  's(^l',
  'I@pf',
  '0Qj8',
  'U>GVD',
  'r#+n',
  'G3#S/a6',
  'm=AN',
  'x(6/Ce',
  'rp"H[',
  'tJZW',
  '>aNE"',
  'n[:<',
  '|P8F',
  "1/g'",
  'xTj$',
  "'uqf",
  'bA!',
  'bve2',
  'P.tUJ5',
  ')D)D',
  '/kY]K',
  'Vu~=L',
  ...]}

In [27]:
test_string_filt['strings']['function_calls'] = [name for name in test_string_filt['strings']['raw'] if name in fnames]

In [36]:
testlist = [name.lower() for name in test_string_filt['strings']['raw'] if 'dll' in name or 'DLL' in name]
testlist


Out[36]:
['wininet.dll',
 'kernel32.dll',
 'user32.dll',
 'gdi32.dll',
 'comdlg32.dll',
 'advapi32.dll',
 'oleaut32.dll',
 'shell32.dll',
 'netapi32.dll',
 'rasman.dll',
 'ntprint.dll']

In [18]:
test_string_filt['strings']['function_calls']


Out[18]:
[]

In [45]:
'InitializeSecurityDescriptor' in fnames


Out[45]:
False

In [51]:
import re

In [155]:
f_stopwords = {
    '#',
    '@',
    '!',
    '%',
    '$',
    '`',
    ' ',
    '*',
    '?',
    '<',
    '>',
    ':',
    '/',
    '\\'
    ',',
    '"',
    '\t',
    "'",
    ')',
    '(',
    '\\\\',
    '&',
    '[',
    ']',
    '_',
    '=',
    '-',
    ';',
    '^',
    ".",
    ",",
    ".",
    '+',
    '{',
    '}',
    '|'
    }

In [ ]:


In [93]:



Out[93]:
['>', '%', '*', '$', '#', '!', '@', ' ', '`', '?', '<']

In [99]:
camel_case = re.compile('([A-Z][a-z]+)')
cam_filter = filter(camel_case.search, test_string_filt['strings']['raw'])

In [136]:
camel_case = re.compile('(^[A-Z][a-z]+)')
nums = re.compile('^[a-zA-Z]+')


cam_filter = filter(camel_case.search, test_string_filt['strings']['raw'])
# [word for stop in f_stopwords for word in cam_filter if stop not in word]

In [162]:
def filter_raw_string(raw_string):
    cam_filter = filter(camel_case.search, raw_string)
    num_filter = filter(nums.search, cam_filter)
    len_filter = filter(lambda x: len(x) > 5, num_filter)
    fin =[]
    for word in len_filter:
        flag = False
        for s in f_stopwords:
            if s in word:
               # print('flag is true')
                flag = True
                break
        if flag == False:
            fin.append(word)
    return fin

In [163]:
filter_raw_string(test_string_filt['strings']['raw'])


Out[163]:
['InternetAttemptConnect',
 'HttpSendRequestW',
 'InternetReadFile',
 'InternetQueryOptionW',
 'InternetSetOptionW',
 'InternetAutodial',
 'InternetCloseHandle',
 'HttpQueryInfoW',
 'InternetOpenW',
 'InternetOpenUrlW',
 'LocalAlloc',
 'GetCurrentProcess',
 'GetCurrentThread',
 'GetFileAttributesW',
 'GetDateFormatW',
 'GetTimeFormatW',
 'GetStartupInfoW',
 'LoadLibraryA',
 'ExpandEnvironmentStringsA',
 'LocalFree',
 'OpenMutexW',
 'DosDateTimeToFileTime',
 'CreateDirectoryW',
 'GetProcessHeap',
 'GetModuleHandleW',
 'CreateMutexW',
 'SetUnhandledExceptionFilter',
 'GetTickCount',
 'GetCurrentThreadId',
 'IsDebuggerPresent',
 'FindResourceW',
 'LoadResource',
 'WaitForSingleObject',
 'GetCurrentProcessId',
 'GetSystemTime',
 'GetModuleFileNameW',
 'GetExitCodeProcess',
 'QueryPerformanceCounter',
 'GetLastError',
 'CreateEventW',
 'LoadLibraryW',
 'GetTempPathW',
 'FreeLibrary',
 'SetEvent',
 'GetLocaleInfoW',
 'GetVersionExW',
 'MultiByteToWideChar',
 'GetLocalTime',
 'GetVersion',
 'GetProcAddress',
 'SetTimer',
 'LoadIconW',
 'GetForegroundWindow',
 'GetFocus',
 'IsWindow',
 'CopyRect',
 'SetCapture',
 'GetActiveWindow',
 'MessageBoxW',
 'GetCursorPos',
 'SetWindowLongW',
 'SetCursor',
 'LoadCursorW',
 'GetSystemMetrics',
 'SetForegroundWindow',
 'PostMessageW',
 'FindWindowW',
 'SendMessageW',
 'EnableWindow',
 'InvalidateRect',
 'GetDesktopWindow',
 'LoadImageW',
 'OffsetRect',
 'DestroyIcon',
 'IsChild',
 'MessageBeep',
 'PostMessageA',
 'PeekMessageA',
 'GetKeyboardLayout',
 'GetCapture',
 'RegisterWindowMessageA',
 'AdjustWindowRect',
 'GetKeyboardType',
 'ActivateKeyboardLayout',
 'EndMenu',
 'GetClassInfoExW',
 'SetDlgItemInt',
 'CreateWindowExA',
 'CreateDialogParamW',
 'CharPrevA',
 'CharPrevW',
 'CreateAcceleratorTableW',
 'SetWindowPos',
 'GetMenuStringA',
 'EmptyClipboard',
 'WinHelpW',
 'GetWindowRgn',
 'RegisterWindowMessageW',
 'RegisterClassExA',
 'MonitorFromPoint',
 'EnumClipboardFormats',
 'LoadIconA',
 'GetTopWindow',
 'CreateDialogParamA',
 'DestroyCursor',
 'MonitorFromWindow',
 'GetDlgItemInt',
 'RemoveMenu',
 'MessageBoxIndirectW',
 'ShowCaret',
 'SetActiveWindow',
 'InsertMenuItemW',
 'CreatePopupMenu',
 'InsertMenuA',
 'GetMenu',
 'GetScrollPos',
 'GetMenuItemID',
 'SendDlgItemMessageW',
 'CharNextA',
 'PeekMessageW',
 'EndDialog',
 'UpdateLayeredWindow',
 'CreateFontIndirectW',
 'CreatePen',
 'CreateCompatibleDC',
 'CreateSolidBrush',
 'GetStockObject',
 'CreatePalette',
 'StretchDIBits',
 'DeleteObject',
 'GetTextExtentPointA',
 'CreateMetaFileW',
 'CreateICA',
 'SetEnhMetaFileBits',
 'GdiGetBatchLimit',
 'GetRasterizerCaps',
 'SetMetaFileBitsEx',
 'CreatePolyPolygonRgn',
 'CreateBitmapIndirect',
 'CreateColorSpaceW',
 'CreateEllipticRgn',
 'CreateFontA',
 'GetOpenFileNameW',
 'ImpersonateSelf',
 'OpenThreadToken',
 'OpenProcessToken',
 'AllocateAndInitializeSid',
 'InitializeSecurityDescriptor',
 'GetLengthSid',
 'InitializeAcl',
 'AddAccessAllowedAce',
 'SetSecurityDescriptorDacl',
 'SetSecurityDescriptorGroup',
 'FileEncryptionStatusW',
 'FreeSid',
 'SetSecurityDescriptorOwner',
 'IsValidSecurityDescriptor',
 'RegOpenKeyExA',
 'RegQueryValueExA',
 'RegDeleteValueW',
 'RegOpenKeyExW',
 'RegQueryValueExW',
 'RegCreateKeyExW',
 'RegSetValueExW',
 'RegCloseKey',
 'AccessCheck',
 'RevertToSelf',
 'DecryptFileW',
 'ShellExecuteW',
 'NetGetJoinableOUs',
 'RasSetEapUserInfo',
 'Vy2m5Z',
 'MmxxSgZfMiZgToU',
 'Xociqegurukovav',
 'Bupifik',
 'Fycasuhosujub',
 'Megukacahytu',
 'Jywomu',
 'IkKWeQy',
 'YbjLaxF1l']

In [110]:
w = ['B*Bb((',
 'B*Bb((',
 'B*Bb((',
 '`Bbb',
 '`Bbb',
 '`Bbb',
 '`Bbb']
[a for a in w if a not in f_stopwords]


Out[110]:
['B*Bb((', 'B*Bb((', 'B*Bb((', '`Bbb', '`Bbb', '`Bbb', '`Bbb']

In [111]:
'*' in w[0]


Out[111]:
True

In [ ]:


In [115]:
fin =[]
for word in w:
    flag = False
    for s in f_stopwords:
        if s in word:
            flag = True
    if flag == False:
        fin.append(word)
        
        
    
fin


Out[115]:
[]

In [189]:
t = malware.samples.find_one({'this.strings.function_names.length > 1'})


---------------------------------------------------------------------------
InvalidDocument                           Traceback (most recent call last)
<ipython-input-189-234225bb4798> in <module>()
----> 1 t = malware.samples.find_one({'this.strings.function_names.length > 1'})

/usr/local/lib/python3.4/site-packages/pymongo/collection.py in find_one(self, spec_or_id, *args, **kwargs)
    722                            *args, **kwargs).max_time_ms(max_time_ms)
    723 
--> 724         for result in cursor.limit(-1):
    725             return result
    726         return None

/usr/local/lib/python3.4/site-packages/pymongo/cursor.py in __next__(self)
   1074             raise StopIteration
   1075         db = self.__collection.database
-> 1076         if len(self.__data) or self._refresh():
   1077             if self.__manipulate:
   1078                 return db._fix_outgoing(self.__data.popleft(),

/usr/local/lib/python3.4/site-packages/pymongo/cursor.py in _refresh(self)
   1018                               self.__skip, ntoreturn,
   1019                               self.__query_spec(), self.__fields,
-> 1020                               self.__uuid_subtype))
   1021             if not self.__id:
   1022                 self.__killed = True

InvalidDocument: Cannot encode object: {'this.strings.function_names.length > 1'}

In [ ]:
test = malware.samples.find({'id': 'K70AVwRbXjTFZv9g463D'})

In [211]:
t = malware.samples.find( {'$where' : 'this.strings.raw.length > 1'} )

In [195]:
t = malware.samples.find_one({'class':'3'})

In [208]:
t.count()


Out[208]:
6383

In [212]:
[len(d['strings']['function_calls']) for d in t[0:100]]


Out[212]:
[0,
 163,
 156,
 163,
 165,
 158,
 92,
 156,
 158,
 165,
 53,
 157,
 162,
 167,
 152,
 37,
 162,
 166,
 161,
 158,
 165,
 164,
 168,
 160,
 67,
 163,
 164,
 162,
 159,
 31,
 31,
 73,
 162,
 160,
 92,
 107,
 160,
 156,
 153,
 165,
 166,
 57,
 163,
 163,
 156,
 31,
 164,
 115,
 163,
 164,
 93,
 159,
 164,
 170,
 162,
 185,
 163,
 157,
 164,
 216,
 165,
 161,
 162,
 160,
 155,
 31,
 101,
 32,
 159,
 88,
 130,
 212,
 159,
 161,
 166,
 169,
 160,
 1054,
 189,
 76,
 165,
 159,
 161,
 166,
 162,
 160,
 164,
 152,
 168,
 164,
 195,
 240,
 121,
 168,
 240,
 32,
 39,
 31,
 167,
 162]

In [ ]:


In [234]:
no_strings = malware.samples.find_one( {'strings' : {'$exists': True} } )

In [235]:
no_strings['strings']


Out[235]:
{'dlls': [], 'function_calls': [], 'raw': []}

In [243]:
no_strings = malware.samples.find( {'strings.raw' : [] } )

In [244]:
no_strings.count()


Out[244]:
4482

In [ ]:


In [245]:
no_string_filenames = [doc['id']+'.asm' for doc in no_strings]


---------------------------------------------------------------------------
CursorNotFound                            Traceback (most recent call last)
<ipython-input-245-d318f6f292ab> in <module>()
----> 1 no_string_filenames = [doc['id']+'.asm' for doc in no_strings]

<ipython-input-245-d318f6f292ab> in <listcomp>(.0)
----> 1 no_string_filenames = [doc['id']+'.asm' for doc in no_strings]

/usr/local/lib/python3.4/site-packages/pymongo/cursor.py in __next__(self)
   1074             raise StopIteration
   1075         db = self.__collection.database
-> 1076         if len(self.__data) or self._refresh():
   1077             if self.__manipulate:
   1078                 return db._fix_outgoing(self.__data.popleft(),

/usr/local/lib/python3.4/site-packages/pymongo/cursor.py in _refresh(self)
   1035                 self.__send_message(
   1036                     message.get_more(self.__collection.full_name,
-> 1037                                      limit, self.__id))
   1038 
   1039         else:  # Cursor id is zero nothing else to return

/usr/local/lib/python3.4/site-packages/pymongo/cursor.py in __send_message(self, message)
    956                                                 self.__tz_aware,
    957                                                 self.__uuid_subtype,
--> 958                                                 self.__compile_re)
    959         except OperationFailure:
    960             self.__killed = True

/usr/local/lib/python3.4/site-packages/pymongo/helpers.py in _unpack_response(response, cursor_id, as_class, tz_aware, uuid_subtype, compile_re)
     99 
    100         raise CursorNotFound("cursor id '%s' not valid at server" %
--> 101                              cursor_id)
    102     elif response_flag & 2:
    103         error_object = bson.BSON(response[20:]).decode()

CursorNotFound: cursor id '9400156971' not valid at server

In [ ]:


In [ ]:
with open('no_string_files', 'w') as f:
    [f.write(line) for line in no_string_filenames]

In [ ]:


In [282]:
import glob
import gzip

In [246]:
with open ('../../../data/sample/0A32eTdBKayjCWhZqDOQ.bytes', 'rb') as f:
    byte = f.readlines()

In [344]:
def get_file(data_path='../../../data/sample/'):
    files = glob.glob(data_path + '*.bytes')
    for doc in files:
        with open(doc, 'rb') as f:
            f = [line.strip().decode('utf-8').split(' ') for line in f.readlines()]
            yield f

In [345]:
file_stream = get_file()

In [280]:
import itertools
import os.path
import re
import tarfile
import time

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams

from sklearn.externals.six.moves import html_parser
from sklearn.externals.six.moves import urllib
from sklearn.datasets import get_data_home
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import MultinomialNB

In [317]:
vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18,
                               non_negative=True)

In [346]:
a = next(file_stream)

In [369]:



mike sucks
aaron sucks
andres sucks

In [370]:



mike sucks
aaron sucks
andres sucks
Out[370]:
[None, None, None]

In [320]:
x_train = []
for doc in file_stream:
    x_train = vectorizer.fit_transform(doc)

In [325]:
x_train.shape


Out[325]:
(98816, 262144)

In [330]:
print(x_train[2000:2500, 200:2500])


  (21, 1746)	0.19245008973
  (316, 606)	0.185695338177

In [333]:
from sklearn.decomposition import SparsePCA

sp = SparsePCA()

In [335]:
sp


Out[335]:
SparsePCA(U_init=None, V_init=None, alpha=1, max_iter=1000, method='lars',
     n_components=None, n_jobs=1, random_state=None, ridge_alpha=0.01,
     tol=1e-08, verbose=False)

In [364]:
from gensim.models import word2vec
workers = 4
context = 10

In [366]:
mod = word2vec.Word2Vec(a, workers=workers, window=10)

In [367]:
mod.most_similar('EF')


Out[367]:
[('5F', 0.774681806564331),
 ('BF', 0.7711033821105957),
 ('AF', 0.7676060199737549),
 ('DF', 0.7602571249008179),
 ('C3', 0.7544677257537842),
 ('F8', 0.7522661089897156),
 ('FB', 0.740742564201355),
 ('A2', 0.7403229475021362),
 ('DE', 0.7361049652099609),
 ('F7', 0.7353725433349609)]

In [368]:
mod.syn0.shape


Out[368]:
(257, 100)

In [371]:
[0]*16**2


Out[371]:
[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [378]:


In [382]:



Out[382]:
{'0',
 '1',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '1a',
 '1b',
 '1c',
 '1d',
 '1e',
 '1f',
 '2',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '2a',
 '2b',
 '2c',
 '2d',
 '2e',
 '2f',
 '3',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '3a',
 '3b',
 '3c',
 '3d',
 '3e',
 '3f',
 '4',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '4a',
 '4b',
 '4c',
 '4d',
 '4e',
 '4f',
 '5',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '5a',
 '5b',
 '5c',
 '5d',
 '5e',
 '5f',
 '6',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '6a',
 '6b',
 '6c',
 '6d',
 '6e',
 '6f',
 '7',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '7a',
 '7b',
 '7c',
 '7d',
 '7e',
 '7f',
 '8',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '8a',
 '8b',
 '8c',
 '8d',
 '8e',
 '8f',
 '9',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '9a',
 '9b',
 '9c',
 '9d',
 '9e',
 '9f',
 'a',
 'a0',
 'a1',
 'a2',
 'a3',
 'a4',
 'a5',
 'a6',
 'a7',
 'a8',
 'a9',
 'aa',
 'ab',
 'ac',
 'ad',
 'ae',
 'af',
 'b',
 'b0',
 'b1',
 'b2',
 'b3',
 'b4',
 'b5',
 'b6',
 'b7',
 'b8',
 'b9',
 'ba',
 'bb',
 'bc',
 'bd',
 'be',
 'bf',
 'c',
 'c0',
 'c1',
 'c2',
 'c3',
 'c4',
 'c5',
 'c6',
 'c7',
 'c8',
 'c9',
 'ca',
 'cb',
 'cc',
 'cd',
 'ce',
 'cf',
 'd',
 'd0',
 'd1',
 'd2',
 'd3',
 'd4',
 'd5',
 'd6',
 'd7',
 'd8',
 'd9',
 'da',
 'db',
 'dc',
 'dd',
 'de',
 'df',
 'e',
 'e0',
 'e1',
 'e2',
 'e3',
 'e4',
 'e5',
 'e6',
 'e7',
 'e8',
 'e9',
 'ea',
 'eb',
 'ec',
 'ed',
 'ee',
 'ef',
 'f',
 'f0',
 'f1',
 'f2',
 'f3',
 'f4',
 'f5',
 'f6',
 'f7',
 'f8',
 'f9',
 'fa',
 'fb',
 'fc',
 'fd',
 'fe',
 'ff'}

In [ ]: