In [1]:
import src.utils.utils as utils
import gensim
from gensim.models.word2vec import Word2Vec
import sklearn.cross_validation as cv
from sklearn.ensemble import RandomForestClassifier
import numpy as np # Make sure that numpy is imported
hello
In [2]:
db = utils.get_mongodb()
In [14]:
class5 = db.samples.find({'class':'5'})
In [15]:
_comments = [doc['calls']['calls'] for doc in class5]
In [6]:
from gensim import corpora, models, similarities
In [7]:
dictionary = corpora.Dictionary(comments)
In [23]:
_comments[0]
Out[23]:
['HANDLE __stdcall GetCurrentProcess()',
'UINT __stdcall GetSystemDirectoryW(LPWSTR lpBuffer, UINT uSize)',
'BOOL __stdcall GetVersionExA(LPOSVERSIONINFOAlpVersionInformation)',
'HANDLE __stdcall GetProcessHeap()',
'void __stdcall GetStartupInfoW(LPSTARTUPINFOWlpStartupInfo)',
'LPTOP_LEVEL_EXCEPTION_FILTER __stdcall SetUnhandledExceptionFilter(LPTOP_LEVEL_EXCEPTION_FILTER lpTopLevelExceptionFilter)',
'BOOL __stdcall IsDebuggerPresent()',
'BOOL __stdcall GetCPInfo(UINTCodePage, LPCPINFO lpCPInfo)',
'UINT __stdcall GetACP()',
'UINT __stdcall GetOEMCP()',
'BOOL __stdcall IsValidCodePage(UINT CodePage)',
'HMODULE __stdcall GetModuleHandleA(LPCSTR lpModuleName)',
'HANDLE __stdcall HeapCreate(DWORD flOptions, SIZE_T dwInitialSize, SIZE_T dwMaximumSize)',
'void __stdcall ExitProcess(UINT uExitCode)',
'void __stdcall GetStartupInfoA(LPSTARTUPINFOAlpStartupInfo)',
'BOOL __stdcall QueryPerformanceCounter(LARGE_INTEGER *lpPerformanceCount)',
'BOOL __stdcall GetStringTypeA(LCID Locale, DWORD dwInfoType, LPCSTR lpSrcStr,int cchSrc, LPWORD lpCharType)',
'BOOL __stdcall GetStringTypeW(DWORD dwInfoType, LPCWSTR lpSrcStr, intcchSrc,LPWORD lpCharType)',
'void __stdcall Sleep(DWORD dwMilliseconds)',
'void __stdcall InitializeCriticalSection(LPCRITICAL_SECTION lpCriticalSection)',
'HMODULE __stdcall LoadLibraryA(LPCSTRlpLibFileName)',
'HANDLE __stdcall CreateMutexA(LPSECURITY_ATTRIBUTES lpMutexAttributes, BOOL bInitialOwner, LPCSTR lpName)',
'FARPROC __stdcall GetProcAddress(HMODULE hModule, LPCSTR lpProcName)',
'UINT __stdcall EnumClipboardFormats(UINT format)',
'UINT_PTR __stdcall SetTimer(HWND hWnd, UINT_PTR nIDEvent, UINT uElapse, TIMERPROC lpTimerFunc)',
'UINT __stdcall RegisterWindowMessageA(LPCSTR lpString)',
'ATOM __stdcall RegisterClassExW(constWNDCLASSEXW *)',
'HBITMAP __stdcall LoadBitmapW(HINSTANCE hInstance, LPCWSTR lpBitmapName)',
'BOOL __stdcall OpenClipboard(HWND hWndNewOwner)',
'INT_PTR __stdcall DialogBoxIndirectParamA(HINSTANCE hInstance, LPCDLGTEMPLATEA hDialogTemplate, HWND hWndParent, DLGPROC lpDialogFunc, LPARAMdwInitParam)',
'HANDLE __stdcall LoadImageA(HINSTANCEhInst, LPCSTR name, UINT type, int cx, int cy, UINT fuLoad)',
'BOOL __stdcall AppendMenuW(HMENU hMenu, UINT uFlags, UINT_PTRuIDNewItem, LPCWSTR lpNewItem)',
'UINT __stdcall GetMenuState(HMENU hMenu, UINTuId, UINT uFlags)',
'INT_PTR __stdcall DialogBoxParamW(HINSTANCE hInstance, LPCWSTR lpTemplateName, HWND hWndParent, DLGPROC lpDialogFunc,LPARAM dwInitParam)',
'HCURSOR __stdcall SetCursor(HCURSOR hCursor)',
'BOOL __stdcall InvalidateRect(HWND hWnd, const RECT *lpRect, BOOL bErase)',
'BOOL __stdcall DestroyMenu(HMENU hMenu)',
'BOOL __stdcall IsChild(HWND hWndParent, HWND hWnd)',
'BOOL __stdcall GetMenuItemInfoA(HMENUhmenu, UINT item, BOOL fByPosition, LPMENUITEMINFOA lpmii)',
'HWND __stdcall SetParent(HWNDhWndChild, HWNDhWndNewParent)',
'INT_PTR __stdcall DialogBoxParamA(HINSTANCE hInstance, LPCSTRlpTemplateName,HWND hWndParent, DLGPROC lpDialogFunc, LPARAM dwInitParam)',
'HACCEL __stdcall CreateAcceleratorTableA(LPACCEL paccel, int cAccel)',
'BOOL __stdcall MessageBeep(UINT uType)',
'HWND __stdcall GetActiveWindow()',
'LSTATUS __stdcall RegSetValueExW(HKEYhKey, LPCWSTR lpValueName, DWORD Reserved, DWORD dwType, const BYTE *lpData, DWORD cbData)',
'LSTATUS __stdcall RegCloseKey(HKEY hKey)',
'LSTATUS __stdcall RegCreateKeyExW(HKEY hKey, LPCWSTR lpSubKey, DWORD Reserved, LPWSTRlpClass, DWORD dwOptions, REGSAM samDesired, const LPSECURITY_ATTRIBUTES lpSecurityAttributes, PHKEY phkResult,LPDWORDlpdwDisposition)',
'LSTATUS __stdcall RegOpenKeyExW(HKEY hKey, LPCWSTR lpSubKey, DWORD ulOptions,REGSAM samDesired, PHKEY phkResult)',
'LSTATUS __stdcall RegDeleteKeyW(HKEY hKey, LPCWSTR lpSubKey)',
'BOOL __stdcall CloseServiceHandle(SC_HANDLE hSCObject)',
'BOOL __stdcall StartServiceW(SC_HANDLE hService, DWORD dwNumServiceArgs, LPCWSTR *lpServiceArgVectors)',
'SC_HANDLE __stdcall OpenServiceW(SC_HANDLE hSCManager, LPCWSTR lpServiceName,DWORD dwDesiredAccess)',
'SC_HANDLE __stdcall OpenSCManagerW(LPCWSTR lpMachineName, LPCWSTR lpDatabaseName, DWORD dwDesiredAccess)',
'HPALETTE __stdcall CreatePalette(const LOGPALETTE *plpal)',
'BOOL __stdcall RemoveFontResourceExA(LPCSTR name, DWORD fl, PVOID pdv)',
'HENHMETAFILE __stdcall SetEnhMetaFileBits(UINT nSize,const BYTE *pb)',
'HMETAFILE __stdcall GetMetaFileA(LPCSTR lpName)',
'HRGN __stdcall CreateRectRgn(int x1, int y1, int x2, int y2)',
'HRGN __stdcall CreatePolygonRgn(constPOINT *pptl, int cPoint, int iMode)']
In [35]:
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-35-a2c2645c7bcb> in <module>()
----> 1 comments = [word for word in line.split() for line in comment for comment in _comments]
NameError: name 'line' is not defined
In [33]:
Out[33]:
['int __cdecl sub_401000(SIZE_T dwBytes)',
'int __cdecl sub_401018(LPVOID lpMem, size_t)',
'int __cdecl sub_401516(HANDLE hNamedPipe, LPOVERLAPPEDlpOverlapped)',
'int __cdecl sub_401CBB(void *)',
'int __stdcall WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nShowCmd)',
'int __cdecl sub_403000(void *,void *)',
'int __stdcall sub_40301C(LPCSTR lpProcName, int)',
'BOOL __stdcall GetTokenInformation(HANDLE TokenHandle, TOKEN_INFORMATION_CLASS TokenInformationClass,LPVOID TokenInformation, DWORD TokenInformationLength, PDWORD ReturnLength)',
'BOOL __stdcall InitializeSecurityDescriptor(PSECURITY_DESCRIPTOR pSecurityDescriptor,DWORD dwRevision)',
'LSTATUS __stdcall RegCloseKey(HKEY hKey)',
'LSTATUS __stdcall RegCreateKeyExA(HKEY hKey, LPCSTR lpSubKey,DWORD Reserved,LPSTR lpClass, DWORD dwOptions,REGSAM samDesired, const LPSECURITY_ATTRIBUTES lpSecurityAttributes, PHKEY phkResult, LPDWORD lpdwDisposition)',
'LSTATUS __stdcall RegEnumValueA(HKEY hKey, DWORD dwIndex, LPSTR lpValueName, LPDWORD lpcchValueName, LPDWORD lpReserved, LPDWORD lpType, LPBYTE lpData, LPDWORD lpcbData)',
'LSTATUS __stdcall RegQueryValueExA(HKEY hKey,LPCSTR lpValueName, LPDWORD lpReserved,LPDWORDlpType,LPBYTE lpData, LPDWORD lpcbData)',
'LSTATUS __stdcall RegOpenKeyA(HKEY hKey, LPCSTR lpSubKey, PHKEY phkResult)',
'LSTATUS __stdcall RegNotifyChangeKeyValue(HKEY hKey, BOOL bWatchSubtree, DWORD dwNotifyFilter, HANDLEhEvent,BOOL fAsynchronous)',
'BOOL __stdcall AllocateAndInitializeSid(PSID_IDENTIFIER_AUTHORITY pIdentifierAuthority, BYTE nSubAuthorityCount, DWORD nSubAuthority0, DWORD nSubAuthority1, DWORD nSubAuthority2, DWORD nSubAuthority3, DWORD nSubAuthority4, DWORD nSubAuthority5, DWORD nSubAuthority6, DWORD nSubAuthority7, PSID*pSid)',
'BOOL __stdcall ConnectNamedPipe(HANDLE hNamedPipe, LPOVERLAPPED lpOverlapped)',
'void __stdcall OutputDebugStringA(LPCSTR lpOutputString)',
'BOOL __stdcall IsBadWritePtr(LPVOID lp, UINT_PTR ucb)',
'BOOL __stdcall IsBadStringPtrA(LPCSTRlpsz, UINT_PTR ucchMax)',
'BOOL __stdcall IsBadReadPtr(const void *lp, UINT_PTR ucb)',
'BOOL __stdcall GetVersionExA(LPOSVERSIONINFOAlpVersionInformation)',
'void __stdcall GetSystemInfo(LPSYSTEM_INFO lpSystemInfo)',
'void __stdcall GetStartupInfoA(LPSTARTUPINFOAlpStartupInfo)',
'void __stdcall GetLocalTime(LPSYSTEMTIME lpSystemTime)',
'BOOL __stdcall GetHandleInformation(HANDLE hObject, LPDWORD lpdwFlags)',
'BOOL __stdcall GetFileTime(HANDLE hFile, LPFILETIME lpCreationTime, LPFILETIME lpLastAccessTime, LPFILETIME lpLastWriteTime)',
'BOOL __stdcall GetComputerNameA(LPSTRlpBuffer, LPDWORD nSize)',
'UINT __stdcall GetACP()',
'BOOL __stdcall FreeLibrary(HMODULE hLibModule)',
'HMODULE __stdcall LoadLibraryA(LPCSTRlpLibFileName)',
'BOOL __stdcall SetEnvironmentVariableA(LPCSTRlpName,LPCSTR lpValue)',
'BOOL __stdcall FreeEnvironmentStringsA(LPCH)',
'LPCH __stdcall GetEnvironmentStrings()',
'BOOL __stdcall SetTimeZoneInformation(const TIME_ZONE_INFORMATION *lpTimeZoneInformation)',
'void __stdcall InitializeCriticalSection(LPCRITICAL_SECTION lpCriticalSection)',
'void __stdcall DeleteCriticalSection(LPCRITICAL_SECTION lpCriticalSection)',
'void __stdcall LeaveCriticalSection(LPCRITICAL_SECTION lpCriticalSection)',
'void __stdcall EnterCriticalSection(LPCRITICAL_SECTION lpCriticalSection)',
'LONG __stdcall InterlockedCompareExchange(volatile LONG *Destination,LONG Exchange, LONG Comperand)',
'LONG __stdcall InterlockedIncrement(volatile LONG *lpAddend)',
'BOOL __stdcall GlobalUnlock(HGLOBAL hMem)',
'BOOL __stdcall SetEvent(HANDLE hEvent)',
'HGLOBAL __stdcall GlobalReAlloc(HGLOBAL hMem,SIZE_T dwBytes,UINT uFlags)',
'void __stdcall ExitThread(DWORD dwExitCode)',
'void __stdcall ExitProcess(UINT uExitCode)',
'HANDLE __stdcall CreateThread(LPSECURITY_ATTRIBUTES lpThreadAttributes, SIZE_T dwStackSize, LPTHREAD_START_ROUTINE lpStartAddress, LPVOID lpParameter, DWORD dwCreationFlags,LPDWORDlpThreadId)',
'ATOM __stdcall GlobalFindAtomA(LPCSTRlpString)',
'ATOM __stdcall GlobalDeleteAtom(ATOM nAtom)',
'ATOM __stdcall GlobalAddAtomA(LPCSTR lpString)',
'ATOM __stdcall DeleteAtom(ATOM nAtom)',
'ATOM __stdcall AddAtomA(LPCSTR lpString)',
'UINT __stdcall GetWindowsDirectoryA(LPSTR lpBuffer, UINT uSize)',
'BOOL __stdcall DeleteFileA(LPCSTR lpFileName)',
'BOOL __stdcall FlushFileBuffers(HANDLE hFile)',
'BOOL __stdcall CloseHandle(HANDLE hObject)',
'HANDLE __stdcall CreateFileA(LPCSTR lpFileName, DWORDdwDesiredAccess, DWORD dwShareMode, LPSECURITY_ATTRIBUTES lpSecurityAttributes,DWORD dwCreationDisposition, DWORD dwFlagsAndAttributes, HANDLEhTemplateFile)',
'BOOL __stdcall CopyFileA(LPCSTR lpExistingFileName, LPCSTR lpNewFileName, BOOL bFailIfExists)',
'BOOL __stdcall SetThreadPriority(HANDLE hThread, int nPriority)',
'BOOL __stdcall ResetEvent(HANDLE hEvent)',
'HANDLE __stdcall OpenEventA(DWORD dwDesiredAccess, BOOL bInheritHandle, LPCSTR lpName)',
'BOOL __stdcall MoveFileA(LPCSTR lpExistingFileName, LPCSTR lpNewFileName)',
'LPVOID __stdcall HeapReAlloc(HANDLE hHeap, DWORD dwFlags, LPVOID lpMem, SIZE_T dwBytes)',
'LPVOID __stdcall TlsGetValue(DWORD dwTlsIndex)',
'BOOL __stdcall TlsSetValue(DWORD dwTlsIndex, LPVOID lpTlsValue)',
'BOOL __stdcall TlsFree(DWORD dwTlsIndex)',
'HMODULE __stdcall GetModuleHandleA(LPCSTR lpModuleName)',
'BOOL __stdcall GetSystemTimeAdjustment(PDWORDlpTimeAdjustment, PDWORD lpTimeIncrement, PBOOLlpTimeAdjustmentDisabled)',
'BOOL __stdcall GetStringTypeA(LCID Locale, DWORD dwInfoType, LPCSTR lpSrcStr,int cchSrc, LPWORD lpCharType)',
'LPVOID __stdcall VirtualAlloc(LPVOID lpAddress, SIZE_T dwSize, DWORD flAllocationType, DWORD flProtect)',
'UINT __stdcall GetOEMCP()',
'BOOL __stdcall GetCPInfo(UINTCodePage, LPCPINFO lpCPInfo)',
'BOOL __stdcall DisconnectNamedPipe(HANDLE hNamedPipe)',
'HANDLE __stdcall CreateEventA(LPSECURITY_ATTRIBUTES lpEventAttributes, BOOL bManualReset, BOOL bInitialState,LPCSTR lpName)',
'HANDLE __stdcall CreateNamedPipeA(LPCSTR lpName, DWORD dwOpenMode, DWORD dwPipeMode, DWORD nMaxInstances, DWORD nOutBufferSize, DWORDnInBufferSize, DWORD nDefaultTimeOut, LPSECURITY_ATTRIBUTES lpSecurityAttributes)',
'BOOL __stdcall GetOverlappedResult(HANDLE hFile, LPOVERLAPPEDlpOverlapped, LPDWORD lpNumberOfBytesTransferred, BOOL bWait)',
'BOOL __stdcall ReadFile(HANDLE hFile,LPVOID lpBuffer, DWORD nNumberOfBytesToRead, LPDWORD lpNumberOfBytesRead, LPOVERLAPPED lpOverlapped)',
'BOOL __stdcall WriteFile(HANDLE hFile, LPCVOID lpBuffer, DWORD nNumberOfBytesToWrite,LPDWORDlpNumberOfBytesWritten,LPOVERLAPPED lpOverlapped)',
'BOOL __stdcall HeapFree(HANDLE hHeap,DWORD dwFlags, LPVOID lpMem)',
'HANDLE __stdcall GetProcessHeap()',
'LPVOID __stdcall GlobalLock(HGLOBAL hMem)',
'LPVOID __stdcall HeapAlloc(HANDLE hHeap, DWORD dwFlags, SIZE_T dwBytes)',
'void __stdcall RtlUnwind(PVOID TargetFrame, PVOID TargetIp, PEXCEPTION_RECORDExceptionRecord, PVOID ReturnValue)',
'BOOL __stdcall VirtualFree(LPVOID lpAddress, SIZE_T dwSize, DWORD dwFreeType)',
'HANDLE __stdcall HeapCreate(DWORD flOptions, SIZE_T dwInitialSize, SIZE_T dwMaximumSize)',
'BOOL __stdcall HeapDestroy(HANDLE hHeap)',
'HANDLE __stdcall GetStdHandle(DWORD nStdHandle)',
'UINT __stdcall SetHandleCount(UINT uNumber)',
'FARPROC __stdcall GetProcAddress(HMODULE hModule, LPCSTR lpProcName)',
'BOOL __stdcall TerminateProcess(HANDLE hProcess, UINTuExitCode)',
'HANDLE __stdcall GetCurrentProcess()',
'LONG __stdcall UnhandledExceptionFilter(struct _EXCEPTION_POINTERS *ExceptionInfo)',
'BOOL __stdcall FreeEnvironmentStringsW(LPWCH)',
'BOOL __stdcall GetStringTypeW(DWORD dwInfoType, LPCWSTR lpSrcStr, intcchSrc,LPWORD lpCharType)',
'BOOL __stdcall KillTimer(HWNDhWnd, UINT_PTR uIDEvent)',
'UINT_PTR __stdcall SetTimer(HWND hWnd, UINT_PTR nIDEvent, UINT uElapse, TIMERPROC lpTimerFunc)',
'HWND __stdcall GetParent(HWNDhWnd)',
'void __stdcall CoUninitialize()',
'HRESULT __stdcall CoInitialize(LPVOIDpvReserved)',
'HRESULT __stdcall OleRun(LPUNKNOWN pUnknown)',
'HRESULT __stdcall CoGetMalloc(DWORD dwMemContext, LPMALLOC *ppMalloc)']
In [ ]:
Content source: xysmas/microsoft_malware_challenge
Similar notebooks: