In [1]:
import src.utils.utils as utils
import gensim
from gensim.models.word2vec import Word2Vec
import sklearn.cross_validation as cv
from sklearn.ensemble import RandomForestClassifier

import numpy as np  # Make sure that numpy is imported


hello

In [2]:
db = utils.get_mongodb()

In [14]:
class5 = db.samples.find({'class':'5'})

In [15]:
_comments = [doc['calls']['calls'] for doc in class5]

In [6]:
from gensim import corpora, models, similarities

In [7]:
dictionary = corpora.Dictionary(comments)

In [23]:
_comments[0]


Out[23]:
['HANDLE __stdcall GetCurrentProcess()',
 'UINT __stdcall GetSystemDirectoryW(LPWSTR lpBuffer, UINT uSize)',
 'BOOL __stdcall GetVersionExA(LPOSVERSIONINFOAlpVersionInformation)',
 'HANDLE __stdcall GetProcessHeap()',
 'void __stdcall GetStartupInfoW(LPSTARTUPINFOWlpStartupInfo)',
 'LPTOP_LEVEL_EXCEPTION_FILTER __stdcall SetUnhandledExceptionFilter(LPTOP_LEVEL_EXCEPTION_FILTER lpTopLevelExceptionFilter)',
 'BOOL __stdcall IsDebuggerPresent()',
 'BOOL __stdcall GetCPInfo(UINTCodePage, LPCPINFO lpCPInfo)',
 'UINT __stdcall GetACP()',
 'UINT __stdcall GetOEMCP()',
 'BOOL __stdcall IsValidCodePage(UINT CodePage)',
 'HMODULE __stdcall GetModuleHandleA(LPCSTR lpModuleName)',
 'HANDLE __stdcall HeapCreate(DWORD flOptions, SIZE_T dwInitialSize, SIZE_T dwMaximumSize)',
 'void __stdcall ExitProcess(UINT uExitCode)',
 'void __stdcall GetStartupInfoA(LPSTARTUPINFOAlpStartupInfo)',
 'BOOL __stdcall QueryPerformanceCounter(LARGE_INTEGER *lpPerformanceCount)',
 'BOOL __stdcall GetStringTypeA(LCID Locale, DWORD dwInfoType, LPCSTR lpSrcStr,int cchSrc, LPWORD lpCharType)',
 'BOOL __stdcall GetStringTypeW(DWORD dwInfoType, LPCWSTR lpSrcStr, intcchSrc,LPWORD lpCharType)',
 'void __stdcall Sleep(DWORD dwMilliseconds)',
 'void __stdcall InitializeCriticalSection(LPCRITICAL_SECTION lpCriticalSection)',
 'HMODULE __stdcall LoadLibraryA(LPCSTRlpLibFileName)',
 'HANDLE __stdcall CreateMutexA(LPSECURITY_ATTRIBUTES lpMutexAttributes, BOOL bInitialOwner, LPCSTR lpName)',
 'FARPROC __stdcall GetProcAddress(HMODULE hModule, LPCSTR lpProcName)',
 'UINT __stdcall EnumClipboardFormats(UINT format)',
 'UINT_PTR __stdcall SetTimer(HWND hWnd, UINT_PTR nIDEvent, UINT uElapse, TIMERPROC lpTimerFunc)',
 'UINT __stdcall RegisterWindowMessageA(LPCSTR lpString)',
 'ATOM __stdcall RegisterClassExW(constWNDCLASSEXW *)',
 'HBITMAP __stdcall LoadBitmapW(HINSTANCE hInstance, LPCWSTR lpBitmapName)',
 'BOOL __stdcall OpenClipboard(HWND hWndNewOwner)',
 'INT_PTR __stdcall DialogBoxIndirectParamA(HINSTANCE hInstance, LPCDLGTEMPLATEA hDialogTemplate, HWND hWndParent, DLGPROC lpDialogFunc, LPARAMdwInitParam)',
 'HANDLE __stdcall LoadImageA(HINSTANCEhInst, LPCSTR name, UINT type, int cx, int cy, UINT fuLoad)',
 'BOOL __stdcall AppendMenuW(HMENU hMenu, UINT uFlags, UINT_PTRuIDNewItem, LPCWSTR lpNewItem)',
 'UINT __stdcall GetMenuState(HMENU hMenu, UINTuId, UINT uFlags)',
 'INT_PTR __stdcall DialogBoxParamW(HINSTANCE hInstance, LPCWSTR lpTemplateName, HWND hWndParent, DLGPROC lpDialogFunc,LPARAM dwInitParam)',
 'HCURSOR __stdcall SetCursor(HCURSOR hCursor)',
 'BOOL __stdcall InvalidateRect(HWND hWnd, const RECT *lpRect, BOOL bErase)',
 'BOOL __stdcall DestroyMenu(HMENU hMenu)',
 'BOOL __stdcall IsChild(HWND hWndParent, HWND hWnd)',
 'BOOL __stdcall GetMenuItemInfoA(HMENUhmenu, UINT item, BOOL fByPosition, LPMENUITEMINFOA lpmii)',
 'HWND __stdcall SetParent(HWNDhWndChild, HWNDhWndNewParent)',
 'INT_PTR __stdcall DialogBoxParamA(HINSTANCE hInstance, LPCSTRlpTemplateName,HWND hWndParent, DLGPROC lpDialogFunc, LPARAM dwInitParam)',
 'HACCEL __stdcall CreateAcceleratorTableA(LPACCEL paccel, int cAccel)',
 'BOOL __stdcall MessageBeep(UINT uType)',
 'HWND __stdcall GetActiveWindow()',
 'LSTATUS __stdcall RegSetValueExW(HKEYhKey, LPCWSTR lpValueName, DWORD Reserved, DWORD dwType, const BYTE *lpData, DWORD cbData)',
 'LSTATUS __stdcall RegCloseKey(HKEY hKey)',
 'LSTATUS __stdcall RegCreateKeyExW(HKEY hKey, LPCWSTR lpSubKey, DWORD Reserved, LPWSTRlpClass, DWORD dwOptions, REGSAM samDesired, const LPSECURITY_ATTRIBUTES lpSecurityAttributes, PHKEY phkResult,LPDWORDlpdwDisposition)',
 'LSTATUS __stdcall RegOpenKeyExW(HKEY hKey, LPCWSTR lpSubKey, DWORD ulOptions,REGSAM samDesired, PHKEY phkResult)',
 'LSTATUS __stdcall RegDeleteKeyW(HKEY hKey, LPCWSTR lpSubKey)',
 'BOOL __stdcall CloseServiceHandle(SC_HANDLE hSCObject)',
 'BOOL __stdcall StartServiceW(SC_HANDLE hService, DWORD dwNumServiceArgs, LPCWSTR *lpServiceArgVectors)',
 'SC_HANDLE __stdcall OpenServiceW(SC_HANDLE hSCManager, LPCWSTR lpServiceName,DWORD dwDesiredAccess)',
 'SC_HANDLE __stdcall OpenSCManagerW(LPCWSTR lpMachineName, LPCWSTR lpDatabaseName, DWORD dwDesiredAccess)',
 'HPALETTE __stdcall CreatePalette(const LOGPALETTE *plpal)',
 'BOOL __stdcall RemoveFontResourceExA(LPCSTR name, DWORD fl, PVOID pdv)',
 'HENHMETAFILE __stdcall SetEnhMetaFileBits(UINT nSize,const BYTE *pb)',
 'HMETAFILE __stdcall GetMetaFileA(LPCSTR lpName)',
 'HRGN __stdcall CreateRectRgn(int x1, int y1, int x2, int y2)',
 'HRGN __stdcall CreatePolygonRgn(constPOINT *pptl, int cPoint, int iMode)']

In [35]:



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-35-a2c2645c7bcb> in <module>()
----> 1 comments = [word for word in line.split() for line in comment for comment in _comments]

NameError: name 'line' is not defined

In [33]:



Out[33]:
['int __cdecl sub_401000(SIZE_T dwBytes)',
 'int __cdecl sub_401018(LPVOID lpMem, size_t)',
 'int __cdecl sub_401516(HANDLE hNamedPipe, LPOVERLAPPEDlpOverlapped)',
 'int __cdecl sub_401CBB(void *)',
 'int __stdcall WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nShowCmd)',
 'int __cdecl sub_403000(void *,void *)',
 'int __stdcall sub_40301C(LPCSTR lpProcName, int)',
 'BOOL __stdcall GetTokenInformation(HANDLE TokenHandle, TOKEN_INFORMATION_CLASS TokenInformationClass,LPVOID TokenInformation, DWORD TokenInformationLength, PDWORD ReturnLength)',
 'BOOL __stdcall InitializeSecurityDescriptor(PSECURITY_DESCRIPTOR pSecurityDescriptor,DWORD dwRevision)',
 'LSTATUS __stdcall RegCloseKey(HKEY hKey)',
 'LSTATUS __stdcall RegCreateKeyExA(HKEY hKey, LPCSTR lpSubKey,DWORD Reserved,LPSTR lpClass, DWORD dwOptions,REGSAM samDesired, const LPSECURITY_ATTRIBUTES lpSecurityAttributes, PHKEY phkResult, LPDWORD lpdwDisposition)',
 'LSTATUS __stdcall RegEnumValueA(HKEY hKey, DWORD dwIndex, LPSTR lpValueName, LPDWORD lpcchValueName, LPDWORD lpReserved, LPDWORD lpType, LPBYTE lpData, LPDWORD lpcbData)',
 'LSTATUS __stdcall RegQueryValueExA(HKEY hKey,LPCSTR lpValueName, LPDWORD lpReserved,LPDWORDlpType,LPBYTE lpData, LPDWORD lpcbData)',
 'LSTATUS __stdcall RegOpenKeyA(HKEY hKey, LPCSTR lpSubKey, PHKEY phkResult)',
 'LSTATUS __stdcall RegNotifyChangeKeyValue(HKEY hKey, BOOL bWatchSubtree, DWORD dwNotifyFilter, HANDLEhEvent,BOOL fAsynchronous)',
 'BOOL __stdcall AllocateAndInitializeSid(PSID_IDENTIFIER_AUTHORITY pIdentifierAuthority, BYTE nSubAuthorityCount, DWORD nSubAuthority0, DWORD nSubAuthority1, DWORD nSubAuthority2, DWORD nSubAuthority3, DWORD nSubAuthority4, DWORD nSubAuthority5, DWORD nSubAuthority6, DWORD nSubAuthority7, PSID*pSid)',
 'BOOL __stdcall ConnectNamedPipe(HANDLE hNamedPipe, LPOVERLAPPED lpOverlapped)',
 'void __stdcall OutputDebugStringA(LPCSTR lpOutputString)',
 'BOOL __stdcall IsBadWritePtr(LPVOID lp, UINT_PTR ucb)',
 'BOOL __stdcall IsBadStringPtrA(LPCSTRlpsz, UINT_PTR ucchMax)',
 'BOOL __stdcall IsBadReadPtr(const void *lp, UINT_PTR ucb)',
 'BOOL __stdcall GetVersionExA(LPOSVERSIONINFOAlpVersionInformation)',
 'void __stdcall GetSystemInfo(LPSYSTEM_INFO lpSystemInfo)',
 'void __stdcall GetStartupInfoA(LPSTARTUPINFOAlpStartupInfo)',
 'void __stdcall GetLocalTime(LPSYSTEMTIME lpSystemTime)',
 'BOOL __stdcall GetHandleInformation(HANDLE hObject, LPDWORD lpdwFlags)',
 'BOOL __stdcall GetFileTime(HANDLE hFile, LPFILETIME lpCreationTime, LPFILETIME lpLastAccessTime, LPFILETIME lpLastWriteTime)',
 'BOOL __stdcall GetComputerNameA(LPSTRlpBuffer, LPDWORD nSize)',
 'UINT __stdcall GetACP()',
 'BOOL __stdcall FreeLibrary(HMODULE hLibModule)',
 'HMODULE __stdcall LoadLibraryA(LPCSTRlpLibFileName)',
 'BOOL __stdcall SetEnvironmentVariableA(LPCSTRlpName,LPCSTR lpValue)',
 'BOOL __stdcall FreeEnvironmentStringsA(LPCH)',
 'LPCH __stdcall GetEnvironmentStrings()',
 'BOOL __stdcall SetTimeZoneInformation(const TIME_ZONE_INFORMATION *lpTimeZoneInformation)',
 'void __stdcall InitializeCriticalSection(LPCRITICAL_SECTION lpCriticalSection)',
 'void __stdcall DeleteCriticalSection(LPCRITICAL_SECTION lpCriticalSection)',
 'void __stdcall LeaveCriticalSection(LPCRITICAL_SECTION lpCriticalSection)',
 'void __stdcall EnterCriticalSection(LPCRITICAL_SECTION lpCriticalSection)',
 'LONG __stdcall InterlockedCompareExchange(volatile LONG *Destination,LONG Exchange, LONG Comperand)',
 'LONG __stdcall InterlockedIncrement(volatile LONG *lpAddend)',
 'BOOL __stdcall GlobalUnlock(HGLOBAL hMem)',
 'BOOL __stdcall SetEvent(HANDLE hEvent)',
 'HGLOBAL __stdcall GlobalReAlloc(HGLOBAL hMem,SIZE_T dwBytes,UINT uFlags)',
 'void __stdcall ExitThread(DWORD dwExitCode)',
 'void __stdcall ExitProcess(UINT uExitCode)',
 'HANDLE __stdcall CreateThread(LPSECURITY_ATTRIBUTES lpThreadAttributes, SIZE_T dwStackSize, LPTHREAD_START_ROUTINE lpStartAddress, LPVOID lpParameter, DWORD dwCreationFlags,LPDWORDlpThreadId)',
 'ATOM __stdcall GlobalFindAtomA(LPCSTRlpString)',
 'ATOM __stdcall GlobalDeleteAtom(ATOM nAtom)',
 'ATOM __stdcall GlobalAddAtomA(LPCSTR lpString)',
 'ATOM __stdcall DeleteAtom(ATOM nAtom)',
 'ATOM __stdcall AddAtomA(LPCSTR lpString)',
 'UINT __stdcall GetWindowsDirectoryA(LPSTR lpBuffer, UINT uSize)',
 'BOOL __stdcall DeleteFileA(LPCSTR lpFileName)',
 'BOOL __stdcall FlushFileBuffers(HANDLE hFile)',
 'BOOL __stdcall CloseHandle(HANDLE hObject)',
 'HANDLE __stdcall CreateFileA(LPCSTR lpFileName, DWORDdwDesiredAccess, DWORD dwShareMode, LPSECURITY_ATTRIBUTES lpSecurityAttributes,DWORD dwCreationDisposition, DWORD dwFlagsAndAttributes, HANDLEhTemplateFile)',
 'BOOL __stdcall CopyFileA(LPCSTR lpExistingFileName, LPCSTR lpNewFileName, BOOL bFailIfExists)',
 'BOOL __stdcall SetThreadPriority(HANDLE hThread, int nPriority)',
 'BOOL __stdcall ResetEvent(HANDLE hEvent)',
 'HANDLE __stdcall OpenEventA(DWORD dwDesiredAccess, BOOL bInheritHandle, LPCSTR lpName)',
 'BOOL __stdcall MoveFileA(LPCSTR lpExistingFileName, LPCSTR lpNewFileName)',
 'LPVOID __stdcall HeapReAlloc(HANDLE hHeap, DWORD dwFlags, LPVOID lpMem, SIZE_T dwBytes)',
 'LPVOID __stdcall TlsGetValue(DWORD dwTlsIndex)',
 'BOOL __stdcall TlsSetValue(DWORD dwTlsIndex, LPVOID lpTlsValue)',
 'BOOL __stdcall TlsFree(DWORD dwTlsIndex)',
 'HMODULE __stdcall GetModuleHandleA(LPCSTR lpModuleName)',
 'BOOL __stdcall GetSystemTimeAdjustment(PDWORDlpTimeAdjustment, PDWORD lpTimeIncrement, PBOOLlpTimeAdjustmentDisabled)',
 'BOOL __stdcall GetStringTypeA(LCID Locale, DWORD dwInfoType, LPCSTR lpSrcStr,int cchSrc, LPWORD lpCharType)',
 'LPVOID __stdcall VirtualAlloc(LPVOID lpAddress, SIZE_T dwSize, DWORD flAllocationType, DWORD flProtect)',
 'UINT __stdcall GetOEMCP()',
 'BOOL __stdcall GetCPInfo(UINTCodePage, LPCPINFO lpCPInfo)',
 'BOOL __stdcall DisconnectNamedPipe(HANDLE hNamedPipe)',
 'HANDLE __stdcall CreateEventA(LPSECURITY_ATTRIBUTES lpEventAttributes, BOOL bManualReset, BOOL bInitialState,LPCSTR lpName)',
 'HANDLE __stdcall CreateNamedPipeA(LPCSTR lpName, DWORD dwOpenMode, DWORD dwPipeMode, DWORD nMaxInstances, DWORD nOutBufferSize, DWORDnInBufferSize, DWORD nDefaultTimeOut, LPSECURITY_ATTRIBUTES lpSecurityAttributes)',
 'BOOL __stdcall GetOverlappedResult(HANDLE hFile, LPOVERLAPPEDlpOverlapped, LPDWORD lpNumberOfBytesTransferred, BOOL bWait)',
 'BOOL __stdcall ReadFile(HANDLE hFile,LPVOID lpBuffer, DWORD nNumberOfBytesToRead, LPDWORD lpNumberOfBytesRead, LPOVERLAPPED lpOverlapped)',
 'BOOL __stdcall WriteFile(HANDLE hFile, LPCVOID lpBuffer, DWORD nNumberOfBytesToWrite,LPDWORDlpNumberOfBytesWritten,LPOVERLAPPED lpOverlapped)',
 'BOOL __stdcall HeapFree(HANDLE hHeap,DWORD dwFlags, LPVOID lpMem)',
 'HANDLE __stdcall GetProcessHeap()',
 'LPVOID __stdcall GlobalLock(HGLOBAL hMem)',
 'LPVOID __stdcall HeapAlloc(HANDLE hHeap, DWORD dwFlags, SIZE_T dwBytes)',
 'void __stdcall RtlUnwind(PVOID TargetFrame, PVOID TargetIp, PEXCEPTION_RECORDExceptionRecord, PVOID ReturnValue)',
 'BOOL __stdcall VirtualFree(LPVOID lpAddress, SIZE_T dwSize, DWORD dwFreeType)',
 'HANDLE __stdcall HeapCreate(DWORD flOptions, SIZE_T dwInitialSize, SIZE_T dwMaximumSize)',
 'BOOL __stdcall HeapDestroy(HANDLE hHeap)',
 'HANDLE __stdcall GetStdHandle(DWORD nStdHandle)',
 'UINT __stdcall SetHandleCount(UINT uNumber)',
 'FARPROC __stdcall GetProcAddress(HMODULE hModule, LPCSTR lpProcName)',
 'BOOL __stdcall TerminateProcess(HANDLE hProcess, UINTuExitCode)',
 'HANDLE __stdcall GetCurrentProcess()',
 'LONG __stdcall UnhandledExceptionFilter(struct _EXCEPTION_POINTERS *ExceptionInfo)',
 'BOOL __stdcall FreeEnvironmentStringsW(LPWCH)',
 'BOOL __stdcall GetStringTypeW(DWORD dwInfoType, LPCWSTR lpSrcStr, intcchSrc,LPWORD lpCharType)',
 'BOOL __stdcall KillTimer(HWNDhWnd, UINT_PTR uIDEvent)',
 'UINT_PTR __stdcall SetTimer(HWND hWnd, UINT_PTR nIDEvent, UINT uElapse, TIMERPROC lpTimerFunc)',
 'HWND __stdcall GetParent(HWNDhWnd)',
 'void __stdcall CoUninitialize()',
 'HRESULT __stdcall CoInitialize(LPVOIDpvReserved)',
 'HRESULT __stdcall OleRun(LPUNKNOWN pUnknown)',
 'HRESULT __stdcall CoGetMalloc(DWORD dwMemContext, LPMALLOC *ppMalloc)']

In [ ]: