The goal of this competition is to predict a Windows machine’s probability of getting infected by various families of malware, based on different properties of that machine. It is really important to find out whether the computer is infected and cure it.
We have a huge dataset of data, where most features are categorical. I think that correct mean encoding should be important. Also the number of columns is quite high so it could be tempting to make some automatical processing for all columns. I personally think that it is important to analyze each variable and it could help to do a better processing.
In this kernel I'll do a detailed EDA, feature engineering and modelling.
In [ ]:
#libraries
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import gc
from catboost import CatBoostClassifier
from tqdm import tqdm_notebook
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import warnings
warnings.filterwarnings("ignore")
import logging
logging.basicConfig(filename='log.txt',level=logging.DEBUG, format='%(asctime)s %(message)s')
pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)
pd.set_option('max_rows', 100)
import os
print(os.listdir("../input/microsoft-malware-prediction"))
In [ ]:
#https://www.kaggle.com/theoviel/load-the-totality-of-the-data
dtypes = {
'MachineIdentifier': 'category',
'ProductName': 'category',
'EngineVersion': 'category',
'AppVersion': 'category',
'AvSigVersion': 'category',
'IsBeta': 'int8',
'RtpStateBitfield': 'float16',
'IsSxsPassiveMode': 'int8',
'DefaultBrowsersIdentifier': 'float16',
'AVProductStatesIdentifier': 'float32',
'AVProductsInstalled': 'float16',
'AVProductsEnabled': 'float16',
'HasTpm': 'int8',
'CountryIdentifier': 'int16',
'CityIdentifier': 'float32',
'OrganizationIdentifier': 'float16',
'GeoNameIdentifier': 'float16',
'LocaleEnglishNameIdentifier': 'int8',
'Platform': 'category',
'Processor': 'category',
'OsVer': 'category',
'OsBuild': 'int16',
'OsSuite': 'int16',
'OsPlatformSubRelease': 'category',
'OsBuildLab': 'category',
'SkuEdition': 'category',
'IsProtected': 'float16',
'AutoSampleOptIn': 'int8',
'PuaMode': 'category',
'SMode': 'float16',
'IeVerIdentifier': 'float16',
'SmartScreen': 'category',
'Firewall': 'float16',
'UacLuaenable': 'float32',
'Census_MDC2FormFactor': 'category',
'Census_DeviceFamily': 'category',
'Census_OEMNameIdentifier': 'float16',
'Census_OEMModelIdentifier': 'float32',
'Census_ProcessorCoreCount': 'float16',
'Census_ProcessorManufacturerIdentifier': 'float16',
'Census_ProcessorModelIdentifier': 'float16',
'Census_ProcessorClass': 'category',
'Census_PrimaryDiskTotalCapacity': 'float32',
'Census_PrimaryDiskTypeName': 'category',
'Census_SystemVolumeTotalCapacity': 'float32',
'Census_HasOpticalDiskDrive': 'int8',
'Census_TotalPhysicalRAM': 'float32',
'Census_ChassisTypeName': 'category',
'Census_InternalPrimaryDiagonalDisplaySizeInInches': 'float16',
'Census_InternalPrimaryDisplayResolutionHorizontal': 'float16',
'Census_InternalPrimaryDisplayResolutionVertical': 'float16',
'Census_PowerPlatformRoleName': 'category',
'Census_InternalBatteryType': 'category',
'Census_InternalBatteryNumberOfCharges': 'float32',
'Census_OSVersion': 'category',
'Census_OSArchitecture': 'category',
'Census_OSBranch': 'category',
'Census_OSBuildNumber': 'int16',
'Census_OSBuildRevision': 'int32',
'Census_OSEdition': 'category',
'Census_OSSkuName': 'category',
'Census_OSInstallTypeName': 'category',
'Census_OSInstallLanguageIdentifier': 'float16',
'Census_OSUILocaleIdentifier': 'int16',
'Census_OSWUAutoUpdateOptionsName': 'category',
'Census_IsPortableOperatingSystem': 'int8',
'Census_GenuineStateName': 'category',
'Census_ActivationChannel': 'category',
'Census_IsFlightingInternal': 'float16',
'Census_IsFlightsDisabled': 'float16',
'Census_FlightRing': 'category',
'Census_ThresholdOptIn': 'float16',
'Census_FirmwareManufacturerIdentifier': 'float16',
'Census_FirmwareVersionIdentifier': 'float32',
'Census_IsSecureBootEnabled': 'int8',
'Census_IsWIMBootEnabled': 'float16',
'Census_IsVirtualDevice': 'float16',
'Census_IsTouchEnabled': 'int8',
'Census_IsPenCapable': 'int8',
'Census_IsAlwaysOnAlwaysConnectedCapable': 'float16',
'Wdft_IsGamer': 'float16',
'Wdft_RegionIdentifier': 'float16',
'HasDetections': 'int8'
}
def reduce_mem_usage(df, verbose=True):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
return df
In [ ]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = [c for c,v in dtypes.items() if v in numerics]
categorical_columns = [c for c,v in dtypes.items() if v not in numerics]
In [ ]:
all_dates = np.load('../input/malware/all_dates_v2.npy').item()
In [ ]:
sig_drop = []
for key,value in all_dates.items():
if value == '':
sig_drop.append(key)
len(sig_drop)
In [ ]:
%%time
train = pd.read_csv('../input/microsoft-malware-prediction/train.csv', dtype=dtypes, nrows=8000000)
In [ ]:
train = reduce_mem_usage(train)
We can see several interesting things here:
I see that all columns except Census_SystemVolumeTotalCapacity are categorical. Also there are 3 columns, where most of the values are missing. Let's drop them.
In [ ]:
good_cols = list(train.columns)
for col in train.columns:
rate = train[col].value_counts(normalize=True, dropna=False).values[0]
if rate > 0.9 and col not in ['OsVer','ProductName', 'DefaultBrowsersIdentifier','Firewall', 'IsProtected', 'AVProductsEnabled', 'Census_GenuineStateName']:
good_cols.remove(col)
In [ ]:
train = train[good_cols]
Now we can read test data.
In [ ]:
test_dtypes = {k: v for k, v in dtypes.items() if k in good_cols}
test = pd.read_csv('../input/microsoft-malware-prediction/test.csv', dtype=test_dtypes, usecols=good_cols[:-1])
test.loc[6529507, 'OsBuildLab'] = '17134.1.amd64fre.rs4_release.180410-1804'
test = reduce_mem_usage(test)
In [ ]:
train.head()
In [ ]:
train['OsBuildLab'] = train['OsBuildLab'].cat.add_categories(['0.0.0.0.0-0'])
train['OsBuildLab'] = train['OsBuildLab'].fillna('0.0.0.0.0-0')
test['OsBuildLab'] = test['OsBuildLab'].cat.add_categories(['0.0.0.0.0-0'])
test['OsBuildLab'] = test['OsBuildLab'].fillna('0.0.0.0.0-0')
In [ ]:
%%time
vocab = ['_rele', 'c_e', 'relea', '5.x86', '_es', '0-18', 'amd', 's4_rel', '.winb']
vec = CountVectorizer(vocabulary=vocab, analyzer='char', ngram_range=(3, 6), dtype=np.int8)
count_vecs = np.array(vec.fit_transform(train['OsBuildLab']).todense())
train_count_vec = pd.DataFrame(data = count_vecs,
columns = ['OsBuildLab_countvec_' + i for i in vocab],
index = train.index)
train = train.join(train_count_vec)
del count_vecs, train_count_vec
gc.collect()
In [ ]:
%%time
count_vecs = np.array(vec.fit_transform(test['OsBuildLab']).todense())
test_count_vec = pd.DataFrame(data = count_vecs,
columns = ['OsBuildLab_countvec_' + i for i in vocab],
index = test.index)
test = test.join(test_count_vec)
del vocab, vec, count_vecs, test_count_vec
gc.collect()
In [ ]:
train['HasDetections'].value_counts()
The target is balanced, which is nice.
In [ ]:
train.SmartScreen=train.SmartScreen.str.lower()
train.SmartScreen.replace({"promt":"prompt",
"promprt":"prompt",
"00000000":"0",
"enabled":"on",
"of":"off" ,
"deny":"0" , # just one
"requiredadmin":"requireadmin"
},inplace=True)
train.SmartScreen = train.SmartScreen.astype("category")
test.SmartScreen = test.SmartScreen.str.lower()
test.SmartScreen.replace({"promt":"prompt",
"promprt":"prompt",
"00000000":"0",
"enabled":"on",
"of":"off" ,
"deny":"0" , # just one
"requiredadmin":"requireadmin"
},inplace=True)
test.SmartScreen = test.SmartScreen.astype("category")
In [ ]:
train['AVProductStatesIdentifier'] = train['AVProductStatesIdentifier'].astype('category')
test['AVProductStatesIdentifier'] = test['AVProductStatesIdentifier'].astype('category')
In [ ]:
train['CountryIdentifier'] = train['CountryIdentifier'].astype('category')
test['CountryIdentifier'] = test['CountryIdentifier'].astype('category')
In [ ]:
train['CityIdentifier'] = train['CityIdentifier'].astype('category')
test['CityIdentifier'] = test['CityIdentifier'].astype('category')
In [ ]:
train['OrganizationIdentifier'] = train['OrganizationIdentifier'].astype('category')
test['OrganizationIdentifier'] = test['OrganizationIdentifier'].astype('category')
In [ ]:
train['GeoNameIdentifier'] = train['GeoNameIdentifier'].astype('category')
test['GeoNameIdentifier'] = test['GeoNameIdentifier'].astype('category')
In [ ]:
train['LocaleEnglishNameIdentifier'] = train['LocaleEnglishNameIdentifier'].astype('category')
test['LocaleEnglishNameIdentifier'] = test['LocaleEnglishNameIdentifier'].astype('category')
In [ ]:
train['IeVerIdentifier'] = train['IeVerIdentifier'].astype('category')
test['IeVerIdentifier'] = test['IeVerIdentifier'].astype('category')
In [ ]:
train['Census_OEMNameIdentifier'] = train['Census_OEMNameIdentifier'].astype('category')
test['Census_OEMNameIdentifier'] = test['Census_OEMNameIdentifier'].astype('category')
In [ ]:
train['Census_OEMModelIdentifier'] = train['Census_OEMModelIdentifier'].astype('category')
test['Census_OEMModelIdentifier'] = test['Census_OEMModelIdentifier'].astype('category')
In [ ]:
train['Census_ProcessorModelIdentifier'] = train['Census_ProcessorModelIdentifier'].astype('category')
test['Census_ProcessorModelIdentifier'] = test['Census_ProcessorModelIdentifier'].astype('category')
In [ ]:
train['Census_InternalBatteryNumberOfCharges'] = train['Census_InternalBatteryNumberOfCharges'].astype('category')
test['Census_InternalBatteryNumberOfCharges'] = test['Census_InternalBatteryNumberOfCharges'].astype('category')
In [ ]:
train['Census_OSBuildNumber'] = train['Census_OSBuildNumber'].astype('category')
test['Census_OSBuildNumber'] = test['Census_OSBuildNumber'].astype('category')
In [ ]:
train['Census_OSBuildRevision'] = train['Census_OSBuildRevision'].astype('category')
test['Census_OSBuildRevision'] = test['Census_OSBuildRevision'].astype('category')
In [ ]:
train['Census_FirmwareManufacturerIdentifier'] = train['Census_FirmwareManufacturerIdentifier'].astype('category')
test['Census_FirmwareManufacturerIdentifier'] = test['Census_FirmwareManufacturerIdentifier'].astype('category')
In [ ]:
train['Census_FirmwareVersionIdentifier'] = train['Census_FirmwareVersionIdentifier'].astype('category')
test['Census_FirmwareVersionIdentifier'] = test['Census_FirmwareVersionIdentifier'].astype('category')
In [ ]:
train['OsBuild'] = train['OsBuild'].astype('category')
test['OsBuild'] = test['OsBuild'].astype('category')
In [ ]:
# https://www.kaggle.com/youhanlee/my-eda-i-want-to-see-all
# grouping battary types by name
def group_battery(x):
x = x.lower()
if 'li' in x:
return 1
else:
return 0
train['Census_InternalBatteryType'] = train['Census_InternalBatteryType'].apply(group_battery)
test['Census_InternalBatteryType'] = test['Census_InternalBatteryType'].apply(group_battery)
In [ ]:
def rename_edition(x):
x = x.lower()
if 'core' in x:
return 'Core'
elif 'pro' in x:
return 'pro'
elif 'enterprise' in x:
return 'Enterprise'
elif 'server' in x:
return 'Server'
elif 'home' in x:
return 'Home'
elif 'education' in x:
return 'Education'
elif 'cloud' in x:
return 'Cloud'
else:
return x
In [ ]:
train['Census_OSEdition'] = train['Census_OSEdition'].astype(str)
test['Census_OSEdition'] = test['Census_OSEdition'].astype(str)
train['Census_OSEdition'] = train['Census_OSEdition'].apply(rename_edition)
test['Census_OSEdition'] = test['Census_OSEdition'].apply(rename_edition)
train['Census_OSEdition'] = train['Census_OSEdition'].astype('category')
test['Census_OSEdition'] = test['Census_OSEdition'].astype('category')
In [ ]:
train['Census_OSSkuName'] = train['Census_OSSkuName'].astype(str)
test['Census_OSSkuName'] = test['Census_OSSkuName'].astype(str)
train['Census_OSSkuName'] = train['Census_OSSkuName'].apply(rename_edition)
test['Census_OSSkuName'] = test['Census_OSSkuName'].apply(rename_edition)
train['Census_OSSkuName'] = train['Census_OSSkuName'].astype('category')
test['Census_OSSkuName'] = test['Census_OSSkuName'].astype('category')
In [ ]:
train['Census_OSInstallLanguageIdentifier'] = train['Census_OSInstallLanguageIdentifier'].astype('category')
test['Census_OSInstallLanguageIdentifier'] = test['Census_OSInstallLanguageIdentifier'].astype('category')
In [ ]:
train['Census_OSUILocaleIdentifier'] = train['Census_OSUILocaleIdentifier'].astype('category')
test['Census_OSUILocaleIdentifier'] = test['Census_OSUILocaleIdentifier'].astype('category')
In [ ]:
train['OsSuite'] = train['OsSuite'].astype('category')
test['OsSuite'] = test['OsSuite'].astype('category')
In [ ]:
# train['Wdft_RegionIdentifier'] = train['Wdft_RegionIdentifier'].astype('category')
# test['Wdft_RegionIdentifier'] = test['Wdft_RegionIdentifier'].astype('category')
In [ ]:
def add_factor_sort(df, col, add_new_col=True):
val = [tuple([float(v) for v in (s.split('.'))]) for s in df[col].astype(str).replace('1.23.1144.0','1.273.1144.0')]
val = pd.factorize(val, sort=True)[0]
if add_new_col is True:
df[f'{col}_sort'] = val
return df
return val
In [ ]:
for col in ['OsVer']:
print(col)
train = add_factor_sort(train, col)
test = add_factor_sort(test, col)
train = reduce_mem_usage(train,False)
test = reduce_mem_usage(test, False)
In [ ]:
ID = (pd.concat([train[['MachineIdentifier',
'AvSigVersion',
'AppVersion',
'EngineVersion',
'HasDetections']],
test[['MachineIdentifier',
'AvSigVersion',
'AppVersion',
'EngineVersion']]],
axis=0, sort=False)
.reset_index(drop=True)
.sort_values(['AvSigVersion', 'AppVersion', 'EngineVersion'])
.reset_index(drop=True))
ID = pd.merge(ID, (ID[['AvSigVersion',
'AppVersion',
'EngineVersion']].drop_duplicates()
.reset_index(drop=True)
.reset_index()
.rename({'index':'ID'}, axis=1)),
on=['AvSigVersion',
'AppVersion',
'EngineVersion'], how='left')
train['ID'] = (ID[ID.HasDetections.notnull()]
.sort_values(['MachineIdentifier'])
.reset_index(drop=True))['ID']
test['ID'] = (ID[ID.HasDetections.isnull()]
.sort_values(['MachineIdentifier'])
.reset_index(drop=True))['ID']
train = train.sort_values(['ID']).reset_index(drop=True)
In [ ]:
# train['OsBuildLab'] = train['OsBuildLab'].cat.add_categories(['0.0.0.0.0-0'])
# train['OsBuildLab'] = train['OsBuildLab'].fillna('0.0.0.0.0-0')
# test['OsBuildLab'] = test['OsBuildLab'].cat.add_categories(['0.0.0.0.0-0'])
# test['OsBuildLab'] = test['OsBuildLab'].fillna('0.0.0.0.0-0')
train = reduce_mem_usage(train,False)
test = reduce_mem_usage(test, False)
top_10 = train['Census_TotalPhysicalRAM'].value_counts(dropna=False, normalize=True).cumsum().index[:10]
train.loc[train['Census_TotalPhysicalRAM'].isin(top_10) == False, 'Census_TotalPhysicalRAM'] = 1024
test.loc[test['Census_TotalPhysicalRAM'].isin(top_10) == False, 'Census_TotalPhysicalRAM'] = 1024
del top_10, ID
gc.collect()
In [ ]:
def add_num_feats(df, numerical_cols):
gr = df.groupby('ID')
for col in numerical_cols:
for agg in ['count']:
df[col+'_'+agg] = gr[col].transform(agg)
df = reduce_mem_usage(df,False)
return df
In [ ]:
def fe(df):
print('grouping combination...')
gp = df[['CountryIdentifier','OrganizationIdentifier', 'Census_OSInstallTypeName', 'SmartScreen']].groupby(by=['CountryIdentifier','OrganizationIdentifier','Census_OSInstallTypeName'], sort=False)[['SmartScreen']].count().reset_index().rename(columns={'SmartScreen':'cnt_cnt_org_os_sc'})
df = df.merge(gp, on=['CountryIdentifier','OrganizationIdentifier', 'Census_OSInstallTypeName'], how='left')
del gp
gc.collect()
df = reduce_mem_usage(df)
print('grouping combination...')
gp = df[['CountryIdentifier','OrganizationIdentifier', 'Census_OSInstallTypeName']].groupby(by=['CountryIdentifier','OrganizationIdentifier'], sort=False)[['Census_OSInstallTypeName']].count().reset_index().rename(columns={'Census_OSInstallTypeName':'cnt_cnt_org_os'})
df = df.merge(gp, on=['CountryIdentifier','OrganizationIdentifier'], how='left')
del gp
gc.collect()
df = reduce_mem_usage(df)
# print('grouping combination...')
# gp = df[['CountryIdentifier','OrganizationIdentifier','CityIdentifier', 'Census_OSInstallTypeName']].groupby(['CountryIdentifier','OrganizationIdentifier', 'CityIdentifier'])[['Census_OSInstallTypeName']].count().reset_index().rename(columns={'Census_OSInstallTypeName':'cnt_cnt_org_city_os'})
# df = df.merge(gp, on=['CountryIdentifier','OrganizationIdentifier','CityIdentifier'], how='left')
# del gp
# gc.collect()
# df = reduce_mem_usage(df)
# print('grouping combination...')
# gp = df[['CountryIdentifier','OrganizationIdentifier','Census_OSBuildNumber', 'Census_OSInstallTypeName']].groupby(['CountryIdentifier','OrganizationIdentifier', 'Census_OSBuildNumber'], sort=False)[['Census_OSInstallTypeName']].count().reset_index().rename(columns={'Census_OSInstallTypeName':'cnt_cnt_org_build_type'})
# df = df.merge(gp, on=['CountryIdentifier','OrganizationIdentifier', 'Census_OSBuildNumber'], how='left')
# del gp
# df = reduce_mem_usage(df)
print(gc.collect())
print('Cooking Pointless Things....')
df['one_less_AVproductInstalled'] = df['AVProductsInstalled'] - 1
df['EngineVersion_2'] = df['EngineVersion'].apply(lambda x: x.split('.')[2]).astype('category')
df['EngineVersion_3'] = df['EngineVersion'].apply(lambda x: x.split('.')[3]).astype('category')
df['AppVersion_1'] = df['AppVersion'].apply(lambda x: x.split('.')[1]).astype('category')
df['AppVersion_2'] = df['AppVersion'].apply(lambda x: x.split('.')[2]).astype('category')
df['AppVersion_3'] = df['AppVersion'].apply(lambda x: x.split('.')[3]).astype('category')
df['AvSigVersion_0'] = df['AvSigVersion'].apply(lambda x: x.split('.')[0]).astype('category')
df['AvSigVersion_1'] = df['AvSigVersion'].apply(lambda x: x.split('.')[1]).astype('category')
df['AvSigVersion_2'] = df['AvSigVersion'].apply(lambda x: x.split('.')[2]).astype('category')
df = reduce_mem_usage(df)
df['OsBuildLab_0'] = df['OsBuildLab'].apply(lambda x: x.split('.')[0]).astype('category')
df['OsBuildLab_1'] = df['OsBuildLab'].apply(lambda x: x.split('.')[1]).astype('category')
df['OsBuildLab_2'] = df['OsBuildLab'].apply(lambda x: x.split('.')[2]).astype('category')
df['OsBuildLab_3'] = df['OsBuildLab'].apply(lambda x: x.split('.')[3]).astype('category')
df['OsBuild_exact'] = df['OsBuildLab'].apply(lambda x: x.split('.')[0] +'.'+ x.split('.')[1])
df['OsBuild_exact'] = df['OsBuild_exact'].astype('category')
df = reduce_mem_usage(df)
df['AvSigVersion_minor'] = df['AvSigVersion'].apply(lambda x: x.split('.')[1]).astype('category')
df['AvSigVersion_build'] = df['AvSigVersion'].apply(lambda x: x.split('.')[2]).astype('category')
df['AvSigVersion_minor_build'] = df['AvSigVersion'].str.replace('1.23.1144.0','1.273.1144.0').apply(lambda x: float((x.split('.')[1]) +'.'+(x.split('.')[2]))).astype('float32')
df['not_protected_at_all'] = 0
df.loc[df[df['IsProtected'].isnull()].index, 'not_protected_at_all'] = 1
df['not_genuine_user'] = 0
df.loc[df['Census_GenuineStateName'].isin(['IS_GENUINE']) == False, 'not_genuine_user'] = 1
df['AvSigVersion_sum'] = df['AvSigVersion'].str.replace('1.23.1144.0','1.273.1144.0').apply(lambda x: float(x.split('.')[1]) + float(x.split('.')[2])).astype(int).values
df['AvSigVersion'] = df['AvSigVersion'].astype('category')
df['OsBuild_exact'] = df['OsBuildLab'].apply(lambda x: x.split('.')[0] +'.'+ x.split('.')[1])
df['OsBuild_exact'] = df['OsBuild_exact'].astype('category')
top_20 = df['AVProductStatesIdentifier'].value_counts(dropna=False, normalize=True).cumsum().index[:20]
df['magic_4'] = 0
df.loc[df['AVProductStatesIdentifier'].isin(top_20) == True, 'magic_4'] = 1
df['no_release_dates'] = 0
df.loc[df['AvSigVersion'].isin(sig_drop) == True, 'no_release_dates'] = 1
df['smartscreen_not_set'] = 0
df.loc[df['SmartScreen'].isin(['existsnotset']) == True, 'smartscreen_not_set'] = 1
df = reduce_mem_usage(df)
df['primary_drive_c_ratio'] = df['Census_SystemVolumeTotalCapacity']/ df['Census_PrimaryDiskTotalCapacity']
df['non_primary_drive_MB'] = df['Census_PrimaryDiskTotalCapacity'] - df['Census_SystemVolumeTotalCapacity']
df['ram_per_processor'] = df['Census_TotalPhysicalRAM']/ df['Census_ProcessorCoreCount']
df['physical_cores'] = df['Census_ProcessorCoreCount'] / 2
df['Census_IsFlightingInternal'] = df['Census_IsFlightingInternal'].fillna(1)
df['Census_ThresholdOptIn'] = df['Census_ThresholdOptIn'].fillna(1)
df['Census_IsWIMBootEnabled'] = df['Census_IsWIMBootEnabled'].fillna(1)
df['Wdft_IsGamer'] = df['Wdft_IsGamer'].fillna(0)
df['EngineVersion'] = df['EngineVersion'].astype('category')
df = add_num_feats(df, ['OsVer_sort', 'AVProductsInstalled','Firewall'])
df['Wdft_RegionIdentifier'] = df['Wdft_RegionIdentifier'].astype('category')
df = reduce_mem_usage(df)
print('Done...!!!')
return df
In [ ]:
train = fe(train)
test = fe(test)
In [ ]:
del sig_drop, all_dates
gc.collect()
cat_cols = [col for col in train.columns if col not in ['MachineIdentifier', 'Census_SystemVolumeTotalCapacity', 'HasDetections'] and str(train[col].dtype) == 'category']
len(cat_cols)
In [ ]:
more_cat_cols = []
add_cat_feats = [
'Census_OSBuildRevision',
'OsBuildLab',
'SmartScreen']
for col1 in add_cat_feats:
for col2 in add_cat_feats:
if col1 != col2:
train[col1 + '__' + col2] = train[col1].astype(str) + train[col2].astype(str)
train[col1 + '__' + col2] = train[col1 + '__' + col2].astype('category')
test[col1 + '__' + col2] = test[col1].astype(str) + test[col2].astype(str)
test[col1 + '__' + col2] = test[col1 + '__' + col2].astype('category')
more_cat_cols.append(col1 + '__' + col2)
cat_cols = cat_cols + more_cat_cols
In [ ]:
%%time
for cc in tqdm_notebook(cat_cols):
fillna_const = 'MISSING_VALUE'
train[cc] = train[cc].astype('str').fillna(fillna_const)
test[cc] = test[cc].astype('str').fillna(fillna_const)
train[cc] = train[cc].astype('category')
test[cc] = test[cc].astype('category')
In [ ]:
to_encode = []
for col in cat_cols:
if train[col].nunique() > 1000:
print(col, train[col].nunique())
to_encode.append(col)
From this kernel: https://www.kaggle.com/fabiendaniel/detecting-malwares-with-lgbm
I do frequency and label encoding on the full dataset, because I think I'll get more correct values this way.
In [ ]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
gc.collect()
In [ ]:
def frequency_encoding(variable):
t = pd.concat([train[variable], test[variable]]).value_counts().reset_index()
t = t.reset_index()
t.loc[t[variable] == 1, 'level_0'] = np.nan
t.set_index('index', inplace=True)
max_label = t['level_0'].max() + 1
t.fillna(max_label, inplace=True)
return t.to_dict()['level_0']
In [ ]:
for col in tqdm_notebook(to_encode):
freq_enc_dict = frequency_encoding(col)
train[col] = train[col].map(lambda x: freq_enc_dict.get(x, np.nan))
test[col] = test[col].map(lambda x: freq_enc_dict.get(x, np.nan))
cat_cols.remove(col)
In [ ]:
%%time
indexer = {}
for col in cat_cols:
_, indexer[col] = pd.factorize(train[col].astype(str))
for col in tqdm_notebook(cat_cols):
train[col] = indexer[col].get_indexer(train[col].astype(str))
test[col] = indexer[col].get_indexer(test[col].astype(str))
train = reduce_mem_usage(train, verbose=False)
test = reduce_mem_usage(test, verbose=False)
In [ ]:
del indexer
gc.collect()
In [ ]:
train.head()
In [ ]:
y = train['HasDetections']
train = train.drop(['HasDetections', 'MachineIdentifier'], axis=1)
test = test.drop(['MachineIdentifier'], axis=1)
gc.collect()
# train1 = train[:3000000]
# train = train[3000000:]
# train2 = train[:3000000]
# train = train[3000000:]
# y1 = y[:3000000]
# y = y[3000000:]
# y2 = y[:3000000]
# y = y[3000000:]
train1 = train[:4000000]
train = train[4000000:8000000]
# train2 = train[:4000000]
# train = train[4000000:]
y1 = y[:4000000]
y = y[4000000:8000000]
# y2 = y[:3000000]
# y = y[3000000:]
In [ ]:
n_fold = 5
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=15)
# folds = TimeSeriesSplit(n_splits=5)
In [ ]:
from numba import jit
# fast roc_auc computation: https://www.kaggle.com/c/microsoft-malware-prediction/discussion/76013
@jit
def fast_auc(y_true, y_prob):
y_true = np.asarray(y_true)
y_true = y_true[np.argsort(y_prob)]
nfalse = 0
auc = 0
n = len(y_true)
for i in range(n):
y_i = y_true[i]
nfalse += (1 - y_i)
auc += y_i * nfalse
auc /= (nfalse * (n - nfalse))
return auc
def eval_auc(preds, dtrain):
labels = dtrain.get_label()
return 'auc', fast_auc(labels, preds), True
# idea from this kernel: https://www.kaggle.com/fabiendaniel/detecting-malwares-with-lgbm
def predict_chunk(model, test):
initial_idx = 0
chunk_size = 1000000
current_pred = np.zeros(len(test))
while initial_idx < test.shape[0]:
final_idx = min(initial_idx + chunk_size, test.shape[0])
idx = range(initial_idx, final_idx)
current_pred[idx] = model.predict(test.iloc[idx], num_iteration=model.best_iteration)
initial_idx = final_idx
#predictions += current_pred / min(folds.n_splits, max_iter)
return current_pred
def train_model(X=train, X_test=test, y=y, params=None, folds=folds, model_type='lgb', plot_feature_importance=True, averaging='usual', make_oof=False):
result_dict = {}
if make_oof:
oof = np.zeros(len(X))
prediction = np.zeros(len(X_test))
scores = []
feature_importance = pd.DataFrame()
for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
gc.collect()
print('Fold', fold_n + 1, 'started at', time.ctime())
X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
if model_type == 'lgb':
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature = cat_cols)
valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature = cat_cols)
model = lgb.train(params,
train_data,
num_boost_round=10000,
valid_sets = [train_data, valid_data],
verbose_eval=100,
early_stopping_rounds = 500,
feval=eval_auc)
del train_data, valid_data
y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
del X_valid
gc.collect()
# print('predicting on test')
# y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred = predict_chunk(model, X_test)
# print('predicted')
if model_type == 'xgb':
train_data = xgb.DMatrix(data=X_train, label=y_train)
valid_data = xgb.DMatrix(data=X_valid, label=y_valid)
watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=500, params=params)
y_pred_valid = model.predict(xgb.DMatrix(X_valid), ntree_limit=model.best_ntree_limit)
#y_pred = model.predict(xgb.DMatrix(X_test), ntree_limit=model.best_ntree_limit)
y_pred = predict_chunk(model, xgb.DMatrix(X_test))
if model_type == 'lcv':
model = LogisticRegressionCV(scoring='roc_auc', cv=3)
model.fit(X_train, y_train)
y_pred_valid = model.predict(X_valid)
# y_pred = model.predict(X_test)
y_pred = predict_chunk(model, X_test)
if model_type == 'cat':
model = CatBoostRegressor(iterations=20000, eval_metric='AUC', **params)
model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)
y_pred_valid = model.predict(X_valid)
# y_pred = model.predict(X_test)
y_pred = predict_chunk(model, X_test)
if make_oof:
oof[valid_index] = y_pred_valid.reshape(-1,)
scores.append(fast_auc(y_valid, y_pred_valid))
print('Fold roc_auc:', roc_auc_score(y_valid, y_pred_valid))
print('')
if averaging == 'usual':
prediction += y_pred
elif averaging == 'rank':
prediction += pd.Series(y_pred).rank().values
if model_type == 'lgb':
# feature importance
fold_importance = pd.DataFrame()
fold_importance["feature"] = X.columns
fold_importance["importance"] = model.feature_importance()
fold_importance["fold"] = fold_n + 1
feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
prediction /= n_fold
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
if model_type == 'lgb':
if plot_feature_importance:
feature_importance["importance"] /= n_fold
cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
by="importance", ascending=False)[:50].index
best_features = feature_importance.loc[feature_importance.feature.isin(cols)]
logging.info('Top features')
for f in best_features.sort_values(by="importance", ascending=False)['feature'].values:
logging.info(f)
plt.figure(figsize=(16, 12));
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
plt.title('LGB Features (avg over folds)');
result_dict['feature_importance'] = feature_importance
result_dict['prediction'] = prediction
if make_oof:
result_dict['oof'] = oof
return result_dict
In [ ]:
params = {'num_leaves': 128,
'min_data_in_leaf': 60,
'objective': 'binary',
'max_depth': -1,
'learning_rate': 0.1,
"boosting": "gbdt",
"feature_fraction": 0.8,
"bagging_freq": 5,
"bagging_fraction": 0.8,
"bagging_seed": 11,
"lambda_l1": 0.1,
"lambda_l2": 0.1,
"random_state": 42,
"verbosity": -1}
In [ ]:
result_dict1 = train_model(X=train1, X_test=test, y=y1, params=params, model_type='lgb', plot_feature_importance=True, averaging='rank')
In [ ]:
del train1, y1
In [ ]:
# result_dict2 = train_model(X=train2, X_test=test, y=y2, params=params, model_type='lgb', plot_feature_importance=False, averaging='rank')
In [ ]:
# del train2, y2
In [ ]:
result_dict = train_model(X=train, X_test=test, y=y, params=params, model_type='lgb', plot_feature_importance=True, averaging='rank')
In [ ]:
submission = pd.read_csv('../input/microsoft-malware-prediction/sample_submission.csv')
# submission['HasDetections'] = (result_dict['prediction'] + result_dict1['prediction'] + result_dict2['prediction']) / 3
submission['HasDetections'] = (result_dict['prediction'] + result_dict1['prediction']) / 2
# submission['HasDetections'] = result_dict['prediction']
submission.to_csv('lgb_rank.csv', index=False)
I'll try blending my solution with the output of one of the top kernels. https://www.kaggle.com/roydatascience/new-blend
In [ ]:
import os
add_sub = pd.read_csv("../input/new-blend/super_blend.csv")
In [ ]:
submission['HasDetections'] = (submission['HasDetections'] + add_sub['HasDetections'].rank().values) / 2
submission.to_csv('blend.csv', index=False)