xdeepfm-baseline



In [1]:
#Download ctrNet-tool 
#You can find the code in https://github.com/guoday/ctrNet-tool
!git clone https://github.com/guoday/ctrNet-tool.git
!cp -r ctrNet-tool/* ./
!rm -r ctrNet-tool data .git
!ls -all


Cloning into 'ctrNet-tool'...
remote: Enumerating objects: 153, done.
remote: Counting objects: 100% (153/153), done.
remote: Compressing objects: 100% (114/114), done.
remote: Total 153 (delta 77), reused 86 (delta 36), pack-reused 0
Receiving objects: 100% (153/153), 8.41 MiB | 0 bytes/s, done.
Resolving deltas: 100% (77/77), done.
rm: cannot remove '.git': No such file or directory
total 40
drwxr-xr-x 4 root root  4096 Feb 17 18:50 .
drwxr-xr-x 6 root root  4096 Feb 17 18:50 ..
-rw-r--r-- 1 root root  2268 Feb 17 18:50 README.md
-rw-r--r-- 1 root root 11707 Feb 17 18:50 __notebook__.ipynb
-rw-r--r-- 1 root root   271 Feb 17 18:50 __output__.json
-rw-r--r-- 1 root root   772 Feb 17 18:50 ctrNet.py
drwxr-xr-x 2 root root  4096 Feb 17 18:50 models
drwxr-xr-x 2 root root  4096 Feb 17 18:50 src

In [2]:
import ctrNet
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from src import misc_utils as utils
import os
import gc
import random

Loading Dataset


In [3]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }
print('Loading Train and Test Data.\n')
train = pd.read_csv('../input/train.csv', dtype=dtypes, low_memory=True)
train['MachineIdentifier'] = train.index.astype('uint32')
test  = pd.read_csv('../input/test.csv',  dtype=dtypes, low_memory=True)
test['MachineIdentifier']  = test.index.astype('uint32')
test['HasDetections']=[0]*len(test)


Loading Train and Test Data.


In [4]:
def make_bucket(data,num=10):
    data.sort()
    bins=[]
    for i in range(num):
        bins.append(data[int(len(data)*(i+1)//num)-1])
    return bins
float_features=['Census_SystemVolumeTotalCapacity','Census_PrimaryDiskTotalCapacity']
for f in float_features:
    train[f]=train[f].fillna(1e10)
    test[f]=test[f].fillna(1e10)
    data=list(train[f])+list(test[f])
    bins=make_bucket(data,num=50)
    train[f]=np.digitize(train[f],bins=bins)
    test[f]=np.digitize(test[f],bins=bins)
    
train, dev,_,_ = train_test_split(train,train['HasDetections'],test_size=0.02, random_state=2019)
features=train.columns.tolist()[1:-1]

Creating hparams


In [5]:
hparam=tf.contrib.training.HParams(
            model='xdeepfm',
            norm=True,
            batch_norm_decay=0.9,
            hidden_size=[128,128],
            cross_layer_sizes=[128,128,128],
            k=8,
            hash_ids=int(2e5),
            batch_size=1024,
            optimizer="adam",
            learning_rate=0.001,
            num_display_steps=1000,
            num_eval_steps=1000,
            epoch=1,
            metric='auc',
            activation=['relu','relu','relu'],
            cross_activation='identity',
            init_method='uniform',
            init_value=0.1,
            feature_nums=len(features),
            kfold=5)
utils.print_hparams(hparam)


  activation=['relu', 'relu', 'relu']
  batch_norm_decay=0.9
  batch_size=1024
  cross_activation=identity
  cross_layer_sizes=[128, 128, 128]
  epoch=1
  feature_nums=81
  hash_ids=200000
  hidden_size=[128, 128]
  init_method=uniform
  init_value=0.1
  k=8
  kfold=5
  learning_rate=0.001
  metric=auc
  model=xdeepfm
  norm=True
  num_display_steps=1000
  num_eval_steps=1000
  optimizer=adam

Training model


In [6]:
index=set(range(train.shape[0]))
K_fold=[]
for i in range(hparam.kfold):
    if i == hparam.kfold-1:
        tmp=index
    else:
        tmp=random.sample(index,int(1.0/hparam.kfold*train.shape[0]))
    index=index-set(tmp)
    print("Number:",len(tmp))
    K_fold.append(tmp)
    

for i in range(hparam.kfold):
    print("Fold",i)
    dev_index=K_fold[i]
    dev_index=random.sample(dev_index,int(0.1*len(dev_index)))
    train_index=[]
    for j in range(hparam.kfold):
        if j!=i:
            train_index+=K_fold[j]
    model=ctrNet.build_model(hparam)
    model.train(train_data=(train.iloc[train_index][features],train.iloc[train_index]['HasDetections']),\
                dev_data=(train.iloc[dev_index][features],train.iloc[dev_index]['HasDetections']))
    print("Training Done! Inference...")
    if i==0:
        preds=model.infer(dev_data=(test[features],test['HasDetections']))/hparam.kfold
    else:
        preds+=model.infer(dev_data=(test[features],test['HasDetections']))/hparam.kfold


Number: 1748610
Number: 1748610
Number: 1748610
Number: 1748610
Number: 1748613
Fold 0
# Trainable variables
  emb_v1:0, (200000, 1), 
  emb_v2:0, (200000, 8), 
  Variable:0, (648, 128), 
  norm_0/beta:0, (128,), 
  norm_0/gamma:0, (128,), 
  Variable_1:0, (128, 128), 
  norm_1/beta:0, (128,), 
  norm_1/gamma:0, (128,), 
  Variable_2:0, (128, 1), 
  exfm_part/f_0:0, (1, 6561, 128), 
  exfm_part/f_1:0, (1, 5184, 128), 
  exfm_part/f_2:0, (1, 5184, 128), 
  exfm_part/w_nn_output:0, (256, 1), 
  exfm_part/b_nn_output:0, (1,), 
  epoch 0 step 1000 lr 0.001 logloss 0.623389 gN 0.30, Sun Feb 17 19:03:26 2019
# Epcho-time 334.06s Eval AUC 0.719803. Best AUC 0.719803.
  epoch 0 step 2000 lr 0.001 logloss 0.607780 gN 0.23, Sun Feb 17 19:09:33 2019
# Epcho-time 700.82s Eval AUC 0.728620. Best AUC 0.728620.
  epoch 0 step 3000 lr 0.001 logloss 0.602428 gN 0.22, Sun Feb 17 19:15:39 2019
# Epcho-time 1067.16s Eval AUC 0.732318. Best AUC 0.732318.
  epoch 0 step 4000 lr 0.001 logloss 0.599856 gN 0.21, Sun Feb 17 19:21:46 2019
# Epcho-time 1433.89s Eval AUC 0.734359. Best AUC 0.734359.
  epoch 0 step 5000 lr 0.001 logloss 0.598311 gN 0.20, Sun Feb 17 19:27:51 2019
# Epcho-time 1799.32s Eval AUC 0.736238. Best AUC 0.736238.
  epoch 0 step 6000 lr 0.001 logloss 0.598146 gN 0.19, Sun Feb 17 19:33:58 2019
# Epcho-time 2166.09s Eval AUC 0.736650. Best AUC 0.736650.
# Epcho-time 2476.73s Eval AUC 0.738504. Best AUC 0.738504.
INFO:tensorflow:Restoring parameters from model_tmp/model
# Epcho-time 2511.79s Eval AUC 0.738510. Best AUC 0.738510.
Training Done! Inference...
Fold 1
# Trainable variables
  emb_v1:0, (200000, 1), 
  emb_v2:0, (200000, 8), 
  Variable:0, (648, 128), 
  norm_0/beta:0, (128,), 
  norm_0/gamma:0, (128,), 
  Variable_1:0, (128, 128), 
  norm_1/beta:0, (128,), 
  norm_1/gamma:0, (128,), 
  Variable_2:0, (128, 1), 
  exfm_part/f_0:0, (1, 6561, 128), 
  exfm_part/f_1:0, (1, 5184, 128), 
  exfm_part/f_2:0, (1, 5184, 128), 
  exfm_part/w_nn_output:0, (256, 1), 
  exfm_part/b_nn_output:0, (1,), 
  epoch 0 step 1000 lr 0.001 logloss 0.622524 gN 0.29, Sun Feb 17 20:13:05 2019
# Epcho-time 333.64s Eval AUC 0.718071. Best AUC 0.718071.
  epoch 0 step 2000 lr 0.001 logloss 0.606822 gN 0.23, Sun Feb 17 20:19:16 2019
# Epcho-time 704.54s Eval AUC 0.727319. Best AUC 0.727319.
  epoch 0 step 3000 lr 0.001 logloss 0.602220 gN 0.21, Sun Feb 17 20:25:24 2019
# Epcho-time 1073.12s Eval AUC 0.730443. Best AUC 0.730443.
  epoch 0 step 4000 lr 0.001 logloss 0.599879 gN 0.20, Sun Feb 17 20:31:34 2019
# Epcho-time 1443.31s Eval AUC 0.733629. Best AUC 0.733629.
  epoch 0 step 5000 lr 0.001 logloss 0.598172 gN 0.19, Sun Feb 17 20:37:41 2019
# Epcho-time 1810.29s Eval AUC 0.735065. Best AUC 0.735065.
  epoch 0 step 6000 lr 0.001 logloss 0.598176 gN 0.19, Sun Feb 17 20:43:46 2019
# Epcho-time 2174.49s Eval AUC 0.735806. Best AUC 0.735806.
# Epcho-time 2481.74s Eval AUC 0.737359. Best AUC 0.737359.
INFO:tensorflow:Restoring parameters from model_tmp/model
# Epcho-time 2515.76s Eval AUC 0.737365. Best AUC 0.737365.
Training Done! Inference...
Fold 2
# Trainable variables
  emb_v1:0, (200000, 1), 
  emb_v2:0, (200000, 8), 
  Variable:0, (648, 128), 
  norm_0/beta:0, (128,), 
  norm_0/gamma:0, (128,), 
  Variable_1:0, (128, 128), 
  norm_1/beta:0, (128,), 
  norm_1/gamma:0, (128,), 
  Variable_2:0, (128, 1), 
  exfm_part/f_0:0, (1, 6561, 128), 
  exfm_part/f_1:0, (1, 5184, 128), 
  exfm_part/f_2:0, (1, 5184, 128), 
  exfm_part/w_nn_output:0, (256, 1), 
  exfm_part/b_nn_output:0, (1,), 
  epoch 0 step 1000 lr 0.001 logloss 0.622527 gN 0.30, Sun Feb 17 21:21:11 2019
# Epcho-time 326.99s Eval AUC 0.720199. Best AUC 0.720199.
  epoch 0 step 2000 lr 0.001 logloss 0.606505 gN 0.23, Sun Feb 17 21:27:11 2019
# Epcho-time 687.31s Eval AUC 0.727623. Best AUC 0.727623.
  epoch 0 step 3000 lr 0.001 logloss 0.602725 gN 0.21, Sun Feb 17 21:33:15 2019
# Epcho-time 1051.09s Eval AUC 0.731835. Best AUC 0.731835.
  epoch 0 step 4000 lr 0.001 logloss 0.600107 gN 0.20, Sun Feb 17 21:39:13 2019
# Epcho-time 1409.39s Eval AUC 0.733906. Best AUC 0.733906.
  epoch 0 step 5000 lr 0.001 logloss 0.598279 gN 0.20, Sun Feb 17 21:45:11 2019
# Epcho-time 1767.28s Eval AUC 0.735214. Best AUC 0.735214.
  epoch 0 step 6000 lr 0.001 logloss 0.598155 gN 0.19, Sun Feb 17 21:51:17 2019
# Epcho-time 2132.57s Eval AUC 0.735627. Best AUC 0.735627.
# Epcho-time 2446.68s Eval AUC 0.737078. Best AUC 0.737078.
INFO:tensorflow:Restoring parameters from model_tmp/model
# Epcho-time 2480.48s Eval AUC 0.737082. Best AUC 0.737082.
Training Done! Inference...
Fold 3
# Trainable variables
  emb_v1:0, (200000, 1), 
  emb_v2:0, (200000, 8), 
  Variable:0, (648, 128), 
  norm_0/beta:0, (128,), 
  norm_0/gamma:0, (128,), 
  Variable_1:0, (128, 128), 
  norm_1/beta:0, (128,), 
  norm_1/gamma:0, (128,), 
  Variable_2:0, (128, 1), 
  exfm_part/f_0:0, (1, 6561, 128), 
  exfm_part/f_1:0, (1, 5184, 128), 
  exfm_part/f_2:0, (1, 5184, 128), 
  exfm_part/w_nn_output:0, (256, 1), 
  exfm_part/b_nn_output:0, (1,), 
  epoch 0 step 1000 lr 0.001 logloss 0.621309 gN 0.29, Sun Feb 17 22:28:59 2019
# Epcho-time 323.99s Eval AUC 0.721527. Best AUC 0.721527.
  epoch 0 step 2000 lr 0.001 logloss 0.605902 gN 0.23, Sun Feb 17 22:35:00 2019
# Epcho-time 684.81s Eval AUC 0.728373. Best AUC 0.728373.
  epoch 0 step 3000 lr 0.001 logloss 0.602622 gN 0.21, Sun Feb 17 22:40:57 2019
# Epcho-time 1041.68s Eval AUC 0.732476. Best AUC 0.732476.
  epoch 0 step 4000 lr 0.001 logloss 0.600202 gN 0.20, Sun Feb 17 22:46:53 2019
# Epcho-time 1398.34s Eval AUC 0.734133. Best AUC 0.734133.
  epoch 0 step 5000 lr 0.001 logloss 0.598160 gN 0.20, Sun Feb 17 22:52:49 2019
# Epcho-time 1754.60s Eval AUC 0.735932. Best AUC 0.735932.
  epoch 0 step 6000 lr 0.001 logloss 0.598076 gN 0.19, Sun Feb 17 22:58:47 2019
# Epcho-time 2112.40s Eval AUC 0.736718. Best AUC 0.736718.
# Epcho-time 2420.66s Eval AUC 0.738356. Best AUC 0.738356.
INFO:tensorflow:Restoring parameters from model_tmp/model
# Epcho-time 2455.45s Eval AUC 0.738354. Best AUC 0.738356.
Training Done! Inference...
Fold 4
# Trainable variables
  emb_v1:0, (200000, 1), 
  emb_v2:0, (200000, 8), 
  Variable:0, (648, 128), 
  norm_0/beta:0, (128,), 
  norm_0/gamma:0, (128,), 
  Variable_1:0, (128, 128), 
  norm_1/beta:0, (128,), 
  norm_1/gamma:0, (128,), 
  Variable_2:0, (128, 1), 
  exfm_part/f_0:0, (1, 6561, 128), 
  exfm_part/f_1:0, (1, 5184, 128), 
  exfm_part/f_2:0, (1, 5184, 128), 
  exfm_part/w_nn_output:0, (256, 1), 
  exfm_part/b_nn_output:0, (1,), 
  epoch 0 step 1000 lr 0.001 logloss 0.623812 gN 0.31, Sun Feb 17 23:36:38 2019
# Epcho-time 325.19s Eval AUC 0.721055. Best AUC 0.721055.
  epoch 0 step 2000 lr 0.001 logloss 0.606656 gN 0.24, Sun Feb 17 23:42:32 2019
# Epcho-time 678.98s Eval AUC 0.729268. Best AUC 0.729268.
  epoch 0 step 3000 lr 0.001 logloss 0.602976 gN 0.22, Sun Feb 17 23:48:26 2019
# Epcho-time 1033.59s Eval AUC 0.734373. Best AUC 0.734373.
  epoch 0 step 4000 lr 0.001 logloss 0.600411 gN 0.21, Sun Feb 17 23:54:21 2019
# Epcho-time 1388.40s Eval AUC 0.735906. Best AUC 0.735906.
  epoch 0 step 5000 lr 0.001 logloss 0.598301 gN 0.20, Mon Feb 18 00:00:22 2019
# Epcho-time 1749.56s Eval AUC 0.738506. Best AUC 0.738506.
  epoch 0 step 6000 lr 0.001 logloss 0.597047 gN 0.19, Mon Feb 18 00:06:20 2019
# Epcho-time 2107.55s Eval AUC 0.738437. Best AUC 0.738506.
# Epcho-time 2407.95s Eval AUC 0.738910. Best AUC 0.738910.
INFO:tensorflow:Restoring parameters from model_tmp/model
# Epcho-time 2440.96s Eval AUC 0.738909. Best AUC 0.738910.
Training Done! Inference...

Inference


In [7]:
submission = pd.read_csv('../input/sample_submission.csv')
submission['HasDetections'] = preds
print(submission['HasDetections'].head())
submission.to_csv('nffm_submission.csv', index=False)


0    0.509475
1    0.612248
2    0.469438
3    0.326553
4    0.489298
Name: HasDetections, dtype: float32