14 - Kaggle Competition

Fraud Detection

https://inclass.kaggle.com/c/easy-ml-class

version 0.1, May 2016

Part of the class Machine Learning for Security Informatics

This notebook is licensed under a [Creative Commons Attribution-ShareAlike 3.0 Unported License]

pip install tqdm

Fraud Detection



In [1]:

    
import pandas as pd
import zipfile
with zipfile.ZipFile('../datasets/fraud_transactions_kaggle.csv.zip', 'r') as z:
    f = z.open('fraud_transactions_kaggle.csv')
    data = pd.read_csv(f, index_col=0)



In [2]:

    
data.head()









    Out[2]:






  
    
      
      date
      card_number
      type
      merchant
      amount
      fraud
    
    
      ID
      
      
      
      
      
      
    
  
  
    
      0
      2011-01-01 08:00:06
      1942
      2
      8328
      65.16
      0.0
    
    
      1
      2011-01-01 08:00:16
      5629
      2
      42588
      260.84
      0.0
    
    
      2
      2011-01-01 08:01:28
      408
      2
      15622
      6010.05
      0.0
    
    
      3
      2011-01-01 08:01:43
      859
      2
      45192
      348.46
      0.0
    
    
      4
      2011-01-01 08:01:48
      3786
      2
      35549
      1160.35
      0.0



In [3]:

    
data.tail()









    Out[3]:






  
    
      
      date
      card_number
      type
      merchant
      amount
      fraud
    
    
      ID
      
      
      
      
      
      
    
  
  
    
      199995
      2012-12-31 17:04:18
      4069
      2
      35828
      91.22
      NaN
    
    
      199996
      2012-12-31 17:04:51
      9
      2
      46923
      390.95
      NaN
    
    
      199997
      2012-12-31 17:05:38
      1481
      1
      -1
      0.65
      NaN
    
    
      199998
      2012-12-31 17:05:55
      1481
      1
      4535
      390.04
      NaN
    
    
      199999
      2012-12-31 17:25:02
      0
      1
      8322
      308.44
      NaN



In [4]:

    
data.fraud.value_counts(dropna=False)









    Out[4]:





 0.0    171048
NaN      27909
 1.0      1043
Name: fraud, dtype: int64

Estimate aggregated features



In [5]:

    
from datetime import datetime, timedelta
from tqdm import tqdm

Split for each account and create the date as index



In [6]:

    
card_numbers = data['card_number'].unique()
data['trx_id'] = data.index
data.index = pd.DatetimeIndex(data['date'])

data_ = []
for card_number in tqdm(card_numbers):
    data_.append(data.query('card_number == ' + str(card_number)))









    



100%|██████████| 8087/8087 [00:20<00:00, 390.15it/s]

Create Aggregated Features for one account



In [7]:

    
res_agg = pd.DataFrame(index=data['trx_id'].values, 
                       columns=['Trx_sum_7D', 'Trx_count_1D'])



In [8]:

    
trx = data_[0]

for i in range(trx.shape[0]):
    date = trx.index[i]
    trx_id = int(trx.ix[i, 'trx_id'])
    # Sum 7 D
    agg_ = trx[date-pd.datetools.to_offset('7D').delta:date-timedelta(0,0,1)]
    res_agg.loc[trx_id, 'Trx_sum_7D'] = agg_['amount'].sum()
    # Count 1D
    agg_ = trx[date-pd.datetools.to_offset('1D').delta:date-timedelta(0,0,1)]
    res_agg.loc[trx_id, 'Trx_count_1D'] = agg_['amount'].shape[0]



In [9]:

    
res_agg.mean()









    Out[9]:





Trx_sum_7D      1054.881429
Trx_count_1D       0.640693
dtype: float64

All accounts



In [10]:

    
for trx in tqdm(data_):
    for i in range(trx.shape[0]):
        date = trx.index[i]
        trx_id = int(trx.ix[i, 'trx_id'])
        # Sum 7 D
        agg_ = trx[date-pd.datetools.to_offset('7D').delta:date-timedelta(0,0,1)]
        res_agg.loc[trx_id, 'Trx_sum_7D'] = agg_['amount'].sum()
        # Count 1D
        agg_ = trx[date-pd.datetools.to_offset('1D').delta:date-timedelta(0,0,1)]
        res_agg.loc[trx_id, 'Trx_count_1D'] = agg_['amount'].shape[0]









    



100%|██████████| 8087/8087 [04:26<00:00, 30.33it/s]



In [13]:

    
res_agg.head()









    Out[13]:






  
    
      
      Trx_sum_7D
      Trx_count_1D
    
  
  
    
      0
      0
      0
    
    
      1
      0
      0
    
    
      2
      0
      0
    
    
      3
      0
      0
    
    
      4
      0
      0



In [15]:

    
data.index = data.trx_id



In [16]:

    
data = data.join(res_agg)



In [19]:

    
data.sample(15, random_state=42).sort_index()









    Out[19]:






  
    
      
      date
      card_number
      type
      merchant
      amount
      fraud
      trx_id
      Trx_sum_7D
      Trx_count_1D
    
    
      trx_id
      
      
      
      
      
      
      
      
      
    
  
  
    
      4082
      2011-01-16 16:26:53
      3558
      2
      13505
      528.82
      0.0
      4082
      307.85
      0
    
    
      23677
      2011-04-04 08:13:41
      1162
      2
      9417
      117.29
      0.0
      23677
      0
      0
    
    
      30074
      2011-04-29 13:09:07
      0
      1
      56997
      21.29
      0.0
      30074
      14171.9
      2
    
    
      65426
      2011-09-09 10:11:24
      4420
      2
      57849
      29.70
      0.0
      65426
      0
      0
    
    
      72272
      2011-10-04 10:43:00
      2114
      2
      5109
      2170.65
      0.0
      72272
      131020
      7
    
    
      74456
      2011-10-11 17:17:22
      2148
      2
      1341
      2150.19
      0.0
      74456
      0
      0
    
    
      84660
      2011-11-19 17:06:58
      1521
      1
      35294
      651.59
      0.0
      84660
      0
      0
    
    
      117167
      2012-04-01 12:33:33
      1471
      1
      -1
      650.94
      0.0
      117167
      4381.21
      1
    
    
      119737
      2012-04-09 14:27:12
      2723
      1
      38616
      13.03
      0.0
      119737
      13614.2
      0
    
    
      132467
      2012-05-27 16:43:11
      4857
      2
      45373
      41.70
      0.0
      132467
      634.13
      10
    
    
      134858
      2012-06-03 17:05:21
      2114
      1
      18692
      26.06
      0.0
      134858
      175202
      7
    
    
      142133
      2012-06-29 16:21:37
      7588
      2
      35991
      92.53
      0.0
      142133
      1151.21
      4
    
    
      158154
      2012-08-20 10:55:23
      4420
      2
      53353
      182.65
      0.0
      158154
      121.77
      0
    
    
      176418
      2012-10-16 14:23:04
      1595
      2
      25985
      15397.58
      NaN
      176418
      0
      0
    
    
      186433
      2012-11-20 11:04:00
      4923
      2
      36010
      217.89
      NaN
      186433
      573.4
      0

Split train and test



In [20]:

    
X = data.loc[~data.fraud.isnull()]



In [21]:

    
y = X.fraud



In [22]:

    
X = X.drop(['fraud', 'date', 'card_number'], axis=1)



In [28]:

    
X_kaggle = data.loc[data.fraud.isnull()]
X_kaggle = X_kaggle.drop(['fraud', 'date', 'card_number'], axis=1)



In [29]:

    
X_kaggle.head()









    Out[29]:






  
    
      
      type
      merchant
      amount
      trx_id
      Trx_sum_7D
      Trx_count_1D
    
    
      trx_id
      
      
      
      
      
      
    
  
  
    
      172091
      2
      13273
      208.51
      172091
      120165
      14
    
    
      172092
      2
      34472
      525.05
      172092
      71042.4
      0
    
    
      172093
      2
      37909
      802.24
      172093
      120374
      15
    
    
      172094
      2
      35167
      130.32
      172094
      90638.1
      9
    
    
      172095
      2
      35073
      9696.96
      172095
      0
      0

Simple Random Forest



In [25]:

    
from sklearn.ensemble import RandomForestClassifier



In [26]:

    
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, class_weight='balanced')



In [27]:

    
from sklearn.metrics import fbeta_score

KFold cross-validation



In [31]:

    
from sklearn.cross_validation import KFold



In [35]:

    
kf = KFold(X.shape[0], n_folds=5)
res = []
for train, test in kf:
    X_train, X_test, y_train, y_test = X.iloc[train], X.iloc[test], y.iloc[train], y.iloc[test]
    clf.fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_proba>0.05).astype(int)
    res.append(fbeta_score(y_test, y_pred, beta=2))



In [37]:

    
pd.Series(res).describe()









    Out[37]:





count    5.000000
mean     0.078145
std      0.032472
min      0.054945
25%      0.057692
50%      0.062500
75%      0.082713
max      0.132877
dtype: float64

Train with all

Predict and send to Kaggle



In [38]:

    
clf.fit(X, y)









    Out[38]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



In [40]:

    
y_pred = clf.predict_proba(X_kaggle)[:, 1]



In [41]:

    
y_pred = (y_pred>0.05).astype(int)



In [42]:

    
y_pred = pd.Series(y_pred,name='fraud', index=X_kaggle.index)



In [43]:

    
y_pred.head(10)









    Out[43]:





trx_id
172091    0
172092    1
172093    1
172094    0
172095    1
172096    1
172097    1
172098    0
172099    1
172100    0
Name: fraud, dtype: int64



In [108]:

    
y_pred.to_csv('fraud_transactions_kaggle_1.csv', header=True, index_label='ID')

Main Issues

Class imbalance
Feature creation
Model selection
Threshold selection

	date	card_number	type	merchant	amount	fraud
ID
0	2011-01-01 08:00:06	1942	2	8328	65.16	0.0
1	2011-01-01 08:00:16	5629	2	42588	260.84	0.0
2	2011-01-01 08:01:28	408	2	15622	6010.05	0.0
3	2011-01-01 08:01:43	859	2	45192	348.46	0.0
4	2011-01-01 08:01:48	3786	2	35549	1160.35	0.0

	date	card_number	type	merchant	amount	fraud
ID
199995	2012-12-31 17:04:18	4069	2	35828	91.22	NaN
199996	2012-12-31 17:04:51	9	2	46923	390.95	NaN
199997	2012-12-31 17:05:38	1481	1	-1	0.65	NaN
199998	2012-12-31 17:05:55	1481	1	4535	390.04	NaN
199999	2012-12-31 17:25:02	0	1	8322	308.44	NaN

	date	card_number	type	merchant	amount	fraud	trx_id	Trx_sum_7D	Trx_count_1D
trx_id
4082	2011-01-16 16:26:53	3558	2	13505	528.82	0.0	4082	307.85	0
23677	2011-04-04 08:13:41	1162	2	9417	117.29	0.0	23677	0	0
30074	2011-04-29 13:09:07	0	1	56997	21.29	0.0	30074	14171.9	2
65426	2011-09-09 10:11:24	4420	2	57849	29.70	0.0	65426	0	0
72272	2011-10-04 10:43:00	2114	2	5109	2170.65	0.0	72272	131020	7
74456	2011-10-11 17:17:22	2148	2	1341	2150.19	0.0	74456	0	0
84660	2011-11-19 17:06:58	1521	1	35294	651.59	0.0	84660	0	0
117167	2012-04-01 12:33:33	1471	1	-1	650.94	0.0	117167	4381.21	1
119737	2012-04-09 14:27:12	2723	1	38616	13.03	0.0	119737	13614.2	0
132467	2012-05-27 16:43:11	4857	2	45373	41.70	0.0	132467	634.13	10
134858	2012-06-03 17:05:21	2114	1	18692	26.06	0.0	134858	175202	7
142133	2012-06-29 16:21:37	7588	2	35991	92.53	0.0	142133	1151.21	4
158154	2012-08-20 10:55:23	4420	2	53353	182.65	0.0	158154	121.77	0
176418	2012-10-16 14:23:04	1595	2	25985	15397.58	NaN	176418	0	0
186433	2012-11-20 11:04:00	4923	2	36010	217.89	NaN	186433	573.4	0

	type	merchant	amount	trx_id	Trx_sum_7D	Trx_count_1D
trx_id
172091	2	13273	208.51	172091	120165	14
172092	2	34472	525.05	172092	71042.4	0
172093	2	37909	802.24	172093	120374	15
172094	2	35167	130.32	172094	90638.1	9
172095	2	35073	9696.96	172095	0	0