In [19]:

    
# https://www.kaggle.com/mlg-ulb/creditcardfraud
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from imblearn.over_sampling import SMOTE

#plot shit
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import average_precision_score
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance, to_graphviz
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.









    



creditcard.csv



In [2]:

    
df1 = pd.read_csv("../input/creditcard.csv")



In [3]:

    
df1.head()









    Out[3]:







  
    
      
      Time
      V1
      V2
      V3
      V4
      V5
      V6
      V7
      V8
      V9
      ...
      V21
      V22
      V23
      V24
      V25
      V26
      V27
      V28
      Amount
      Class
    
  
  
    
      0
      0.0
      -1.359807
      -0.072781
      2.536347
      1.378155
      -0.338321
      0.462388
      0.239599
      0.098698
      0.363787
      ...
      -0.018307
      0.277838
      -0.110474
      0.066928
      0.128539
      -0.189115
      0.133558
      -0.021053
      149.62
      0
    
    
      1
      0.0
      1.191857
      0.266151
      0.166480
      0.448154
      0.060018
      -0.082361
      -0.078803
      0.085102
      -0.255425
      ...
      -0.225775
      -0.638672
      0.101288
      -0.339846
      0.167170
      0.125895
      -0.008983
      0.014724
      2.69
      0
    
    
      2
      1.0
      -1.358354
      -1.340163
      1.773209
      0.379780
      -0.503198
      1.800499
      0.791461
      0.247676
      -1.514654
      ...
      0.247998
      0.771679
      0.909412
      -0.689281
      -0.327642
      -0.139097
      -0.055353
      -0.059752
      378.66
      0
    
    
      3
      1.0
      -0.966272
      -0.185226
      1.792993
      -0.863291
      -0.010309
      1.247203
      0.237609
      0.377436
      -1.387024
      ...
      -0.108300
      0.005274
      -0.190321
      -1.175575
      0.647376
      -0.221929
      0.062723
      0.061458
      123.50
      0
    
    
      4
      2.0
      -1.158233
      0.877737
      1.548718
      0.403034
      -0.407193
      0.095921
      0.592941
      -0.270533
      0.817739
      ...
      -0.009431
      0.798278
      -0.137458
      0.141267
      -0.206010
      0.502292
      0.219422
      0.215153
      69.99
      0
    
  

5 rows × 31 columns



In [4]:

    
print(len(df1))
print(df1['Class'].sum())



In [5]:

    
limit = len(df1)

def plotStrip(x, y, hue, figsize = (14, 9)):
    
    fig = plt.figure(figsize = figsize)
    colours = plt.cm.tab10(np.linspace(0, 1, 9))
    with sns.axes_style('ticks'):
        ax = sns.stripplot(x, y, \
             hue = hue, jitter = 0.4, marker = '.', \
             size = 4, palette = colours)
        ax.set_xlabel('')
        ax.set_xticklabels(['genuine', 'fraudulent'], size = 16)
        for axis in ['top','bottom','left','right']:
            ax.spines[axis].set_linewidth(2)

        handles, labels = ax.get_legend_handles_labels()
        plt.legend(handles, ['Transfer', 'Cash out'], bbox_to_anchor=(1, 1), \
               loc=2, borderaxespad=0, fontsize = 16);
    return ax



In [6]:

    
X = df1
Y = X['Class']
del X['Class']



In [7]:

    
print('skew = {}'.format( sum((Y)) / float(len(X)) ))









    



skew = 0.001727485630620034



In [8]:

    
randomState = 5
np.random.seed(randomState)
trainX, testX, trainY, testY = train_test_split(X, Y, test_size = 0.2, random_state = randomState)



In [63]:

    
# I got to this point with mostly manual tuning of XGB
weights = (Y == 0).sum() / (1.0 * (Y == 1).sum())
clf = XGBClassifier(eta = 0.3, max_depth = 7, nthread = 6, scale_pos_weight = weights)
probabilities = clf.fit(trainX, trainY).predict_proba(testX)
print('AUPRC = {}'.format(average_precision_score(testY, \
                                              probabilities[:, 1])))









    



AUPRC = 0.8572058473004869



In [62]:

    
fig = plt.figure(figsize = (14, 9))
ax = fig.add_subplot(111)

colours = plt.cm.Set1(np.linspace(0, 1, 9))

ax = plot_importance(clf, height = 1, color = colours, grid = False, importance_type = 'cover', ax = ax);
for axis in ['top','bottom','left','right']:
            ax.spines[axis].set_linewidth(2)
        
ax.set_xlabel('importance score', size = 16);
ax.set_ylabel('features', size = 16);
ax.set_yticklabels(ax.get_yticklabels(), size = 12);
ax.set_title('Ordering of features by importance to the model learnt', size = 20);



In [34]:

    
# Long computation in this cell (~6 minutes)

trainSizes, trainScores, crossValScores = learning_curve(\
XGBClassifier(max_depth = 7, scale_pos_weight = weights, nthread = 4), trainX,\
                                         trainY, scoring = 'average_precision')



In [35]:

    
trainScoresMean = np.mean(trainScores, axis=1)
trainScoresStd = np.std(trainScores, axis=1)
crossValScoresMean = np.mean(crossValScores, axis=1)
crossValScoresStd = np.std(crossValScores, axis=1)

colours = plt.cm.tab10(np.linspace(0, 1, 9))

fig = plt.figure(figsize = (14, 9))
plt.fill_between(trainSizes, trainScoresMean - trainScoresStd,
    trainScoresMean + trainScoresStd, alpha=0.1, color=colours[0])
plt.fill_between(trainSizes, crossValScoresMean - crossValScoresStd,
    crossValScoresMean + crossValScoresStd, alpha=0.1, color=colours[1])
plt.plot(trainSizes, trainScores.mean(axis = 1), 'o-', label = 'train', \
         color = colours[0])
plt.plot(trainSizes, crossValScores.mean(axis = 1), 'o-', label = 'cross-val', \
         color = colours[1])

ax = plt.gca()
for axis in ['top','bottom','left','right']:
    ax.spines[axis].set_linewidth(2)

handles, labels = ax.get_legend_handles_labels()
plt.legend(handles, ['train', 'cross-val'], bbox_to_anchor=(0.8, 0.15), \
               loc=2, borderaxespad=0, fontsize = 16);
plt.xlabel('training set size', size = 16); 
plt.ylabel('AUPRC', size = 16)
plt.title('Learning curves indicate underfit model', size = 20);

Okay not bad, not great either. I'm going to try smote and see if it gives me better results by oversampling



In [36]:

    
sm = SMOTE(random_state=12, ratio = 1.0)
x_train_res, y_train_res = sm.fit_sample(trainX, trainY)









    



/opt/conda/lib/python3.6/site-packages/sklearn/utils/deprecation.py:77: DeprecationWarning: Function _ratio_float is deprecated; Use a float for 'ratio' is deprecated from version 0.2. The support will be removed in 0.4. Use a dict, str, or a callable instead.
  warnings.warn(msg, category=DeprecationWarning)



In [45]:

    
#I don't know if other people have this problem or not, but when I use smote it always gives me np arrays instead of dataframes...
trainXnp = trainX.as_matrix()
trainYnp = trainY.as_matrix()
testXnp = testX.as_matrix()
testYnp = testY.as_matrix()



In [38]:

    
#clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
#clf_rf.fit(x_train_res, y_train_res)
weights = (Y == 0).sum() / (1.0 * (Y == 1).sum())
clf_xgb = XGBClassifier(max_depth = 7, scale_pos_weight = weights).fit(x_train_res, y_train_res)



In [43]:

    
probabilities = clf_xgb.predict_proba(testXnp)
print('AUPRC = {}'.format(average_precision_score(testYnp, \
                                              probabilities[:, 1])))









    



AUPRC = 0.7765930974602815



In [46]:

    
trainSizes, trainScores, crossValScores = learning_curve(\
XGBClassifier(max_depth = 7, scale_pos_weight = weights, nthread = 4), trainXnp,\
                                         trainYnp, scoring = 'average_precision')



In [47]:

    
trainScoresMean = np.mean(trainScores, axis=1)
trainScoresStd = np.std(trainScores, axis=1)
crossValScoresMean = np.mean(crossValScores, axis=1)
crossValScoresStd = np.std(crossValScores, axis=1)

colours = plt.cm.tab10(np.linspace(0, 1, 9))

fig = plt.figure(figsize = (14, 9))
plt.fill_between(trainSizes, trainScoresMean - trainScoresStd,
    trainScoresMean + trainScoresStd, alpha=0.1, color=colours[0])
plt.fill_between(trainSizes, crossValScoresMean - crossValScoresStd,
    crossValScoresMean + crossValScoresStd, alpha=0.1, color=colours[1])
plt.plot(trainSizes, trainScores.mean(axis = 1), 'o-', label = 'train', \
         color = colours[0])
plt.plot(trainSizes, crossValScores.mean(axis = 1), 'o-', label = 'cross-val', \
         color = colours[1])

ax = plt.gca()
for axis in ['top','bottom','left','right']:
    ax.spines[axis].set_linewidth(2)

handles, labels = ax.get_legend_handles_labels()
plt.legend(handles, ['train', 'cross-val'], bbox_to_anchor=(0.8, 0.15), \
               loc=2, borderaxespad=0, fontsize = 16);
plt.xlabel('training set size', size = 16); 
plt.ylabel('AUPRC', size = 16)
plt.title('Learning curves indicate underfit model', size = 20);

So basically the oversampling + xgboost performed worse than just xgboost. The randomly generated data was probably not perfect and xgboost handled the heavy imbalancing much better than it had any right to. I'm using area under precision recall curve (auprc) instead of the ROC curve because the dataset is heavily skewed. I could get 99.9% accuracy by classifying everything as non-fraudulent but that doesn't tell us jack.

Obviously this isn't the best we can do (it's literally a mildly tuned xgb), but 86% auprc isn't bad for an hour's work and I just wanted to show that I do know the difference between ROC and PR.

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99