notebook.community

Edit and run

Using this credit card fraud dataset develop an algorithm to predict fraud. Prioritize correctly finding fraud rather than correctly labeling non-fraudulent transactions.



In [96]:

    
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import math
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
import seaborn as sns
import scipy.stats as stats
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.decomposition import PCA as sklearn_pca
import locale
from locale import atof
import warnings
from IPython.display import display
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_selection import f_regression
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import xlrd
from sklearn import ensemble
import time
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error



In [97]:

    
# Read and import data
creditcard = pd.read_csv('creditcard.csv')
creditcard.head()









    Out[97]:







  
    
      
      Time
      V1
      V2
      V3
      V4
      V5
      V6
      V7
      V8
      V9
      ...
      V21
      V22
      V23
      V24
      V25
      V26
      V27
      V28
      Amount
      Class
    
  
  
    
      0
      0.0
      -1.359807
      -0.072781
      2.536347
      1.378155
      -0.338321
      0.462388
      0.239599
      0.098698
      0.363787
      ...
      -0.018307
      0.277838
      -0.110474
      0.066928
      0.128539
      -0.189115
      0.133558
      -0.021053
      149.62
      0
    
    
      1
      0.0
      1.191857
      0.266151
      0.166480
      0.448154
      0.060018
      -0.082361
      -0.078803
      0.085102
      -0.255425
      ...
      -0.225775
      -0.638672
      0.101288
      -0.339846
      0.167170
      0.125895
      -0.008983
      0.014724
      2.69
      0
    
    
      2
      1.0
      -1.358354
      -1.340163
      1.773209
      0.379780
      -0.503198
      1.800499
      0.791461
      0.247676
      -1.514654
      ...
      0.247998
      0.771679
      0.909412
      -0.689281
      -0.327642
      -0.139097
      -0.055353
      -0.059752
      378.66
      0
    
    
      3
      1.0
      -0.966272
      -0.185226
      1.792993
      -0.863291
      -0.010309
      1.247203
      0.237609
      0.377436
      -1.387024
      ...
      -0.108300
      0.005274
      -0.190321
      -1.175575
      0.647376
      -0.221929
      0.062723
      0.061458
      123.50
      0
    
    
      4
      2.0
      -1.158233
      0.877737
      1.548718
      0.403034
      -0.407193
      0.095921
      0.592941
      -0.270533
      0.817739
      ...
      -0.009431
      0.798278
      -0.137458
      0.141267
      -0.206010
      0.502292
      0.219422
      0.215153
      69.99
      0
    
  

5 rows × 31 columns



In [98]:

    
#Analyse the data and number of datapoints in each category
creditcard['Class'].value_counts()









    Out[98]:





0    284315
1       492
Name: Class, dtype: int64

Balancing the dataset



In [99]:

    
#Upsample the majority class

# Separate majority and minority classes
creditcard_majority = creditcard[creditcard.Class==0]
creditcard_minority = creditcard[creditcard.Class==1]
 
# Upsample minority class
creditcard_minority_upsampled = resample(creditcard_minority, replace=True, n_samples=284315, random_state=123) 
 
# Combine majority class with upsampled minority class
creditcard_upsampled = pd.concat([creditcard_majority, creditcard_minority_upsampled])
 
# Display new class counts
creditcard_upsampled.Class.value_counts()









    Out[99]:





1    284315
0    284315
Name: Class, dtype: int64



In [100]:

    
#Downsample majority class

# Separate majority and minority classes
creditcard_majority = creditcard[creditcard.Class==0]
creditcard_minority = creditcard[creditcard.Class==1]
 
# Downsample majority class
creditcard_majority_downsampled = resample(creditcard_majority, replace=False, n_samples=492, random_state=123) 
 
# Combine minority class with downsampled majority class
creditcard_downsampled = pd.concat([creditcard_majority_downsampled, creditcard_minority])
 
# Display new class counts
creditcard_downsampled.Class.value_counts()









    Out[100]:





1    492
0    492
Name: Class, dtype: int64



In [101]:

    
#Define Outcome & Predictors

y = creditcard_downsampled['Class']
X = creditcard_downsampled.drop(creditcard_downsampled[['Class','Time']],axis=1)

#Scale the data
names = X.columns
X = pd.DataFrame(preprocessing.scale(X), columns = names)

#Split into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#KFOld
kf = KFold(20)

Logistic Regression



In [102]:

    
# Initialize and fit the model.
lr = LogisticRegression()
lr.fit(X_train,y_train)
lr.fit(X_test,y_test)

# Predict on training set
predtrain_y = lr.predict(X_train)
predtest_y = lr.predict(X_test)



In [103]:

    
#Training Scores
target_names = ['0', '1']
print(classification_report(y_train, predtrain_y, target_names=target_names))
cnf = confusion_matrix(y_train, predtrain_y)
print(cnf)

# Accuracy tables.
table_train = pd.crosstab(y_train, predtrain_y, margins=True)

train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

print((
    'Training set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}\n\n'
).format(train_tI_errors, train_tII_errors))









    



             precision    recall  f1-score   support

          0       0.90      0.99      0.94       328
          1       0.99      0.89      0.94       331

avg / total       0.94      0.94      0.94       659

[[324   4]
 [ 36 295]]
Training set accuracy:
Percent Type I errors: 0.006069802731411229
Percent Type II errors: 0.054628224582701064



In [104]:

    
#Testing Scores
target_names = ['0', '1']
print(classification_report(y_test, predtest_y, target_names=target_names))
cnf = confusion_matrix(y_test, predtest_y)
print(cnf)

table_test = pd.crosstab(y_test, predtest_y, margins=True)

test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

print((
    'Test set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}'
).format(test_tI_errors, test_tII_errors))
print(cross_val_score(lr,X,y,cv=kf))
print(cross_val_score(lr,X,y,cv=kf).mean())









    



             precision    recall  f1-score   support

          0       0.92      0.99      0.95       164
          1       0.99      0.91      0.94       161

avg / total       0.95      0.95      0.95       325

[[162   2]
 [ 15 146]]
Test set accuracy:
Percent Type I errors: 0.006153846153846154
Percent Type II errors: 0.046153846153846156
[ 0.94        0.98        1.          0.98        0.97959184  0.97959184
  0.97959184  0.97959184  0.97959184  0.95918367  0.93877551  0.89795918
  1.          0.7755102   0.7755102   0.91836735  0.95918367  0.85714286
  0.93877551  0.79591837]
0.930714285714

Support Vector Machine



In [105]:

    
# Train and fit the model
clf1 = SVC(kernel='linear', 
            class_weight='balanced',
            probability=True)
 
clf1.fit(X_train, y_train)
clf1.fit(X_test,y_test)
 
# Predict on training set
predtrainclf_y = clf1.predict(X_train)
predtestclf_y = clf1.predict(X_test)



In [106]:

    
#Training Scores
target_names = ['0', '1']
print(classification_report(y_train, predtrainclf_y, target_names=target_names))
cnf = confusion_matrix(y_train, predtrainclf_y)
print(cnf)

# Accuracy tables.
table_train = pd.crosstab(y_train, predtrainclf_y, margins=True)

train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

print((
    'Training set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}\n\n'
).format(train_tI_errors, train_tII_errors))









    



             precision    recall  f1-score   support

          0       0.90      0.98      0.94       328
          1       0.97      0.89      0.93       331

avg / total       0.94      0.93      0.93       659

[[320   8]
 [ 36 295]]
Training set accuracy:
Percent Type I errors: 0.012139605462822459
Percent Type II errors: 0.054628224582701064



In [107]:

    
#Testing Scores
target_names = ['0', '1']
print(classification_report(y_test, predtestclf_y, target_names=target_names))
cnf = confusion_matrix(y_test, predtestclf_y)
print(cnf)

table_test = pd.crosstab(y_test, predtestclf_y, margins=True)

test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

print((
    'Test set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}'
).format(test_tI_errors, test_tII_errors))
print(cross_val_score(clf1,X,y,cv=kf))
print(cross_val_score(clf1,X,y,cv=kf).mean())









    



             precision    recall  f1-score   support

          0       0.92      0.99      0.96       164
          1       0.99      0.91      0.95       161

avg / total       0.96      0.95      0.95       325

[[163   1]
 [ 14 147]]
Test set accuracy:
Percent Type I errors: 0.003076923076923077
Percent Type II errors: 0.043076923076923075
[ 0.96        0.98        1.          0.98        0.95918367  0.97959184
  0.97959184  0.95918367  0.97959184  0.95918367  0.93877551  0.85714286
  0.97959184  0.79591837  0.79591837  0.87755102  0.95918367  0.85714286
  0.93877551  0.79591837]
0.926612244898

Random Forest



In [108]:

    
# Train model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf.fit(X_test, y_test)

# Predict on training set
predtrainrf_y = rf.predict(X_train)
predtestrf_y = rf.predict(X_test)



In [109]:

    
#Training Scores
target_names = ['0', '1']
print(classification_report(y_train, predtrainrf_y, target_names=target_names))
cnf = confusion_matrix(y_train, predtrainrf_y)
print(cnf)

# Accuracy tables.
table_train = pd.crosstab(y_train, predtrainrf_y, margins=True)

train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

print((
    'Training set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}\n\n'
).format(train_tI_errors, train_tII_errors))









    



             precision    recall  f1-score   support

          0       0.88      0.98      0.93       328
          1       0.97      0.87      0.92       331

avg / total       0.93      0.92      0.92       659

[[320   8]
 [ 43 288]]
Training set accuracy:
Percent Type I errors: 0.012139605462822459
Percent Type II errors: 0.06525037936267071



In [110]:

    
#Test Scores
target_names = ['0', '1']
print(classification_report(y_test, predtestrf_y, target_names=target_names))
cnf = confusion_matrix(y_test, predtestrf_y)
print(cnf)

table_test = pd.crosstab(y_test, predtestrf_y, margins=True)

test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

print((
    'Test set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}'
).format(test_tI_errors, test_tII_errors))
print(cross_val_score(rf,X,y,cv=kf))
print(cross_val_score(rf,X,y,cv=kf).mean())









    



             precision    recall  f1-score   support

          0       0.99      1.00      0.99       164
          1       1.00      0.99      0.99       161

avg / total       0.99      0.99      0.99       325

[[164   0]
 [  2 159]]
Test set accuracy:
Percent Type I errors: 0.0
Percent Type II errors: 0.006153846153846154
[ 0.96        0.98        1.          0.94        0.97959184  0.95918367
  1.          1.          0.93877551  1.          0.93877551  0.95918367
  0.97959184  0.75510204  0.79591837  0.85714286  0.95918367  0.91836735
  0.89795918  0.81632653]
0.935775510204

Gradient Boosting



In [111]:

    
# We'll make 500 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 500,
          'max_depth': 2,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)
clf.fit(X_test, y_test)

# Predict on training set
predtrainclf_y = clf.predict(X_train)
predtestclf_y = clf.predict(X_test)



In [112]:

    
#Training Scores
target_names = ['0', '1']
print(classification_report(y_train, predtrainclf_y, target_names=target_names))
cnf = confusion_matrix(y_train, predtrainclf_y)
print(cnf)

# Accuracy tables.
table_train = pd.crosstab(y_train, predtrainclf_y, margins=True)

train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

print((
    'Training set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}\n\n'
).format(train_tI_errors, train_tII_errors))









    



             precision    recall  f1-score   support

          0       0.91      0.96      0.93       328
          1       0.96      0.90      0.93       331

avg / total       0.93      0.93      0.93       659

[[316  12]
 [ 32 299]]
Training set accuracy:
Percent Type I errors: 0.018209408194233688
Percent Type II errors: 0.048558421851289835



In [113]:

    
#Test Scores
target_names = ['0', '1']
print(classification_report(y_test, predtestclf_y, target_names=target_names))
cnf = confusion_matrix(y_test, predtestclf_y)
print(cnf)

table_test = pd.crosstab(y_test, predtestclf_y, margins=True)

test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

print((
    'Test set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}'
).format(test_tI_errors, test_tII_errors))
print(cross_val_score(clf,X,y,cv=kf))
print(cross_val_score(clf,X,y,cv=kf).mean())









    



             precision    recall  f1-score   support

          0       1.00      1.00      1.00       164
          1       1.00      1.00      1.00       161

avg / total       1.00      1.00      1.00       325

[[164   0]
 [  0 161]]
Test set accuracy:
Percent Type I errors: 0.0
Percent Type II errors: 0.0
[ 0.98        0.96        1.          0.94        0.93877551  1.
  0.91836735  0.97959184  0.89795918  0.95918367  0.97959184  0.97959184
  1.          0.79591837  0.79591837  0.89795918  0.97959184  0.87755102
  0.91836735  0.83673469]
0.931755102041

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99