In [ ]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot import TPOTClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score
%matplotlib inline
In [ ]:
data = pd.read_csv('../input/training_set _processed.csv',header=0)
data.head()
In [ ]:
data.columns
In [ ]:
X_train = data[['amount','sc_ic_same','shopper_country', 'issuer_country','time_diff_usage',
'card_person_used_before','is_cc', 'fraud']]
In [ ]:
X_train.head()
In [ ]:
#Apply one hot encoder for categorical variables
X_train_ohe = pd.get_dummies(X_train, columns=['shopper_country','issuer_country'])
In [ ]:
X_train_ohe.head()
In [ ]:
X = X_train_ohe.loc[:, X_train_ohe.columns != 'fraud']
y = X_train_ohe['fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
In [ ]:
tpot = TPOTClassifier(generations=10, population_size=50, verbosity=2,scoring='accuracy')
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('adyen_fraud.py')
In [ ]:
exported_pipeline = make_pipeline(
StackingEstimator(estimator=RandomForestClassifier(bootstrap=True, criterion="entropy", max_features=0.15000000000000002, min_samples_leaf=10, min_samples_split=20, n_estimators=100)),
RandomForestClassifier(bootstrap=True, criterion="entropy", max_features=0.05, min_samples_leaf=1, min_samples_split=12, n_estimators=100)
)
exported_pipeline.fit(X,y)
results = exported_pipeline.predict(X)
In [ ]:
accuracy_score(y,results)
In [ ]:
#AUC_ROC
roc_auc_score(y,results)
In [ ]:
average_precision_score(y,results)
In [ ]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y, results)
roc_auc = auc(false_positive_rate, true_positive_rate)
In [ ]:
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()