version 0.1, May 2016
This notebook is licensed under a [Creative Commons Attribution-ShareAlike 3.0 Unported License]
pip install tqdm
In [1]:
import pandas as pd
import zipfile
with zipfile.ZipFile('../datasets/fraud_transactions_kaggle.csv.zip', 'r') as z:
f = z.open('fraud_transactions_kaggle.csv')
data = pd.read_csv(f, index_col=0)
In [2]:
data.head()
Out[2]:
In [3]:
data.tail()
Out[3]:
In [4]:
data.fraud.value_counts(dropna=False)
Out[4]:
In [5]:
from datetime import datetime, timedelta
from tqdm import tqdm
Split for each account and create the date as index
In [6]:
card_numbers = data['card_number'].unique()
data['trx_id'] = data.index
data.index = pd.DatetimeIndex(data['date'])
data_ = []
for card_number in tqdm(card_numbers):
data_.append(data.query('card_number == ' + str(card_number)))
Create Aggregated Features for one account
In [7]:
res_agg = pd.DataFrame(index=data['trx_id'].values,
columns=['Trx_sum_7D', 'Trx_count_1D'])
In [8]:
trx = data_[0]
for i in range(trx.shape[0]):
date = trx.index[i]
trx_id = int(trx.ix[i, 'trx_id'])
# Sum 7 D
agg_ = trx[date-pd.datetools.to_offset('7D').delta:date-timedelta(0,0,1)]
res_agg.loc[trx_id, 'Trx_sum_7D'] = agg_['amount'].sum()
# Count 1D
agg_ = trx[date-pd.datetools.to_offset('1D').delta:date-timedelta(0,0,1)]
res_agg.loc[trx_id, 'Trx_count_1D'] = agg_['amount'].shape[0]
In [9]:
res_agg.mean()
Out[9]:
All accounts
In [10]:
for trx in tqdm(data_):
for i in range(trx.shape[0]):
date = trx.index[i]
trx_id = int(trx.ix[i, 'trx_id'])
# Sum 7 D
agg_ = trx[date-pd.datetools.to_offset('7D').delta:date-timedelta(0,0,1)]
res_agg.loc[trx_id, 'Trx_sum_7D'] = agg_['amount'].sum()
# Count 1D
agg_ = trx[date-pd.datetools.to_offset('1D').delta:date-timedelta(0,0,1)]
res_agg.loc[trx_id, 'Trx_count_1D'] = agg_['amount'].shape[0]
In [13]:
res_agg.head()
Out[13]:
In [15]:
data.index = data.trx_id
In [16]:
data = data.join(res_agg)
In [19]:
data.sample(15, random_state=42).sort_index()
Out[19]:
In [20]:
X = data.loc[~data.fraud.isnull()]
In [21]:
y = X.fraud
In [22]:
X = X.drop(['fraud', 'date', 'card_number'], axis=1)
In [28]:
X_kaggle = data.loc[data.fraud.isnull()]
X_kaggle = X_kaggle.drop(['fraud', 'date', 'card_number'], axis=1)
In [29]:
X_kaggle.head()
Out[29]:
In [25]:
from sklearn.ensemble import RandomForestClassifier
In [26]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, class_weight='balanced')
In [27]:
from sklearn.metrics import fbeta_score
KFold cross-validation
In [31]:
from sklearn.cross_validation import KFold
In [35]:
kf = KFold(X.shape[0], n_folds=5)
res = []
for train, test in kf:
X_train, X_test, y_train, y_test = X.iloc[train], X.iloc[test], y.iloc[train], y.iloc[test]
clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba>0.05).astype(int)
res.append(fbeta_score(y_test, y_pred, beta=2))
In [37]:
pd.Series(res).describe()
Out[37]:
Train with all
In [38]:
clf.fit(X, y)
Out[38]:
In [40]:
y_pred = clf.predict_proba(X_kaggle)[:, 1]
In [41]:
y_pred = (y_pred>0.05).astype(int)
In [42]:
y_pred = pd.Series(y_pred,name='fraud', index=X_kaggle.index)
In [43]:
y_pred.head(10)
Out[43]:
In [108]:
y_pred.to_csv('fraud_transactions_kaggle_1.csv', header=True, index_label='ID')