In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from datetime import datetime, timedelta
import utils_data
from os.path import join
from IPython.display import display
In [ ]:
dates_2016 = [datetime(2016, 1, 1) + timedelta(days=i) for i in range(366)]
In [ ]:
dataset01, dataset0, dataset1 = utils_data.get_real_dataset()
datasets = [dataset0, dataset1]
out_folder = utils_data.FOLDER_REAL_DATA_ANALYSIS
In [ ]:
print(dataset01.head())
In [ ]:
data_stats = utils_data.get_real_data_stats()
data_stats.to_csv(join(utils_data.FOLDER_SIMULATOR_INPUT, 'aggregated_data.csv'))
display(data_stats)
Percentage of fraudulent cards also in genuine transactions:
In [ ]:
most_used_card = dataset0['CardID'].value_counts().index[0]
print("Card (ID) with most transactions: ", most_used_card)
In [ ]:
plt.figure(figsize=(15, 5))
plt_idx = 1
for d in datasets:
plt.subplot(1, 2, plt_idx)
trans_dates = d["Global_Date"].apply(lambda date: date.date())
all_trans = trans_dates.value_counts().sort_index()
date_num = matplotlib.dates.date2num(all_trans.index)
plt.plot(date_num, all_trans.values, 'k.', label='num trans.')
plt.plot(date_num, np.zeros(len(date_num))+np.sum(all_trans)/366, 'g--',label='average')
plt_idx += 1
plt.title(d.name, size=20)
plt.xlabel('days (1.1.16 - 31.12.16)', size=15)
plt.xticks([])
plt.xlim(matplotlib.dates.date2num([datetime(2016,1,1), datetime(2016,12,31)]))
if plt_idx == 2:
plt.ylabel('num transactions', size=15)
plt.legend(fontsize=15)
plt.tight_layout()
plt.savefig(join(utils_data.FOLDER_REAL_DATA_ANALYSIS, 'time_day-in-year'))
plt.show()
Analysis:
In [ ]:
monthdays_2016 = np.unique([dates_2016[i].day for i in range(366)], return_counts=True)
monthdays_2016 = monthdays_2016[1][monthdays_2016[0]-1]
plt.figure(figsize=(12, 5))
plt_idx = 1
monthday_frac = np.zeros((31, 2))
idx = 0
for d in datasets:
# get the average number of transactions per day in a month
monthday = d["Local_Date"].apply(lambda date: date.day).value_counts().sort_index()
monthday /= monthdays_2016
if idx > -1:
monthday_frac[:, idx] = monthday.values / np.sum(monthday.values, axis=0)
idx += 1
plt.subplot(1, 2, plt_idx)
plt.plot(monthday.index, monthday.values, 'ko')
plt.plot(monthday.index, monthday.values, 'k-', markersize=0.1)
plt.plot(monthday.index, np.zeros(31)+np.sum(monthday)/31, 'g--', label='average')
plt.title(d.name, size=20)
plt.xlabel('day in month', size=15)
if plt_idx == 1:
plt.ylabel('avg. num transactions', size=15)
plt_idx += 1
plt.tight_layout()
plt.savefig(join(utils_data.FOLDER_REAL_DATA_ANALYSIS, 'time_day-in-month'))
plt.show()
# save the resulting data
np.save(join(utils_data.FOLDER_SIMULATOR_INPUT, 'monthday_frac'), monthday_frac)
Analysis:
In [ ]:
weekdays_2016 = np.unique([dates_2016[i].weekday() for i in range(366)], return_counts=True)
weekdays_2016 = weekdays_2016[1][weekdays_2016[0]]
plt.figure(figsize=(12, 5))
plt_idx = 1
weekday_frac = np.zeros((7, 2))
idx = 0
for d in datasets:
weekday = d["Local_Date"].apply(lambda date: date.weekday()).value_counts().sort_index()
weekday /= weekdays_2016
if idx > -1:
weekday_frac[:, idx] = weekday.values / np.sum(weekday.values, axis=0)
idx += 1
plt.subplot(1, 2, plt_idx)
plt.plot(weekday.index, weekday.values, 'ko')
plt.plot(weekday.index, weekday.values, 'k-', markersize=0.1)
plt.plot(weekday.index, np.zeros(7)+np.sum(weekday)/7, 'g--', label='average')
plt.title(d.name, size=20)
plt.xlabel('weekday', size=15)
plt.xticks(range(7), ['Mo', 'Tu', 'We', 'Th', 'Fr', 'Sa', 'Su'])
if plt_idx == 1:
plt.ylabel('avg. num transactions', size=15)
plt_idx += 1
plt.tight_layout()
plt.savefig(join(utils_data.FOLDER_REAL_DATA_ANALYSIS, 'time_day-in-week'))
plt.show()
# save the resulting data
np.save(join(utils_data.FOLDER_SIMULATOR_INPUT, 'weekday_frac'), weekday_frac)
Analysis:
In [ ]:
monthdays = np.array([31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31])
plt.figure(figsize=(12, 5))
plt_idx = 1
month_frac = np.zeros((12, 2))
idx = 0
for d in datasets:
month = d["Local_Date"].apply(lambda date: date.month).value_counts().sort_index()
# correct for different number of days in a month
month = month / monthdays[month.index.values-1] * np.mean(monthdays[month.index.values-1])
if idx > -1:
month_frac[month.index-1, idx] = month.values / np.sum(month.values, axis=0)
idx += 1
plt.subplot(1, 2, plt_idx)
plt.plot(month.index, month.values, 'ko')
plt.plot(month.index, month.values, 'k-', markersize=0.1)
plt.plot(range(1,13), np.zeros(12)+np.sum(month)/12, 'g--', label='average')
plt.title(d.name, size=20)
plt.xlabel('month', size=15)
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
if plt_idx == 1:
plt.ylabel('num transactions', size=15)
plt_idx += 1
plt.tight_layout()
plt.savefig(join(utils_data.FOLDER_REAL_DATA_ANALYSIS, 'time_month-in-year'))
plt.show()
# save the resulting data
np.save(join(utils_data.FOLDER_SIMULATOR_INPUT, 'month_frac'), month_frac)
Analysis:
In [ ]:
plt.figure(figsize=(12, 5))
plt_idx = 1
hour_frac = np.zeros((24, 2))
idx = 0
for d in datasets:
hours = d["Local_Date"].apply(lambda date: date.hour).value_counts().sort_index()
hours /= 366
if idx > -1:
hour_frac[hours.index.values, idx] = hours.values / np.sum(hours.values, axis=0)
idx += 1
plt.subplot(1, 2, plt_idx)
plt.plot(hours.index, hours.values, 'ko')
plt.plot(hours.index, hours.values, 'k-', markersize=0.1, label='transactions')
plt.plot(range(24), np.zeros(24)+np.sum(hours)/24, 'g--', label='average')
plt.title(d.name, size=20)
plt.xlabel('hour', size=15)
# plt.xticks([])
if plt_idx == 1:
plt.ylabel('avg. num transactions', size=15)
plt_idx += 1
plt.tight_layout()
plt.savefig(join(utils_data.FOLDER_REAL_DATA_ANALYSIS, 'time_hour-in-day'))
plt.show()
# save the resulting data
np.save(join(utils_data.FOLDER_SIMULATOR_INPUT, 'hour_frac'), hour_frac)
Analysis:
In [ ]:
# extract only hours
date_hour_counts = dataset0["Local_Date"].apply(lambda d: d.replace(minute=0, second=0)).value_counts(sort=False)
hours = np.array(list(map(lambda d: d.hour, list(date_hour_counts.index))))
counts = date_hour_counts.values
hour_mean = np.zeros(24)
hour_min = np.zeros(24)
hour_max = np.zeros(24)
hour_std = np.zeros(24)
for h in range(24):
hour_mean[h] = np.mean(counts[hours==h])
hour_min[h] = np.min(counts[hours==h])
hour_max[h] = np.max(counts[hours==h])
hour_std[h] = np.std(counts[hours==h])
print(np.vstack((range(24), hour_min, hour_max, hour_mean, hour_std)).T)
In [ ]:
# total number of transactions we want in one year
aggregated_data = pd.read_csv(join(utils_data.FOLDER_SIMULATOR_INPUT, 'aggregated_data.csv'), index_col=0)
trans_per_year = np.array(aggregated_data.loc['transactions'].values, dtype=np.float)[1:]
# transactions per day in a month
frac_monthday = np.load(join(utils_data.FOLDER_SIMULATOR_INPUT, 'monthday_frac.npy'))
# transactions per day in a week
frac_weekday = np.load(join(utils_data.FOLDER_SIMULATOR_INPUT, 'weekday_frac.npy'))
# transactions per month in a year
frac_month = np.load(join(utils_data.FOLDER_SIMULATOR_INPUT, 'month_frac.npy'))
# transactions hour in a day
frac_hour = np.load(join(utils_data.FOLDER_SIMULATOR_INPUT, 'hour_frac.npy'))
cust_idx = 0
std_transactions = 1000
num_customers = 200
# get the probability of a transaction in a given hour
curr_date = datetime(2016, 1, 1)
num_trans = 0
for i in range(366*24):
new_trans = float(trans_per_year[cust_idx])
new_trans *= frac_month[curr_date.month-1, cust_idx]
new_trans *= frac_monthday[curr_date.day-1, cust_idx]
new_trans *= 7 * frac_weekday[curr_date.weekday(), cust_idx]
new_trans *= frac_hour[curr_date.hour, cust_idx]
num_trans += new_trans
curr_date += timedelta(hours=1)
print(curr_date)
print(trans_per_year[cust_idx])
print(num_trans)
print("")
# the difference happens because some months have longer/shorter days.
# We did not want to scale up the transactions on day 31 because that's unrealistic.
curr_date = datetime(2016, 1, 1)
num_trans = 0
for i in range(366*24):
for c in range(num_customers):
# num_trans is the number of transactions the customer will make in this hour
# we assume that we have enough customers to model that each customer can make max 1 transaction per hour
cust_trans = float(trans_per_year[cust_idx])
cust_trans += np.random.normal(0, std_transactions, 1)[0]
cust_trans /= num_customers
cust_trans *= frac_month[curr_date.month-1, cust_idx]
cust_trans *= frac_monthday[curr_date.day-1, cust_idx]
cust_trans *= 7 * frac_weekday[curr_date.weekday(), cust_idx]
cust_trans *= frac_hour[curr_date.hour, cust_idx]
cust_trans += np.random.normal(0, 0.01, 1)[0]
if cust_trans > np.random.uniform(0, 1, 1)[0]:
num_trans += 1
curr_date += timedelta(hours=1)
print(curr_date)
print(trans_per_year[cust_idx])
print(num_trans)
print("")
In [ ]:
country_counts = pd.concat([d['Country'].value_counts() for d in datasets], axis=1)
country_counts.fillna(0, inplace=True)
country_counts.columns = ['non-fraud', 'fraud']
country_counts[['non-fraud', 'fraud']] /= country_counts.sum(axis=0)
# save the resulting data
country_counts.to_csv(join(utils_data.FOLDER_SIMULATOR_INPUT, 'country_frac.csv'))
countries_large = []
for c in ['non-fraud', 'fraud']:
countries_large.extend(country_counts.loc[country_counts[c] > 0.05].index)
countries_large = np.unique(countries_large)
countries_large_counts = []
for c in countries_large:
countries_large_counts.append(country_counts.loc[c, 'non-fraud'])
countries_large = [countries_large[np.argsort(countries_large_counts)[::-1][i]] for i in range(len(countries_large))]
plt.figure(figsize=(10,5))
bottoms = np.zeros(3)
for i in range(len(countries_large)):
c = countries_large[i]
plt.bar((0, 1, 2), np.concatenate((country_counts.loc[c], [0])), label=c, bottom=bottoms)
bottoms += np.concatenate((country_counts.loc[c], [0]))
# fill up the rest
plt.bar((0, 1), 1-bottoms[:-1], bottom=bottoms[:-1], label='rest')
plt.legend(fontsize=20)
plt.xticks([0, 1], ['non-fraud', 'fraud'], size=15)
plt.ylabel('fraction transactions made', size=15)
plt.tight_layout()
plt.savefig(join(utils_data.FOLDER_REAL_DATA_ANALYSIS, 'country_distribution'))
plt.show()
In [ ]:
currency_counts = pd.concat([d['Currency'].value_counts() for d in datasets], axis=1)
currency_counts.fillna(0, inplace=True)
currency_counts.columns = ['non-fraud', 'fraud']
currency_counts[['non-fraud', 'fraud']] /= currency_counts.sum(axis=0)
currencies_large = []
for c in ['non-fraud', 'fraud']:
currencies_large.extend(currency_counts.loc[currency_counts[c] > 0].index)
currencies_large = np.unique(currencies_large)
currencies_large_counts = []
for c in currencies_large:
currencies_large_counts.append(currency_counts.loc[c, 'non-fraud'])
currencies_large = [currencies_large[np.argsort(currencies_large_counts)[::-1][i]] for i in range(len(currencies_large))]
plt.figure(figsize=(10,5))
bottoms = np.zeros(3)
for i in range(len(currencies_large)):
c = currencies_large[i]
plt.bar((0, 1, 2), np.concatenate((currency_counts.loc[c], [0])), label=c, bottom=bottoms)
bottoms += np.concatenate((currency_counts.loc[c], [0]))
plt.legend(fontsize=20)
plt.xticks([0, 1], ['non-fraud', 'fraud'], size=15)
plt.ylabel('fraction of total transactions made', size=15)
plt.tight_layout()
plt.savefig(join(utils_data.FOLDER_REAL_DATA_ANALYSIS, 'currency_distribution'))
plt.show()
Check how many cards make purchases in several currencies:
In [ ]:
curr_per_cust = dataset0[['CardID', 'Currency']].groupby('CardID')['Currency'].value_counts().index.get_level_values(0)
print(len(curr_per_cust))
print(len(curr_per_cust.unique()))
print(len(curr_per_cust) - len(curr_per_cust.unique()))
CONCLUSION: Only 243 cards out of 54,000 puchased things in several currencies.
Estimate the probability of selection a currency, given a country:
In [ ]:
curr_per_country0 = dataset0.groupby(['Country'])['Currency'].value_counts(normalize=True)
curr_per_country1 = dataset1.groupby(['Country'])['Currency'].value_counts(normalize=True)
curr_per_country0.to_csv(join(utils_data.FOLDER_SIMULATOR_INPUT, 'currency_per_country0.csv'))
curr_per_country1.to_csv(join(utils_data.FOLDER_SIMULATOR_INPUT, 'currency_per_country1.csv'))
In [ ]:
plt.figure(figsize=(7,5))
currencies = dataset01['Currency'].unique()
merchants = dataset01['MerchantID'].unique()
for curr_idx in range(len(currencies)):
for merch_idx in range(len(merchants)):
plt.plot(range(len(currencies)), np.zeros(len(currencies))+merch_idx, 'r-', linewidth=0.2)
if currencies[curr_idx] in dataset01.loc[dataset01['MerchantID'] == merch_idx, 'Currency'].values:
plt.plot(curr_idx, merch_idx, 'ko')
plt.xticks(range(len(currencies)), currencies)
plt.ylabel('Merchant ID', size=15)
plt.xlabel('Currency', size=15)
plt.tight_layout()
plt.show()
plt.savefig(join(utils_data.FOLDER_REAL_DATA_ANALYSIS, 'currency_per_merchant'))
We conclude from this that most merchants only sell things in one currenyc; thus, we will let each customer select the merchant given the currency that the customer has (which is unique).
Estimate the probability of selection a merchat, given the currency:
In [ ]:
merch_per_curr0 = dataset0.groupby(['Currency'])['MerchantID'].value_counts(normalize=True)
merch_per_curr1 = dataset1.groupby(['Currency'])['MerchantID'].value_counts(normalize=True)
merch_per_curr0.to_csv(join(utils_data.FOLDER_SIMULATOR_INPUT, 'merchant_per_currency0.csv'))
merch_per_curr1.to_csv(join(utils_data.FOLDER_SIMULATOR_INPUT, 'merchant_per_currency1.csv'))
In [ ]:
merchant_count0 = dataset0['MerchantID'].value_counts().sort_index()
merchant_count1 = dataset1['MerchantID'].value_counts().sort_index()
plt.figure(figsize=(15,10))
ax = plt.subplot(2, 1, 1)
ax.bar(merchant_count0.index.values, merchant_count0.values)
rects = ax.patches
for rect, label in zip(rects, merchant_count0.values):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height, label, ha='center', va='bottom')
plt.ylabel('num transactions')
plt.xticks([])
plt.xlim([-0.5, data_stats.loc['num merchants', 'all']+0.5])
ax = plt.subplot(2, 1, 2)
ax.bar(merchant_count1.index.values, merchant_count1.values)
rects = ax.patches
for rect, label in zip(rects, merchant_count1.values):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height, label, ha='center', va='bottom')
plt.ylabel('num transactions')
plt.xlabel('Merchant ID')
plt.xlim([-0.5, data_stats.loc['num merchants', 'all']+0.5])
plt.tight_layout()
plt.show()
In [ ]:
plt.figure(figsize=(12, 10))
plt_idx = 1
for d in datasets:
plt.subplot(2, 1, plt_idx)
plt.plot(range(d.shape[0]), d['Amount'], 'k.')
# plt.plot(date_num, amount, 'k.', label='num trans.')
# plt.plot(date_num, np.zeros(len(date_num))+np.mean(all_trans), 'g',label='average')
plt_idx += 1
# plt.title(d.name, size=20)
plt.xlabel('transactions', size=15)
plt.xticks([])
if plt_idx == 2:
plt.ylabel('amount', size=15)
plt.legend(fontsize=15)
plt.tight_layout()
plt.savefig(join(utils_data.FOLDER_REAL_DATA_ANALYSIS, 'amount_day-in-year'))
plt.show()
In [ ]:
print(dataset0.loc[dataset0['Amount'] == 5472.53,['Local_Date', 'CardID', 'MerchantID', 'Amount', 'Currency', 'Country']])
In [ ]:
plt.figure(figsize=(10,5))
bins = [0, 5, 25, 50, 100, 1000, 11000]
plt_idx = 1
for d in datasets:
amount_counts, loc = np.histogram(d["Amount"], bins=bins)
amount_counts = np.array(amount_counts, dtype=np.float)
amount_counts /= np.sum(amount_counts)
plt.subplot(1, 2, plt_idx)
am_bot = 0
for i in range(len(amount_counts)):
plt.bar(plt_idx, amount_counts[i], bottom=am_bot, label='{}-{}'.format(bins[i], bins[i+1]))
am_bot += amount_counts[i]
plt_idx += 1
plt.ylim([0, 1.01])
plt.legend()
# plt.title("Amount distribution")
plt_idx += 1
plt.show()
In [ ]:
plt.figure(figsize=(12, 10))
plt_idx = 1
for d in datasets:
plt.subplot(2, 1, plt_idx)
min_amount = min(d['Amount'])
max_amount = max(d['Amount'])
plt.plot(range(d.shape[0]), np.sort(d['Amount']), 'k.', label='transaction')
# plt.plot(date_num, amount, 'k.', label='num trans.')
plt.plot(np.linspace(0, d.shape[0], 100), np.zeros(100)+np.mean(d['Amount']), 'g--',label='average')
plt_idx += 1
plt.title(d.name, size=20)
plt.ylabel('amount', size=15)
if plt_idx == 3:
plt.xlabel('transactions', size=15)
else:
plt.legend(fontsize=15)
plt.tight_layout()
plt.savefig(join(utils_data.FOLDER_REAL_DATA_ANALYSIS, 'amount_day-in-year'))
plt.show()
For each merchant, we will have a probability distribution over the amount spent
In [ ]:
from scipy.optimize import curve_fit
def sigmoid(x, x0, k):
y = 1 / (1 + np.exp(-k * (x - x0)))
return y
num_merchants = data_stats.loc['num merchants', 'all']
num_bins = 20
merchant_amount_distr = np.zeros((2, num_merchants, 2*num_bins+1))
plt.figure(figsize=(15, 5))
plt_idx = 1
for dataset in [dataset0, dataset1]:
for m in dataset0['MerchantID'].unique():
# get all transactions from this merchant
trans_merch = dataset.loc[dataset['MerchantID']==m]
num_transactions = trans_merch.shape[0]
if num_transactions > 0:
# get the amounts paid for the transactions with this merchant
amounts = trans_merch['Amount']
bins_height, bins_edges = np.histogram(amounts, bins=num_bins)
bins_height = np.array(bins_height, dtype=np.float)
bins_height /= np.sum(bins_height)
merchant_amount_distr[int(plt_idx > 7), (plt_idx-1)%7, :] = np.concatenate((bins_height, bins_edges))
plt.subplot(2, num_merchants, plt_idx)
plt.hist(amounts, bins=num_bins)
plt_idx += 1
plt.tight_layout()
plt.show()
np.save(join(utils_data.FOLDER_SIMULATOR_INPUT,'merchant_amount_distr'), merchant_amount_distr)
In [ ]:
from scipy.optimize import curve_fit
def sigmoid(x, x0, k):
y = 1 / (1 + np.exp(-k * (x - x0)))
return y
num_merchants = data_stats.loc['num merchants', 'all']
merchant_amount_parameters = np.zeros((2, num_merchants, 4))
plt.figure(figsize=(15, 5))
plt_idx = 1
for dataset in [dataset0, dataset1]:
for m in dataset0['MerchantID'].unique():
# get all transactions from this merchant
trans_merch = dataset.loc[dataset['MerchantID']==m]
num_transactions = trans_merch.shape[0]
if num_transactions > 0:
# get the amounts paid for the transactions with this merchant
amounts = np.sort(trans_merch['Amount'])
min_amount = min(amounts)
max_amount = max(amounts)
amounts_normalised = (amounts - min_amount) / (max_amount - min_amount)
plt.subplot(2, num_merchants, plt_idx)
plt.plot(np.linspace(0, 1, num_transactions), amounts, '.')
# fit sigmoid
x_vals = np.linspace(0, 1, 100)
try:
p_sigmoid, _ = curve_fit(sigmoid, np.linspace(0, 1, num_transactions), amounts_normalised)
amounts_predict = sigmoid(x_vals, *p_sigmoid)
amounts_predict_denormalised = amounts_predict * (max_amount - min_amount) + min_amount
plt.plot(x_vals, amounts_predict_denormalised)
except:
# fit polynomial
p_poly = np.polyfit(np.linspace(0, 1, num_transactions), amounts_normalised, 2)
amounts_predict = np.polyval(p_poly, x_vals)
p_sigmoid, _ = curve_fit(sigmoid, x_vals, amounts_predict)
amounts_predict = sigmoid(x_vals, *p_sigmoid)
amounts_predict_denormalised = amounts_predict * (max_amount - min_amount) + min_amount
plt.plot(x_vals, amounts_predict_denormalised)
merchant_amount_parameters[int(plt_idx > 7), (plt_idx-1)%7] = [min_amount, max_amount, p_sigmoid[0], p_sigmoid[1]]
plt_idx += 1
plt.tight_layout()
plt.show()
np.save(join(utils_data.FOLDER_SIMULATOR_INPUT,'merchant_amount_parameters'), merchant_amount_parameters)
print(merchant_amount_parameters)
We conclude that the normal customers and fraudsters follow roughly the same distribution, so we will only have one per merchant; irrespective of whether a genuine or fraudulent customer is making the transaction.
In [ ]:
from scipy.optimize import curve_fit
def sigmoid(x, x0, k):
y = 1 / (1 + np.exp(-k * (x - x0)))
return y
num_merchants = data_stats.loc['num merchants', 'all']
merchant_amount_parameters = np.zeros((2, num_merchants, 4))
plt.figure(figsize=(6, 3))
plt_idx = 1
dataset = dataset0
m = dataset0['MerchantID'].unique()[0]
# get all transactions from this merchant
trans_merch = dataset.loc[dataset['MerchantID']==m]
num_transactions = trans_merch.shape[0]
# get the amounts paid for the transactions with this merchant
amounts = np.sort(trans_merch['Amount'])
min_amount = min(amounts)
max_amount = max(amounts)
amounts_normalised = (amounts - min_amount) / (max_amount - min_amount)
plt.plot(range(num_transactions), amounts, 'k-', linewidth=2, label='real')
# fit sigmoid
x_vals = np.linspace(0, 1, 100)
x = np.linspace(0, 1, num_transactions)
p_sigmoid, _ = curve_fit(sigmoid, np.linspace(0, 1, num_transactions), amounts_normalised)
amounts_predict = sigmoid(x_vals, *p_sigmoid)
amounts_predict_denormalised = amounts_predict * (max_amount - min_amount) + min_amount
plt.plot(np.linspace(0, num_transactions, 100), amounts_predict_denormalised, 'm--', linewidth=3, label='approx')
merchant_amount_parameters[int(plt_idx > 7), (plt_idx-1)%7] = [min_amount, max_amount, p_sigmoid[0], p_sigmoid[1]]
plt.xlabel('transaction count', fontsize=20)
plt.ylabel('price', fontsize=20)
plt.legend(fontsize=15)
plt.tight_layout()
plt.savefig(join(utils_data.FOLDER_REAL_DATA_ANALYSIS, 'merchant_price_sigmoid_fit'))
plt.show()
Here we want to find out how long customers/fraudsters return, i.e., how often the same credit card is used over time.
In [ ]:
plt.figure(figsize=(15, 30))
plt_idx = 1
dist_transactions = [[], []]
for d in datasets:
# d = d.loc[d['Date'].apply(lambda date: date.month) < 7]
# d = d.loc[d['Date'].apply(lambda date: date.month) > 3]
plt.subplot(1, 2, plt_idx)
trans_idx = 0
for card in dataset01['CardID'].unique():
card_times = d.loc[d['CardID'] == card, 'Global_Date']
dist_transactions[plt_idx-1].extend([(card_times.iloc[i+1] - card_times.iloc[i]).days for i in range(len(card_times)-1)])
if plt_idx == 2:
num_c = 2
else:
num_c = 10
if len(card_times) > num_c:
card_times = card_times.apply(lambda date: date.date())
card_times = matplotlib.dates.date2num(card_times)
plt.plot(card_times, np.zeros(len(card_times)) + trans_idx, 'k.', markersize=1)
plt.plot(card_times, np.zeros(len(card_times)) + trans_idx, 'k-', linewidth=0.2)
trans_idx += 1
min_date = matplotlib.dates.date2num(min(dataset01['Global_Date']).date())
max_date = matplotlib.dates.date2num(max(dataset01['Global_Date']).date())
# plt.xlim([min_date, max_date])
plt.xticks([])
for m in range(1,13):
datenum = matplotlib.dates.date2num(datetime(2016, m, 1))
plt.plot(np.zeros(2)+datenum, [-1, 1000], 'r-', linewidth=0.5)
if plt_idx == 1:
plt.ylim([0,300])
else:
plt.ylim([0, 50])
plt_idx += 1
plt.show()
In [ ]:
# average distance between two transactions with the same card
print(np.mean(dist_transactions[0]))
print(np.mean(dist_transactions[1]))
At a given transaction, estimate the probability of doing another transaction with the same card.
In [ ]:
prob_stay = np.zeros(2)
for k in range(2):
dataset = [dataset0, dataset1][k]
creditcards = dataset.loc[dataset['Global_Date'].apply(lambda d: d.month) > 3]
creditcards = creditcards.loc[creditcards['Global_Date'].apply(lambda d: d.month) < 6]
creditcard_counts = creditcards['CardID'].value_counts()
creditcardIDs = creditcards['CardID']
data = dataset.loc[dataset['Global_Date'].apply(lambda d: d.month) > 3]
single = 0
multi = 0
for i in range(len(creditcards)):
cc = creditcards.iloc[i]['CardID']
dd = creditcards.iloc[i]['Global_Date']
cond1 = data['CardID'] == cc
cond2 = data['Global_Date'] > dd
if len(data.loc[np.logical_and(cond1, cond2)]) == 0:
single += 1
else:
multi += 1
prob_stay[k] = multi/(single+multi)
print('probability of doing another transaction:', prob_stay[k], '{}'.format(['non-fraud', 'fraud'][k]))
np.save(join(utils_data.FOLDER_SIMULATOR_INPUT, 'prob_stay'), prob_stay)
In [ ]:
cards0 = dataset0['CardID'].unique()
cards1 = dataset1['CardID'].unique()
print('cards total:', len(np.union1d(cards0, cards1)))
print('fraud cards:', len(cards1))
print('intersection:', len(np.intersect1d(cards0, cards1)))
# go through the cards that were in both sets
cards0_1 = []
cards1_0 = []
cards010 = []
for cib in np.intersect1d(cards0, cards1):
date0 = dataset0.loc[dataset0['CardID']==cib].iloc[0]['Global_Date']
date1 = dataset1.loc[dataset1['CardID']==cib].iloc[0]['Global_Date']
if date0 < date1:
cards0_1.append(cib)
# genuine purchases after fraud
dates00 = dataset0.loc[dataset0['CardID']==cib].iloc[1:]['Global_Date']
if len(dates00)>0:
if sum(dates00>date1)>0:
cards010.append(cib)
else:
cards1_0.append(cib)
print('first genuine then fraud: ', len(cards0_1))
print('first fraud then genuine: ', len(cards1_0))
print('genuine again after fraud: ', len(cards010))
prob_stay_after_fraud = len(cards010)/len(cards0_1)
print('prob of purchase after fraud: ', prob_stay_after_fraud)
np.save(join(utils_data.FOLDER_SIMULATOR_INPUT, 'prob_stay_after_fraud'), prob_stay_after_fraud )
In [ ]:
plt.figure(figsize=(10, 25))
dist_transactions = []
trans_idx = 0
data_compromised = dataset01.loc[dataset01['CardID'].apply(lambda cid: cid in np.intersect1d(cards0, cards1))]
no_trans_after_fraud = 0
trans_after_fraud = 0
for card in data_compromised['CardID'].unique():
cards_used = data_compromised.loc[data_compromised['CardID'] == card, ['Global_Date', 'Target']]
dist_transactions.extend([(cards_used.iloc[i+1, 0] - cards_used.iloc[i, 0]).days for i in range(len(cards_used)-1)])
card_times = cards_used['Global_Date'].apply(lambda date: date.date())
card_times = matplotlib.dates.date2num(card_times)
plt.plot(card_times, np.zeros(len(card_times)) + trans_idx, 'k-', linewidth=0.9)
cond0 = cards_used['Target'] == 0
plt.plot(card_times[cond0], np.zeros(len(card_times[cond0])) + trans_idx, 'g.', markersize=5)
cond1 = cards_used['Target'] == 1
plt.plot(card_times[cond1], np.zeros(len(card_times[cond1])) + trans_idx, 'r.', markersize=5)
if max(cards_used.loc[cards_used['Target']==0, 'Global_Date']) > max(cards_used.loc[cards_used['Target']==1, 'Global_Date']):
trans_after_fraud += 1
else:
no_trans_after_fraud += 1
trans_idx += 1
min_date = matplotlib.dates.date2num(min(dataset01['Global_Date']).date())
max_date = matplotlib.dates.date2num(max(dataset01['Global_Date']).date())
plt.xticks([])
plt.ylim([0, trans_idx])
# print lines for months
for m in range(1,13):
datenum = matplotlib.dates.date2num(datetime(2016, m, 1))
plt.plot(np.zeros(2)+datenum, [-1, 1000], 'r-', linewidth=0.5)
plt_idx += 1
plt.show()
print("genuine transactions after fraud: ", trans_after_fraud)
print("fraud is the last transaction: ", no_trans_after_fraud)
when a fraudster uses an existing card, are country and currency always the same?
In [ ]:
plt.figure(figsize=(10, 25))
dist_transactions = []
trans_idx = 0
for card in data_compromised['CardID'].unique():
cards_used = data_compromised.loc[data_compromised['CardID'] == card, ['Global_Date', 'Target', 'Country', 'Currency']]
if len(cards_used['Country'].unique()) > 1 or len(cards_used['Currency'].unique()) > 1:
print(cards_used)
print("")
In [ ]:
In [ ]: