First the exercise:
Now let us load our standard libraries.
In [249]:
import numpy as np
import pandas as pd
Let us load the credit card dataset and extract a small dataframe of numerical features to test on.
In [250]:
big_df = pd.read_csv("UCI_Credit_Card.csv")
In [251]:
big_df.head()
Out[251]:
In [252]:
len(big_df)
Out[252]:
In [253]:
len(big_df.dropna())
Out[253]:
In [254]:
df = big_df.drop(labels = ['ID'], axis = 1)
In [255]:
labels = df['default.payment.next.month']
df.drop('default.payment.next.month', axis = 1, inplace = True)
In [256]:
num_samples = 25000
In [257]:
train_x, train_y = df[0:num_samples], labels[0:num_samples]
In [258]:
test_x, test_y = df[num_samples:], labels[num_samples:]
In [259]:
test_x.head()
Out[259]:
In [260]:
train_y.head()
Out[260]:
Now let us write our transformation function.
In [264]:
class bin_transformer(object):
def __init__(self, df, num_quantiles = 2):
self.quantiles = df.quantile(np.linspace(1./num_quantiles, 1.-1./num_quantiles,num_quantiles-1))
def transform(self, df):
new = pd.DataFrame()
fns = {}
for col_name in df.axes[1]:
for ix, q in self.quantiles.iterrows():
quart = q[col_name]
new[col_name+str(ix)] = (df[col_name] >= quart)
fns[col_name+str(ix)] =(col_name, lambda x: x[col_name]>=quart)
return new, fns
In [265]:
transformer = bin_transformer(df,5)
In [266]:
train_x_t, tr_fns = transformer.transform(train_x)
In [267]:
test_x_t, test_fns = transformer.transform(test_x)
In [268]:
train_x_t.head()
Out[268]:
In [269]:
tr_fns
Out[269]:
Now let us build some simple loss functions for 1d labels.
In [270]:
def bdd_cross_entropy(pred, label):
return -np.mean(label*np.log(pred+10**(-20)))
In [271]:
def MSE(pred,label):
return np.mean((pred-label)**2)
In [272]:
def acc(pred,label):
return np.mean((pred>=0.5)==(label == 1))
Now let us define the find split function.
In [273]:
def find_split(x, y, loss, verbose = False):
min_ax = None
base_loss = loss(np.mean(y),y)
min_loss = base_loss
N = len(x)
for col_name in x.axes[1]:
mask = x[col_name]
num_pos = np.sum(mask)
num_neg = N - num_pos
pos_y = np.mean(y[mask])
neg_y = np.mean(y[~mask])
l = (num_pos*loss(pos_y, y[mask]) + num_neg*loss(neg_y, y[~mask]))/N
if verbose:
print("Column {0} split has improved loss {1}".format(col_name, base_loss-l))
if l < min_loss:
min_loss = l
min_ax = col_name
return min_ax, min_loss
In [278]:
find_split(train_x_t, train_y, MSE, verbose = True)
Out[278]:
In [279]:
find_split(train_x_t, train_y, bdd_cross_entropy, verbose = 0)
Out[279]:
In [280]:
find_split(train_x_t, train_y, acc, verbose = 0)
Out[280]:
In [281]:
np.mean(train_y[train_x_t['PAY_00.8']])
Out[281]:
In [283]:
np.mean(train_y[~train_x_t['PAY_00.8']])
Out[283]:
In [284]:
np.mean(train_y[train_x_t['AGE0.2']])
Out[284]:
In [285]:
np.mean(train_y[~train_x_t['AGE0.2']])
Out[285]: