In [1]:
import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
Let's generate some fake data:
In [2]:
def gimme_data():
n = 500
k = 100
df = pd.DataFrame()
ids = np.arange(n)
np.random.shuffle(ids)
df['entity_id'] = ids
# ===== Underlying features ======
# age is random
df['age'] = np.random.randn(len(df)) * 15 + 45
df.loc[df.age<0,'age'] = 1.0
# gender is random
df['gender_female'] = np.random.binomial(1, 0.6, len(df))
# incident rate is random, but bi-modal
switch = (np.random.randn(len(df)) > 0.2).astype(float)
df['incident_rate'] = np.random.beta(5,2,len(df)) * switch + np.random.beta(1,5,len(df)) * (1-switch)
# some people have a rare mutation
df['random_feature_1'] = (np.random.randn(len(df)) > .5).astype(float)
# ====== Actual risk ======
# we have a latent score that determines actual risk
df['latent_score'] = 0
# every 30 years of age adds a risk point
df.loc[:,'latent_score'] += df['age'] / 30
# being male adds 1 risk points
df.loc[df.gender_female==0, 'latent_score'] = df.loc[df.gender_female==0, 'latent_score'] + 2
# incident rate * 2 for risk
df['latent_score'] = df['latent_score'] + df['incident_rate'] * 2
# if you're young, female, and have the rare mutation, you're at very high risk
small_group = (df.age<25)&(df.gender_female==1)&(df.random_feature_1==1)
df.loc[small_group, 'latent_score'] = df.loc[small_group, 'latent_score'] + 25
# remember a cutoff - above this, you're positive (90th percentile)
cutoff = np.percentile(df.latent_score, q=90)
# make some noooooise
df['latent_score'] = df['latent_score'] + np.random.randn(len(df)) * 1
# convert latent score to true label
df['true_label'] = df['latent_score'] > cutoff
del df['latent_score']
# ===== Distracting Features =====
for idx in range(2, 15):
df['random_feature_%d'%idx] = ((np.random.randn(len(df)) * np.random.rand()*5 + np.random.randn()*5) > 2).astype(float)
# some binary ones
for idx in range(16, 30):
df['random_feature_%d'%idx] = np.random.randn(len(df)) * np.random.rand()*5 + np.random.randn()*5
# some boring correlates of age
for idx in range(31, 60):
df['random_feature_%d'%idx] = df['age'] + np.random.rand()*20 + np.random.rand()*100 +\
np.random.randn(len(df))*10
# some boring correlates of gender
for idx in range(61, 90):
df['random_feature_%d'%idx] = df['gender_female'] + np.random.randn(len(df)) * np.random.rand()*5
df = df[['entity_id','age','gender_female','incident_rate','true_label'] +
[c for c in df.columns if c.startswith('random')]]
return df
In [3]:
train = gimme_data()
test = gimme_data()
In [4]:
train.true_label.value_counts()
Out[4]:
In [5]:
test.true_label.value_counts()
Out[5]:
In [6]:
small_group = (train.age<25)&(train.gender_female==1)&(train.random_feature_1==1)
train[small_group].true_label.value_counts()
Out[6]:
In [7]:
small_group = (test.age<25)&(test.gender_female==1)&(test.random_feature_1==1)
test[small_group].true_label.value_counts()
Out[7]:
Yup!
In [8]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X=train.drop(['entity_id','true_label'],1),
y=train['true_label'])
Out[8]:
In [9]:
preds = rf.predict_proba(test.drop(['entity_id','true_label'],1))
In [10]:
fpr, tpr, _ = roc_curve(y_true=test.true_label, y_score=preds[:,1])
In [11]:
roc_auc = auc(fpr, tpr)
In [12]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
Looking alright.
In [13]:
test['score'] = preds[:,1]
In [14]:
test = test.sort_values(by='score', ascending=False)
In [15]:
test.head(10)
Out[15]:
But didn't pick up on the interaction term.
In [16]:
# rearrange columns a bit
test = test[['entity_id','true_label','score'] + [c for c in test.columns if c not in ['entity_id','true_label','score']]]
In [31]:
test.to_csv('test_data.csv', index=False)
In [21]:
test[['entity_id','score','true_label','gender_female','age']].to_csv('bias_exercise.csv',index=False)