# Let's create some simple fake data

``````

In [1]:

import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

``````

## For the Crosstabs Example

Let's generate some fake data:

``````

In [2]:

def gimme_data():

n = 500
k = 100
df = pd.DataFrame()
ids = np.arange(n)
np.random.shuffle(ids)
df['entity_id'] = ids

# ===== Underlying features ======

# age is random
df['age'] = np.random.randn(len(df)) * 15 + 45
df.loc[df.age<0,'age'] = 1.0

# gender is random
df['gender_female'] = np.random.binomial(1, 0.6, len(df))

# incident rate is random, but bi-modal
switch = (np.random.randn(len(df)) > 0.2).astype(float)
df['incident_rate'] = np.random.beta(5,2,len(df)) * switch + np.random.beta(1,5,len(df)) * (1-switch)

# some people have a rare mutation
df['random_feature_1'] = (np.random.randn(len(df)) > .5).astype(float)

# ====== Actual risk ======

# we have a latent score that determines actual risk
df['latent_score'] = 0

# every 30 years of age adds a risk point
df.loc[:,'latent_score'] += df['age'] / 30

# being male adds 1 risk points
df.loc[df.gender_female==0, 'latent_score'] = df.loc[df.gender_female==0, 'latent_score'] + 2

# incident rate * 2 for risk
df['latent_score'] = df['latent_score'] + df['incident_rate'] * 2

# if you're young, female, and have the rare mutation, you're at very high risk
small_group = (df.age<25)&(df.gender_female==1)&(df.random_feature_1==1)
df.loc[small_group, 'latent_score'] = df.loc[small_group, 'latent_score'] + 25

# remember a cutoff - above this, you're positive (90th percentile)
cutoff = np.percentile(df.latent_score, q=90)

# make some noooooise
df['latent_score'] = df['latent_score'] + np.random.randn(len(df)) * 1

# convert latent score to true label
df['true_label'] = df['latent_score'] > cutoff
del df['latent_score']

# ===== Distracting Features =====

for idx in range(2, 15):
df['random_feature_%d'%idx] = ((np.random.randn(len(df)) * np.random.rand()*5 + np.random.randn()*5) > 2).astype(float)

# some binary ones
for idx in range(16, 30):
df['random_feature_%d'%idx] = np.random.randn(len(df)) * np.random.rand()*5 + np.random.randn()*5

# some boring correlates of age
for idx in range(31, 60):
df['random_feature_%d'%idx] = df['age'] + np.random.rand()*20 + np.random.rand()*100 +\
np.random.randn(len(df))*10
# some boring correlates of gender
for idx in range(61, 90):
df['random_feature_%d'%idx] = df['gender_female'] + np.random.randn(len(df)) * np.random.rand()*5

df = df[['entity_id','age','gender_female','incident_rate','true_label'] +
[c for c in df.columns if c.startswith('random')]]

return df

``````

#### Generate train and test set

``````

In [3]:

train = gimme_data()
test = gimme_data()

``````
``````

In [4]:

train.true_label.value_counts()

``````
``````

Out[4]:

False    441
True      59
Name: true_label, dtype: int64

``````
``````

In [5]:

test.true_label.value_counts()

``````
``````

Out[5]:

False    434
True      66
Name: true_label, dtype: int64

``````

#### Let's see if the interaction term made it through:

``````

In [6]:

small_group = (train.age<25)&(train.gender_female==1)&(train.random_feature_1==1)
train[small_group].true_label.value_counts()

``````
``````

Out[6]:

True    12
Name: true_label, dtype: int64

``````
``````

In [7]:

small_group = (test.age<25)&(test.gender_female==1)&(test.random_feature_1==1)
test[small_group].true_label.value_counts()

``````
``````

Out[7]:

True    11
Name: true_label, dtype: int64

``````

Yup!

### Fit and Predict with an RF

``````

In [8]:

rf = RandomForestClassifier(n_estimators=200)
rf.fit(X=train.drop(['entity_id','true_label'],1),
y=train['true_label'])

``````
``````

Out[8]:

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
verbose=0, warm_start=False)

``````
``````

In [9]:

preds = rf.predict_proba(test.drop(['entity_id','true_label'],1))

``````

#### Just making sure that the RF can do something sensible:

``````

In [10]:

fpr, tpr, _ = roc_curve(y_true=test.true_label, y_score=preds[:,1])

``````
``````

In [11]:

roc_auc = auc(fpr, tpr)

``````
``````

In [12]:

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()

``````
``````

``````

Looking alright.

``````

In [13]:

test['score'] = preds[:,1]

``````
``````

In [14]:

test = test.sort_values(by='score', ascending=False)

``````
``````

In [15]:

``````
``````

Out[15]:

entity_id
age
gender_female
incident_rate
true_label
random_feature_1
random_feature_2
random_feature_3
random_feature_4
random_feature_5
...
random_feature_81
random_feature_82
random_feature_83
random_feature_84
random_feature_85
random_feature_86
random_feature_87
random_feature_88
random_feature_89
score

465
478
63.227958
0
0.818133
True
0.0
1.0
0.0
1.0
1.0
...
-1.766104
4.528153
-0.402657
-0.284959
2.653418
2.360303
10.957686
3.624214
0.276944
0.615

180
174
66.557592
0
0.412870
False
0.0
0.0
0.0
0.0
0.0
...
0.808632
2.705668
-0.495744
-0.390279
0.966705
-3.702808
-2.249557
0.864319
-0.984309
0.585

397
192
81.583693
0
0.354826
False
1.0
0.0
0.0
1.0
0.0
...
-3.531336
-6.141112
-1.981490
-0.471732
0.057509
1.610588
2.705318
2.498322
0.766655
0.580

215
15
57.108930
0
0.527284
False
0.0
0.0
0.0
1.0
0.0
...
-1.400378
4.954661
0.156229
-0.321477
-1.323733
0.830541
-1.726930
11.488804
1.389767
0.575

368
142
51.996117
0
0.768863
True
0.0
1.0
0.0
1.0
1.0
...
-3.840975
0.122851
-1.033036
0.071066
-4.006721
-3.869528
5.631649
3.828006
0.996846
0.565

104
126
57.526542
0
0.898845
True
0.0
0.0
0.0
1.0
0.0
...
-1.693623
1.874155
-1.421506
-0.204338
3.209767
-1.363325
-2.548176
-5.714439
-1.685581
0.565

328
375
44.509787
0
0.586849
False
0.0
0.0
0.0
1.0
1.0
...
-0.501809
3.066686
-0.485009
0.100575
3.096863
0.061533
-2.806147
-2.001836
-1.398406
0.565

99
370
67.807486
0
0.561012
False
0.0
0.0
0.0
1.0
0.0
...
1.671493
4.571253
0.137551
-0.379693
2.269276
-4.799680
5.481302
-4.811187
2.561597
0.560

225
41
65.159263
0
0.673079
False
0.0
0.0
0.0
1.0
0.0
...
-1.037459
1.810626
-0.490970
-0.545345
-1.355171
-0.018782
6.332184
-2.682034
2.491792
0.560

198
37
55.379742
0
0.923638
True
0.0
0.0
1.0
1.0
0.0
...
-0.416719
1.484297
1.548530
-0.031410
-1.359145
-0.330880
-0.239769
-6.869188
-2.498499
0.560

10 rows × 92 columns

``````

But didn't pick up on the interaction term.

``````

In [16]:

# rearrange columns a bit
test = test[['entity_id','true_label','score'] + [c for c in test.columns if c not in ['entity_id','true_label','score']]]

``````
``````

In [31]:

test.to_csv('test_data.csv', index=False)

``````

### For the bias exercise

``````

In [21]:

test[['entity_id','score','true_label','gender_female','age']].to_csv('bias_exercise.csv',index=False)

``````