In [110]:
!date
import numpy as np, matplotlib.pyplot as plt, pandas as pd, seaborn as sns
%matplotlib inline
sns.set_context('talk')
sns.set_style('darkgrid')


Sun Mar  8 16:22:36 PDT 2015

In [111]:
# set random seed for reproducibility
np.random.seed(12345)

Attribute Selection

Remember our good, old, noisy sinewave?


In [112]:
x_true = np.linspace(0,4,1000)
y_true = np.sin(x_true)

obs = np.random.choice(range(1000), size=100, replace=False)
x_obs = x_true[obs]
y_obs = np.random.normal(y_true[obs], .3)

plt.plot(x_true, y_true, label='True')
plt.plot(x_obs, y_obs, 's', label='Observed')
plt.legend(loc=(1,.1))


Out[112]:
<matplotlib.legend.Legend at 0x7ffc20d78710>

And here are a few ways to fit this:


In [113]:
import sklearn.linear_model, sklearn.svm, sklearn.ensemble

In [114]:
m1 = sklearn.linear_model.LinearRegression()
m2 = sklearn.svm.SVR()
m3 = sklearn.ensemble.RandomForestRegressor()

In [115]:
X = x_obs[:,None]
y = y_obs

In [116]:
m1.fit(X,y)


Out[116]:
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

In [117]:
m2.fit(X,y)


Out[117]:
SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [118]:
m3.fit(X,y)


Out[118]:
RandomForestRegressor(bootstrap=True, compute_importances=None,
           criterion='mse', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
           min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False,
           random_state=None, verbose=0)

In [119]:
def plot_pred(m,label):
    X = x_true[:,None]
    y = m.predict(X)
    plt.plot(X, y, label=label)

What do you think the predictions from m1, m2, and m3 are going to look like?


In [120]:
plt.plot(x_true, y_true, label='True')
plt.plot(x_obs, y_obs, 's', label='Observed')

plot_pred(m1, 'Linear Regression')
plot_pred(m2, 'SVM')
plot_pred(m3, 'Random Forest')

plt.legend(loc=(1,.1))


Out[120]:
<matplotlib.legend.Legend at 0x7ffc20c5f290>

Add random noise as a predictor


In [121]:
X = np.hstack((X, np.random.normal(size=(100,1))))

In [122]:
m1.fit(X,y)
m2.fit(X,y)
m3.fit(X,y)


Out[122]:
RandomForestRegressor(bootstrap=True, compute_importances=None,
           criterion='mse', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
           min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False,
           random_state=None, verbose=0)

In [123]:
def plot_pred_w_noise(m,label):
    X = x_true[:,None]
    X = np.hstack((X, np.random.normal(size=(1000,1))))
    y = m.predict(X)
    plt.plot(X[:,0], y, label=label, alpha=.75)
    
plt.plot(x_true, y_true, label='True')
plt.plot(x_obs, y_obs, 's', label='Observed')

plot_pred_w_noise(m1, 'Linear Regression')
plot_pred_w_noise(m2, 'SVM')
plot_pred_w_noise(m3, 'Random Forest')

plt.legend(loc=(1,.1))


Out[123]:
<matplotlib.legend.Legend at 0x7ffc20b51ed0>

Work through one round of forward selection


In [124]:
X = x_obs[:,None]
y = y_obs
X = np.hstack((X, np.random.normal(size=(100,10))))

In [125]:
# how does forward selection determine which column of X to include first?

# choose the model to use
m = m1

# loop through columns, see which one is most informative
n,p = X.shape
for j in range(p):
    Xj = X[:,[j]]
    m.fit(Xj, y)
    
    # should we guage information in-sample or out-of-sample?
    y_pred = m.predict(Xj)
    
    print 'j=',j,'in-sample MSE=',np.round(np.mean((y - y_pred)**2),2)


j= 0 in-sample MSE= 0.24
j= 1 in-sample MSE= 0.36
j= 2 in-sample MSE= 0.36
j= 3 in-sample MSE= 0.36
j= 4 in-sample MSE= 0.36
j= 5 in-sample MSE= 0.36
j= 6 in-sample MSE= 0.36
j= 7 in-sample MSE= 0.35
j= 8 in-sample MSE= 0.36
j= 9 in-sample MSE= 0.36
j= 10 in-sample MSE= 0.36

So first thing to include is column zero! No surprise...

Work through one round of backwards elimination


In [126]:
X = x_obs[:,None]
y = y_obs
X = np.hstack((X, np.random.normal(size=(100,10))))

In [127]:
# how does backwards elimination determine which column of X to remove first?

# how does forward selection determine which column of X to include first?

# choose the model to use
m = m1

# see how well whole thing predicts.  Let's use out-of-sample this time.
import sklearn.cross_validation
full_score = sklearn.cross_validation.cross_val_score(m, X, y)

# loop through columns, see which one is most informative
n,p = X.shape
for j in range(p):
    X_wo_j = X[:, range(0,j)+range(j+1,p)]
    score_wo_j = sklearn.cross_validation.cross_val_score(m, X_wo_j, y)
    
    print 'j=',j,'out-of-sample increase in MSE=', np.round(np.mean(full_score - score_wo_j),2)


j= 0 out-of-sample increase in MSE= 0.41
j= 1 out-of-sample increase in MSE= -0.01
j= 2 out-of-sample increase in MSE= -0.01
j= 3 out-of-sample increase in MSE= -0.02
j= 4 out-of-sample increase in MSE= -0.01
j= 5 out-of-sample increase in MSE= -0.01
j= 6 out-of-sample increase in MSE= -0.05
j= 7 out-of-sample increase in MSE= -0.0
j= 8 out-of-sample increase in MSE= -0.06
j= 9 out-of-sample increase in MSE= 0.01
j= 10 out-of-sample increase in MSE= -0.02

In [128]:
# so first one to kick is j=8, although nothing seems important besides j=0

If you need to use this in your project, consider learning more about the backwards elimination method in sklearn with sklearn.feature_selection.RFE and .RFECV.


In [129]:
import sklearn.feature_selection

In [130]:
r = sklearn.feature_selection.RFE(sklearn.linear_model.LinearRegression(), n_features_to_select=2, step=1)
r.fit(X,y)


Out[130]:
RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, normalize=False),
  estimator_params={}, n_features_to_select=2, step=1, verbose=0)

In [131]:
# features included
r.support_


Out[131]:
array([ True,  True, False, False, False, False, False, False, False,
       False, False], dtype=bool)

Discretization

An example from Verbal Autopsy: symptom duration


In [132]:
# load va data from Week 4
df = pd.read_csv('/homes/abie/ML4HM/Week_4/IHME_PHMRC_VA_DATA_ADULT_Y2013M09D11_0.csv', low_memory=False)

We are going to look at questions about the cough symptom here: a2_32 and a2_33


In [133]:
# did deceased have cough?
df.a2_32.head()


Out[133]:
0     No
1    Yes
2    Yes
3    Yes
4     No
Name: a2_32, dtype: object

In [134]:
# tabulate that: did deceased have cough?
df.a2_32.value_counts()


Out[134]:
No                   5054
Yes                  2766
Don't Know             19
Refused to Answer       1
dtype: int64

In [135]:
# how long did the cough last (in days)?
df.a2_33.head()


Out[135]:
0     0
1    60
2    60
3    30
4     0
Name: a2_33, dtype: int64

In [136]:
sns.distplot(np.log(df.a2_33+1))
plt.xlabel('log(duration+1)')
plt.ylabel('probability density')
plt.axis(xmin=0)


Out[136]:
(0, 12.0, 0.0, 3.5)

In [137]:
# it is getting annoying to remember that silly name...
df['cough'] = df.a2_32 == 'Yes'
df['duration'] = df.a2_33.map(float)

In [138]:
df[df.cough].duration.describe(percentiles=[.025,.25,.50,.75,.975])


Out[138]:
count     2766.000000
mean        85.345264
std        457.400185
min          0.000000
2.5%         0.000000
25%          5.000000
50%         15.000000
75%         60.000000
97.5%      360.000000
max      14400.000000
Name: duration, dtype: float64

In [139]:
14400 / 30. / 12.


Out[139]:
40.0

Duration ranges from 0 to 14400 days.

Now: discretize duration variable into $K$ equal width bins, for $K = \sqrt{2766}$.


In [140]:
np.sqrt(2766)


Out[140]:
52.592775169218825

In [141]:
pd.cut(df.duration, bins=53)


Out[141]:
0     (-14.4, 271.698]
1     (-14.4, 271.698]
2     (-14.4, 271.698]
3     (-14.4, 271.698]
4     (-14.4, 271.698]
5     (-14.4, 271.698]
6     (-14.4, 271.698]
7     (-14.4, 271.698]
8     (-14.4, 271.698]
9     (-14.4, 271.698]
10    (-14.4, 271.698]
11    (-14.4, 271.698]
12    (-14.4, 271.698]
13    (-14.4, 271.698]
14    (-14.4, 271.698]
...
7826    (-14.4, 271.698]
7827    (-14.4, 271.698]
7828    (-14.4, 271.698]
7829    (-14.4, 271.698]
7830    (-14.4, 271.698]
7831    (-14.4, 271.698]
7832    (-14.4, 271.698]
7833    (-14.4, 271.698]
7834    (-14.4, 271.698]
7835    (-14.4, 271.698]
7836    (-14.4, 271.698]
7837    (-14.4, 271.698]
7838    (-14.4, 271.698]
7839    (-14.4, 271.698]
7840    (-14.4, 271.698]
Name: duration, Length: 7841, dtype: category
Categories (53, object): [(-14.4, 271.698] < (271.698, 543.396] < (543.396, 815.0943] < (815.0943, 1086.792] ... (13313.208, 13584.906] < (13584.906, 13856.604] < (13856.604, 14128.302] < (14128.302, 14400]]

In [142]:
# this is "fixed width" discretization, hence the name fw_disc:
df['fw_disc_duration'] = pd.cut(df.duration, bins=53)

And how do you make this discretized covariate something suitable for binary classifiers? One-Hot encoding, of course.


In [143]:
disc_duration = pd.cut(df.duration, bins=53)

In [144]:
# one hot encode them
for dur in disc_duration.unique():
    df['fw_disc_duration_%s'%dur] = (disc_duration == dur)

How about fixed frequency discretization?


In [145]:
bins = df.duration.order().iloc[len(df.duration)*np.linspace(0,1,53,endpoint=False)]
bins = np.unique(bins)

df['ff_disc_duration'] = pd.cut(df.duration, bins)

for dur in df.ff_disc_duration.unique():
    df['duration_in_bin_%s'%dur] = (df.ff_disc_duration == dur)

Or Thermometer encoding:


In [146]:
for lb in bins:
    df['duration_in_bin_at_least_%.2f'%lb] = (df.duration >= lb)

Another approach, which only uses the rows where cough == True to get the distribution of duration.


In [147]:
ordered_durations = np.array(df[df.cough].duration.order())
ordered_durations[np.linspace(0,2766,52, endpoint=False).astype(int)]


Out[147]:
array([   0.,    0.,    0.,    0.,    1.,    1.,    2.,    2.,    3.,
          3.,    3.,    4.,    4.,    5.,    6.,    7.,    7.,    7.,
          7.,    7.,    9.,   10.,   14.,   14.,   15.,   15.,   15.,
         20.,   24.,   30.,   30.,   30.,   30.,   30.,   30.,   30.,
         60.,   60.,   60.,   60.,   60.,   90.,   90.,   90.,  120.,
        150.,  180.,  180.,  180.,  240.,  360.,  450.])

In [148]:
df['ff_disc_duration'] = [np.sum(df.duration[i] >= ordered_durations) for i in df.index]

What we ended up doing is looking at the mean duration stratified by cause:


In [149]:
df.groupby('gs_text34').duration.mean().order(ascending=False)


Out[149]:
gs_text34
TB                                 156.108696
Acute Myocardial Infarction         74.345000
Lung Cancer                         56.566038
Pneumonia                           55.201852
AIDS                                47.519920
Esophageal Cancer                   45.125000
COPD                                43.222222
Other Cardiovascular Diseases       32.021635
Leukemia/Lymphomas                  29.602564
Breast Cancer                       28.661538
Poisonings                          28.058140
Malaria                             26.210000
Falls                               25.745665
Asthma                              25.553191
Other Non-communicable Diseases     24.020033
Diarrhea/Dysentery                  22.118421
Prostate Cancer                     21.687500
Colorectal Cancer                   19.080808
Stomach Cancer                      16.322581
Renal Failure                       15.451923
Stroke                              15.314286
Diabetes                            15.007246
Cirrhosis                           14.070288
Road Traffic                        10.643564
Other Infectious Diseases           10.148289
Suicide                              8.661290
Cervical Cancer                      8.006452
Epilepsy                             6.479167
Homicide                             5.251497
Other Injuries                       2.766990
Maternal                             2.636752
Fires                                1.631148
Drowning                             0.396226
Bite of Venomous Animal              0.303030
Name: duration, dtype: float64

In [150]:
sns.distplot(df.groupby('gs_text34').duration.mean())


Out[150]:
<matplotlib.axes.AxesSubplot at 0x7ffc209b2b50>

In [151]:
df['cough_duration_is_long'] = (df.duration > 60)
df['cough_duration_is_really_long'] = (df.duration > 90)

Projections

Not too much more to say, since a lot of last week was really on this. You've got PCA, MDS, Factor Analysis, and Random Tree Encodings in your Exercise 8 notebook, if you need any of that fancy stuff.

If you need partial least squares regression in sklearn, you can find it here:


In [152]:
import sklearn.cross_decomposition

m4 = sklearn.cross_decomposition.PLSRegression()

So for our exercise, just two little examples of "semantic projections".

One from VA, wherein dates can be subtracted to make a more meaningful feature:


In [153]:
# year of birth
df.g1_01y.head()


Out[153]:
0    1958
1     NaN
2     NaN
3    1923
4    1933
Name: g1_01y, dtype: object

In [154]:
# year of death
df.g1_06y.head()


Out[154]:
0    2009
1    2008
2    2008
3    2009
4    2009
Name: g1_06y, dtype: float64

Much more meaningful to have the age of the deceased (which is g1_07a in the database, but let's pretend it is not).


In [155]:
def float_or_nan(x):
    try:
        return float(x)
    except ValueError:
        return np.nan

In [156]:
age_at_death = df.g1_06y - df.g1_01y.map(float_or_nan)
age_at_death


Out[156]:
0     51
1    NaN
2    NaN
3     86
4     76
5     16
6     65
7    NaN
8     68
9     32
10    35
11    17
12   NaN
13   NaN
14    20
...
7826    18
7827    51
7828    68
7829    66
7830    29
7831    43
7832    52
7833    49
7834    67
7835   NaN
7836    52
7837    78
7838   NaN
7839    22
7840    10
Length: 7841, dtype: float64

In [157]:
# for use later:
reported_age_at_death = df.g1_07a

Another time this comes up: timeseries, like Ebola case counts: http://institutefordiseasemodeling.github.io/EVD/

Sampling

Sample 1000 numbers from 0, 1, ..., 1e7, with replacement:


In [158]:
sampled_ints = np.random.choice(range(int(1e7)), 50000)
len(sampled_ints)


Out[158]:
50000

In [159]:
np.min(sampled_ints), 1./1000 * 1e7


Out[159]:
(61, 10000.0)

In [160]:
np.mean(sampled_ints), 1e7 /2


Out[160]:
(4993943.4337600004, 5000000.0)

In [161]:
np.unique(sampled_ints).shape


Out[161]:
(49885,)

Again, without replacement:


In [162]:
sampled_ints=np.random.choice(np.arange(1e7), size=50000, replace=False)

In [163]:
np.unique(sampled_ints).shape


Out[163]:
(50000,)

Speed is a good reason to do this sort of sampling. Let us revisit the noisy sinewave to see what I mean:


In [164]:
np.random.seed(12345)

x_true = np.linspace(0,4,1000)
y_true = np.sin(x_true)

x_obs = np.linspace(0,4,1000*1000)
y_obs = np.random.normal(np.sin(x_obs), .3)

plt.plot(x_obs, y_obs, ',', label='Observed', alpha=.5)
plt.plot(x_true, y_true, label='True', linewidth=10)
plt.legend(loc=(1,.1))


Out[164]:
<matplotlib.legend.Legend at 0x7ffc20a7f9d0>

In [165]:
m = sklearn.ensemble.RandomForestRegressor(max_depth=10)
%time m.fit(x_obs[:,None], y_obs)


CPU times: user 10.5 s, sys: 6.53 ms, total: 10.5 s
Wall time: 10.5 s
Out[165]:
RandomForestRegressor(bootstrap=True, compute_importances=None,
           criterion='mse', max_depth=10, max_features='auto',
           max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
           min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False,
           random_state=None, verbose=0)

In [166]:
plt.plot(x_obs, y_obs, ',', label='Observed', alpha=.5)
plt.plot(x_true, y_true, label='True', linewidth=4)
plot_pred(m, 'Random Forest')
plt.legend(loc=(1,.1))


Out[166]:
<matplotlib.legend.Legend at 0x7ffc208c5110>

In [167]:
import time

In [168]:
time.time()


Out[168]:
1425856981.181412

In [169]:
n = len(x_obs)

In [170]:
t = {}
for n_sample in [10,50,100,500,1000,5000,10*1000,50*1000,100*1000]:
    random_sample = np.random.choice(range(n), size=n_sample, replace=False)
    
    t[n_sample] = time.time()
    %time m.fit(x_obs[random_sample,None], y_obs[random_sample])
    t[n_sample] = time.time() - t[n_sample]


CPU times: user 11.7 ms, sys: 71 µs, total: 11.8 ms
Wall time: 7.8 ms
CPU times: user 7.3 ms, sys: 36 µs, total: 7.34 ms
Wall time: 6.14 ms
CPU times: user 5.32 ms, sys: 0 ns, total: 5.32 ms
Wall time: 5.35 ms
CPU times: user 7.57 ms, sys: 0 ns, total: 7.57 ms
Wall time: 7.57 ms
CPU times: user 16.9 ms, sys: 0 ns, total: 16.9 ms
Wall time: 13.1 ms
CPU times: user 36.6 ms, sys: 0 ns, total: 36.6 ms
Wall time: 36.7 ms
CPU times: user 74.2 ms, sys: 33 µs, total: 74.2 ms
Wall time: 74.3 ms
CPU times: user 425 ms, sys: 0 ns, total: 425 ms
Wall time: 426 ms
CPU times: user 935 ms, sys: 0 ns, total: 935 ms
Wall time: 936 ms

In [171]:
t = pd.Series(t)
plt.loglog(t.index, t, 's-')
plt.ylabel('Time (seconds)')
plt.xlabel('Rows of training data (n)')


Out[171]:
<matplotlib.text.Text at 0x7ffc2086f750>

Now what you should also do is have a look at the prediction accuracy as a function of number of samples, to decide how much data is worth your time. (Evaluate out-of-sample, of course!)

Cleansing

How long is the index of the age_at_death series we made in our semantic projection part of the exercise?


In [172]:
len(age_at_death.index)


Out[172]:
7841

And how much of it is missing?


In [173]:
age_at_death.isnull().sum()


Out[173]:
2020

We should fill it in somehow. What does the non-missing part look like?


In [174]:
age_at_death.describe()


Out[174]:
count    5821.00000
mean       49.68459
std        20.33336
min        -1.00000
25%        33.00000
50%        50.00000
75%        65.00000
max       101.00000
dtype: float64

Negative ages are not good...


In [175]:
plt.plot(reported_age_at_death.map(float_or_nan), age_at_death, 'o')
plt.xlabel('Reported Age at Death (years)')
plt.ylabel('Reported year of death minus year of birth')


Out[175]:
<matplotlib.text.Text at 0x7ffc2070c8d0>

Cleaning this up a mess like this is something with which I suspect many of my students are familiar.

Hidden in this section is a very interesting approach which DM calls "generating artificial data". This is a way that you can use any supervised learning algorithm to do unsupervised work. We will experiment with it with Nick's example data, and I hope he will push this further as part of his project:


In [176]:
df = pd.read_csv('/homes/abie/ML4HM/Week_9/Nick_project_data_example_ZWE_2010_DHS_cleaned.csv')

In [177]:
df.head()


Out[177]:
posteriormean time_to_watersource electricity radio tv refrigerator bicycle motorcycle car rooms_sleeping ... cookfuel_type11 cookfuel_type12 cookfuel_type13 watersource_location1 watersource_location2 watersource_location3 cooking_location1 cooking_location2 cooking_location3 cooking_location4
0 0.947429 0 1 0 1 0 0 0 0 1 ... 0 0 0 0 0 0 0 1 0 0
1 2.523223 0 1 0 1 1 1 0 1 3 ... 0 0 0 0 0 0 0 1 0 0
2 2.073207 0 1 0 1 1 0 0 1 4 ... 0 0 0 0 0 1 0 1 0 0
3 2.442217 15 1 0 1 1 0 0 1 3 ... 0 0 0 1 0 0 0 1 0 0
4 1.319935 0 1 1 1 1 0 0 0 3 ... 0 0 0 0 0 0 0 1 0 0

5 rows × 100 columns

Start with the feature vectors:


In [178]:
X_real = np.array(df.iloc[:, 1:])

Label it all as real:


In [179]:
n,p = X_real.shape

In [180]:
y_real = np.array(['real']*n)

Then make fake data, which has the same univariate distributions, but none of the correlation structure:


In [181]:
X_fake = np.empty((n,p))
for j in range(p):
    X_fake[:,j] = np.random.choice(X_real[:,j], size=n, replace=False)

label this as fake:


In [182]:
y_fake = np.array(['fake']*n)

Stick them together and you have the labeled examples for supervised learning:


In [183]:
X = np.vstack((X_real, X_fake))
y = np.hstack((y_real, y_fake))

In [184]:
X_real.shape


Out[184]:
(9756, 99)

In [185]:
y_real.shape


Out[185]:
(9756,)

In [186]:
c1 = sklearn.linear_model.LogisticRegression()
c1.fit(X,y)
c1.score(X,y)


Out[186]:
0.5

In [187]:
c2 = sklearn.ensemble.RandomForestClassifier()
c2.fit(X,y)
c2.score(X,y)


Out[187]:
0.99882123821238211

In [188]:
y_pred = c2.predict(X)

In [189]:
np.mean(y==y_pred)


Out[189]:
0.99882123821238211

In [190]:
import sklearn.cross_validation

In [191]:
sklearn.cross_validation.cross_val_score(c2, X, y)


Out[191]:
array([ 0.97217097,  0.97201722,  0.96417589])

So the question for Nick is, how does RF do it?

Transforming Multiple Classes to Binary Classes

A sketch of the ways with the VA data:


In [192]:
# load va data from Week 4
df = pd.read_csv('/homes/abie/ML4HM/Week_4/IHME_PHMRC_VA_DATA_ADULT_Y2013M09D11_0.csv', low_memory=False)

In [193]:
df.gs_text34.value_counts()


Out[193]:
Stroke                             630
Other Non-communicable Diseases    599
Pneumonia                          540
AIDS                               502
Maternal                           468
Renal Failure                      416
Other Cardiovascular Diseases      416
Diabetes                           414
Acute Myocardial Infarction        400
Cirrhosis                          313
TB                                 276
Other Infectious Diseases          263
Diarrhea/Dysentery                 228
Road Traffic                       202
Breast Cancer                      195
Falls                              173
COPD                               171
Homicide                           167
Leukemia/Lymphomas                 156
Cervical Cancer                    155
Suicide                            124
Fires                              122
Drowning                           106
Lung Cancer                        106
Other Injuries                     103
Malaria                            100
Colorectal Cancer                   99
Poisonings                          86
Bite of Venomous Animal             66
Stomach Cancer                      62
Epilepsy                            48
Prostate Cancer                     48
Asthma                              47
Esophageal Cancer                   40
dtype: int64

In "one-vs-all" approach, for each multiclass label, make a new problem of classifying as "is it this label?"


In [194]:
y = (df.gs_text34 == 'Asthma')

In [195]:
pd.Series(y).value_counts()


Out[195]:
False    7794
True       47
dtype: int64

In "one-vs-one approach, for each pair of labels, make a new problem with instances just for that pair:


In [196]:
rows = df.gs_text34.isin(['Stroke', 'Asthma'])
y = df[rows].gs_text34

In [197]:
y.value_counts()


Out[197]:
Stroke    630
Asthma     47
dtype: int64

Calibration

Is anyone thinking about predicting probabilities for their projects at this point? It can be very cool, but also hard. Returning to the stuff from Nick's project:


In [198]:
X = np.vstack((X_real, X_fake))
y = np.hstack((y_real, y_fake))

In [199]:
train = np.random.choice(range(len(X)), size=len(X)/2, replace=False)
test = list(set(range(len(X))) - set(train))

c = sklearn.ensemble.RandomForestClassifier()
c.fit(X[train], y[train])


Out[199]:
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)

We can make probability predictions for most classifiers in sklearn:


In [200]:
c.predict_proba(X[test[0]])


Out[200]:
array([[ 0.,  1.]])

Did that tend towards right answer?


In [201]:
y[test[0]]


Out[201]:
'real'

Can't tell from that, actually...


In [202]:
c.classes_


Out[202]:
array(['fake', 'real'], 
      dtype='|S4')

So yes... How does it do on things it say are 80% chance real, overall?


In [203]:
y_pr_pred = c.predict_proba(X[test])

How many things like this are there?


In [204]:
np.sum(y_pr_pred[:,1]==.8)


Out[204]:
489

In [205]:
# in general, better to have a little wiggle room
np.sum((y_pr_pred[:,1] >= .79)&(y_pr_pred[:,1] < .81))


Out[205]:
489

In [206]:
rows = np.where(y_pr_pred[:,1]==.8)

In [207]:
test = np.array(test)
np.mean(y[test[rows]] == 'real')


Out[207]:
0.99386503067484666

So this is a bit conservative! What about for some low probability examples?


In [208]:
np.sum(y_pr_pred[:,1]==0)


Out[208]:
1634

In [209]:
rows = np.where(y_pr_pred[:,1]==0)

In [210]:
np.mean(y[test[rows]] == 'real')


Out[210]:
0.0

Perhaps with more trees, things would be better. But Random Forests are not supposed to do probability prediction, so perhaps we need to work on it a little bit ourselves if this is what is desired. Got any ideas for how to do so?


In [211]:
import sklearn.linear_model, patsy

In [212]:
cal = sklearn.linear_model.LogisticRegression(C=1e4)
X_pr_pred = patsy.dmatrix('bs(x,10)', {'x':y_pr_pred[:,1]})
cal.fit(X_pr_pred,y[test])


Out[212]:
LogisticRegression(C=10000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, penalty='l2',
          random_state=None, tol=0.0001)

In [213]:
y_cal_pr = cal.predict_proba(X_pr_pred)

In [214]:
# in general, better to have a little wiggle room
np.sum((y_cal_pr[:,1] >= .79)&(y_cal_pr[:,1] < .81))


Out[214]:
217

In [215]:
rows = (y_cal_pr[:,1] >= .79)&(y_cal_pr[:,1] < .81)

In [216]:
np.mean(y[test[rows]] == 'real')


Out[216]:
0.82027649769585254

Quite a bit closer... how to check this more rigorously is something I can tell you more about sometime.