In [ ]:
!date
import numpy as np, matplotlib.pyplot as plt, pandas as pd, seaborn as sns
%matplotlib inline
sns.set_context('talk')
sns.set_style('darkgrid')
In [ ]:
# set random seed for reproducibility
np.random.seed(12345)
In [ ]:
x_true = np.linspace(0,4,1000)
y_true = np.sin(x_true)
obs = np.random.choice(range(1000), size=100, replace=False)
x_obs = x_true[obs]
y_obs = np.random.normal(y_true[obs], .3)
plt.plot(x_true, y_true, label='True')
plt.plot(x_obs, y_obs, 's', label='Observed')
plt.legend(loc=(1,.1))
And here are a few ways to fit this:
In [ ]:
import sklearn.linear_model, sklearn.svm, sklearn.ensemble
In [ ]:
m1 = sklearn.linear_model.LinearRegression()
m2 = sklearn.svm.SVR()
m3 = sklearn.ensemble.RandomForestRegressor()
In [ ]:
X = x_obs[:,None]
y = y_obs
In [ ]:
m1.fit(X,y)
In [ ]:
m2.fit(X,y)
In [ ]:
m3.fit(X,y)
In [ ]:
def plot_pred(m,label):
X = x_true[:,None]
y = m.predict(X)
plt.plot(X, y, label=label)
What do you think the predictions from m1, m2, and m3 are going to look like?
In [ ]:
plt.plot(x_true, y_true, label='True')
plt.plot(x_obs, y_obs, 's', label='Observed')
# plot_pred(m1, 'Linear Regression')
# plot_pred(m2, 'SVM')
# plot_pred(m3, 'Random Forest')
plt.legend(loc=(1,.1))
Add random noise as a predictor
In [ ]:
X = np.hstack((X, np.random.normal(size=(100,1))))
In [ ]:
m1.fit(X,y)
m2.fit(X,y)
m3.fit(X,y)
In [ ]:
def plot_pred_w_noise(m,label):
X = x_true[:,None]
X = np.hstack((X, np.random.normal(size=(1000,1))))
y = m.predict(X)
plt.plot(X[:,0], y, label=label, alpha=.75)
plt.plot(x_true, y_true, label='True')
plt.plot(x_obs, y_obs, 's', label='Observed')
plot_pred_w_noise(m1, 'Linear Regression')
plot_pred_w_noise(m2, 'SVM')
plot_pred_w_noise(m3, 'Random Forest')
plt.legend(loc=(1,.1))
Work through one round of forward selection
In [ ]:
X = x_obs[:,None]
y = y_obs
X = np.hstack((X, np.random.normal(size=(100,10))))
In [ ]:
# how does forward selection determine which column of X to include first?
Work through one round of backwards elimination
In [ ]:
X = x_obs[:,None]
y = y_obs
X = np.hstack((X, np.random.normal(size=(100,10))))
In [ ]:
# how does backwards elimination determine which column of X to remove first?
If you need to use this in your project, consider learning more about the backwards elimination method in sklearn with sklearn.feature_selection.RFE and .RFECV.
In [ ]:
import sklearn.feature_selection
In [ ]:
r = sklearn.feature_selection.RFE(sklearn.linear_model.LinearRegression(), n_features_to_select=2, step=1)
r.fit(X,y)
In [ ]:
# features included
r.support_
In [ ]:
# load va data from Week 4
df = pd.read_csv('../Week_4/IHME_PHMRC_VA_DATA_ADULT_Y2013M09D11_0.csv', low_memory=False)
We are going to look at questions about the cough symptom here: a2_32 and a2_33
In [ ]:
# did deceased have cough?
df.a2_32.head()
In [ ]:
# tabulate that: did deceased have cough?
df.a2_32.value_counts()
In [ ]:
# how long did the cough last (in days)?
df.a2_33.head()
In [ ]:
sns.distplot(np.log(df.a2_33+1))
plt.xlabel('log(duration+1)')
plt.ylabel('probability density')
plt.axis(xmin=0)
In [ ]:
# it is getting annoying to remember that silly name...
df['cough'] = df.a2_32 == 'Yes'
df['duration'] = df.a2_33.map(float)
In [ ]:
df[df.cough].duration.describe(percentiles=[.025,.25,.50,.75,.975])
Duration ranges from 0 to 14400 days.
Now: discretize duration variable into $K$ equal width bins, for $K = \sqrt{2766}$.
In [ ]:
# this is "fixed width" discretization, hence the name fw_disc:
df['fw_disc_duration'] = ________________________________
And how do you make this discretized covariate something suitable for binary classifiers? One-Hot encoding, of course.
In [ ]:
for val in df.fw_disc_duration.unique():
df['duration_in_bin_%d'%val] = (df.fw_disc_duration == val)
Or Thermometer encoding:
In [ ]:
for val in df.fw_disc_duration.unique():
df['duration_in_bin_at_least_%d'%val] = ___________________________________
How about fixed frequency discretization?
In [ ]:
ordered_durations = np.array(df[df.cough].duration.order())
ordered_durations[np.linspace(0,2766,52, endpoint=False).astype(int)]
In [ ]:
df['ff_disc_duration'] = _________________________________
What we ended up doing is looking at the mean duration stratified by cause:
In [ ]:
df.groupby('gs_text34').duration.mean().order(ascending=False)
In [ ]:
sns.distplot(df.groupby('gs_text34').duration.mean())
In [ ]:
df['cough_duration_is_long'] = (df.duration > ____________________)
df['cough_duration_is_really_long'] = (df.duration > ______________________________)
In [ ]:
import sklearn.cross_decomposition
m4 = sklearn.cross_decomposition.PLSRegression()
So for our exercise, just two little examples of "semantic projections".
One from VA, wherein dates can be subtracted to make a more meaningful feature:
In [ ]:
# year of birth
df.g1_01y.head()
In [ ]:
# year of death
df.g1_06y.head()
Much more meaningful to have the age of the deceased (which is g1_07a in the database, but let's pretend it is not).
In [ ]:
age_at_death = _________________________________
age_at_death
In [ ]:
# for use later:
reported_age_at_death = df.g1_07a
Another time this comes up: timeseries, like Ebola case counts: http://institutefordiseasemodeling.github.io/EVD/
In [ ]:
np.random.choice(____________________________)
Again, without replacement:
In [ ]:
np.random.choice(______________________________)
Speed is a good reason to do this sort of sampling. Let us revisit the noisy sinewave to see what I mean:
In [ ]:
np.random.seed(12345)
x_true = np.linspace(0,4,1000)
y_true = np.sin(x_true)
x_obs = np.linspace(0,4,1000*1000)
y_obs = np.random.normal(np.sin(x_obs), .3)
plt.plot(x_obs, y_obs, ',', label='Observed', alpha=.5)
plt.plot(x_true, y_true, label='True', linewidth=10)
plt.legend(loc=(1,.1))
In [ ]:
m = sklearn.ensemble.RandomForestRegressor()
%time m.fit(x_obs[:,None], y_obs)
In [ ]:
plt.plot(x_obs, y_obs, ',', label='Observed', alpha=.5)
plot_pred(m, 'Random Forest')
plt.plot(x_true, y_true, label='True', linewidth=4)
plt.legend(loc=(1,.1))
In [ ]:
import time
In [ ]:
time.time()
In [ ]:
t = {}
for n_sample in [10,50,100,500,1000,5000,10*1000,50*1000,100*1000]:
random_sample = np.random.choice(_____________________________)
t[n_sample] = time.time()
%time m.fit(x_obs[random_sample,None], y_obs[random_sample])
t[n_sample] = time.time() - t[n_sample]
In [ ]:
t = pd.Series(t)
plt.loglog(t.index, t, 's-')
plt.ylabel('Time (seconds)')
plt.xlabel('Rows of training data (n)')
Now what you should also do is have a look at the prediction accuracy as a function of number of samples, to decide how much data is worth your time. (Evaluate out-of-sample, of course!)
How long is the index of the age_at_death series we made in our semantic projection part of the exercise?
In [ ]:
len(age_at_death.index)
And how much of it is missing?
In [ ]:
age_at_death.isnull().sum()
We should fill it in somehow. What does the non-missing part look like?
In [ ]:
age_at_death.describe()
Negative ages are not good...
In [ ]:
plt.plot(reported_age_at_death.map(float_or_nan), age_at_death, 'o')
plt.xlabel('Reported Age at Death (years)')
plt.ylabel('Reported year of death minus year of birth')
Cleaning this up a mess like this is something with which I suspect many of my students are familiar.
Hidden in this section is a very interesting approach which DM calls "generating artificial data". This is a way that you can use any supervised learning algorithm to do unsupervised work. We will experiment with it with Nick's example data, and I hope he will push this further as part of his project:
In [ ]:
df = pd.read_csv('Nick_project_data_example_ZWE_2010_DHS_cleaned.csv')
In [ ]:
df.head()
Start with the feature vectors:
In [ ]:
X_real = np.array(df.iloc[:, 1:])
Label it all as real:
In [ ]:
n,p = X_real.shape
In [ ]:
y_real = np.array(['real']*n)
Then make fake data, which has the same univariate distributions, but none of the correlation structure:
In [ ]:
X_fake = np.empty((n,p))
for j in range(p):
X_fake[:,j] = np.random.choice(____________________________________)
label this as fake:
In [ ]:
y_fake = np.array(['fake']*n)
Stick them together and you have the labeled examples for supervised learning:
In [ ]:
X = np.vstack((X_real, X_fake))
y = np.hstack((y_real, y_fake))
In [ ]:
X_real.shape
In [ ]:
y_real.shape
In [ ]:
c1 = sklearn.linear_model.LogisticRegression()
c1.fit(X,y)
c1.score(X,y)
In [ ]:
c2 = sklearn.ensemble.RandomForestClassifier()
c2.fit(X,y)
c2.score(X,y)
In [ ]:
import sklearn.cross_validation
In [ ]:
sklearn.cross_validation.cross_val_score(c2, X, y)
So the question for Nick is, how does RF do it?
In [ ]:
# load va data from Week 4
df = pd.read_csv('../Week_4/IHME_PHMRC_VA_DATA_ADULT_Y2013M09D11_0.csv', low_memory=False)
In [ ]:
df.gs_text34.value_counts()
In "one-vs-all" approach, for each multiclass label, make a new problem of classifying as "is it this label?"
In [ ]:
y = ____________________________
In [ ]:
y.value_counts()
In "one-vs-one approach, for each pair of labels, make a new problem with instances just for that pair:
In [ ]:
rows = ____________________________________
y = df[rows].gs_text34
In [ ]:
y.value_counts()
In [ ]:
X = np.vstack((X_real, X_fake))
y = np.hstack((y_real, y_fake))
In [ ]:
train = np.random.choice(range(len(X)), size=len(X)/2, replace=False)
test = list(set(range(len(X))) - set(train))
c = sklearn.ensemble.RandomForestClassifier()
c.fit(X[train], y[train])
We can make probability predictions for most classifiers in sklearn:
In [ ]:
c.predict_proba(X[test[0]])
Did that tend towards right answer?
In [ ]:
y[test[0]]
Can't tell from that, actually...
In [ ]:
c.classes_
So yes... How does it do on things it say are 80% chance real, overall?
In [ ]:
y_pr_pred = c.predict_proba(X[test])
How many things like this are there?
In [ ]:
np.sum(y_pr_pred[:,1]==.8)
In [ ]:
# in general, better to have a little wiggle room
np.sum((y_pr_pred[:,1] >= .79)&(y_pr_pred[:,1] < .81))
In [ ]:
rows = np.where(y_pr_pred[:,1]==.8)
In [ ]:
test = np.array(test)
np.mean(y[test[rows]] == 'real')
So this is a bit conservative! What about for some low probability examples?
In [ ]:
np.sum(y_pr_pred[:,1]==0)
In [ ]:
rows = np.where(y_pr_pred[:,1]==0)
In [ ]:
np.mean(y[test[rows]] == 'real')
Perhaps with more trees, things would be better. But Random Forests are not supposed to do probability prediction, so perhaps we need to work on it a little bit ourselves if this is what is desired. Got any ideas for how to do so?