In [11]:
!date
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
sns.set_context('poster')
sns.set_style('darkgrid')
In [12]:
# set random seed, for reproducibility
np.random.seed(12345)
# funny little var for making notebook executable
__________________ = None
Load, clean, and prepare DHS asset ownership data:
In [13]:
df = pd.read_csv('RWA_DHS6_2010_2011_HH_ASSETS.CSV', index_col=0)
# have a look at what is in this data frame
df
Out[13]:
In [14]:
cb = pd.read_csv('RWA_DHS6_2010_2011_HH_ASSETS_codebook.CSV', index_col=0)
# cb stands for codebook. have a look at what the funny column names mean
cb
Out[14]:
Wouldn't it be nice if the column names were descriptive, instead of codes?
In [15]:
# find a dictionary mapping codes to descriptions
# is it simply cb.to_dict?
cb.to_dict()
Out[15]:
In [16]:
# no, not quite. but it is in there:
cb.to_dict().get('full name')
Out[16]:
In [17]:
# you can use pd.Series.map to change all the names,
# but you cannot do it to a list of columns directly
# too bad...
pd.Series(df.columns).map(cb.to_dict()['full name'])
Out[17]:
In [18]:
df.columns = pd.Series(df.columns).map(cb.to_dict()['full name'])
# did we get that right?
df
Out[18]:
In [19]:
# have a look at the survey results:
(100*df.mean()).order().plot(kind='barh')
plt.xlabel('Percent endorsed')
Out[19]:
Now make an array of feature vectors and a corresponding array of labels:
In [20]:
X = np.array(df.drop('has mobile telephone', axis=1))
y = np.array(df['has mobile telephone'])
And split the data into training and test sets (we'll talk more about this next week!)
In [21]:
train = np.random.choice(range(len(df.index)), size=len(df.index)*.8, replace=False)
test = [i for i in (set(range(len(df.index))) - set(train))]
In [22]:
X_test = X[test]
y_test = y[test]
X_train = X[train]
y_train = y[train]
Does it look reasonable?
In [23]:
len(X_test), len(X_train)
Out[23]:
In [24]:
y_test.mean(), y_train.mean()
Out[24]:
In [25]:
X_test.mean(axis=0).round(2)
Out[25]:
In [26]:
X_train.mean(axis=0).round(2)
Out[26]:
In [27]:
import sklearn.naive_bayes
clf = sklearn.naive_bayes.BernoulliNB()
clf.fit(X_train, y_train)
Out[27]:
In [29]:
y_pred = clf.predict(X_test)
In [30]:
np.mean(y_pred == y_test)
Out[30]:
Is that good?
In [31]:
y_random = np.random.choice([0,1], size=len(y_test))
np.mean(y_random == y_test)
Out[31]:
Better than random, worse than perfect...
In [32]:
import sklearn.linear_model
clf = sklearn.linear_model.LinearRegression()
clf.fit(X_train, y_train)
Out[32]:
In [33]:
y_pred = clf.predict(X_test)
In [34]:
y_pred = (y_pred >= .5)
In [35]:
np.mean(y_pred == y_test)
Out[35]:
Actually just about as good as N-B.
In [36]:
import sklearn.linear_model
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train, y_train)
Out[36]:
In [37]:
y_pred = clf.predict(X_test)
In [38]:
np.mean(y_pred == y_test)
Out[38]:
Again, about the same...
In [39]:
import sklearn.linear_model
clf = sklearn.linear_model.Perceptron()
clf.fit(X_train, y_train)
Out[39]:
In [40]:
y_pred = clf.predict(X_test)
In [41]:
np.mean(y_pred == y_test)
Out[41]:
So it is possible to do worse. Bonus challenge: I think you can change the parameters to get this up to 75% concordance. Can you?
In [42]:
import sklearn.tree
clf = sklearn.tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
Out[42]:
In [45]:
y_pred = clf.predict(X_test)
In [47]:
np.mean(y_pred == y_test)
Out[47]:
A tiny improvement!
Now refactor this process, so that for any sklearn classifier you can call a function to find its out-of-sample predictive accuracy on cell phone ownership:
In [48]:
def oos_accuracy(clf, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
""" Calculate out-of-sample predictive accuracy of cell phone ownership
prediction
Parameters
----------
clf : sklearn classifier
X_train, y_train, X_test, y_test : training and test data and labels
Results
-------
stores trained classifier in clf, returns oos accuracy
"""
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
return np.mean(y_pred == y_test)
Figure out a way to test it:
In [49]:
oos_accuracy(sklearn.naive_bayes.BernoulliNB()) # should be .765
Out[49]:
Bonus challenge: Figure out a way to make it work for the linear regression predictor
(Hard because we had to round the numeric predictions)
In [50]:
oos_accuracy(sklearn.linear_model.LinearRegression()) # got .763 before
Out[50]:
In [53]:
def fixed_oos_accuracy(clf, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
""" Calculate out-of-sample predictive accuracy of cell phone ownership
prediction
Parameters
----------
clf : sklearn classifier
X_train, y_train, X_test, y_test : training and test data and labels
Results
-------
stores trained classifier in clf, returns oos accuracy
"""
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# HACK: turn numeric values into categorical predictions
y_pred = (y_pred >= .5)
return np.mean(y_pred == y_test)
In [54]:
fixed_oos_accuracy(sklearn.linear_model.LinearRegression()) # got .763 before
Out[54]:
In [ ]: