In [11]:
!date
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
sns.set_context('poster')
sns.set_style('darkgrid')


Tue Jan 20 17:12:54 PST 2015

In [12]:
# set random seed, for reproducibility
np.random.seed(12345)

# funny little var for making notebook executable
__________________ = None

Load, clean, and prepare DHS asset ownership data:


In [13]:
df = pd.read_csv('RWA_DHS6_2010_2011_HH_ASSETS.CSV', index_col=0)

# have a look at what is in this data frame
df


Out[13]:
sh110g hv247 hv246 hv227 sh118f hv243a hv243c hv243b hv225 hv209 hv208 hv243d hv207 hv206 hv212 hv221 hv210 hv211
0 0 1 0 1 0 1 0 1 0 0 1 0 1 1 0 0 0 0
1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0
2 0 1 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 1
3 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
4 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
5 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0
6 1 1 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 0
7 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0
8 0 1 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 0
9 0 0 0 1 0 1 0 1 1 0 0 0 1 1 0 0 0 0
10 1 1 0 1 0 1 0 1 1 0 1 0 1 1 0 0 0 0
11 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0
12 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0
13 0 1 1 0 0 1 0 0 1 0 1 0 1 1 0 0 0 0
14 0 0 0 1 0 1 0 0 1 0 0 0 1 1 0 0 0 0
15 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0
16 0 1 1 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0
17 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0
18 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0
19 0 1 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0
20 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0
21 0 1 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 0
22 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0
23 0 1 0 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0
24 0 1 1 0 0 1 0 0 1 0 1 0 1 1 0 0 0 0
25 0 1 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 0
26 0 1 0 1 0 1 0 1 1 0 1 0 1 1 0 0 0 0
27 0 1 0 1 0 1 0 1 1 0 1 0 1 1 0 0 0 0
28 0 1 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0
29 0 1 0 1 0 1 0 0 1 0 0 0 1 1 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
12510 0 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0
12511 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0
12512 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0
12513 0 1 1 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0
12514 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
12515 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0
12516 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0
12517 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
12518 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0
12519 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0
12520 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12521 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12522 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0
12523 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0
12524 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0
12525 0 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0
12526 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12527 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
12528 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0
12529 0 1 1 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0
12530 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12531 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12532 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
12533 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
12534 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
12535 0 0 1 1 0 1 0 1 1 0 0 0 1 0 0 0 1 0
12536 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0
12537 0 1 1 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0
12538 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
12539 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0

12317 rows × 18 columns


In [14]:
cb = pd.read_csv('RWA_DHS6_2010_2011_HH_ASSETS_codebook.CSV', index_col=0)

# cb stands for codebook.  have a look at what the funny column names mean
cb


Out[14]:
full name
hv206 has electricity
hv207 has radio
hv208 has television
hv209 has refrigerator
hv210 has bicycle
hv211 has motorcycle/scooter
hv212 has car/truck
hv221 has telephone (landline)
hv225 share toilet with other households
hv227 has mosquito bed net for sleeping
hv243a has mobile telephone
hv243b has watch
hv243c has animal-drawn cart
hv243d has boat with a motor
hv246 owns livestock, herds, or farm animals
hv247 has bank account
sh110g computer
sh118f boat without motor

Wouldn't it be nice if the column names were descriptive, instead of codes?


In [15]:
# find a dictionary mapping codes to descriptions

# is it simply cb.to_dict?
cb.to_dict()


Out[15]:
{'full name': {'hv206': 'has electricity',
  'hv207': 'has radio',
  'hv208': 'has television',
  'hv209': 'has refrigerator',
  'hv210': 'has bicycle',
  'hv211': 'has motorcycle/scooter',
  'hv212': 'has car/truck',
  'hv221': 'has telephone (landline)',
  'hv225': 'share toilet with other households',
  'hv227': 'has mosquito bed net for sleeping',
  'hv243a': 'has mobile telephone',
  'hv243b': 'has watch',
  'hv243c': 'has animal-drawn cart',
  'hv243d': 'has boat with a motor',
  'hv246': 'owns livestock, herds, or farm animals',
  'hv247': 'has bank account',
  'sh110g': 'computer',
  'sh118f': 'boat without motor'}}

In [16]:
# no, not quite.  but it is in there:

cb.to_dict().get('full name')


Out[16]:
{'hv206': 'has electricity',
 'hv207': 'has radio',
 'hv208': 'has television',
 'hv209': 'has refrigerator',
 'hv210': 'has bicycle',
 'hv211': 'has motorcycle/scooter',
 'hv212': 'has car/truck',
 'hv221': 'has telephone (landline)',
 'hv225': 'share toilet with other households',
 'hv227': 'has mosquito bed net for sleeping',
 'hv243a': 'has mobile telephone',
 'hv243b': 'has watch',
 'hv243c': 'has animal-drawn cart',
 'hv243d': 'has boat with a motor',
 'hv246': 'owns livestock, herds, or farm animals',
 'hv247': 'has bank account',
 'sh110g': 'computer',
 'sh118f': 'boat without motor'}

In [17]:
# you can use pd.Series.map to change all the names,
# but you cannot do it to a list of columns directly

# too bad...

pd.Series(df.columns).map(cb.to_dict()['full name'])


Out[17]:
0                                   computer
1                           has bank account
2     owns livestock, herds, or farm animals
3          has mosquito bed net for sleeping
4                         boat without motor
5                       has mobile telephone
6                      has animal-drawn cart
7                                  has watch
8         share toilet with other households
9                           has refrigerator
10                            has television
11                     has boat with a motor
12                                 has radio
13                           has electricity
14                             has car/truck
15                  has telephone (landline)
16                               has bicycle
17                    has motorcycle/scooter
dtype: object

In [18]:
df.columns = pd.Series(df.columns).map(cb.to_dict()['full name'])

# did we get that right?
df


Out[18]:
computer has bank account owns livestock, herds, or farm animals has mosquito bed net for sleeping boat without motor has mobile telephone has animal-drawn cart has watch share toilet with other households has refrigerator has television has boat with a motor has radio has electricity has car/truck has telephone (landline) has bicycle has motorcycle/scooter
0 0 1 0 1 0 1 0 1 0 0 1 0 1 1 0 0 0 0
1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0
2 0 1 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 1
3 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
4 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
5 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0
6 1 1 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 0
7 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0
8 0 1 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 0
9 0 0 0 1 0 1 0 1 1 0 0 0 1 1 0 0 0 0
10 1 1 0 1 0 1 0 1 1 0 1 0 1 1 0 0 0 0
11 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0
12 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0
13 0 1 1 0 0 1 0 0 1 0 1 0 1 1 0 0 0 0
14 0 0 0 1 0 1 0 0 1 0 0 0 1 1 0 0 0 0
15 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0
16 0 1 1 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0
17 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0
18 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0
19 0 1 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0
20 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0
21 0 1 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 0
22 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0
23 0 1 0 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0
24 0 1 1 0 0 1 0 0 1 0 1 0 1 1 0 0 0 0
25 0 1 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 0
26 0 1 0 1 0 1 0 1 1 0 1 0 1 1 0 0 0 0
27 0 1 0 1 0 1 0 1 1 0 1 0 1 1 0 0 0 0
28 0 1 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0
29 0 1 0 1 0 1 0 0 1 0 0 0 1 1 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
12510 0 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0
12511 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0
12512 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0
12513 0 1 1 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0
12514 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
12515 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0
12516 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0
12517 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
12518 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0
12519 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0
12520 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12521 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12522 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0
12523 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0
12524 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0
12525 0 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0
12526 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12527 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
12528 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0
12529 0 1 1 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0
12530 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12531 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12532 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
12533 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
12534 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
12535 0 0 1 1 0 1 0 1 1 0 0 0 1 0 0 0 1 0
12536 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0
12537 0 1 1 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0
12538 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
12539 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0

12317 rows × 18 columns


In [19]:
# have a look at the survey results:

(100*df.mean()).order().plot(kind='barh')
plt.xlabel('Percent endorsed')


Out[19]:
<matplotlib.text.Text at 0x7f77d95bbf90>

Now make an array of feature vectors and a corresponding array of labels:


In [20]:
X = np.array(df.drop('has mobile telephone', axis=1))
y = np.array(df['has mobile telephone'])

And split the data into training and test sets (we'll talk more about this next week!)


In [21]:
train = np.random.choice(range(len(df.index)), size=len(df.index)*.8, replace=False)
test = [i for i in (set(range(len(df.index))) - set(train))]

In [22]:
X_test = X[test]
y_test = y[test]

X_train = X[train]
y_train = y[train]

Does it look reasonable?


In [23]:
len(X_test), len(X_train)


Out[23]:
(2464, 9853)

In [24]:
y_test.mean(), y_train.mean()


Out[24]:
(0.42288961038961037, 0.41530498325383131)

In [25]:
X_test.mean(axis=0).round(2)


Out[25]:
array([ 0.02,  0.34,  0.57,  0.84,  0.  ,  0.  ,  0.24,  0.22,  0.02,
        0.06,  0.  ,  0.63,  0.11,  0.01,  0.  ,  0.15,  0.01])

In [26]:
X_train.mean(axis=0).round(2)


Out[26]:
array([ 0.02,  0.34,  0.57,  0.82,  0.  ,  0.  ,  0.23,  0.23,  0.02,
        0.06,  0.  ,  0.64,  0.11,  0.01,  0.  ,  0.15,  0.01])

Now, let us try a range of prediction methods:

Naïve Bayes


In [27]:
import sklearn.naive_bayes
clf = sklearn.naive_bayes.BernoulliNB()
clf.fit(X_train, y_train)


Out[27]:
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [29]:
y_pred = clf.predict(X_test)

In [30]:
np.mean(y_pred == y_test)


Out[30]:
0.76501623376623373

Is that good?


In [31]:
y_random = np.random.choice([0,1], size=len(y_test))

np.mean(y_random == y_test)


Out[31]:
0.50121753246753242

Better than random, worse than perfect...

Linear regression based prediction


In [32]:
import sklearn.linear_model
clf = sklearn.linear_model.LinearRegression()
clf.fit(X_train, y_train)


Out[32]:
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

In [33]:
y_pred = clf.predict(X_test)

In [34]:
y_pred = (y_pred >= .5)

In [35]:
np.mean(y_pred == y_test)


Out[35]:
0.7633928571428571

Actually just about as good as N-B.

Logistic regression


In [36]:
import sklearn.linear_model
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train, y_train)


Out[36]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [37]:
y_pred = clf.predict(X_test)

In [38]:
np.mean(y_pred == y_test)


Out[38]:
0.76420454545454541

Again, about the same...

Did you read about Perceptron?


In [39]:
import sklearn.linear_model
clf = sklearn.linear_model.Perceptron()
clf.fit(X_train, y_train)


Out[39]:
Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=False,
      verbose=0, warm_start=False)

In [40]:
y_pred = clf.predict(X_test)

In [41]:
np.mean(y_pred == y_test)


Out[41]:
0.70941558441558439

So it is possible to do worse. Bonus challenge: I think you can change the parameters to get this up to 75% concordance. Can you?

Decision Trees


In [42]:
import sklearn.tree
clf = sklearn.tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)


Out[42]:
DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            random_state=None, splitter='best')

In [45]:
y_pred = clf.predict(X_test)

In [47]:
np.mean(y_pred == y_test)


Out[47]:
0.76379870129870131

A tiny improvement!

Now refactor this process, so that for any sklearn classifier you can call a function to find its out-of-sample predictive accuracy on cell phone ownership:


In [48]:
def oos_accuracy(clf, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    """ Calculate out-of-sample predictive accuracy of cell phone ownership
    prediction
    
    Parameters
    ----------
    clf : sklearn classifier
    X_train, y_train, X_test, y_test : training and test data and labels
    
    Results
    -------
    stores trained classifier in clf, returns oos accuracy
    """
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return np.mean(y_pred == y_test)

Figure out a way to test it:


In [49]:
oos_accuracy(sklearn.naive_bayes.BernoulliNB())  # should be .765


Out[49]:
0.76501623376623373

Bonus challenge: Figure out a way to make it work for the linear regression predictor

(Hard because we had to round the numeric predictions)


In [50]:
oos_accuracy(sklearn.linear_model.LinearRegression())  # got .763 before


Out[50]:
0.0

In [53]:
def fixed_oos_accuracy(clf, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    """ Calculate out-of-sample predictive accuracy of cell phone ownership
    prediction
    
    Parameters
    ----------
    clf : sklearn classifier
    X_train, y_train, X_test, y_test : training and test data and labels
    
    Results
    -------
    stores trained classifier in clf, returns oos accuracy
    """
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    # HACK: turn numeric values into categorical predictions
    y_pred = (y_pred >= .5)
    return np.mean(y_pred == y_test)

In [54]:
fixed_oos_accuracy(sklearn.linear_model.LinearRegression())  # got .763 before


Out[54]:
0.7633928571428571

In [ ]: