notebook.community

Edit and run



In [11]:

    
!date
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
sns.set_context('poster')
sns.set_style('darkgrid')









    



Tue Jan 20 17:12:54 PST 2015



In [12]:

    
# set random seed, for reproducibility
np.random.seed(12345)

# funny little var for making notebook executable
__________________ = None

Load, clean, and prepare DHS asset ownership data:



In [13]:

    
df = pd.read_csv('RWA_DHS6_2010_2011_HH_ASSETS.CSV', index_col=0)

# have a look at what is in this data frame
df









    Out[13]:






  
    
      
      sh110g
      hv247
      hv246
      hv227
      sh118f
      hv243a
      hv243c
      hv243b
      hv225
      hv209
      hv208
      hv243d
      hv207
      hv206
      hv212
      hv221
      hv210
      hv211
    
  
  
    
      0    
       0
       1
       0
       1
       0
       1
       0
       1
       0
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      1    
       0
       1
       0
       0
       0
       1
       0
       0
       1
       0
       0
       0
       1
       1
       0
       0
       0
       0
    
    
      2    
       0
       1
       0
       0
       0
       1
       0
       1
       1
       0
       0
       0
       1
       0
       0
       0
       0
       1
    
    
      3    
       0
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      4    
       0
       0
       1
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      5    
       0
       0
       0
       1
       0
       1
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      6    
       1
       1
       0
       1
       0
       1
       0
       0
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      7    
       0
       1
       0
       0
       0
       1
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
    
    
      8    
       0
       1
       0
       1
       0
       1
       0
       0
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      9    
       0
       0
       0
       1
       0
       1
       0
       1
       1
       0
       0
       0
       1
       1
       0
       0
       0
       0
    
    
      10   
       1
       1
       0
       1
       0
       1
       0
       1
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      11   
       0
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      12   
       0
       0
       0
       1
       0
       1
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      13   
       0
       1
       1
       0
       0
       1
       0
       0
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      14   
       0
       0
       0
       1
       0
       1
       0
       0
       1
       0
       0
       0
       1
       1
       0
       0
       0
       0
    
    
      15   
       0
       0
       0
       1
       0
       1
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      16   
       0
       1
       1
       0
       0
       1
       0
       0
       1
       0
       0
       0
       1
       1
       0
       0
       0
       0
    
    
      17   
       0
       0
       0
       1
       0
       1
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      18   
       0
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      19   
       0
       1
       1
       1
       0
       1
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      20   
       0
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      21   
       0
       1
       0
       1
       0
       1
       0
       0
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      22   
       0
       0
       0
       1
       0
       1
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       0
    
    
      23   
       0
       1
       0
       1
       0
       1
       0
       0
       0
       0
       1
       0
       1
       1
       1
       0
       0
       0
    
    
      24   
       0
       1
       1
       0
       0
       1
       0
       0
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      25   
       0
       1
       0
       1
       0
       1
       0
       0
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      26   
       0
       1
       0
       1
       0
       1
       0
       1
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      27   
       0
       1
       0
       1
       0
       1
       0
       1
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      28   
       0
       1
       0
       1
       0
       1
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      29   
       0
       1
       0
       1
       0
       1
       0
       0
       1
       0
       0
       0
       1
       1
       0
       0
       0
       0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      12510
       0
       0
       1
       1
       0
       1
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12511
       0
       0
       1
       0
       0
       1
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12512
       0
       0
       1
       1
       0
       1
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12513
       0
       1
       1
       1
       0
       1
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12514
       0
       1
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      12515
       0
       0
       1
       1
       0
       0
       0
       1
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12516
       0
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      12517
       0
       0
       1
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12518
       0
       0
       1
       1
       0
       1
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12519
       0
       0
       0
       1
       0
       1
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      12520
       0
       1
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12521
       0
       0
       1
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12522
       0
       1
       1
       1
       0
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12523
       0
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      12524
       0
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      12525
       0
       0
       1
       1
       0
       1
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12526
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12527
       0
       0
       1
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12528
       0
       0
       1
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       1
       0
    
    
      12529
       0
       1
       1
       1
       0
       1
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12530
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12531
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12532
       0
       1
       1
       1
       0
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      12533
       0
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12534
       0
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12535
       0
       0
       1
       1
       0
       1
       0
       1
       1
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12536
       0
       0
       1
       1
       0
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12537
       0
       1
       1
       1
       0
       1
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12538
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      12539
       0
       1
       1
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       1
       0
    
  

12317 rows × 18 columns



In [14]:

    
cb = pd.read_csv('RWA_DHS6_2010_2011_HH_ASSETS_codebook.CSV', index_col=0)

# cb stands for codebook.  have a look at what the funny column names mean
cb









    Out[14]:






  
    
      
      full name
    
  
  
    
      hv206
                              has electricity
    
    
      hv207
                                    has radio
    
    
      hv208
                               has television
    
    
      hv209
                             has refrigerator
    
    
      hv210
                                  has bicycle
    
    
      hv211
                       has motorcycle/scooter
    
    
      hv212
                                has car/truck
    
    
      hv221
                     has telephone (landline)
    
    
      hv225
           share toilet with other households
    
    
      hv227
            has mosquito bed net for sleeping
    
    
      hv243a
                         has mobile telephone
    
    
      hv243b
                                    has watch
    
    
      hv243c
                        has animal-drawn cart
    
    
      hv243d
                        has boat with a motor
    
    
      hv246
       owns livestock, herds, or farm animals
    
    
      hv247
                             has bank account
    
    
      sh110g
                                     computer
    
    
      sh118f
                           boat without motor

Wouldn't it be nice if the column names were descriptive, instead of codes?



In [15]:

    
# find a dictionary mapping codes to descriptions

# is it simply cb.to_dict?
cb.to_dict()









    Out[15]:





{'full name': {'hv206': 'has electricity',
  'hv207': 'has radio',
  'hv208': 'has television',
  'hv209': 'has refrigerator',
  'hv210': 'has bicycle',
  'hv211': 'has motorcycle/scooter',
  'hv212': 'has car/truck',
  'hv221': 'has telephone (landline)',
  'hv225': 'share toilet with other households',
  'hv227': 'has mosquito bed net for sleeping',
  'hv243a': 'has mobile telephone',
  'hv243b': 'has watch',
  'hv243c': 'has animal-drawn cart',
  'hv243d': 'has boat with a motor',
  'hv246': 'owns livestock, herds, or farm animals',
  'hv247': 'has bank account',
  'sh110g': 'computer',
  'sh118f': 'boat without motor'}}



In [16]:

    
# no, not quite.  but it is in there:

cb.to_dict().get('full name')









    Out[16]:





{'hv206': 'has electricity',
 'hv207': 'has radio',
 'hv208': 'has television',
 'hv209': 'has refrigerator',
 'hv210': 'has bicycle',
 'hv211': 'has motorcycle/scooter',
 'hv212': 'has car/truck',
 'hv221': 'has telephone (landline)',
 'hv225': 'share toilet with other households',
 'hv227': 'has mosquito bed net for sleeping',
 'hv243a': 'has mobile telephone',
 'hv243b': 'has watch',
 'hv243c': 'has animal-drawn cart',
 'hv243d': 'has boat with a motor',
 'hv246': 'owns livestock, herds, or farm animals',
 'hv247': 'has bank account',
 'sh110g': 'computer',
 'sh118f': 'boat without motor'}



In [17]:

    
# you can use pd.Series.map to change all the names,
# but you cannot do it to a list of columns directly

# too bad...

pd.Series(df.columns).map(cb.to_dict()['full name'])









    Out[17]:





0                                   computer
1                           has bank account
2     owns livestock, herds, or farm animals
3          has mosquito bed net for sleeping
4                         boat without motor
5                       has mobile telephone
6                      has animal-drawn cart
7                                  has watch
8         share toilet with other households
9                           has refrigerator
10                            has television
11                     has boat with a motor
12                                 has radio
13                           has electricity
14                             has car/truck
15                  has telephone (landline)
16                               has bicycle
17                    has motorcycle/scooter
dtype: object



In [18]:

    
df.columns = pd.Series(df.columns).map(cb.to_dict()['full name'])

# did we get that right?
df









    Out[18]:






  
    
      
      computer
      has bank account
      owns livestock, herds, or farm animals
      has mosquito bed net for sleeping
      boat without motor
      has mobile telephone
      has animal-drawn cart
      has watch
      share toilet with other households
      has refrigerator
      has television
      has boat with a motor
      has radio
      has electricity
      has car/truck
      has telephone (landline)
      has bicycle
      has motorcycle/scooter
    
  
  
    
      0    
       0
       1
       0
       1
       0
       1
       0
       1
       0
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      1    
       0
       1
       0
       0
       0
       1
       0
       0
       1
       0
       0
       0
       1
       1
       0
       0
       0
       0
    
    
      2    
       0
       1
       0
       0
       0
       1
       0
       1
       1
       0
       0
       0
       1
       0
       0
       0
       0
       1
    
    
      3    
       0
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      4    
       0
       0
       1
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      5    
       0
       0
       0
       1
       0
       1
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      6    
       1
       1
       0
       1
       0
       1
       0
       0
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      7    
       0
       1
       0
       0
       0
       1
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
    
    
      8    
       0
       1
       0
       1
       0
       1
       0
       0
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      9    
       0
       0
       0
       1
       0
       1
       0
       1
       1
       0
       0
       0
       1
       1
       0
       0
       0
       0
    
    
      10   
       1
       1
       0
       1
       0
       1
       0
       1
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      11   
       0
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      12   
       0
       0
       0
       1
       0
       1
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      13   
       0
       1
       1
       0
       0
       1
       0
       0
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      14   
       0
       0
       0
       1
       0
       1
       0
       0
       1
       0
       0
       0
       1
       1
       0
       0
       0
       0
    
    
      15   
       0
       0
       0
       1
       0
       1
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      16   
       0
       1
       1
       0
       0
       1
       0
       0
       1
       0
       0
       0
       1
       1
       0
       0
       0
       0
    
    
      17   
       0
       0
       0
       1
       0
       1
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      18   
       0
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      19   
       0
       1
       1
       1
       0
       1
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      20   
       0
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      21   
       0
       1
       0
       1
       0
       1
       0
       0
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      22   
       0
       0
       0
       1
       0
       1
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       0
    
    
      23   
       0
       1
       0
       1
       0
       1
       0
       0
       0
       0
       1
       0
       1
       1
       1
       0
       0
       0
    
    
      24   
       0
       1
       1
       0
       0
       1
       0
       0
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      25   
       0
       1
       0
       1
       0
       1
       0
       0
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      26   
       0
       1
       0
       1
       0
       1
       0
       1
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      27   
       0
       1
       0
       1
       0
       1
       0
       1
       1
       0
       1
       0
       1
       1
       0
       0
       0
       0
    
    
      28   
       0
       1
       0
       1
       0
       1
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      29   
       0
       1
       0
       1
       0
       1
       0
       0
       1
       0
       0
       0
       1
       1
       0
       0
       0
       0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      12510
       0
       0
       1
       1
       0
       1
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12511
       0
       0
       1
       0
       0
       1
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12512
       0
       0
       1
       1
       0
       1
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12513
       0
       1
       1
       1
       0
       1
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12514
       0
       1
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      12515
       0
       0
       1
       1
       0
       0
       0
       1
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12516
       0
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      12517
       0
       0
       1
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12518
       0
       0
       1
       1
       0
       1
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12519
       0
       0
       0
       1
       0
       1
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      12520
       0
       1
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12521
       0
       0
       1
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12522
       0
       1
       1
       1
       0
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12523
       0
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      12524
       0
       0
       0
       1
       0
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      12525
       0
       0
       1
       1
       0
       1
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12526
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12527
       0
       0
       1
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12528
       0
       0
       1
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       1
       0
    
    
      12529
       0
       1
       1
       1
       0
       1
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12530
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12531
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12532
       0
       1
       1
       1
       0
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      12533
       0
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12534
       0
       0
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12535
       0
       0
       1
       1
       0
       1
       0
       1
       1
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12536
       0
       0
       1
       1
       0
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12537
       0
       1
       1
       1
       0
       1
       0
       1
       0
       0
       0
       0
       1
       0
       0
       0
       1
       0
    
    
      12538
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
    
    
      12539
       0
       1
       1
       1
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
       1
       0
    
  

12317 rows × 18 columns



In [19]:

    
# have a look at the survey results:

(100*df.mean()).order().plot(kind='barh')
plt.xlabel('Percent endorsed')









    Out[19]:





<matplotlib.text.Text at 0x7f77d95bbf90>

Now make an array of feature vectors and a corresponding array of labels:



In [20]:

    
X = np.array(df.drop('has mobile telephone', axis=1))
y = np.array(df['has mobile telephone'])

And split the data into training and test sets (we'll talk more about this next week!)



In [21]:

    
train = np.random.choice(range(len(df.index)), size=len(df.index)*.8, replace=False)
test = [i for i in (set(range(len(df.index))) - set(train))]



In [22]:

    
X_test = X[test]
y_test = y[test]

X_train = X[train]
y_train = y[train]

Does it look reasonable?



In [23]:

    
len(X_test), len(X_train)









    Out[23]:





(2464, 9853)



In [24]:

    
y_test.mean(), y_train.mean()









    Out[24]:





(0.42288961038961037, 0.41530498325383131)



In [25]:

    
X_test.mean(axis=0).round(2)









    Out[25]:





array([ 0.02,  0.34,  0.57,  0.84,  0.  ,  0.  ,  0.24,  0.22,  0.02,
        0.06,  0.  ,  0.63,  0.11,  0.01,  0.  ,  0.15,  0.01])



In [26]:

    
X_train.mean(axis=0).round(2)









    Out[26]:





array([ 0.02,  0.34,  0.57,  0.82,  0.  ,  0.  ,  0.23,  0.23,  0.02,
        0.06,  0.  ,  0.64,  0.11,  0.01,  0.  ,  0.15,  0.01])

Now, let us try a range of prediction methods:

Naïve Bayes



In [27]:

    
import sklearn.naive_bayes
clf = sklearn.naive_bayes.BernoulliNB()
clf.fit(X_train, y_train)









    Out[27]:





BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)



In [29]:

    
y_pred = clf.predict(X_test)



In [30]:

    
np.mean(y_pred == y_test)









    Out[30]:





0.76501623376623373

Is that good?



In [31]:

    
y_random = np.random.choice([0,1], size=len(y_test))

np.mean(y_random == y_test)









    Out[31]:





0.50121753246753242

Better than random, worse than perfect...

Linear regression based prediction



In [32]:

    
import sklearn.linear_model
clf = sklearn.linear_model.LinearRegression()
clf.fit(X_train, y_train)









    Out[32]:





LinearRegression(copy_X=True, fit_intercept=True, normalize=False)



In [33]:

    
y_pred = clf.predict(X_test)



In [34]:

    
y_pred = (y_pred >= .5)



In [35]:

    
np.mean(y_pred == y_test)









    Out[35]:





0.7633928571428571

Actually just about as good as N-B.

Logistic regression



In [36]:

    
import sklearn.linear_model
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train, y_train)









    Out[36]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)



In [37]:

    
y_pred = clf.predict(X_test)



In [38]:

    
np.mean(y_pred == y_test)









    Out[38]:





0.76420454545454541

Again, about the same...

Did you read about Perceptron?



In [39]:

    
import sklearn.linear_model
clf = sklearn.linear_model.Perceptron()
clf.fit(X_train, y_train)









    Out[39]:





Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=False,
      verbose=0, warm_start=False)



In [40]:

    
y_pred = clf.predict(X_test)



In [41]:

    
np.mean(y_pred == y_test)









    Out[41]:





0.70941558441558439

So it is possible to do worse. Bonus challenge: I think you can change the parameters to get this up to 75% concordance. Can you?

Decision Trees



In [42]:

    
import sklearn.tree
clf = sklearn.tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)









    Out[42]:





DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            random_state=None, splitter='best')



In [45]:

    
y_pred = clf.predict(X_test)



In [47]:

    
np.mean(y_pred == y_test)









    Out[47]:





0.76379870129870131

A tiny improvement!

Now refactor this process, so that for any sklearn classifier you can call a function to find its out-of-sample predictive accuracy on cell phone ownership:



In [48]:

    
def oos_accuracy(clf, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    """ Calculate out-of-sample predictive accuracy of cell phone ownership
    prediction
    
    Parameters
    ----------
    clf : sklearn classifier
    X_train, y_train, X_test, y_test : training and test data and labels
    
    Results
    -------
    stores trained classifier in clf, returns oos accuracy
    """
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return np.mean(y_pred == y_test)

Figure out a way to test it:



In [49]:

    
oos_accuracy(sklearn.naive_bayes.BernoulliNB())  # should be .765









    Out[49]:





0.76501623376623373

Bonus challenge: Figure out a way to make it work for the linear regression predictor

(Hard because we had to round the numeric predictions)



In [50]:

    
oos_accuracy(sklearn.linear_model.LinearRegression())  # got .763 before









    Out[50]:





0.0



In [53]:

    
def fixed_oos_accuracy(clf, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    """ Calculate out-of-sample predictive accuracy of cell phone ownership
    prediction
    
    Parameters
    ----------
    clf : sklearn classifier
    X_train, y_train, X_test, y_test : training and test data and labels
    
    Results
    -------
    stores trained classifier in clf, returns oos accuracy
    """
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    # HACK: turn numeric values into categorical predictions
    y_pred = (y_pred >= .5)
    return np.mean(y_pred == y_test)



In [54]:

    
fixed_oos_accuracy(sklearn.linear_model.LinearRegression())  # got .763 before









    Out[54]:





0.7633928571428571



In [ ]:

	sh110g	hv247	hv246	hv227	sh118f	hv243a	hv243c	hv243b	hv225	hv209	hv208	hv243d	hv207	hv206	hv212	hv221	hv210	hv211
0	0	1	0	1	0	1	0	1	0	0	1	0	1	1	0	0	0	0
1	0	1	0	0	0	1	0	0	1	0	0	0	1	1	0	0	0	0
2	0	1	0	0	0	1	0	1	1	0	0	0	1	0	0	0	0	1
3	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0
4	0	0	1	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0
5	0	0	0	1	0	1	0	0	1	0	0	0	1	0	0	0	0	0
6	1	1	0	1	0	1	0	0	1	0	1	0	1	1	0	0	0	0
7	0	1	0	0	0	1	0	0	1	0	0	0	0	1	0	0	0	0
8	0	1	0	1	0	1	0	0	1	0	1	0	1	1	0	0	0	0
9	0	0	0	1	0	1	0	1	1	0	0	0	1	1	0	0	0	0
10	1	1	0	1	0	1	0	1	1	0	1	0	1	1	0	0	0	0
11	0	0	0	1	0	0	0	0	1	0	0	0	1	0	0	0	0	0
12	0	0	0	1	0	1	0	0	1	0	0	0	1	0	0	0	0	0
13	0	1	1	0	0	1	0	0	1	0	1	0	1	1	0	0	0	0
14	0	0	0	1	0	1	0	0	1	0	0	0	1	1	0	0	0	0
15	0	0	0	1	0	1	0	0	1	0	0	0	0	0	0	0	0	0
16	0	1	1	0	0	1	0	0	1	0	0	0	1	1	0	0	0	0
17	0	0	0	1	0	1	0	0	1	0	0	0	1	0	0	0	0	0
18	0	0	0	1	0	0	0	0	1	0	0	0	1	0	0	0	0	0
19	0	1	1	1	0	1	0	0	1	0	0	0	0	0	0	0	0	0
20	0	0	0	1	0	0	0	0	1	0	0	0	1	0	0	0	0	0
21	0	1	0	1	0	1	0	0	1	0	1	0	1	1	0	0	0	0
22	0	0	0	1	0	1	0	0	0	0	0	0	0	1	0	0	0	0
23	0	1	0	1	0	1	0	0	0	0	1	0	1	1	1	0	0	0
24	0	1	1	0	0	1	0	0	1	0	1	0	1	1	0	0	0	0
25	0	1	0	1	0	1	0	0	1	0	1	0	1	1	0	0	0	0
26	0	1	0	1	0	1	0	1	1	0	1	0	1	1	0	0	0	0
27	0	1	0	1	0	1	0	1	1	0	1	0	1	1	0	0	0	0
28	0	1	0	1	0	1	0	0	1	0	0	0	0	0	0	0	0	0
29	0	1	0	1	0	1	0	0	1	0	0	0	1	1	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
12510	0	0	1	1	0	1	0	1	0	0	0	0	1	0	0	0	1	0
12511	0	0	1	0	0	1	0	0	0	0	0	0	1	0	0	0	1	0
12512	0	0	1	1	0	1	0	0	0	0	0	0	1	0	0	0	1	0
12513	0	1	1	1	0	1	0	1	0	0	0	0	1	0	0	0	1	0
12514	0	1	0	1	0	0	0	0	0	0	0	0	1	0	0	0	0	0
12515	0	0	1	1	0	0	0	1	1	0	0	0	0	0	0	0	0	0
12516	0	0	0	1	0	0	0	0	1	0	0	0	1	0	0	0	0	0
12517	0	0	1	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0
12518	0	0	1	1	0	1	0	0	0	0	0	0	1	0	0	0	1	0
12519	0	0	0	1	0	1	0	0	0	0	0	0	1	0	0	0	0	0
12520	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0
12521	0	0	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0
12522	0	1	1	1	0	0	0	0	0	0	0	0	1	0	0	0	1	0
12523	0	0	0	1	0	0	0	1	0	0	0	0	1	0	0	0	0	0
12524	0	0	0	1	0	0	0	1	0	0	0	0	1	0	0	0	0	0
12525	0	0	1	1	0	1	0	1	0	0	0	0	1	0	0	0	1	0
12526	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
12527	0	0	1	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0
12528	0	0	1	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0
12529	0	1	1	1	0	1	0	0	0	0	0	0	1	0	0	0	1	0
12530	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0
12531	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
12532	0	1	1	1	0	0	0	0	0	0	0	0	1	0	0	0	0	0
12533	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0
12534	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0
12535	0	0	1	1	0	1	0	1	1	0	0	0	1	0	0	0	1	0
12536	0	0	1	1	0	0	0	0	0	0	0	0	1	0	0	0	1	0
12537	0	1	1	1	0	1	0	1	0	0	0	0	1	0	0	0	1	0
12538	0	0	0	1	0	0	0	0	0	0	0	0	1	0	0	0	0	0
12539	0	1	1	1	0	0	0	0	1	0	0	0	0	0	0	0	1	0

	full name
hv206	has electricity
hv207	has radio
hv208	has television
hv209	has refrigerator
hv210	has bicycle
hv211	has motorcycle/scooter
hv212	has car/truck
hv221	has telephone (landline)
hv225	share toilet with other households
hv227	has mosquito bed net for sleeping
hv243a	has mobile telephone
hv243b	has watch
hv243c	has animal-drawn cart
hv243d	has boat with a motor
hv246	owns livestock, herds, or farm animals
hv247	has bank account
sh110g	computer
sh118f	boat without motor