In [1]:

    
!date
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
sns.set_context('poster')
sns.set_style('darkgrid')









    



Sun Jan 25 21:30:05 PST 2015



In [2]:

    
# set random seed, for reproducibility
np.random.seed(12345)

Load and clean PHMRC VA data:



In [3]:

    
df = pd.read_csv('IHME_PHMRC_VA_DATA_ADULT_Y2013M09D11_0.csv')









    



/snfs2/HOME/abie/pandas/pandas/io/parsers.py:1150: DtypeWarning: Columns (18,29,38,41,60,96) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

What do you think we should do about that warning?



In [4]:

    
# just do what the warning recommends
df = pd.read_csv('IHME_PHMRC_VA_DATA_ADULT_Y2013M09D11_0.csv', low_memory=False)



In [5]:

    
# take a look at what is in the DataFrame
df.head()









    Out[5]:






  
    
      
      site
      module
      gs_code34
      gs_text34
      va34
      gs_code46
      gs_text46
      va46
      gs_code55
      gs_text55
      ...
      word_woman
      word_womb
      word_worri
      word_wors
      word_worsen
      word_worst
      word_wound
      word_xray
      word_yellow
      newid
    
  
  
    
      0
       Mexico
       Adult
       K71
                         Cirrhosis
        6
       K71
                         Cirrhosis
        8
       K71
                         Cirrhosis
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       1
    
    
      1
           AP
       Adult
       G40
                          Epilepsy
       12
       G40
                          Epilepsy
       16
       G40
                          Epilepsy
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       2
    
    
      2
           AP
       Adult
       J12
                         Pneumonia
       26
       J12
                         Pneumonia
       37
       J12
                         Pneumonia
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       3
    
    
      3
       Mexico
       Adult
       J33
                              COPD
        8
       J33
                              COPD
       10
       J33
                              COPD
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       4
    
    
      4
           UP
       Adult
       I21
       Acute Myocardial Infarction
       17
       I21
       Acute Myocardial Infarction
       23
       I21
       Acute Myocardial Infarction
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       5
    
  

5 rows × 946 columns



In [6]:

    
df.columns









    Out[6]:





Index([u'site', u'module', u'gs_code34', u'gs_text34', u'va34', u'gs_code46', u'gs_text46', u'va46', u'gs_code55', u'gs_text55', u'va55', u'gs_comorbid1', u'gs_comorbid2', u'gs_level', u'g1_01d', u'g1_01m', u'g1_01y', u'g1_05', u'g1_06d', u'g1_06m', u'g1_06y', u'g1_07a', u'g1_07b', u'g1_07c', u'g1_08', u'g1_09', u'g1_10', u'g2_01', u'g2_02', u'g2_03ad', u'g2_03am', u'g2_03ay', u'g2_03bd', u'g2_03bm', u'g2_03by', u'g2_03cd', u'g2_03cm', u'g2_03cy', u'g2_03dd', u'g2_03dm', u'g2_03dy', u'g2_03ed', u'g2_03em', u'g2_03ey', u'g2_03fd', u'g2_03fm', u'g2_03fy', u'g3_01', u'g4_02', u'g4_03a', u'g4_03b', u'g4_04', u'g4_05', u'g4_06', u'g4_07', u'g4_08', u'g5_01d', u'g5_01m', u'g5_01y', u'g5_02', u'g5_03d', u'g5_03m', u'g5_03y', u'g5_04a', u'g5_04b', u'g5_04c', u'g5_05', u'g5_06a', u'g5_06b', u'g5_07', u'g5_08', u'a1_01_1', u'a1_01_2', u'a1_01_3', u'a1_01_4', u'a1_01_5', u'a1_01_6', u'a1_01_7', u'a1_01_8', u'a1_01_9', u'a1_01_10', u'a1_01_11', u'a1_01_12', u'a1_01_13', u'a1_01_14', u'a2_01', u'a2_02', u'a2_03', u'a2_04', u'a2_05', u'a2_06', u'a2_07', u'a2_08', u'a2_09_1a', u'a2_09_1b', u'a2_09_2a', u'a2_09_2b', u'a2_10', u'a2_11', u'a2_12', ...], dtype='object')



In [7]:

    
# also load codebook (excel doc)
cb = pd.read_excel('IHME_PHMRC_VA_DATA_CODEBOOK_Y2013M09D11_0.xlsx')
cb









    Out[7]:






  
    
      
      variable
      question
      module
      health_care_experience
      coding
    
  
  
    
      0   
                  site
                                                    Site
       General
       0
                                                     NaN
    
    
      1   
                 newid
                                                Study ID
       General
       0
                                                     NaN
    
    
      2   
          gs_diagnosis
                            Gold Standard Diagnosis Code
       General
       0
                                                     NaN
    
    
      3   
          gs_comorbid1
                     Gold Standard Comorbid Conditions 1
       General
       0
                                                     NaN
    
    
      4   
          gs_comorbid2
                     Gold Standard Comorbid Conditions 2
       General
       0
                                                     NaN
    
    
      5   
              gs_level
                           Gold Standard Diagnosis Level
       General
       0
       1 "GS Level 1" 2 "GS Level 2" 3 "GS Level 2B" ...
    
    
      6   
                g1_01d
                                     Date of birth [day]
       General
       0
                                         99 "Don't Know"
    
    
      7   
                g1_01m
                                   Date of birth [month]
       General
       0
       1 "January" 2 "February" 3 "March" 4 "April" 5...
    
    
      8   
                g1_01y
                                    Date of birth [year]
       General
       0
                                       9999 "Don't Know"
    
    
      9   
                 g1_05
                                         Sex of deceased
       General
       0
       1 "Male" 2 "Female" 8 "Refused to Answer" 9 "D...
    
    
      10  
                g1_06d
                                     Date of death [day]
       General
       0
                                         99 "Don't Know"
    
    
      11  
                g1_06m
                                   Date of death [month]
       General
       0
       1 "January" 2 "February" 3 "March" 4 "April" 5...
    
    
      12  
                g1_06y
                                    Date of death [year]
       General
       0
                                       9999 "Don't Know"
    
    
      13  
                g1_07a
                  Last known age of the deceased [years]
       General
       0
                                        999 "Don't Know"
    
    
      14  
                g1_07b
                 Last known age of the deceased [months]
       General
       0
                                         99 "Don't Know"
    
    
      15  
                g1_07c
                   Last known age of the deceased [days]
       General
       0
                                         99 "Don't Know"
    
    
      16  
                 g1_08
                              Marital status of deceased
       General
       0
       1 "Never Married" 2 "Married" 3 "Separated" 4 ...
    
    
      17  
                 g1_09
                 Last known level of education completed
       General
       0
       1 "No Schooling" 2 "Primary School" 3 "High Sc...
    
    
      18  
                 g1_10
                  Number of years of education completed
       General
       0
                                         99 "Don't Know"
    
    
      19  
                 g2_01
                                   Language of interview
       General
       0
                                                     NaN
    
    
      20  
                 g2_02
                                   Interviewer ID number
       General
       0
                                                     NaN
    
    
      21  
               g2_03ad
                   Date of first interview attempt [day]
       General
       0
                                         99 "Don't Know"
    
    
      22  
               g2_03am
                 Date of first interview attempt [month]
       General
       0
       1 "January" 2 "February" 3 "March" 4 "April" 5...
    
    
      23  
               g2_03ay
                  Date of first interview attempt [year]
       General
       0
                                       9999 "Don't Know"
    
    
      24  
               g2_03bd
       Date and time arranged for second interview at...
       General
       0
                                         99 "Don't Know"
    
    
      25  
               g2_03bm
       Date and time arranged for second interview at...
       General
       0
       1 "January" 2 "February" 3 "March" 4 "April" 5...
    
    
      26  
               g2_03by
       Date and time arranged for second interview at...
       General
       0
                                       9999 "Don't Know"
    
    
      27  
               g2_03cd
       Date and time arranged for third interview att...
       General
       0
                                         99 "Don't Know"
    
    
      28  
               g2_03cm
       Date and time arranged for third interview att...
       General
       0
       1 "January" 2 "February" 3 "March" 4 "April" 5...
    
    
      29  
               g2_03cy
       Date and time arranged for third interview att...
       General
       0
                                       9999 "Don't Know"
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      1619
          word_stomach
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1620
             word_test
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1621
            word_deliv
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1622
         word_asphyxia
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1623
             word_wife
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1624
            word_renal
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1625
           word_surviv
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1626
            word_visit
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1627
          word_continu
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1628
          word_husband
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1629
            word_reach
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1630
         word_provinci
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1631
           word_servic
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1632
           word_deceas
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1633
              word_due
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1634
            word_vomit
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1635
              word_bad
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1636
            word_nilof
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1637
             word_male
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1638
             word_milk
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1639
            word_incub
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1640
             word_aliv
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1641
           word_doctor
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1642
             word_cord
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1643
             word_born
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1644
          word_brought
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1645
             word_look
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1646
             word_week
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1647
             word_babi
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
    
      1648
       word_difficulti
       The open narratives contained the word or stem...
       Neonate
       1
                                                     NaN
    
  

1649 rows × 5 columns

Every column that starts with a{NUMBER}_stuff is part of the signs and symptoms reported about an adult death:



In [8]:

    
df.filter(regex='a\d_')









    Out[8]:






  
    
      
      a1_01_1
      a1_01_2
      a1_01_3
      a1_01_4
      a1_01_5
      a1_01_6
      a1_01_7
      a1_01_8
      a1_01_9
      a1_01_10
      ...
      a6_06_2y
      a6_07d
      a6_07m
      a6_07y
      a6_09
      a6_10
      a7_11
      a7_12
      a7_13
      a7_14
    
  
  
    
      0   
        No
        No
        No
        No
        No
        No
        No
        No
       Don't Know
       Don't Know
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      1   
       Yes
       Yes
        No
       Yes
        No
       Yes
        No
        No
              Yes
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      2   
       Yes
       Yes
        No
        No
        No
        No
       Yes
        No
               No
              Yes
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      3   
       Yes
        No
        No
       Yes
        No
       Yes
       Yes
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      4   
        No
        No
        No
        No
        No
        No
        No
        No
              Yes
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      5   
        No
        No
        No
        No
       Yes
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
               No
        No
      NaN
      NaN
      NaN
      NaN
    
    
      6   
        No
        No
        No
        No
        No
        No
       Yes
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
               No
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7   
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
               No
        No
      NaN
      NaN
      NaN
      NaN
    
    
      8   
       Yes
       Yes
        No
        No
       Yes
       Yes
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      9   
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      10  
        No
        No
        No
        No
        No
        No
        No
        No
               No
              Yes
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      11  
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      12  
       Yes
       Yes
        No
       Yes
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      13  
        No
        No
        No
        No
        No
        No
        No
        No
              Yes
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
               No
        No
      NaN
      NaN
      NaN
      NaN
    
    
      14  
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
       Yes
      NaN
      NaN
      NaN
      NaN
    
    
      15  
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      16  
        No
       Yes
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      17  
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      18  
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      19  
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      20  
       Yes
       Yes
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
               No
        No
      NaN
      NaN
      NaN
      NaN
    
    
      21  
       Yes
        No
       Yes
        No
       Yes
       Yes
        No
        No
               No
              Yes
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
       Yes
      NaN
      NaN
      NaN
      NaN
    
    
      22  
        No
       Yes
        No
        No
        No
        No
       Yes
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      23  
        No
        No
        No
        No
        No
        No
        No
        No
               No
              Yes
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
               No
        No
      NaN
      NaN
      NaN
      NaN
    
    
      24  
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
               No
        No
      NaN
      NaN
      NaN
      NaN
    
    
      25  
        No
        No
        No
        No
        No
        No
       Yes
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
               No
        No
      NaN
      NaN
      NaN
      NaN
    
    
      26  
        No
        No
        No
       Yes
       Yes
       Yes
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      27  
        No
       Yes
        No
        No
        No
       Yes
        No
        No
              Yes
              Yes
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
       Yes
      NaN
      NaN
      NaN
      NaN
    
    
      28  
        No
        No
        No
       Yes
        No
       Yes
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      29  
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
               No
        No
      NaN
      NaN
      NaN
      NaN
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      7811
        No
        No
       Yes
        No
        No
        No
        No
        No
               No
              Yes
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7812
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7813
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
               No
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7814
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
             2009
               16
            April
             2009
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7815
        No
        No
        No
        No
        No
        No
        No
        No
              Yes
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
       Yes
      NaN
      NaN
      NaN
      NaN
    
    
      7816
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7817
        No
        No
        No
        No
        No
        No
        No
        No
               No
              Yes
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
       Yes
      NaN
      NaN
      NaN
      NaN
    
    
      7818
        No
       Yes
        No
        No
       Yes
        No
        No
        No
               No
              Yes
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7819
        No
        No
        No
        No
        No
       Yes
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
               No
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7820
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7821
        No
        No
        No
        No
        No
        No
       Yes
        No
               No
               No
      ...
       Don't Know
                3
        September
             2009
               No
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7822
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
       Don't Know
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7823
       Yes
        No
       Yes
       Yes
        No
       Yes
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7824
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7825
        No
        No
        No
        No
       Yes
        No
        No
       Yes
              Yes
              Yes
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
       Yes
      NaN
      NaN
      NaN
      NaN
    
    
      7826
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
               No
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7827
        No
        No
        No
        No
        No
        No
        No
        No
       Don't Know
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
       Yes
      NaN
      NaN
      NaN
      NaN
    
    
      7828
        No
        No
        No
        No
       Yes
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
       Yes
      NaN
      NaN
      NaN
      NaN
    
    
      7829
        No
        No
        No
        No
        No
        No
       Yes
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7830
        No
        No
        No
        No
        No
        No
        No
        No
               No
              Yes
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7831
        No
       Yes
        No
        No
        No
        No
        No
       Yes
               No
              Yes
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7832
        No
        No
       Yes
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
               No
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7833
        No
        No
        No
        No
        No
        No
        No
        No
               No
              Yes
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
               No
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7834
        No
        No
        No
        No
        No
        No
        No
        No
              Yes
              Yes
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7835
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
               No
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7836
        No
        No
       Yes
        No
        No
       Yes
       Yes
        No
               No
              Yes
      ...
             2009
                9
        September
             2009
              Yes
       Yes
      NaN
      NaN
      NaN
      NaN
    
    
      7837
        No
        No
        No
        No
        No
        No
       Yes
        No
               No
              Yes
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7838
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
               No
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7839
        No
        No
        No
        No
        No
        No
        No
        No
               No
               No
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
              Yes
        No
      NaN
      NaN
      NaN
      NaN
    
    
      7840
        No
        No
       Yes
        No
        No
        No
        No
        No
               No
              Yes
      ...
       Don't Know
       Don't Know
       Don't Know
       Don't Know
               No
        No
      NaN
      NaN
      NaN
      NaN
    
  

7841 rows × 195 columns

There is also a "bag of words" section from the processed results of the free-text response section:



In [9]:

    
df.filter(like='word_')









    Out[9]:






  
    
      
      word_abdomen
      word_abl
      word_accid
      word_accord
      word_ach
      word_acidosi
      word_acquir
      word_activ
      word_acut
      word_add
      ...
      word_wit
      word_woman
      word_womb
      word_worri
      word_wors
      word_worsen
      word_worst
      word_wound
      word_xray
      word_yellow
    
  
  
    
      0   
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      1   
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      2   
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      3   
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      4   
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      5   
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      6   
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7   
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      8   
       0
       2
       0
       1
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
    
    
      9   
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       2
       0
       0
       0
       0
       0
       0
       0
    
    
      10  
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      11  
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      12  
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      13  
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      14  
       0
       0
       0
       0
       0
       0
       0
       0
       1
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      15  
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      16  
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       3
       1
       0
    
    
      17  
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      18  
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      19  
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      20  
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      21  
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      22  
       0
       0
       1
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      23  
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      24  
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      25  
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      26  
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      27  
       0
       0
       0
       0
       0
       0
       0
       0
       1
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      28  
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      29  
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      7811
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7812
       6
       0
       0
       0
       0
       0
       0
       1
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       1
    
    
      7813
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7814
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7815
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7816
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7817
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7818
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7819
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7820
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7821
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7822
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7823
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7824
       0
       0
       4
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7825
       2
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7826
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7827
       0
       0
       0
       0
       0
       0
       1
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7828
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       2
       0
       0
       0
       0
       0
       0
    
    
      7829
       0
       0
       0
       0
       1
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7830
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7831
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7832
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7833
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7834
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7835
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7836
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       1
       0
       0
       0
       0
       0
       0
    
    
      7837
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7838
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7839
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7840
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
  

7841 rows × 679 columns

Make the feature vectors, by putting together everything from the adult module, and everything from the free response section.



In [10]:

    
X = np.hstack((np.array(df.filter(regex='a\d_')), np.array(df.filter(like='word_'))))

Did I get that right?



In [11]:

    
np.array(df.filter(regex='a\d_')).shape, np.array(df.filter(like='word_')).shape









    Out[11]:





((7841, 195), (7841, 679))



In [12]:

    
X.shape









    Out[12]:





(7841, 874)

Actually, that won't work without more data cleaning, because sklearn expects the feature vectors to be numbers, not answers like "Yes" and "Don't know".

So let's just use the open-response text, which has already been processed.



In [13]:

    
X = np.array(df.filter(like='word_'))

The causes of death appear in the gs_text34 column (obviously...)



In [14]:

    
df.gs_text34









    Out[14]:





0                         Cirrhosis
1                          Epilepsy
2                         Pneumonia
3                              COPD
4       Acute Myocardial Infarction
5                             Fires
6                     Renal Failure
7                              AIDS
8                       Lung Cancer
9                          Maternal
10                         Maternal
11                         Drowning
12                    Renal Failure
13    Other Cardiovascular Diseases
14                    Renal Failure
...
7826               Diarrhea/Dysentery
7827                        Cirrhosis
7828                        Cirrhosis
7829                         Diabetes
7830                         Maternal
7831                         Maternal
7832                    Breast Cancer
7833                             AIDS
7834                        Cirrhosis
7835                         Homicide
7836                  Cervical Cancer
7837    Other Cardiovascular Diseases
7838                       Poisonings
7839                            Fires
7840                Esophageal Cancer
Name: gs_text34, Length: 7841, dtype: object

For class purposes, we will simplify the prediction task: was the death due to stroke, diabetes, or something else?



In [15]:

    
df['Cause'] = df.gs_text34.map({'Stroke':'Stroke', 'Diabetes':'Diabetes'}).fillna('Other')
y = np.array(df.Cause)

Did that work?



In [16]:

    
np.unique(y)









    Out[16]:





array(['Diabetes', 'Other', 'Stroke'], dtype=object)

Now, let's train one of our (by now) standard classifiers to predict the underlying cause of death from the restricted cause list:



In [17]:

    
import sklearn.tree

clf = sklearn.tree.DecisionTreeClassifier()
clf.fit(X,y)

y_pred = clf.predict(X)
np.mean(y == y_pred)









    Out[17]:





0.98125239127662289

Does that look good? Too good? If it looks too good to be true, it probably is...



In [18]:

    
import sklearn.cross_validation

scores = sklearn.cross_validation.cross_val_score(clf, X, y)

print scores.mean(), scores.std()









    



0.833696270575 0.00883096018699



In [19]:

    
sns.distplot(scores, rug=True)









    Out[19]:





<matplotlib.axes.AxesSubplot at 0x7ffcaf31f2d0>

What have we done here?



In [20]:

    
help(sklearn.cross_validation.cross_val_score)









    



Help on function cross_val_score in module sklearn.cross_validation:

cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, score_func=None, pre_dispatch='2*n_jobs')
    Evaluate a score by cross-validation
    
    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.
    
    X : array-like
        The data to fit. Can be, for example a list, or an array at least 2d.
    
    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    
    scoring : string, callable or None, optional, default: None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    
    cv : cross-validation generator or int, optional, default: None
        A cross-validation generator to use. If int, determines
        the number of folds in StratifiedKFold if y is binary
        or multiclass and estimator is a classifier, or the number
        of folds in KFold otherwise. If None, it is equivalent to cv=3.
    
    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.
    
    verbose : integer, optional
        The verbosity level.
    
    fit_params : dict, optional
        Parameters to pass to the fit method of the estimator.
    
    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:
    
            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs
    
            - An int, giving the exact number of total jobs that are
              spawned
    
            - A string, giving an expression as a function of n_jobs,
              as in '2*n_jobs'
    
    Returns
    -------
    scores : array of float, shape=(len(list(cv)),)
        Array of scores of the estimator for each run of the cross validation.

If you read through that, you will find out: we have done stratified K-Fold cross validation with $K = 3$. In other words, split the data into 3 equal parts (what if it's not divisible by three?) and then use two for training and the third for testing, in all three ways.

So how to do 10-fold CV as in the reading?



In [21]:

    
%%time

n = len(y)
cv = sklearn.cross_validation.KFold(n, n_folds=10, shuffle=True)
scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=cv)

print scores.mean(), scores.std()









    



0.833949369557 0.00660735684823
CPU times: user 19.4 s, sys: 67 ms, total: 19.5 s
Wall time: 19.5 s



In [22]:

    
sns.distplot(scores, rug=True)









    Out[22]:





<matplotlib.axes.AxesSubplot at 0x7ffce65024d0>

I think it is important to stratify the sampling, although our book does not make a big deal about that:



In [24]:

    
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)
scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=cv)

print scores.mean(), scores.std()









    



0.83688337277 0.0111495762856



In [25]:

    
sns.distplot(scores, rug=True)









    Out[25]:





<matplotlib.axes.AxesSubplot at 0x7ffca9d86490>

And I don't want you to ever rely on just 10 samples. How can you do ten repetitions of 10-fold cross validation?



In [28]:

    
scores = []
for rep in range(10):
    print rep,
    cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)
    scores += list(sklearn.cross_validation.cross_val_score(clf, X, y, cv=cv))









    



0 1 2 3 4 5 6 7 8 9



In [29]:

    
np.mean(scores), np.std(scores)









    Out[29]:





(0.83225215827478494, 0.010111713947456653)



In [30]:

    
sns.distplot(scores, rug=True)









    Out[30]:





<matplotlib.axes.AxesSubplot at 0x7ffca99eebd0>

Although our books make a big deal about K-Fold cross-validation, and we have gone through all of the trouble to understand what it is doing, I am partial to what is sometimes called leave-group-out cross-validation, Monte Carlo cross-validation, or just plain-old cross-validation: split the data into training and test sets randomly a bunch of times.

Just to be perverse, in sklearn this is called ShuffleSplit cross-validation:



In [32]:

    
%%time

n = len(y)
cv = sklearn.cross_validation.ShuffleSplit(n, n_iter=10, test_size=.25)
scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=cv)

print scores.mean(), scores.std()









    



0.833197348292 0.00925643009122
CPU times: user 15.8 s, sys: 16 ms, total: 15.8 s
Wall time: 15.8 s

Of course, you can also do a stratified version of this:



In [34]:

    
cv = sklearn.cross_validation.StratifiedShuffleSplit(y, n_iter=10, test_size=.25)
scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=cv)

print scores.mean(), scores.std()









    



0.831345565749 0.00604550537699

One last way to do cross-validation, which (in my humble opinion) was not emphasized sufficiently in the reading: split based on what you are actually interested in. In the case of VA, I am very interested in how the approach will generalize to other parts of the world. So I could hold-out based on site:



In [35]:

    
df.site.value_counts()









    Out[35]:





Dar       1726
Mexico    1586
AP        1554
UP        1419
Bohol     1259
Pemba      297
dtype: int64



In [36]:

    
cv = sklearn.cross_validation.LeaveOneLabelOut(df.site)
scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=cv)

print scores.mean(), scores.std()









    



0.800788863972 0.0594346454159

There is also a version that holds out $P$ labels, and lets you choose $P$:



In [37]:

    
cv = sklearn.cross_validation.LeavePLabelOut(df.site, p=2)
scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=cv)

print scores.mean(), scores.std()









    



0.786904119569 0.057943850751

What did I not mention that whole time? Leave-one-out cross-validation. That is because it takes forever.



In [38]:

    
%%time

n = len(y)
cv = sklearn.cross_validation.LeaveOneOut(n)
scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=cv)

print scores.mean(), scores.std()









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-38-9f6ce4f81f80> in <module>()
----> 1 get_ipython().run_cell_magic(u'time', u'', u'\nn = len(y)\ncv = sklearn.cross_validation.LeaveOneOut(n)\nscores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=cv)\n\nprint scores.mean(), scores.std()')

/snfs2/HOME/abie/ipython/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2231             magic_arg_s = self.var_expand(line, stack_depth)
   2232             with self.builtin_trap:
-> 2233                 result = fn(magic_arg_s, cell)
   2234             return result
   2235 

/snfs2/HOME/abie/ipython/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)

/snfs2/HOME/abie/ipython/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/snfs2/HOME/abie/ipython/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1145         else:
   1146             st = clock2()
-> 1147             exec(code, glob, local_ns)
   1148             end = clock2()
   1149             out = None

<timed exec> in <module>()

/homes/abie/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, score_func, pre_dispatch)
   1149                                               train, test, verbose, None,
   1150                                               fit_params)
-> 1151                       for train, test in cv)
   1152     return np.array(scores)[:, 0]
   1153 

/homes/abie/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    651             self._iterating = True
    652             for function, args, kwargs in iterable:
--> 653                 self.dispatch(function, args, kwargs)
    654 
    655             if pre_dispatch == "all" or n_jobs == 1:

/homes/abie/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch(self, func, args, kwargs)
    398         """
    399         if self._pool is None:
--> 400             job = ImmediateApply(func, args, kwargs)
    401             index = len(self._jobs)
    402             if not _verbosity_filter(index, self.verbose):

/homes/abie/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, func, args, kwargs)
    136         # Don't delay the application, to avoid keeping the input
    137         # arguments in memory
--> 138         self.results = func(*args, **kwargs)
    139 
    140     def get(self):

/homes/abie/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters)
   1237         estimator.fit(X_train, **fit_params)
   1238     else:
-> 1239         estimator.fit(X_train, y_train, **fit_params)
   1240     test_score = _score(estimator, X_test, y_test, scorer)
   1241     if return_train_score:

/homes/abie/anaconda/lib/python2.7/site-packages/sklearn/tree/tree.pyc in fit(self, X, y, sample_mask, X_argsorted, check_input, sample_weight)
    265                                            max_leaf_nodes)
    266 
--> 267         builder.build(self.tree_, X, y, sample_weight)
    268 
    269         if self.n_outputs_ == 1:

KeyboardInterrupt:

Let's return to the stratified, repeated, 10-fold cross-validation approach:



In [39]:

    
%%time

scores = []
for rep in range(10):
    print rep,
    cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)
    scores += [sklearn.cross_validation.cross_val_score(clf, X, y, cv=cv)]
    
print









    



0 1 2 3 4 5 6 7 8 9
CPU times: user 3min 6s, sys: 626 ms, total: 3min 7s
Wall time: 3min 7s



In [40]:

    
np.mean(scores), np.std(scores)









    Out[40]:





(0.83329775204490886, 0.010930491665794614)

Sure, we can have a couple of minutes of break-time running this thing, but how do you think this is actually doing? Last week, we used a random prediction method as a baseline, and that is reasonable, but this week let's use something even simpler. Predicting a single class:



In [41]:

    
y_single = 'Diabetes'
np.mean(y == y_single)









    Out[41]:





0.052799387833184545



In [42]:

    
y_single = 'Stroke'
np.mean(y == y_single)









    Out[42]:





0.08034689452875908



In [43]:

    
y_single = 'Other'
np.mean(y == y_single)









    Out[43]:





0.86685371763805641

Uh, oh... we don't need any fancy classifiers to get 80% accuracy, and we don't need to wait around for five minutes to see how well that fancy classifier does. We could just say "other" and call it a day. To be a really fair (yet still damning) comparison, let's make a stratified, repeated 10-fold cross-validation of the baseline classifier that always says "other".

To do so, it will help you to know how to use a sklearn.cross_validation object a little bit more:



In [44]:

    
cv = sklearn.cross_validation.KFold(10, n_folds=3, shuffle=False)
for train, test in cv:
    print train, test









    



[4 5 6 7 8 9] [0 1 2 3]
[0 1 2 3 7 8 9] [4 5 6]
[0 1 2 3 4 5 6] [7 8 9]

We can use that to make a fair comparison of the decision tree to the baseline classifier that always predicts "Other":



In [45]:

    
scores = []
for rep in range(10):
    cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)
    for train, test in cv:
        scores.append(np.mean(y[test] == 'Other'))
np.mean(scores)









    Out[45]:





0.86685415362425799

To make this really, really fair, we should use the same random splits in each experiment...



In [46]:

    
%%time

scores = []
for rep in range(10):
    print rep,
    cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456+rep)
    scores += [sklearn.cross_validation.cross_val_score(clf, X, y, cv=cv)]
    
print
print np.mean(scores)









    



0 1 2 3 4 5 6 7 8 9
0.833157772844
CPU times: user 2min 51s, sys: 570 ms, total: 2min 51s
Wall time: 2min 52s



In [47]:

    
scores = []
for rep in range(10):
    cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456+rep)
    for train, test in cv:
        scores.append(np.mean(y[test] == 'Other'))
np.mean(scores)









    Out[47]:





0.86685415362425799

But that is just practicing "defensive research", to prepare for the pickiest of critics...

What we really need to be thinking about now is what was called "cost-sensitive learning" in the Data Mining book.



In [48]:

    
df.Cause.value_counts()









    Out[48]:





Other       6797
Stroke       630
Diabetes     414
dtype: int64

Make weights for examples proportional to inverse of counts:



In [49]:

    
weights = 1000. / df.Cause.value_counts()
weights









    Out[49]:





Other       0.147124
Stroke      1.587302
Diabetes    2.415459
dtype: float64



In [50]:

    
sample_weight = np.array(weights[y])

What is in sample_weights?



In [51]:

    
import sklearn.metrics
y_pred = ['Other' for i in range(len(y))]
sklearn.metrics.accuracy_score(y, y_pred, sample_weight=sample_weight)









    Out[51]:





0.33333333333334975

Is that what you expected?

This is a better metric for our classification challenge, and we should use it when building the classifier and when measuring its performance:



In [52]:

    
%%time

scores = []
for rep in range(10):
    print rep,
    cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456+rep)
    for train, test in cv:
        clf = sklearn.tree.DecisionTreeClassifier()
        clf.fit(X[train], y[train], sample_weight=sample_weight[train])

        y_pred = clf.predict(X[test])
        scores.append(sklearn.metrics.accuracy_score(y[test], y_pred, sample_weight=sample_weight[test]))
                   
print
print np.mean(scores)









    



0 1 2 3 4 5 6 7 8 9
0.508593929492
CPU times: user 1min 54s, sys: 609 ms, total: 1min 55s
Wall time: 1min 55s



In [53]:

    
scores = []
for rep in range(10):
    cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456+rep)
    for train, test in cv:
        y_pred = ['Other' for i in range(len(test))]
        scores.append(sklearn.metrics.accuracy_score(y[test], y_pred, sample_weight=sample_weight[test]))
np.mean(scores)









    Out[53]:





0.33333831953558202

The moral is: ML methods can do something, but you have to be careful about it!

This relates to a subtle point, that I think may be somewhat unique to population health metrics, where we are interested in the population-level mean of the predictions more than the individual-level predictions themselves. I will see how far we make it through this exercise and decide if we have time to really dig into it.

In short, the percent of test set examples with underlying cause Diabetes is equal to the percent of the training set with this underlying cause. So it is not really out-of-sample, even though the actual examples are completely disjoint.



In [54]:

    
def csmf_accuracy(y_true, y_pred):
    csmf_diff = 0
    csmf_true_min = 1.
    for j in ['Stroke', 'Diabetes', 'Other']:
        csmf_true_j = np.mean(np.array(y_true) == j)
        csmf_pred_j = np.mean(np.array(y_pred) == j)
        csmf_diff += np.abs(csmf_true_j - csmf_pred_j)
        
        if csmf_true_j < csmf_true_min:
            csmf_true_min = csmf_true_j
        #print csmf_true_j, csmf_pred_j, csmf_diff, csmf_true_min
    return 1 - csmf_diff / (2 * (1 - csmf_true_min))


# test this function
y_true = ['Stroke', 'Diabetes', 'Other']
y_pred = ['Diabetes', 'Other', 'Stroke']
assert np.allclose(csmf_accuracy(y_true, y_pred), 1)

y_true = ['Stroke', 'Stroke', 'Stroke']
y_pred = ['Diabetes', 'Other', 'Diabetes']
assert np.allclose(csmf_accuracy(y_true, y_pred), 0)

The baseline predictor is pretty good:



In [55]:

    
scores = []
for rep in range(10):
    cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456+rep)
    for train, test in cv:
        y_pred = ['Other' for i in range(len(test))]
        scores.append(csmf_accuracy(y[test], y_pred))
np.mean(scores)









    Out[55]:





0.85943196700199098

But because the validation environment is leaking information, a different silly predictor is even better. I call this "Random-From-Train":



In [56]:

    
scores = []
for rep in range(10):
    cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456+rep)
    for train, test in cv:
        y_pred = np.random.choice(y[train], size=len(test), replace=True)
        scores.append(csmf_accuracy(y[test], y_pred))
np.mean(scores)









    Out[56]:





0.98696647596797393

To fix it, you must resample the cause distribution of the test dataset. Super-hard extra bonus homework: figure out how to do this.



In [60]:

    
scores = []
for rep in range(10):
    print rep,
    
    cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456+rep)
    for train, test in cv:
        clf = sklearn.tree.DecisionTreeClassifier()
        clf.fit(X[train], y[train], sample_weight=sample_weight[train])

        # resample y[test] here to have a distribution chosen uniformly at random
        rows = []
        for j in np.unique(y[test]):
            test_rows_with_cause_j = np.where(y[test] == j)[0]  # np.where returns a tuple, just to be annoying 
            rows += list(np.random.choice(test_rows_with_cause_j, size=1000))
        X_test = X[rows]
        y_pred = clf.predict(X_test)
        scores.append(csmf_accuracy(y[rows], y_pred))
        
print
np.mean(scores)









    



0
1
2
3
4
5
6
7
8
9






    Out[60]:





0.92179468391231179

Did I promise a little more for this week in the syllabus?

Probability prediction:



In [61]:

    
clf.predict_proba(X[test[:10]])









    Out[61]:





array([[ 0.        ,  1.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ],
       [ 0.35442687,  0.31950041,  0.32607272],
       [ 0.        ,  0.        ,  1.        ]])

Maybe more convincing with Naive Bayes:



In [62]:

    
import sklearn.naive_bayes



In [63]:

    
clf = sklearn.naive_bayes.MultinomialNB()
clf.fit(X[train], y[train], )









    Out[63]:





MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)



In [64]:

    
clf.predict_proba(X[test[:10]])









    Out[64]:





array([[  1.81132527e-03,   9.97964902e-01,   2.23772715e-04],
       [  2.69863763e-02,   9.32236682e-01,   4.07769417e-02],
       [  5.79002201e-03,   2.09617280e-01,   7.84592698e-01],
       [  2.74315671e-03,   9.95858778e-01,   1.39806538e-03],
       [  1.22831768e-03,   9.98730068e-01,   4.16145637e-05],
       [  2.75318783e-03,   9.89736195e-01,   7.51061729e-03],
       [  3.35403449e-06,   9.99958398e-01,   3.82481162e-05],
       [  7.90692631e-06,   9.99983598e-01,   8.49518171e-06],
       [  5.92856078e-03,   9.86574873e-01,   7.49656648e-03],
       [  1.10081195e-03,   1.46315277e-01,   8.52583911e-01]])

Use np.round to make this look nicer:



In [65]:

    
np.round(clf.predict_proba(X[test[:10]]), 2)









    Out[65]:





array([[ 0.  ,  1.  ,  0.  ],
       [ 0.03,  0.93,  0.04],
       [ 0.01,  0.21,  0.78],
       [ 0.  ,  1.  ,  0.  ],
       [ 0.  ,  1.  ,  0.  ],
       [ 0.  ,  0.99,  0.01],
       [ 0.  ,  1.  ,  0.  ],
       [ 0.  ,  1.  ,  0.  ],
       [ 0.01,  0.99,  0.01],
       [ 0.  ,  0.15,  0.85]])

It is a bit hard to understand those probabilities. What do they mean?



In [66]:

    
y[test[:10]]









    Out[66]:





array(['Other', 'Other', 'Stroke', 'Other', 'Other', 'Other', 'Other',
       'Other', 'Other', 'Other'], dtype=object)

Let us hope that the middle column is the probability of 'Other'. The column names must be stored in the clf instance somewhere:



In [67]:

    
clf.classes_









    Out[67]:





array(['Diabetes', 'Other', 'Stroke'], 
      dtype='|S8')

Make that into a pretty DataFrame



In [69]:

    
t = pd.DataFrame(clf.predict_proba(X[test[:10]]), columns=clf.classes_)
np.round(t, 2)

And what about numeric prediction? I'll give us a little something to show the mechanics, although it is a weird example. Can you figure out what g1_07a is? Hint: remember that cb is the codebook.



In [70]:

    
def float_or_100(x):
    try:
        return float(x)
    except:
        return 100.
    
y = np.array(df.g1_07a.map(float_or_100))

Here is some MSE cross-validation:



In [78]:

    
scores = []
cv = sklearn.cross_validation.KFold(len(y), n_folds=10, shuffle=True, random_state=123456)

for train, test in cv:
    clf = sklearn.tree.DecisionTreeRegressor()
    clf.fit(X[train], y[train])

    y_pred = clf.predict(X[test])
    scores.append(sklearn.metrics.mean_squared_error(y[test], y_pred))
                   
print
print np.mean(scores)









    



592.67874886

I prefer root mean squared error, myself. I find it more interpretable:



In [79]:

    
np.mean(np.sqrt(scores))









    Out[79]:





24.33928481528179

There is also a mean absolute error in sklearn.



In [80]:

    
sklearn.metrics.mean_absolute_error









    Out[80]:





<function sklearn.metrics.metrics.mean_absolute_error>

Can you make a custom error function that doesn't charge for the missing values we clipped to 100?



In [85]:

    
def rmse_when_not_100(a, b):
    diff = a-b
    rows = np.where((a != 100) & (b != 100))[0]
    return np.sqrt(np.mean(diff[rows]**2))



In [86]:

    
cv = sklearn.cross_validation.KFold(len(y), n_folds=10, shuffle=True, random_state=123456)

for train, test in cv:
    clf = sklearn.tree.DecisionTreeRegressor()
    clf.fit(X[train], y[train], sample_weight=sample_weight[train])

    y_pred = clf.predict(X[test])
    scores.append(rmse_when_not_100(y[test], y_pred))
                   
print np.mean(scores)









    



31.0586305975

Is this any good? How can you tell?

Answer: compare to a baseline method that makes a numeric prediction without looking at the feature vector



In [87]:

    
scores = []

for val in np.unique(y):
    if val != 100:
        scores.append(rmse_when_not_100(y, val))
max(scores)









    Out[87]:





57.198502197364171

Yes, about twice as good as random.

	site	module	gs_code34	gs_text34	va34	gs_code46	gs_text46	va46	gs_code55	gs_text55	...	newid
0	Mexico	Adult	K71	Cirrhosis	6	K71	Cirrhosis	8	K71	Cirrhosis	...	1
1	AP	Adult	G40	Epilepsy	12	G40	Epilepsy	16	G40	Epilepsy	...	2
2	AP	Adult	J12	Pneumonia	26	J12	Pneumonia	37	J12	Pneumonia	...	3
3	Mexico	Adult	J33	COPD	8	J33	COPD	10	J33	COPD	...	4
4	UP	Adult	I21	Acute Myocardial Infarction	17	I21	Acute Myocardial Infarction	23	I21	Acute Myocardial Infarction	...	5

	variable	question	module	health_care_experience	coding
0	site	Site	General	0	NaN
1	newid	Study ID	General	0	NaN
2	gs_diagnosis	Gold Standard Diagnosis Code	General	0	NaN
3	gs_comorbid1	Gold Standard Comorbid Conditions 1	General	0	NaN
4	gs_comorbid2	Gold Standard Comorbid Conditions 2	General	0	NaN
5	gs_level	Gold Standard Diagnosis Level	General	0	1 "GS Level 1" 2 "GS Level 2" 3 "GS Level 2B" ...
6	g1_01d	Date of birth [day]	General	0	99 "Don't Know"
7	g1_01m	Date of birth [month]	General	0	1 "January" 2 "February" 3 "March" 4 "April" 5...
8	g1_01y	Date of birth [year]	General	0	9999 "Don't Know"
9	g1_05	Sex of deceased	General	0	1 "Male" 2 "Female" 8 "Refused to Answer" 9 "D...
10	g1_06d	Date of death [day]	General	0	99 "Don't Know"
11	g1_06m	Date of death [month]	General	0	1 "January" 2 "February" 3 "March" 4 "April" 5...
12	g1_06y	Date of death [year]	General	0	9999 "Don't Know"
13	g1_07a	Last known age of the deceased [years]	General	0	999 "Don't Know"
14	g1_07b	Last known age of the deceased [months]	General	0	99 "Don't Know"
15	g1_07c	Last known age of the deceased [days]	General	0	99 "Don't Know"
16	g1_08	Marital status of deceased	General	0	1 "Never Married" 2 "Married" 3 "Separated" 4 ...
17	g1_09	Last known level of education completed	General	0	1 "No Schooling" 2 "Primary School" 3 "High Sc...
18	g1_10	Number of years of education completed	General	0	99 "Don't Know"
19	g2_01	Language of interview	General	0	NaN
20	g2_02	Interviewer ID number	General	0	NaN
21	g2_03ad	Date of first interview attempt [day]	General	0	99 "Don't Know"
22	g2_03am	Date of first interview attempt [month]	General	0	1 "January" 2 "February" 3 "March" 4 "April" 5...
23	g2_03ay	Date of first interview attempt [year]	General	0	9999 "Don't Know"
24	g2_03bd	Date and time arranged for second interview at...	General	0	99 "Don't Know"
25	g2_03bm	Date and time arranged for second interview at...	General	0	1 "January" 2 "February" 3 "March" 4 "April" 5...
26	g2_03by	Date and time arranged for second interview at...	General	0	9999 "Don't Know"
27	g2_03cd	Date and time arranged for third interview att...	General	0	99 "Don't Know"
28	g2_03cm	Date and time arranged for third interview att...	General	0	1 "January" 2 "February" 3 "March" 4 "April" 5...
29	g2_03cy	Date and time arranged for third interview att...	General	0	9999 "Don't Know"
...	...	...	...	...	...
1619	word_stomach	The open narratives contained the word or stem...	Neonate	1	NaN
1620	word_test	The open narratives contained the word or stem...	Neonate	1	NaN
1621	word_deliv	The open narratives contained the word or stem...	Neonate	1	NaN
1622	word_asphyxia	The open narratives contained the word or stem...	Neonate	1	NaN
1623	word_wife	The open narratives contained the word or stem...	Neonate	1	NaN
1624	word_renal	The open narratives contained the word or stem...	Neonate	1	NaN
1625	word_surviv	The open narratives contained the word or stem...	Neonate	1	NaN
1626	word_visit	The open narratives contained the word or stem...	Neonate	1	NaN
1627	word_continu	The open narratives contained the word or stem...	Neonate	1	NaN
1628	word_husband	The open narratives contained the word or stem...	Neonate	1	NaN
1629	word_reach	The open narratives contained the word or stem...	Neonate	1	NaN
1630	word_provinci	The open narratives contained the word or stem...	Neonate	1	NaN
1631	word_servic	The open narratives contained the word or stem...	Neonate	1	NaN
1632	word_deceas	The open narratives contained the word or stem...	Neonate	1	NaN
1633	word_due	The open narratives contained the word or stem...	Neonate	1	NaN
1634	word_vomit	The open narratives contained the word or stem...	Neonate	1	NaN
1635	word_bad	The open narratives contained the word or stem...	Neonate	1	NaN
1636	word_nilof	The open narratives contained the word or stem...	Neonate	1	NaN
1637	word_male	The open narratives contained the word or stem...	Neonate	1	NaN
1638	word_milk	The open narratives contained the word or stem...	Neonate	1	NaN
1639	word_incub	The open narratives contained the word or stem...	Neonate	1	NaN
1640	word_aliv	The open narratives contained the word or stem...	Neonate	1	NaN
1641	word_doctor	The open narratives contained the word or stem...	Neonate	1	NaN
1642	word_cord	The open narratives contained the word or stem...	Neonate	1	NaN
1643	word_born	The open narratives contained the word or stem...	Neonate	1	NaN
1644	word_brought	The open narratives contained the word or stem...	Neonate	1	NaN
1645	word_look	The open narratives contained the word or stem...	Neonate	1	NaN
1646	word_week	The open narratives contained the word or stem...	Neonate	1	NaN
1647	word_babi	The open narratives contained the word or stem...	Neonate	1	NaN
1648	word_difficulti	The open narratives contained the word or stem...	Neonate	1	NaN

	a1_01_1	a1_01_2	a1_01_3	a1_01_4	a1_01_5	a1_01_6	a1_01_7	a1_01_8	a1_01_9	a1_01_10	...	a6_06_2y	a6_07d	a6_07m	a6_07y	a6_09	a6_10	a7_11	a7_12	a7_13	a7_14
0	No	No	No	No	No	No	No	No	Don't Know	Don't Know	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
1	Yes	Yes	No	Yes	No	Yes	No	No	Yes	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
2	Yes	Yes	No	No	No	No	Yes	No	No	Yes	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
3	Yes	No	No	Yes	No	Yes	Yes	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
4	No	No	No	No	No	No	No	No	Yes	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
5	No	No	No	No	Yes	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	No	No	NaN	NaN	NaN	NaN
6	No	No	No	No	No	No	Yes	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	No	No	NaN	NaN	NaN	NaN
7	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	No	No	NaN	NaN	NaN	NaN
8	Yes	Yes	No	No	Yes	Yes	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
9	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
10	No	No	No	No	No	No	No	No	No	Yes	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
11	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
12	Yes	Yes	No	Yes	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
13	No	No	No	No	No	No	No	No	Yes	No	...	Don't Know	Don't Know	Don't Know	Don't Know	No	No	NaN	NaN	NaN	NaN
14	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	Yes	NaN	NaN	NaN	NaN
15	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
16	No	Yes	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
17	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
18	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
19	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
20	Yes	Yes	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	No	No	NaN	NaN	NaN	NaN
21	Yes	No	Yes	No	Yes	Yes	No	No	No	Yes	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	Yes	NaN	NaN	NaN	NaN
22	No	Yes	No	No	No	No	Yes	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
23	No	No	No	No	No	No	No	No	No	Yes	...	Don't Know	Don't Know	Don't Know	Don't Know	No	No	NaN	NaN	NaN	NaN
24	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	No	No	NaN	NaN	NaN	NaN
25	No	No	No	No	No	No	Yes	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	No	No	NaN	NaN	NaN	NaN
26	No	No	No	Yes	Yes	Yes	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
27	No	Yes	No	No	No	Yes	No	No	Yes	Yes	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	Yes	NaN	NaN	NaN	NaN
28	No	No	No	Yes	No	Yes	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
29	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	No	No	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
7811	No	No	Yes	No	No	No	No	No	No	Yes	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
7812	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
7813	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	No	No	NaN	NaN	NaN	NaN
7814	No	No	No	No	No	No	No	No	No	No	...	2009	16	April	2009	Yes	No	NaN	NaN	NaN	NaN
7815	No	No	No	No	No	No	No	No	Yes	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	Yes	NaN	NaN	NaN	NaN
7816	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
7817	No	No	No	No	No	No	No	No	No	Yes	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	Yes	NaN	NaN	NaN	NaN
7818	No	Yes	No	No	Yes	No	No	No	No	Yes	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
7819	No	No	No	No	No	Yes	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	No	No	NaN	NaN	NaN	NaN
7820	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
7821	No	No	No	No	No	No	Yes	No	No	No	...	Don't Know	3	September	2009	No	No	NaN	NaN	NaN	NaN
7822	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Don't Know	No	NaN	NaN	NaN	NaN
7823	Yes	No	Yes	Yes	No	Yes	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
7824	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
7825	No	No	No	No	Yes	No	No	Yes	Yes	Yes	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	Yes	NaN	NaN	NaN	NaN
7826	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	No	No	NaN	NaN	NaN	NaN
7827	No	No	No	No	No	No	No	No	Don't Know	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	Yes	NaN	NaN	NaN	NaN
7828	No	No	No	No	Yes	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	Yes	NaN	NaN	NaN	NaN
7829	No	No	No	No	No	No	Yes	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
7830	No	No	No	No	No	No	No	No	No	Yes	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
7831	No	Yes	No	No	No	No	No	Yes	No	Yes	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
7832	No	No	Yes	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	No	No	NaN	NaN	NaN	NaN
7833	No	No	No	No	No	No	No	No	No	Yes	...	Don't Know	Don't Know	Don't Know	Don't Know	No	No	NaN	NaN	NaN	NaN
7834	No	No	No	No	No	No	No	No	Yes	Yes	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
7835	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	No	No	NaN	NaN	NaN	NaN
7836	No	No	Yes	No	No	Yes	Yes	No	No	Yes	...	2009	9	September	2009	Yes	Yes	NaN	NaN	NaN	NaN
7837	No	No	No	No	No	No	Yes	No	No	Yes	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
7838	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	No	No	NaN	NaN	NaN	NaN
7839	No	No	No	No	No	No	No	No	No	No	...	Don't Know	Don't Know	Don't Know	Don't Know	Yes	No	NaN	NaN	NaN	NaN
7840	No	No	Yes	No	No	No	No	No	No	Yes	...	Don't Know	Don't Know	Don't Know	Don't Know	No	No	NaN	NaN	NaN	NaN

	Diabetes	Other	Stroke
0	0.00	1.00	0.00
1	0.03	0.93	0.04
2	0.01	0.21	0.78
3	0.00	1.00	0.00
4	0.00	1.00	0.00
5	0.00	0.99	0.01
6	0.00	1.00	0.00
7	0.00	1.00	0.00
8	0.01	0.99	0.01
9	0.00	0.15	0.85