Big Data Applications and Analytics: Term Project

Sean M. Shiverick Fall 2017

Exploratory Data Analysis: nsduh15-dataset

2015 National Survey on Drug Abuse and Health (NSDUH)

Obtain Summary Data and Crosstabulations

  • Import python modules
  • load data file and save as DataFrame object
  • Obtain Crosstabs on variables of interest

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
file = pd.read_csv('nsduh-2015.csv')
df = pd.DataFrame(file)

In [3]:
df.shape


Out[3]:
(57146, 89)

In [4]:
df.drop(df.columns[[0,1]], axis=1, inplace=True)

In [5]:
df.columns


Out[5]:
Index(['AGECAT', 'SEX', 'MARRIED', 'EDUCAT', 'EMPLOY18', 'CTYMETRO', 'HEALTH2',
       'STDPYR', 'HEPEVR', 'CANCEVR', 'HOSPYR', 'DEPMELT', 'DEPMEYR',
       'DEPMWKS', 'DEPWMOS', 'EMODSWKS', 'EMOPGOUT', 'EMOPSOC', 'EMOPWRK',
       'SUICTHT', 'SUICPLN', 'SUICATT', 'PRLUNDR', 'PRLUNDR30', 'PRLGRTYR',
       'PRLMISEVR', 'PRLMISYR', 'PRLMISMO', 'PRLOXYMSYR', 'PRLDEPYR',
       'PRLABSRY', 'PRLHIGH', 'HYDRCDYR', 'OXYCDPRYR', 'OXYCTNYR', 'TRMADLYR',
       'MORPHPRYR', 'FENTNYLYR', 'BUPRNRPHN', 'OXYMORPHN', 'DEMEROL',
       'HYDRMRPHN', 'HEROINEVR', 'HEROINYR', 'HEROINMO', 'HEROINAB',
       'METHADONE', 'HEROINFQY', 'TRQBENZODZ', 'TRQALPRZM', 'TRQLRZPM',
       'TRQCLNZPM', 'TRQDIAZPM', 'SDBENZDPN', 'SDTRZLM', 'SDTMZPM', 'SDBARBTS',
       'SDOTHYR', 'COCNEVR', 'COCNYR', 'COCNMO', 'CRACKEVR', 'CRACKYR',
       'AMPHTMNYR', 'METHEVR', 'METHYR', 'METHMO', 'HLCNEVR', 'LSDEVR',
       'MDMAEVR', 'DMTEVR', 'KETMNEVR', 'TRTRHBOVN', 'TRTRHBOUT', 'TRTMHCTR',
       'TRTERYR', 'TRTCURRCV', 'TRTCURPRL', 'TRTGAPYR', 'MHTRTOYR',
       'MHTRTCLYR', 'MHTRTTHPY', 'MHTRTDRYR', 'MHTRTMDOUT', 'MHTRTHPPGM',
       'MHTRTHSPON', 'MHTRTALT'],
      dtype='object')

In [6]:
df.shape


Out[6]:
(57146, 87)

Frequency Counts


In [7]:
df['HEROINEVR'].value_counts()


Out[7]:
0    56190
1      956
Name: HEROINEVR, dtype: int64

In [8]:
df['AGECAT'].value_counts()


Out[8]:
2    14553
1    13585
4    11169
3     9084
5     8755
Name: AGECAT, dtype: int64

In [9]:
df.describe()


Out[9]:
AGECAT SEX MARRIED EDUCAT EMPLOY18 CTYMETRO HEALTH2 STDPYR HEPEVR CANCEVR ... TRTCURPRL TRTGAPYR MHTRTOYR MHTRTCLYR MHTRTTHPY MHTRTDRYR MHTRTMDOUT MHTRTHPPGM MHTRTHSPON MHTRTALT
count 57146.000000 57146.000000 57146.000000 57146.000000 57146.000000 57146.000000 57146.000000 57146.0 57146.0 57146.0 ... 57146.0 57146.000000 57146.0 57146.000000 57146.000000 57146.000000 57146.000000 57146.000000 57146.0 57146.0
mean 2.771743 0.532146 1.976219 3.052270 0.898786 2.236325 2.196672 0.0 0.0 0.0 ... 0.0 0.035873 0.0 0.015854 0.033091 0.008610 0.003587 0.001225 0.0 0.0
std 1.400627 0.498970 1.487020 1.442398 0.931240 0.772090 0.931390 0.0 0.0 0.0 ... 0.0 0.185975 0.0 0.124912 0.178875 0.092388 0.059787 0.034978 0.0 0.0
min 1.000000 0.000000 0.000000 1.000000 0.000000 1.000000 0.000000 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0
25% 2.000000 0.000000 1.000000 2.000000 0.000000 2.000000 1.000000 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0
50% 3.000000 1.000000 1.000000 3.000000 1.000000 2.000000 2.000000 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0
75% 4.000000 1.000000 4.000000 4.000000 2.000000 3.000000 3.000000 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0
max 5.000000 1.000000 4.000000 5.000000 2.000000 3.000000 4.000000 0.0 0.0 0.0 ... 0.0 1.000000 0.0 1.000000 1.000000 1.000000 1.000000 1.000000 0.0 0.0

8 rows × 87 columns


In [10]:
df.keys()


Out[10]:
Index(['AGECAT', 'SEX', 'MARRIED', 'EDUCAT', 'EMPLOY18', 'CTYMETRO', 'HEALTH2',
       'STDPYR', 'HEPEVR', 'CANCEVR', 'HOSPYR', 'DEPMELT', 'DEPMEYR',
       'DEPMWKS', 'DEPWMOS', 'EMODSWKS', 'EMOPGOUT', 'EMOPSOC', 'EMOPWRK',
       'SUICTHT', 'SUICPLN', 'SUICATT', 'PRLUNDR', 'PRLUNDR30', 'PRLGRTYR',
       'PRLMISEVR', 'PRLMISYR', 'PRLMISMO', 'PRLOXYMSYR', 'PRLDEPYR',
       'PRLABSRY', 'PRLHIGH', 'HYDRCDYR', 'OXYCDPRYR', 'OXYCTNYR', 'TRMADLYR',
       'MORPHPRYR', 'FENTNYLYR', 'BUPRNRPHN', 'OXYMORPHN', 'DEMEROL',
       'HYDRMRPHN', 'HEROINEVR', 'HEROINYR', 'HEROINMO', 'HEROINAB',
       'METHADONE', 'HEROINFQY', 'TRQBENZODZ', 'TRQALPRZM', 'TRQLRZPM',
       'TRQCLNZPM', 'TRQDIAZPM', 'SDBENZDPN', 'SDTRZLM', 'SDTMZPM', 'SDBARBTS',
       'SDOTHYR', 'COCNEVR', 'COCNYR', 'COCNMO', 'CRACKEVR', 'CRACKYR',
       'AMPHTMNYR', 'METHEVR', 'METHYR', 'METHMO', 'HLCNEVR', 'LSDEVR',
       'MDMAEVR', 'DMTEVR', 'KETMNEVR', 'TRTRHBOVN', 'TRTRHBOUT', 'TRTMHCTR',
       'TRTERYR', 'TRTCURRCV', 'TRTCURPRL', 'TRTGAPYR', 'MHTRTOYR',
       'MHTRTCLYR', 'MHTRTTHPY', 'MHTRTDRYR', 'MHTRTMDOUT', 'MHTRTHPPGM',
       'MHTRTHSPON', 'MHTRTALT'],
      dtype='object')

Descriptive Statistics


In [11]:
columns = ['PRLMISEVR','HYDRCDYR']

for col in columns:
    print(df[[col]])


       PRLMISEVR
0              0
1              0
2              0
3              0
4              0
5              0
6              0
7              0
8              0
9              0
10             0
11             0
12             0
13             0
14             0
15             0
16             0
17             0
18             0
19             0
20             0
21             1
22             0
23             0
24             0
25             0
26             0
27             0
28             0
29             0
...          ...
57116          0
57117          0
57118          0
57119          0
57120          0
57121          1
57122          0
57123          0
57124          0
57125          0
57126          0
57127          0
57128          0
57129          0
57130          0
57131          0
57132          0
57133          0
57134          0
57135          1
57136          0
57137          0
57138          0
57139          0
57140          0
57141          0
57142          0
57143          0
57144          0
57145          0

[57146 rows x 1 columns]
       HYDRCDYR
0             0
1             0
2             1
3             0
4             1
5             0
6             0
7             0
8             0
9             0
10            0
11            0
12            0
13            0
14            0
15            0
16            0
17            0
18            0
19            0
20            1
21            0
22            0
23            0
24            1
25            0
26            0
27            0
28            1
29            0
...         ...
57116         1
57117         0
57118         0
57119         0
57120         0
57121         0
57122         1
57123         0
57124         0
57125         0
57126         1
57127         1
57128         0
57129         1
57130         0
57131         1
57132         0
57133         0
57134         0
57135         1
57136         0
57137         0
57138         0
57139         0
57140         0
57141         1
57142         0
57143         0
57144         0
57145         0

[57146 rows x 1 columns]

Crosstabs for Opioids Use by Age

  • Look at Prescription Opioid Use by Age
  • Look at Heroin Use by Age

In [12]:
prl_evr = pd.crosstab(df['PRLMISEVR'], df['AGECAT'])
prl_evr


Out[12]:
AGECAT 1 2 3 4 5
PRLMISEVR
0 12787 12426 7609 9826 8155
1 798 2127 1475 1343 600

In [13]:
prl_evr = pd.crosstab(df['HYDRCDYR'], df['AGECAT'])
prl_evr


Out[13]:
AGECAT 1 2 3 4 5
HYDRCDYR
0 12754 11617 6851 8388 6652
1 831 2936 2233 2781 2103

In [14]:
oxy_evr = pd.crosstab(df['OXYCDPRYR'], df['AGECAT'])
oxy_evr


Out[14]:
AGECAT 1 2 3 4 5
OXYCDPRYR
0 13040 12921 7952 9824 7711
1 545 1632 1132 1345 1044

In [15]:
prl_evr = pd.crosstab(df['TRMADLYR'], df['AGECAT'])
prl_evr


Out[15]:
AGECAT 1 2 3 4 5
TRMADLYR
0 13344 13800 8430 10340 8021
1 241 753 654 829 734

In [16]:
prl_evr = pd.crosstab(df['MORPHPRYR'], df['AGECAT'])
prl_evr


Out[16]:
AGECAT 1 2 3 4 5
MORPHPRYR
0 13334 14122 8848 10856 8469
1 251 431 236 313 286

In [17]:
prl_evr = pd.crosstab(df['FENTNYLYR'], df['AGECAT'])
prl_evr


Out[17]:
AGECAT 1 2 3 4 5
FENTNYLYR
0 13557 14456 9003 11073 8669
1 28 97 81 96 86

In [18]:
prl_evr = pd.crosstab(df['BUPRNRPHN'], df['AGECAT'])
prl_evr


Out[18]:
AGECAT 1 2 3 4 5
BUPRNRPHN
0 13542 14356 8917 11045 8704
1 43 197 167 124 51

In [19]:
prl_evr = pd.crosstab(df['DEMEROL'], df['AGECAT'])
prl_evr


Out[19]:
AGECAT 1 2 3 4 5
DEMEROL
0 13559 14479 9035 11105 8684
1 26 74 49 64 71

In [20]:
prl_evr = pd.crosstab(df['OXYMORPHN'], df['AGECAT'])
prl_evr


Out[20]:
AGECAT 1 2 3 4 5
OXYMORPHN
0 13539 14465 9027 11122 8714
1 46 88 57 47 41

In [21]:
prl_evr = pd.crosstab(df['HYDRMRPHN'], df['AGECAT'])
prl_evr


Out[21]:
AGECAT 1 2 3 4 5
HYDRMRPHN
0 13561 14459 8977 11051 8674
1 24 94 107 118 81

In [22]:
prl_evr = pd.crosstab(df['METHADONE'], df['AGECAT'])
prl_evr


Out[22]:
AGECAT 1 2 3 4 5
METHADONE
0 13553 14470 8988 11098 8709
1 32 83 96 71 46

In [23]:
prl_evr = pd.crosstab(df['HEROINEVR'], df['AGECAT'])
prl_evr


Out[23]:
AGECAT 1 2 3 4 5
HEROINEVR
0 13563 14292 8825 10919 8591
1 22 261 259 250 164

In [24]:
prl_evr = pd.crosstab(df['COCNEVR'], df['AGECAT'])
prl_evr


Out[24]:
AGECAT 1 2 3 4 5
COCNEVR
0 13476 12908 7458 9215 7349
1 109 1645 1626 1954 1406

In [25]:
prl_evr = pd.crosstab(df['METHEVR'], df['AGECAT'])
prl_evr


Out[25]:
AGECAT 1 2 3 4 5
METHEVR
0 13543 14072 8384 10271 8263
1 42 481 700 898 492

In [26]:
prl_evr = pd.crosstab(df['TRQBENZODZ'], df['AGECAT'])
prl_evr


Out[26]:
AGECAT 1 2 3 4 5
TRQBENZODZ
0 13180 13084 8020 9764 7622
1 405 1469 1064 1405 1133

In [27]:
prl_evr = pd.crosstab(df['SDOTHYR'], df['AGECAT'])
prl_evr


Out[27]:
AGECAT 1 2 3 4 5
SDOTHYR
0 13381 14311 8927 10913 8529
1 204 242 157 256 226

In [28]:
prl_evr = pd.crosstab(df['AMPHTMNYR'], df['AGECAT'])
prl_evr


Out[28]:
AGECAT 1 2 3 4 5
AMPHTMNYR
0 12653 12717 8457 10786 8591
1 932 1836 627 383 164

In [29]:
hlc_evr = pd.crosstab(df['HLCNEVR'], df['AGECAT'])
hlc_evr


Out[29]:
AGECAT 1 2 3 4 5
HLCNEVR
0 13135 11893 7064 9042 7558
1 450 2660 2020 2127 1197

In [30]:
hlc_evr = pd.crosstab(df['LSDEVR'], df['AGECAT'])
hlc_evr


Out[30]:
AGECAT 1 2 3 4 5
LSDEVR
0 13395 13439 8210 9727 7848
1 190 1114 874 1442 907

In [31]:
hlc_evr = pd.crosstab(df['MDMAEVR'], df['AGECAT'])
hlc_evr


Out[31]:
AGECAT 1 2 3 4 5
MDMAEVR
0 13386 12686 7681 10222 8606
1 199 1867 1403 947 149

Depression, Mental Health, Treatment


In [32]:
mde_evr = pd.crosstab(df['HOSPYR'], df['AGECAT'])
mde_evr


Out[32]:
AGECAT 1 2 3 4 5
HOSPYR
0.0 12855 13404 8263 10279 7582
1.0 730 1149 821 890 1173

In [33]:
mde_evr = pd.crosstab(df['DEPMELT'], df['AGECAT'])
mde_evr


Out[33]:
AGECAT 1 2 3 4 5
DEPMELT
0.0 13585 12140 7689 9403 7788
1.0 0 2413 1395 1766 967

In [34]:
mde_evr = pd.crosstab(df['SUICTHT'], df['AGECAT'])
mde_evr


Out[34]:
AGECAT 1 2 3 4 5
SUICTHT
0.0 13585 14553 9084 11169 8755

In [35]:
trtm_evr = pd.crosstab(df['MHTRTOYR'], df['AGECAT'])
trtm_evr


Out[35]:
AGECAT 1 2 3 4 5
MHTRTOYR
0.0 13585 14553 9084 11169 8755

In [36]:
trtm_evr = pd.crosstab(df['MHTRTTHPY'], df['AGECAT'])
trtm_evr


Out[36]:
AGECAT 1 2 3 4 5
MHTRTTHPY
0.0 13585 13961 8650 10615 8444
1.0 0 592 434 554 311

In [37]:
trtm_evr = pd.crosstab(df['TRTGAPYR'], df['AGECAT'])
trtm_evr


Out[37]:
AGECAT 1 2 3 4 5
TRTGAPYR
0 13116 13622 8763 10930 8665
1 469 931 321 239 90

Crosstabulation


In [38]:
her_evr = pd.crosstab(df['HEROINEVR'], df['AGECAT'])
her_evr


Out[38]:
AGECAT 1 2 3 4 5
HEROINEVR
0 13563 14292 8825 10919 8591
1 22 261 259 250 164

In [39]:
df.drop('SUICATT', 1, inplace=True)
df.keys()


Out[39]:
Index(['AGECAT', 'SEX', 'MARRIED', 'EDUCAT', 'EMPLOY18', 'CTYMETRO', 'HEALTH2',
       'STDPYR', 'HEPEVR', 'CANCEVR', 'HOSPYR', 'DEPMELT', 'DEPMEYR',
       'DEPMWKS', 'DEPWMOS', 'EMODSWKS', 'EMOPGOUT', 'EMOPSOC', 'EMOPWRK',
       'SUICTHT', 'SUICPLN', 'PRLUNDR', 'PRLUNDR30', 'PRLGRTYR', 'PRLMISEVR',
       'PRLMISYR', 'PRLMISMO', 'PRLOXYMSYR', 'PRLDEPYR', 'PRLABSRY', 'PRLHIGH',
       'HYDRCDYR', 'OXYCDPRYR', 'OXYCTNYR', 'TRMADLYR', 'MORPHPRYR',
       'FENTNYLYR', 'BUPRNRPHN', 'OXYMORPHN', 'DEMEROL', 'HYDRMRPHN',
       'HEROINEVR', 'HEROINYR', 'HEROINMO', 'HEROINAB', 'METHADONE',
       'HEROINFQY', 'TRQBENZODZ', 'TRQALPRZM', 'TRQLRZPM', 'TRQCLNZPM',
       'TRQDIAZPM', 'SDBENZDPN', 'SDTRZLM', 'SDTMZPM', 'SDBARBTS', 'SDOTHYR',
       'COCNEVR', 'COCNYR', 'COCNMO', 'CRACKEVR', 'CRACKYR', 'AMPHTMNYR',
       'METHEVR', 'METHYR', 'METHMO', 'HLCNEVR', 'LSDEVR', 'MDMAEVR', 'DMTEVR',
       'KETMNEVR', 'TRTRHBOVN', 'TRTRHBOUT', 'TRTMHCTR', 'TRTERYR',
       'TRTCURRCV', 'TRTCURPRL', 'TRTGAPYR', 'MHTRTOYR', 'MHTRTCLYR',
       'MHTRTTHPY', 'MHTRTDRYR', 'MHTRTMDOUT', 'MHTRTHPPGM', 'MHTRTHSPON',
       'MHTRTALT'],
      dtype='object')

In [40]:
heroin_sex = pd.crosstab(df['HEROINEVR'], df['SEX'])
heroin_sex


Out[40]:
SEX 0 1
HEROINEVR
0 26166 30024
1 570 386

In [41]:
opioid_evr = pd.crosstab(df['HEROINEVR'], df['PRLMISEVR'])
opioid_evr


Out[41]:
PRLMISEVR 0 1
HEROINEVR
0 50489 5701
1 314 642

In [42]:
opioid_pct = opioid_evr.div(opioid_evr.sum(1), axis=0)
opioid_pct


Out[42]:
PRLMISEVR 0 1
HEROINEVR
0 0.898541 0.101459
1 0.328452 0.671548

In [43]:
opioid_pct.plot.bar()
plt.savefig('opioids.png', bbox_inches='tight')


Plot Prescription Opioid use by AgeCat

  • Create crosstabulation table
  • Normalize so that each row sums to one and make a plot

In [47]:
prl_any = pd.crosstab(df['AGECAT'],df['PRLMISEVR'])
prl_any


Out[47]:
PRLMISEVR 0 1
AGECAT
1 12787 798
2 12426 2127
3 7609 1475
4 9826 1343
5 8155 600

In [48]:
prl_any_pct = prl_any.div(prl_any.sum(1), axis=0)
prl_any_pct


Out[48]:
PRLMISEVR 0 1
AGECAT
1 0.941259 0.058741
2 0.853845 0.146155
3 0.837627 0.162373
4 0.879756 0.120244
5 0.931468 0.068532

In [49]:
prl_any_pct.plot.bar()


Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0x11c59cf60>

In [50]:
df.keys()


Out[50]:
Index(['AGECAT', 'SEX', 'MARRIED', 'EDUCAT', 'EMPLOY18', 'CTYMETRO', 'HEALTH2',
       'STDPYR', 'HEPEVR', 'CANCEVR', 'HOSPYR', 'DEPMELT', 'DEPMEYR',
       'DEPMWKS', 'DEPWMOS', 'EMODSWKS', 'EMOPGOUT', 'EMOPSOC', 'EMOPWRK',
       'SUICTHT', 'SUICPLN', 'PRLUNDR', 'PRLUNDR30', 'PRLGRTYR', 'PRLMISEVR',
       'PRLMISYR', 'PRLMISMO', 'PRLOXYMSYR', 'PRLDEPYR', 'PRLABSRY', 'PRLHIGH',
       'HYDRCDYR', 'OXYCDPRYR', 'OXYCTNYR', 'TRMADLYR', 'MORPHPRYR',
       'FENTNYLYR', 'BUPRNRPHN', 'OXYMORPHN', 'DEMEROL', 'HYDRMRPHN',
       'HEROINEVR', 'HEROINYR', 'HEROINMO', 'HEROINAB', 'METHADONE',
       'HEROINFQY', 'TRQBENZODZ', 'TRQALPRZM', 'TRQLRZPM', 'TRQCLNZPM',
       'TRQDIAZPM', 'SDBENZDPN', 'SDTRZLM', 'SDTMZPM', 'SDBARBTS', 'SDOTHYR',
       'COCNEVR', 'COCNYR', 'COCNMO', 'CRACKEVR', 'CRACKYR', 'AMPHTMNYR',
       'METHEVR', 'METHYR', 'METHMO', 'HLCNEVR', 'LSDEVR', 'MDMAEVR', 'DMTEVR',
       'KETMNEVR', 'TRTRHBOVN', 'TRTRHBOUT', 'TRTMHCTR', 'TRTERYR',
       'TRTCURRCV', 'TRTCURPRL', 'TRTGAPYR', 'MHTRTOYR', 'MHTRTCLYR',
       'MHTRTTHPY', 'MHTRTDRYR', 'MHTRTMDOUT', 'MHTRTHPPGM', 'MHTRTHSPON',
       'MHTRTALT'],
      dtype='object')