Chapter 7 in 'Python for Data Analysis' by Wes McKinney (2017, O'Reilly)
Chapter 3 in 'Python Data Science Handbook' by Jake VanderPlas (2016, O'Reilly)
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
file = pd.read_table('NSDUH-2015.tsv', low_memory=False)
data = pd.DataFrame(file)
In [3]:
data.shape
Out[3]:
In [4]:
df = pd.DataFrame(data, columns=['QUESTID2', 'CATAG6', 'IRSEX','IRMARITSTAT',
'EDUHIGHCAT', 'IRWRKSTAT18', 'COUTYP2', 'HEALTH2','STDANYYR1',
'HEPBCEVER1','HIVAIDSEV1','CANCEREVR1','INHOSPYR','AMDELT',
'AMDEYR','ADDPR2WK1','ADWRDST1','DSTWORST1','IMPGOUTM1',
'IMPSOCM1','IMPRESPM1','SUICTHNK1','SUICPLAN1','SUICTRY1',
'PNRNMLIF','PNRNM30D','PNRWYGAMT','PNRNMFLAG','PNRNMYR',
'PNRNMMON','OXYCNNMYR','DEPNDPYPNR','ABUSEPYPNR','PNRRSHIGH',
'HYDCPDAPYU','OXYCPDAPYU','OXCNANYYR2','TRAMPDAPYU','MORPPDAPYU',
'FENTPDAPYU','BUPRPDAPYU','OXYMPDAPYU','DEMEPDAPYU','HYDMPDAPYU',
'HERFLAG','HERYR','HERMON','ABODHER', 'MTDNPDAPYU',
'IRHERFY','TRBENZAPYU','ALPRPDAPYU','LORAPDAPYU','CLONPDAPYU',
'DIAZPDAPYU','SVBENZAPYU','TRIAPDAPYU','TEMAPDAPYU','BARBITAPYU',
'SEDOTANYR2','COCFLAG','COCYR','COCMON','CRKFLAG',
'CRKYR','AMMEPDAPYU','METHAMFLAG','METHAMYR','METHAMMON',
'HALLUCFLAG','LSDFLAG','ECSTMOFLAG','DAMTFXFLAG','KETMINFLAG',
'TXYRRESOV1','TXYROUTPT1','TXYRMHCOP1','TXYREMRGN1','TXCURRENT1',
'TXLTYPNRL1','TXYRNOSPIL','AUOPTYR1','MHLMNT3','MHLTHER3',
'MHLDOC3','MHLCLNC3','MHLDTMT3','AUINPYR1','AUALTYR1'])
df.shape
Out[4]:
In [5]:
df.head()
Out[5]:
In [6]:
df.tail()
Out[6]:
In [7]:
df.replace([83, 85, 91, 93, 94, 97, 98, 99, 991, 993], np.nan, inplace=True)
df.fillna(0, inplace=True)
df.head()
Out[7]:
Order matters here, because some features were saved as new variables
2=0
:
['STDANYYR1','HEPBCEVER1', 'HIVAIDSEV1', 'CANCEREVR1', 'INHOSPYR ',
'AMDELT','AMDEYR','ADDPR2WK1','DSTWORST1', 'IMPGOUTM1',
'IMPSOCM1','IMPRESPM1','SUICTHNK1','SUICPLAN1','SUICTRY1',
'PNRNMLIF','PNRNM30D','PNRWYGAMT','PNRWYGAMT','PNRRSHIGH'
'TXYRRESOV1','TXYROUTPT1','TXYRMHCOP1','TXYREMRGN1', 'TXCURRENT1',
'TXLTYPNRL1','AUOPTYR1','AUINPYR1','AUALTYR1']
3=1
: ['PNRRSHIGH', 'TXLTYPNRL1','TXYREMRGN1', 'AUOPTYR1','AUALTYR1']
5=1
: ['TXYRRESOV1', 'TXYROUTPT1','TXYRMHCOP1']
6=0
: TXLTYPNRL
male=0
, female=1
: IRSEX
1=4
, 2=3
, 3=2
, 4=1
: IRMARITSTAT
5=0
: EDUHIGHCAT
1=2
, 2=1
, 3=0
, 4=0
: IRWRKSTAT18
1=3
, 3=1
: COUTYP2
: 1=0
, 2=1
, 3=2
, 4=3
: ADWRDST1
In [8]:
columns = ['STDANYYR1','HEPBCEVER1', 'HIVAIDSEV1', 'CANCEREVR1', 'INHOSPYR ',
'AMDELT','AMDEYR','ADDPR2WK1','DSTWORST1', 'IMPGOUTM1',
'IMPSOCM1','IMPRESPM1','SUICTHNK1','SUICPLAN1','SUICTRY1',
'PNRNMLIF','PNRNM30D','PNRWYGAMT','PNRWYGAMT','PNRRSHIGH'
'TXYRRESOV1','TXYROUTPT1','TXYRMHCOP1','TXYREMRGN1', 'TXCURRENT1',
'TXLTYPNRL1','AUOPTYR1','AUINPYR1','AUALTYR1']
for col in df:
df[col].replace(2,0,inplace=True)
df.head()
Out[8]:
In [9]:
col = ['PNRRSHIGH', 'TXLTYPNRL1', 'TXYREMRGN1', 'AUOPTYR1','AUALTYR1']
for col in df:
df[col].replace(3,1,inplace=True)
df.head()
Out[9]:
In [10]:
df['SEX'] = df['IRSEX'].replace([1,2], [0,1])
df['MARRIED'] = df['IRMARITSTAT'].replace([1,2,3,4], [4,3,2,1])
df['EDUCAT'] = df['EDUHIGHCAT'].replace([1,2,3,4,5], [2,3,4,5,1])
df['EMPLOY18'] = df['IRWRKSTAT18'].replace([1,2,3,4], [2,1,0,0])
df['CTYMETRO'] = df['COUTYP2'].replace([1,2,3],[3,2,1])
df['EMODSWKS'] = df['ADWRDST1'].replace([1,2,3,4], [0,1,2,3])
df['TXLTPNRL'] = df['TXLTYPNRL1'].replace(6,0)
df['TXYRRESOV'] = df['TXYRRESOV1'].replace(5,1)
df['TXYROUTPT'] = df['TXYROUTPT1'].replace(5,1)
df['TXYRMHCOP'] = df['TXYRMHCOP1'].replace(5,1)
df.head()
Out[10]:
In [11]:
df.shape
Out[11]:
In [12]:
df.columns
Out[12]:
In [13]:
df = df.rename(columns={'QUESTID2':'QID','CATAG6':'AGECAT',
'STDANYYR1':'STDPYR','HEPBCEVER1':'HEPEVR','CANCEREVR1':'CANCEVR','INHOSPYR':'HOSPYR',
'AMDELT':'DEPMELT','AMDEYR':'DEPMEYR','ADDPR2WK1':'DEPMWKS','DSTWORST1':'DEPWMOS',
'IMPGOUTM1':'EMOPGOUT','IMPSOCM1':'EMOPSOC','IMPRESPM1':'EMOPWRK',
'SUICTHNK1':'SUICTHT','SUICPLAN1':'SUICPLN','SUICTRY1':'SUICATT',
'PNRNMLIF':'PRLUNDR','PNRNM30D':'PRLUNDR30','PNRWYGAMT':'PRLGRTYR',
'PNRNMFLAG':'PRLMISEVR','PNRNMYR':'PRLMISYR','PNRNMMON':'PRLMISMO',
'OXYCNNMYR':'PRLOXYMSYR','DEPNDPYPNR':'PRLDEPYR','ABUSEPYPNR':'PRLABSRY',
'PNRRSHIGH':'PRLHIGH','HYDCPDAPYU':'HYDRCDYR','OXYCPDAPYU':'OXYCDPRYR',
'OXCNANYYR2':'OXYCTNYR','TRAMPDAPYU':'TRMADLYR','MORPPDAPYU':'MORPHPRYR',
'FENTPDAPYU':'FENTNYLYR','BUPRPDAPYU':'BUPRNRPHN','OXYMPDAPYU':'OXYMORPHN',
'DEMEPDAPYU':'DEMEROL','HYDMPDAPYU':'HYDRMRPHN','HERFLAG':'HEROINEVR',
'HERYR':'HEROINYR', 'HERMON':'HEROINMO','ABODHER':'HEROINAB',
'MTDNPDAPYU':'METHADONE','IRHERFY':'HEROINFQY',
'TRBENZAPYU':'TRQBENZODZ','ALPRPDAPYU':'TRQALPRZM','LORAPDAPYU':'TRQLRZPM',
'CLONPDAPYU':'TRQCLNZPM','DIAZPDAPYU':'TRQDIAZPM','SVBENZAPYU':'SDBENZDPN',
'TRIAPDAPYU':'SDTRZLM','TEMAPDAPYU':'SDTMZPM','BARBITAPYU':'SDBARBTS',
'SEDOTANYR2':'SDOTHYR','COCFLAG':'COCNEVR','COCYR':'COCNYR','COCMON':'COCNMO',
'CRKFLAG':'CRACKEVR','CRKYR':'CRACKYR','AMMEPDAPYU':'AMPHTMNYR',
'METHAMFLAG':'METHEVR','METHAMYR':'METHYR','METHAMMON':'METHMO',
'HALLUCFLAG':'HLCNEVR','LSDFLAG':'LSDEVR','ECSTMOFLAG':'MDMAEVR',
'DAMTFXFLAG':'DMTEVR','KETMINFLAG':'KETMNEVR',
'TXYRRESOV':'TRTRHBOVN','TXYROUTPT':'TRTRHBOUT','TXYRMHCOP':'TRTMHCTR',
'TXYREMRGN1':'TRTERYR','TXCURRENT1':'TRTCURRCV','TXLTPNRL':'TRTCURPRL',
'TXYRNOSPIL':'TRTGAPYR','AUOPTYR1':'MHTRTOYR','MHLMNT3':'MHTRTCLYR',
'MHLTHER3':'MHTRTTHPY','MHLDOC3':'MHTRTDRYR', 'MHLCLNC3':'MHTRTMDOUT',
'MHLDTMT3':'MHTRTHPPGM','AUINPYR1':'MHTRTHSPON','AUALTYR1':'MHTRTALT'})
df.shape
Out[13]:
In [14]:
df1 = df[['QID','AGECAT','SEX', 'MARRIED', 'EDUCAT',
'EMPLOY18','CTYMETRO','HEALTH2','STDPYR','HEPEVR','CANCEVR','HOSPYR',
'DEPMELT','DEPMEYR','DEPMWKS','DEPWMOS','EMODSWKS','EMOPGOUT',
'EMOPSOC','EMOPWRK','SUICTHT','SUICPLN','SUICATT',
'PRLUNDR','PRLUNDR30','PRLGRTYR','PRLMISEVR','PRLMISYR',
'PRLMISMO','PRLOXYMSYR','PRLDEPYR','PRLABSRY','PRLHIGH',
'HYDRCDYR','OXYCDPRYR','OXYCTNYR','TRMADLYR','MORPHPRYR',
'FENTNYLYR','BUPRNRPHN','OXYMORPHN','DEMEROL','HYDRMRPHN',
'HEROINEVR','HEROINYR','HEROINMO','HEROINAB','METHADONE','HEROINFQY',
'TRQBENZODZ','TRQALPRZM','TRQLRZPM','TRQCLNZPM','TRQDIAZPM',
'SDBENZDPN','SDTRZLM','SDTMZPM','SDBARBTS','SDOTHYR',
'COCNEVR','COCNYR','COCNMO','CRACKEVR','CRACKYR',
'AMPHTMNYR','METHEVR','METHYR','METHMO',
'HLCNEVR','LSDEVR','MDMAEVR','DMTEVR','KETMNEVR',
'TRTRHBOVN','TRTRHBOUT','TRTMHCTR','TRTERYR','TRTCURRCV',
'TRTCURPRL','TRTGAPYR','MHTRTOYR','MHTRTCLYR','MHTRTTHPY',
'MHTRTDRYR','MHTRTMDOUT','MHTRTHPPGM','MHTRTHSPON','MHTRTALT']]
df1.shape
Out[14]:
In [15]:
df1.head()
Out[15]:
In [16]:
df1.to_csv('nsduh-dataset.csv', sep=',', encoding='utf-8')