In [1]:
    
import requests, zipfile, io
import pandas as pd
URL = 'http://samhda.s3-us-gov-west-1.amazonaws.com/s3fs-public/field-uploads-protected/studies/NSDUH-2015/NSDUH-2015-datasets/NSDUH-2015-DS0001/NSDUH-2015-DS0001-bundles-with-study-info/NSDUH-2015-DS0001-bndl-data-tsv.zip'
def get_data():
    r = requests.get('http://samhda.s3-us-gov-west-1.amazonaws.com/s3fs-public/field-uploads-protected/studies/NSDUH-2015/NSDUH-2015-datasets/NSDUH-2015-DS0001/NSDUH-2015-DS0001-bundles-with-study-info/NSDUH-2015-DS0001-bndl-data-tsv.zip')
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall()
    file = pd.read_table('~/NSDUH-2015-DS0001-bndl-data-tsv/NSDUH-2015-DS0001-data/NSDUH-2015-DS0001-data-excel.tsv', low_memory=False)
    data = pd.DataFrame(file)
    print(data.shape)
    
    data.to_csv('nsduh15-dataset.csv', sep=',', encoding='utf-8')
    
get_data()
    
    
In [2]:
    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
    
In [3]:
    
file = pd.read_csv('nsduh15-dataset.csv', low_memory=False)
data = pd.DataFrame(file)
    
In [4]:
    
data.shape
    
    Out[4]:
In [5]:
    
df = pd.DataFrame(data, columns=['QUESTID2', 'CATAG6', 'IRSEX','IRMARITSTAT',
        'EDUHIGHCAT', 'IRWRKSTAT18', 'COUTYP2', 'HEALTH2','STDANYYR1',
        'HEPBCEVER1','HIVAIDSEV1','CANCEREVR1','INHOSPYR','AMDELT',
        'AMDEYR','ADDPR2WK1','ADWRDST1','DSTWORST1','IMPGOUTM1',
        'IMPSOCM1','IMPRESPM1','SUICTHNK1','SUICPLAN1','SUICTRY1',
        'PNRNMLIF','PNRNM30D','PNRWYGAMT','PNRNMFLAG','PNRNMYR',
        'PNRNMMON','OXYCNNMYR','DEPNDPYPNR','ABUSEPYPNR','PNRRSHIGH',
        'HYDCPDAPYU','OXYCPDAPYU','OXCNANYYR2','TRAMPDAPYU','MORPPDAPYU',
        'FENTPDAPYU','BUPRPDAPYU','OXYMPDAPYU','DEMEPDAPYU','HYDMPDAPYU',
        'HERFLAG','HERYR','HERMON','ABODHER', 'MTDNPDAPYU',
        'IRHERFY','TRBENZAPYU','ALPRPDAPYU','LORAPDAPYU','CLONPDAPYU',
        'DIAZPDAPYU','SVBENZAPYU','TRIAPDAPYU','TEMAPDAPYU','BARBITAPYU',
        'SEDOTANYR2','COCFLAG','COCYR','COCMON','CRKFLAG',
        'CRKYR','AMMEPDAPYU','METHAMFLAG','METHAMYR','METHAMMON',
        'HALLUCFLAG','LSDFLAG','ECSTMOFLAG','DAMTFXFLAG','KETMINFLAG',
        'TXYRRESOV1','TXYROUTPT1','TXYRMHCOP1','TXYREMRGN1','TXCURRENT1',
        'TXLTYPNRL1','TXYRNOSPIL','AUOPTYR1','MHLMNT3','MHLTHER3',
        'MHLDOC3','MHLCLNC3','MHLDTMT3','AUINPYR1','AUALTYR1'])
df.shape
    
    Out[5]:
In [6]:
    
df.head()
    
    Out[6]:
In [7]:
    
df.tail()
    
    Out[7]:
In [8]:
    
df.replace([83, 85, 91, 93, 94, 97, 98, 99, 991, 993], np.nan, inplace=True)
df.fillna(0, inplace=True)
df.head()
    
    Out[8]:
Order matters here, because some variables were recoded into new variables
2=0: 
['STDANYYR1','HEPBCEVER1', 'HIVAIDSEV1', 'CANCEREVR1', 'INHOSPYR ',
'AMDELT','AMDEYR','ADDPR2WK1','DSTWORST1', 'IMPGOUTM1',
'IMPSOCM1','IMPRESPM1','SUICTHNK1','SUICPLAN1','SUICTRY1',
'PNRNMLIF','PNRNM30D','PNRWYGAMT','PNRRSHIGH'
'TXYRRESOV1','TXYROUTPT1','TXYRMHCOP1','TXYREMRGN1', 'TXCURRENT1', 
'TXLTYPNRL1','AUOPTYR1','AUINPYR1','AUALTYR1'] ['PNRRSHIGH', 'TXLTYPNRL1','TXYREMRGN1', 'AUOPTYR1','AUALTYR1']: 3=1['TXYRRESOV1', 'TXYROUTPT1','TXYRMHCOP1']: 5=1TXLTYPNRL: 6=0IRSEX to male=0, female=1IRMARITSTAT: 1=4, 2=3, 3=2, 4=1EDUHIGHCAT: 5=0IRWRKSTAT18: 1=2, 2=1, 3=0, 4=0COUTYP2: 1=3, 3=1ADWRDST1: 1=0, 2=1, 3=2, 4=3
In [9]:
    
df['STDANYYR1'].replace(2,0,inplace=True)
df['HEPBCEVER1'].replace(2,0,inplace=True)
df['HIVAIDSEV1'].replace(2,0,inplace=True)
df['CANCEREVR1'].replace(2,0,inplace=True)
df['INHOSPYR'].replace(2,0,inplace=True)
df['AMDELT'].replace(2,0,inplace=True)
df['AMDEYR'].replace(2,0,inplace=True)
df['ADDPR2WK1'].replace(2,0,inplace=True)
df['DSTWORST1'].replace(2,0,inplace=True)
df['IMPGOUTM1'].replace(2,0,inplace=True)
df['IMPSOCM1'].replace(2,0,inplace=True)
df['IMPRESPM1'].replace(2,0,inplace=True)
df['SUICTHNK1'].replace(2,0,inplace=True)
df['SUICPLAN1'].replace(2,0,inplace=True)
df['SUICTRY1'].replace(2,0,inplace=True)
df['PNRNMLIF'].replace(2,0,inplace=True)
df['PNRNM30D'].replace(2,0,inplace=True)
df['PNRWYGAMT'].replace(2,0,inplace=True)
df['PNRRSHIGH'].replace(2,0,inplace=True)
df['TXYRRESOV1'].replace(2,0,inplace=True)
df['TXYROUTPT1'].replace(2,0,inplace=True)
df['TXYRMHCOP1'].replace(2,0,inplace=True)
df['TXYREMRGN1'].replace(2,0,inplace=True)
df['TXCURRENT1'].replace(2,0,inplace=True)
df['TXLTYPNRL1'].replace(2,0,inplace=True)
df['AUOPTYR1'].replace(2,0,inplace=True)
df['AUINPYR1'].replace(2,0,inplace=True)
df['AUALTYR1'].replace(2,0,inplace=True)
df.head()
    
    Out[9]:
In [10]:
    
df['PNRRSHIGH'].replace(3,1,inplace=True)
df['TXLTYPNRL1'].replace(3,1,inplace=True)
df['TXYREMRGN1'].replace(3,1,inplace=True)
df['AUOPTYR1'].replace(3,1,inplace=True)
df['AUALTYR1'].replace(3,1,inplace=True)
df.head()
    
    Out[10]:
In [11]:
    
df['SEX'] = df['IRSEX'].replace([1,2], [0,1])
df['MARRIED'] = df['IRMARITSTAT'].replace([1,2,3,4], [4,3,2,1])
df['EDUCAT'] = df['EDUHIGHCAT'].replace([1,2,3,4,5], [2,3,4,5,1])
df['EMPLOY18'] = df['IRWRKSTAT18'].replace([1,2,3,4], [2,1,0,0])
df['CTYMETRO'] = df['COUTYP2'].replace([1,2,3],[3,2,1])
df['EMODSWKS'] = df['ADWRDST1'].replace([1,2,3,4], [0,1,2,3])
df['TXLTPNRL'] = df['TXLTYPNRL1'].replace(6,0)
df['TXYRRESOV'] = df['TXYRRESOV1'].replace(5,1)
df['TXYROUTPT'] = df['TXYROUTPT1'].replace(5,1)
df['TXYRMHCOP'] = df['TXYRMHCOP1'].replace(5,1)
df.head()
    
    Out[11]:
In [12]:
    
df.shape
    
    Out[12]:
In [13]:
    
df.columns
    
    Out[13]:
In [14]:
    
df =  df.rename(columns={'QUESTID2':'QID','CATAG6':'AGECAT',
     'STDANYYR1':'STDPYR','HEPBCEVER1':'HEPEVR','CANCEREVR1':'CANCEVR','INHOSPYR':'HOSPYR', 
     'AMDELT':'DEPMELT','AMDEYR':'DEPMEYR','ADDPR2WK1':'DEPMWKS','DSTWORST1':'DEPWMOS',
     'IMPGOUTM1':'EMOPGOUT','IMPSOCM1':'EMOPSOC','IMPRESPM1':'EMOPWRK',
     'SUICTHNK1':'SUICTHT','SUICPLAN1':'SUICPLN','SUICTRY1':'SUICATT',
     'PNRNMLIF':'PRLUNDR','PNRNM30D':'PRLUNDR30','PNRWYGAMT':'PRLGRTYR',
     'PNRNMFLAG':'PRLMISEVR','PNRNMYR':'PRLMISYR','PNRNMMON':'PRLMISMO',
     'OXYCNNMYR':'PRLOXYMSYR','DEPNDPYPNR':'PRLDEPYR','ABUSEPYPNR':'PRLABSRY',     
     'PNRRSHIGH':'PRLHIGH','HYDCPDAPYU':'HYDRCDYR','OXYCPDAPYU':'OXYCDPRYR', 
     'OXCNANYYR2':'OXYCTNYR','TRAMPDAPYU':'TRMADLYR','MORPPDAPYU':'MORPHPRYR',
     'FENTPDAPYU':'FENTNYLYR','BUPRPDAPYU':'BUPRNRPHN','OXYMPDAPYU':'OXYMORPHN',
     'DEMEPDAPYU':'DEMEROL','HYDMPDAPYU':'HYDRMRPHN','HERFLAG':'HEROINEVR',
     'HERYR':'HEROINYR', 'HERMON':'HEROINMO','ABODHER':'HEROINAB',
     'MTDNPDAPYU':'METHADONE','IRHERFY':'HEROINFQY',
     'TRBENZAPYU':'TRQBENZODZ','ALPRPDAPYU':'TRQALPRZM','LORAPDAPYU':'TRQLRZPM',
     'CLONPDAPYU':'TRQCLNZPM','DIAZPDAPYU':'TRQDIAZPM','SVBENZAPYU':'SDBENZDPN',
     'TRIAPDAPYU':'SDTRZLM','TEMAPDAPYU':'SDTMZPM','BARBITAPYU':'SDBARBTS', 
     'SEDOTANYR2':'SDOTHYR','COCFLAG':'COCNEVR','COCYR':'COCNYR','COCMON':'COCNMO',
     'CRKFLAG':'CRACKEVR','CRKYR':'CRACKYR','AMMEPDAPYU':'AMPHTMNYR', 
     'METHAMFLAG':'METHEVR','METHAMYR':'METHYR','METHAMMON':'METHMO',
     'HALLUCFLAG':'HLCNEVR','LSDFLAG':'LSDEVR','ECSTMOFLAG':'MDMAEVR',
     'DAMTFXFLAG':'DMTEVR','KETMINFLAG':'KETMNEVR', 
     'TXYRRESOV':'TRTRHBOVN','TXYROUTPT':'TRTRHBOUT','TXYRMHCOP':'TRTMHCTR',
     'TXYREMRGN1':'TRTERYR','TXCURRENT1':'TRTCURRCV','TXLTPNRL':'TRTCURPRL',
     'TXYRNOSPIL':'TRTGAPYR','AUOPTYR1':'MHTRTOYR','MHLMNT3':'MHTRTCLYR',
     'MHLTHER3':'MHTRTTHPY','MHLDOC3':'MHTRTDRYR', 'MHLCLNC3':'MHTRTMDOUT',
     'MHLDTMT3':'MHTRTHPPGM','AUINPYR1':'MHTRTHSPON','AUALTYR1':'MHTRTALT'})
     
df.shape
    
    Out[14]:
In [15]:
    
df1 = df[['QID','AGECAT','SEX', 'MARRIED', 'EDUCAT', 
     'EMPLOY18','CTYMETRO','HEALTH2','STDPYR','HEPEVR','CANCEVR','HOSPYR', 
     'DEPMELT','DEPMEYR','DEPMWKS','DEPWMOS','EMODSWKS','EMOPGOUT',
     'EMOPSOC','EMOPWRK','SUICTHT','SUICPLN','SUICATT',
     'PRLUNDR','PRLUNDR30','PRLGRTYR','PRLMISEVR','PRLMISYR',
     'PRLMISMO','PRLOXYMSYR','PRLDEPYR','PRLABSRY','PRLHIGH',
     'HYDRCDYR','OXYCDPRYR','OXYCTNYR','TRMADLYR','MORPHPRYR',
     'FENTNYLYR','BUPRNRPHN','OXYMORPHN','DEMEROL','HYDRMRPHN',
     'HEROINEVR','HEROINYR','HEROINMO','HEROINAB','METHADONE','HEROINFQY',
     'TRQBENZODZ','TRQALPRZM','TRQLRZPM','TRQCLNZPM','TRQDIAZPM',
     'SDBENZDPN','SDTRZLM','SDTMZPM','SDBARBTS','SDOTHYR',
     'COCNEVR','COCNYR','COCNMO','CRACKEVR','CRACKYR',
     'AMPHTMNYR','METHEVR','METHYR','METHMO',
     'HLCNEVR','LSDEVR','MDMAEVR','DMTEVR','KETMNEVR', 
     'TRTRHBOVN','TRTRHBOUT','TRTMHCTR','TRTERYR','TRTCURRCV',
     'TRTCURPRL','TRTGAPYR','MHTRTOYR','MHTRTCLYR','MHTRTTHPY',
     'MHTRTDRYR','MHTRTMDOUT','MHTRTHPPGM','MHTRTHSPON','MHTRTALT']]
df1.shape
    
    Out[15]:
In [16]:
    
df1.head()
    
    Out[16]:
In [17]:
    
df1.to_csv('nsduh-2015.csv', sep=',', encoding='utf-8')
    
Several ways to create new variables based on sum of related columns:
df['C'] = df['A'] + df['B']df['C'] =  df[['A', 'B']].sum(axis=1)df['C'] = df.apply(lambda row: row['A']+row['B'], axis=1)
In [18]:
    
df1['HEALTH'] = df1['HEALTH2']+df1['STDPYR']+df1['HEPEVR']+df1['CANCEVR']+df1['HOSPYR']
    
    
In [20]:
    
df1['MENTHLTH'] =  df1[['DEPMELT', 'DEPMEYR', 'DEPMWKS', 'DEPWMOS', 'EMODSWKS', 
                        'EMOPGOUT','EMOPSOC', 'EMOPWRK','SUICTHT', 'SUICPLN']].sum(axis=1)
df1['PRLMISAB'] =  df1[['PRLUNDR', 'PRLUNDR30', 'PRLGRTYR', 'PRLMISEVR', 'PRLMISYR', 
                          'PRLMISMO', 'PRLOXYMSYR','PRLDEPYR', 'PRLABSRY','PRLHIGH']].sum(axis=1)
df1['PRLANY'] = df1[['HYDRCDYR', 'OXYCDPRYR', 'OXYCTNYR', 'TRMADLYR', 'MORPHPRYR', 
                        'FENTNYLYR','BUPRNRPHN', 'OXYMORPHN','DEMEROL', 'HYDRMRPHN']].sum(axis=1)
df1['HEROINUSE'] =  df1[['HEROINEVR', 'HEROINYR', 'HEROINMO', 'HEROINAB', 'METHADONE']].sum(axis=1)
df1['TRQLZRS'] =  df1[['TRQBENZODZ', 'TRQALPRZM', 'TRQLRZPM', 'TRQCLNZPM', 'TRQDIAZPM']].sum(axis=1)
df1['SEDATVS'] =  df1[['SDBENZDPN','SDTRZLM', 'SDTMZPM','SDBARBTS', 'SDOTHYR', 'SDOTHYR']].sum(axis=1)
df1['COCAINE'] =  df1[['COCNEVR', 'COCNYR', 'COCNMO', 'CRACKEVR', 'CRACKYR']].sum(axis=1)
df1['AMPHETMN'] =  df1[['AMPHTMNYR','METHEVR', 'METHYR','METHMO']].sum(axis=1)
df1['HALUCNG'] =  df1[['HLCNEVR', 'LSDEVR','MDMAEVR', 'DMTEVR', 'KETMNEVR']].sum(axis=1)
df1['TRTMENT'] =  df1[['TRTRHBOVN', 'TRTRHBOUT', 'TRTMHCTR','TRTERYR',
                         'TRTCURRCV', 'TRTCURPRL', 'TRTGAPYR']].sum(axis=1)
        
df1['MHTRTMT'] =  df1[['MHTRTOYR','MHTRTCLYR', 'MHTRTTHPY', 'MHTRTDRYR', 
                          'MHTRTMDOUT', 'MHTRTHPPGM','MHTRTHSPON', 'MHTRTALT']].sum(axis=1)
    
In [21]:
    
df1.shape
    
    Out[21]:
In [22]:
    
df1.keys()
    
    Out[22]:
In [23]:
    
df1['HEALTH'].replace([0,1,2,3,4,5], [5,4,3,2,1,0])
    
    Out[23]:
In [24]:
    
df2 = pd.DataFrame(df1, columns=['QID', 'AGECAT', 'SEX', 'MARRIED', 
    'EDUCAT', 'EMPLOY18', 'CTYMETRO','HEALTH', 'MENTHLTH','SUICATT',
    'PRLMISEVR','PRLMISAB','PRLANY','HEROINEVR','HEROINUSE','HEROINFQY',    
    'TRQLZRS', 'SEDATVS', 'COCAINE', 'AMPHETMN','TRTMENT','MHTRTMT'  
    ])
df2.shape
    
    Out[24]:
In [25]:
    
df2.keys()
    
    Out[25]:
In [26]:
    
df2.to_csv('project-data.csv', sep=',', encoding='utf-8')