In [1]:
import numpy as np
import pandas as pd
import re
import datetime
In [2]:
%pylab inline
In [3]:
allyears = pd.read_csv('https://dl.dropboxusercontent.com/u/39570370/allyears.csv')
searchcodes = pd.read_csv('https://raw.githubusercontent.com/rinina/PCC-Opioid-Project/master/data/searchcodes.csv', dtype=str)
In [4]:
allyears = allyears.dropna(subset=['Substance'])
allyears = allyears[allyears['Medical Outcome'] != 'Confirmed nonexposure']
allyears['Start Date'] = pd.to_datetime(allyears['Start Date'])
In [5]:
matched = allyears[allyears['Substance'].str.contains('|'.join(searchcodes['Code number'].values))]
In [6]:
opioid_counts = (searchcodes.set_index('Opioid name')['Code number']
.apply(lambda x: len(matched[matched['Substance'].str.contains(x)]))
.order(ascending=False).reset_index()
.rename(columns={'Code number' : 'Count'}))
opioid_counts
Out[6]:
In [7]:
opioid_deaths = (searchcodes.set_index('Opioid name')['Code number']
.apply(lambda x: len(matched[matched['Medical Outcome']
.str.contains('Death') & (matched['Substance'].str.contains(x))]))
.replace(0, np.nan).dropna().reset_index().rename(columns={'Code number':'Deaths'}))
opioid_deaths.sort('Deaths', ascending=False)
Out[7]:
In [8]:
opioid_outcomes = {}
for i in matched['Medical Outcome'].dropna().unique():
outcome = (searchcodes.set_index('Opioid name')['Code number']
.apply(lambda x:len(matched[(matched['Medical Outcome'] == i) & (matched['Substance'].str.contains(x))]))
.replace(0, np.nan).dropna().reset_index().rename(columns={'Code number':i}))
opioid_outcomes.update({i : outcome.set_index('Opioid name')})
opioid_outcomes = pd.concat(opioid_outcomes.values(), axis=1)
opioid_outcomes
Out[8]:
In [9]:
opioid_outcomes.plot(kind='bar', stacked=True, figsize=(20,20))
title('Outcomes by Opioid')
ylabel('Count')
Out[9]:
In [10]:
opioid_reasons = {}
for i in matched['Reason'].dropna().unique():
outcome = (searchcodes.set_index('Opioid name')['Code number']
.apply(lambda x:len(matched[(matched['Reason'] == i) & (matched['Substance'].str.contains(x))]))
.replace(0, np.nan).dropna().reset_index().rename(columns={'Code number':i}))
opioid_reasons.update({i : outcome.set_index('Opioid name')})
opioid_reasons = pd.concat(opioid_reasons.values(), axis=1)
opioid_reasons
Out[10]:
In [11]:
opioid_reasons.plot(kind='bar', stacked=True, figsize=(20,20))
title('Reasons for call by Opioid')
ylabel('Count')
Out[11]:
In [12]:
matched_reasons = matched.groupby('Reason').size().order(ascending=False).reset_index().rename(columns={0:'Matched Count'})
matched_reasons
Out[12]:
In [13]:
matched_outcomes = matched.groupby('Medical Outcome').size().order(ascending=False).reset_index().rename(columns={0:'Count'})
matched_outcomes
Out[13]:
In [14]:
matched['Gender'].replace('Pregnant', 'Female', inplace=True)
In [15]:
gender_totals = matched.groupby('Gender').size().reset_index().rename(columns={0:'Count'})
gender_totals
Out[15]:
In [16]:
reasons_by_gender = (matched.groupby(['Gender', 'Reason']).size()
.unstack().T)
reasons_by_gender
Out[16]:
In [17]:
reasons_by_gender.plot(kind='bar', figsize=(10,7))
xlabel('')
Out[17]:
In [18]:
matched[(matched['Reason'] == 'Intentional - Abuse') | (matched['Reason'] == 'Intentional - Misuse')].groupby('Gender').size()
Out[18]:
In [19]:
matched[matched['Reason'].str.contains('Unintentional|Adverse')].groupby('Gender').size()
Out[19]:
In [20]:
matched_age = matched[matched['Age Unit'] == 'Years']
def age_group(age):
if 1 <= age <= 4:
return '1-4'
elif 5 <= age <= 14:
return '5-14'
elif 15 <= age <= 24:
return '15-24'
elif 25 <= age <= 34:
return '25-34'
elif 35 <= age <= 44:
return '35-44'
elif 45 <= age <= 54:
return '45-54'
elif 55 <= age <= 64:
return '55-64'
elif 65 <= age <= 74:
return '65-74'
elif 75 <= age <= 84:
return '75-84'
elif age > 84:
return '85+'
else:
return 'Unknown'
matched_age['Age_group'] = matched_age['Age'].apply(age_group)
In [21]:
age_counts = matched_age.groupby('Age_group').size().reset_index().rename(columns={0 : 'Count'})
age_counts
Out[21]:
In [22]:
suicides_by_age = (matched_age[matched_age['Reason'] == 'Intentional - Suspected suicide']
.groupby('Age_group').size().reset_index().rename(columns={0 : 'Suicides'}))
suicides_by_age
Out[22]:
In [23]:
abuse_by_age = (matched_age[matched_age['Reason'] == 'Intentional - Abuse']
.groupby('Age_group').size().reset_index().rename(columns={0 : 'Intent. Abuse'}))
abuse_by_age
Out[23]:
In [24]:
intent_by_age = (matched_age[(matched_age['Reason'] == 'Intentional - Abuse') | (matched['Reason']== 'Intentional - Misuse')]
.groupby('Age_group').size().reset_index().rename(columns={0 : 'Intent. Abuse or Misuse'}))
intent_by_age
Out[24]:
In [25]:
unintent_by_age = (matched_age[matched_age['Reason'].str.contains('Unintentional|Adverse')]
.groupby('Age_group').size().reset_index().rename(columns={0 : 'Unintent or Adverse'}))
unintent_by_age
Out[25]:
In [26]:
age_reasons = pd.concat([i.set_index('Age_group') for i in [suicides_by_age, intent_by_age, unintent_by_age]], axis=1)
age_reasons.replace(np.nan, 0)
Out[26]:
In [27]:
age_reasons.iloc[[0,5,1,2,3,4,6,7,8,9]].plot(kind='bar', stacked=True, figsize=(15,10))
title('Brief reasons by age')
xlabel('Age group')
Out[27]:
In [28]:
age_full_reasons = matched_age.groupby(['Age_group', 'Reason']).size().unstack().T
age_full_reasons.replace(np.nan, '').iloc[:, [0,5,1,2,3,4,6,7,8,9]] #The iloc is used to reorder so that 5-14 comes after 1-4
Out[28]:
In [29]:
age_full_reasons.iloc[:, [0,5,1,2,3,4,6,7,8,9]].T.plot(kind='bar', stacked=True, figsize=(15,10))
title('All reasons by age')
xlabel('Age group')
Out[29]:
In [30]:
(100*matched_age.groupby(['Age_group', 'Reason']).size().unstack().astype(float)
.div(matched_age.groupby('Age_group').size().values, axis=0).T)
Out[30]:
In [31]:
heroin = allyears[allyears['Substance'].str.contains('Heroin', re.IGNORECASE)]
print len(heroin)
print len(matched)
In [32]:
matched_ts = matched.groupby([matched['Start Date'].dt.year, matched['Start Date'].dt.month]).size()
matched_ts.index = matched_ts.reset_index().apply(lambda x: datetime.datetime(x['level_0'], x['level_1'], 1), axis=1)
matched_ts.plot()
title('Time series of calls by month (2010-2014)')
xlabel('Time')
ylabel('Calls')
Out[32]:
In [33]:
by_mo = matched.groupby(matched['Start Date'].dt.month).size()
by_mo.plot()
title('Total calls by month')
xlim(1,12)
xlabel('Month')
ylabel('Calls')
Out[33]:
In [34]:
matched_suicides = matched[matched['Reason']=='Intentional - Suspected suicide']
allyears_suicides = allyears[allyears['Reason']=='Intentional - Suspected suicide']
pd.concat([matched_suicides.groupby(matched_suicides['Start Date'].dt.month).size(),
allyears_suicides.groupby(allyears_suicides['Start Date'].dt.month).size()],
axis=1).rename(columns={0:'Opioid', 1:'All Calls'})
Out[34]:
In [35]:
matched_therapy = matched.dropna(subset=['Therapy'])
In [36]:
matched_therapy[matched_therapy['Therapy'].str.contains('Naloxone')].groupby('Reason').size()
Out[36]:
In [37]:
matched_death = matched[matched['Medical Outcome']=='Death']
In [38]:
matched_death.groupby('Reason').size()
Out[38]:
In [39]:
matched['Therapy'].dropna().unique()
Out[39]:
In [40]:
matched['Clinical Effect'].unique()
Out[40]:
In [41]:
matched['Exposure Site'].unique()
Out[41]:
In [42]:
matched.groupby('Exposure Site').size()
Out[42]:
In [43]:
matched[matched['Substance'].str.contains('[Ee]thanol')].groupby('Reason').size()
Out[43]:
In [44]:
secondary_substances = (pd.Series(np.concatenate(matched['Substance'].str.split(' ; ').apply(np.array).values))
.str.split('[').str[0].str.split(' - ').apply(pd.Series).apply(lambda x: x.str.strip())).groupby(0).size().order(ascending=False).drop(searchcodes['Opioid name'])
In [45]:
secondary_compare = matched.groupby('Medical Outcome').size()
secondary_compare.name = 'All calls'
for i in pd.Series(secondary_substances.index).str.split('\(').str[0].str.strip().values:
sub = matched[matched['Substance'].str.contains(i)].groupby('Medical Outcome').size()
sub.name = i
secondary_compare = pd.concat([secondary_compare, sub], axis=1)
secondary_compare.T.replace(np.nan, 0)
Out[45]:
In [46]:
pd.concat([secondary_compare.sum().T,
100*(secondary_compare.T).div(secondary_compare.sum().T.astype(float), axis=0).replace(np.nan, 0)], axis=1).rename(columns={0:'Total'})
Out[46]:
In [47]:
secondary_compare2 = matched.groupby('Reason').size()
secondary_compare2.name = 'All calls'
for i in pd.Series(secondary_substances.index).str.split('\(').str[0].str.strip().values:
sub = matched[matched['Substance'].str.contains(i)].groupby('Reason').size()
sub.name = i
secondary_compare2 = pd.concat([secondary_compare2, sub], axis=1)
secondary_compare2.T.replace(np.nan, 0)
Out[47]:
In [48]:
pd.concat([secondary_compare2.sum().T,
100*(secondary_compare2.T).div(secondary_compare2.sum().T.astype(float), axis=0).replace(np.nan, '')], axis=1).sort(0, ascending=False).rename(columns={0:'Total'})
Out[48]:
In [49]:
matched_death = matched.dropna(subset=['Medical Outcome'])[matched['Medical Outcome'].dropna().str.contains('Death')]
In [50]:
opioid_deaths_expanded = (searchcodes.set_index('Opioid name')['Code number']
.apply(lambda x: matched[matched['Medical Outcome']
.str.contains('Death') & (matched['Substance'].str.contains(x))])
.replace(0, np.nan).dropna().reset_index().rename(columns={'Code number':'Deaths'}))
In [51]:
opioid_deaths_expanded = (opioid_deaths_expanded[opioid_deaths_expanded['Deaths'].str.len() != 0]
.set_index('Opioid name')['Deaths'].apply(lambda x: x['Substance'].values))
In [52]:
opioid_deaths_expanded = pd.Series(np.concatenate(opioid_deaths_expanded.values),
index=np.repeat(opioid_deaths_expanded.index.values,
opioid_deaths_expanded.str.len()))
In [53]:
secondary_counts = (pd.Series(np.concatenate(opioid_deaths_expanded.str.split(' ; ').apply(np.array).values))
.str.split('[').str[0].str.split(' - ').apply(pd.Series).apply(lambda x: x.str.strip())).groupby(0).size().order(ascending=False)
In [54]:
opioid_deaths_expanded = (pd.Series(np.concatenate(opioid_deaths_expanded.str.split(' ; ').apply(np.array).values),
index=np.repeat(opioid_deaths_expanded.index.values, opioid_deaths_expanded.str.split(' ; ')
.str.len())).str.split('[').str[0].str.split(' - ').apply(pd.Series))
In [55]:
opioid_deaths_expanded[0] = opioid_deaths_expanded[0].str.strip()
In [56]:
opioid_deaths_expanded = opioid_deaths_expanded.reset_index().rename(columns={'index' : 'primary',
0 : 'secondary'}).groupby(['primary', 'secondary']).size().reset_index()
In [57]:
len(matched[matched['Substance'].str.contains('Benzodiazepines')])
Out[57]:
In [58]:
len(matched[(matched['Substance'].str.contains('Benzodiazepines')) & (matched['Reason'].str.contains('[Ss]uicide'))])
Out[58]:
In [59]:
benzo = matched[matched['Substance'].str.contains('Benzodiazepines')]
ethanol = matched[matched['Substance'].str.contains('Ethanol')]
benzo_compare = (pd.concat([benzo.groupby('Medical Outcome').size(),
matched.groupby('Medical Outcome').size(),
ethanol.groupby('Medical Outcome').size()], axis=1).rename(columns={0:'Benzodiazepine', 1:'All Opioids', 2:'Ethanol'}))
benzo_compare
Out[59]:
In [60]:
benz_reason = matched[matched['Substance'].str.contains('0007000')].groupby('Reason').size()
benz_reason/benz_reason.sum()*100
Out[60]:
In [60]: