In [63]:
import requests
from StringIO import StringIO
from numpy import nan as NA
import pandas as pd
from pandas import DataFrame
import zipfile
import re
%matplotlib inline
import matplotlib.pyplot as plt
In [64]:
def fetch_prescribing_data():
url = 'http://datagov.ic.nhs.uk/presentation/2015_01_January/T201501PDPI+BNFT.CSV' #gp prescribing data (Jan 2015)
r = requests.get(url)
data = r.content
df = (pd.read_csv(StringIO(data)))
df.to_csv('datas/T201501PDPI+BNFT.CSV')
def fetch_dispensary_data():
url = 'http://systems.hscic.gov.uk/data/ods/datadownloads/data-files/edispensary.zip'
r = requests.get(url)
z = zipfile.ZipFile(StringIO(r.content))
df = pd.read_csv(z.open('edispensary.csv'))
df.to_csv('datas/edispensary.csv')
def fetch_gp_details():
url = 'http://systems.hscic.gov.uk/data/ods/datadownloads/data-files/epraccur.zip'
r = requests.get(url)
z = zipfile.ZipFile(StringIO(r.content))
df = pd.read_csv(z.open('epraccur.csv'))
df.to_csv('datas/epraccur.csv')
def clean_prescribing_data(df):
df.columns = [x.strip() for x in df.columns] #gets rid of variable whitespace
df = df[df['BNF NAME'].str.contains('statin')] #cut for rows with statin in them
df = df[~df['BNF NAME'].str.contains('Nystatin|Sandostatin|Ecostatin')] #throw away unwanted statins
df.to_csv('datas/StatinsJan2015.csv') #save the result
return(df)
def clean_dispensing_practice_addresses(dpad):
dpad = dpad['Dispensing Practices Address Details'].dropna()
dpad = dpad.reset_index()
del dpad['index']
dpad['Dispensing Practices Address Details'] = dpad['Dispensing Practices Address Details'].str.strip()
dpad['Dispensing Practices Address Details'] = dpad['Dispensing Practices Address Details'].str.replace('\n', ' ')
dpad['NAME'] = dpad['Dispensing Practices Address Details'].str.split(',').str[0].str.upper()
dpad['POSTCODE'] = dpad['Dispensing Practices Address Details'].str.split(',').str[-1].astype(str).str.strip()
dpad.ix[254,2] = 'BN25 1HH' #one practice lacked a postcode.... we fix this manually
dpad.ix[254,1] = 'Old School Surgery'
return(dpad)
def validate_dispensing_postcodes():
assert(len(dpad[~dpad.Postcode.str.contains(r'[A-Z]{1,2}[0-9R][0-9A-Z]? [0-9][A-Z]{2}')]) == 0) #length of dataframe of postcodes that don't pass regex should be 0
#nb df = pd.read_csv('http://datagov.ic.nhs.uk/presentation/2015_01_January/T201501PDPI+BNFT.CSV') should also work but seems slower
In [65]:
#df = fetch_prescribing_data() #commented because requires internet and is slow
#df = clean_prescribing_data(df) #commented because need only run once and is slow
#dispdata = fetch_dispensary_data() #doesn't actually contain dispensing practices
#fetch_gp_details()
df = pd.read_csv('datas/StatinsJan2015.csv') #load cleaned prescribing data cut
gpdetail = pd.read_csv('datas/epraccur.csv') #from http://systems.hscic.gov.uk/data/ods/datadownloads/data-files/epraccur.zip
dpad_formatting_junk = ['Dispensing Practices Address Details', 'Primary Care Trust:', 'Report For:', 'Practice Name and Address', 'January 2015']
dpad = pd.read_excel('datas/Disp Pracs Name and Address 2015-01-31.xls', usecols=[0], na_values=dpad_formatting_junk) #load dispensing practice list
dpad = clean_dispensing_practice_addresses(dpad)
gpdetail = gpdetail.icol([1,2,10]) #throw away columns we don't care about
gpdetail.columns = ['PRACTICE', 'NAME', 'POSTCODE']
dpad = pd.merge(gpdetail, dpad, on=['NAME','POSTCODE']) #merge to add practice codes
df['DP'] = df.PRACTICE.isin(dpad.PRACTICE) #add column DP to identify dispensing practices
df['IR'] = df['BNF NAME'].str.contains('Rosuvastatin') #add column for is rosuvastatin
In [66]:
df = df[['PRACTICE','BNF NAME','ITEMS','DP', 'IR']]
In [39]:
#df['DP'] = df['DP'].astype('category', categories=['dispensing practice', 'not a dispensing practice']) try with newer version
In [40]:
def boolstostrings():
df['DP'] = df['DP'].astype(str).str.replace('False', 'NDP')
df['DP'] = df['DP'].astype(str).str.replace('True', 'DP')
df['IR'] = df['IR'].astype(str).str.replace('False', 'NIR')
df['IR'] = df['IR'].astype(str).str.replace('True', 'IR')
In [41]:
#boolstostrings()
In [71]:
df.groupby(['DP','PRACTICE', 'IR']).ITEMS.sum()
Out[71]:
In [42]:
df1 = df.groupby(['DP','PRACTICE', 'IR']).ITEMS.sum()
#applying groupby when there are no items in a categories generate NaNs where we would instead prefer 0's
df1 = df1.fillna(0)
In [47]:
df2 = df1.unstack()
In [53]:
df2.columns = ['off patent statin', 'rosuvastatin']
In [160]:
df2['Percentage'] = (df2['rosuvastatin'] / (df2['rosuvastatin'] + df2['off patent statin']) * 100)
In [153]:
df2.head()
Out[153]:
In [125]:
def practice_number_sanity_check():
all_practices = df['PRACTICE'].unique()
rosuva_practices = df[df['BNF NAME'].str.contains('Rosuvastatin')]['PRACTICE'].unique()
not_rosuva_practices = set(all_practices) - set(rosuva_practices)
assert(len(all_practices) == len(rosuva_practices) + len(not_rosuva_practices))
In [154]:
df2[df2.rosuvastatin.isnull()]
Out[154]:
In [161]:
df2[df2['off patent statin'].isnull()]
Out[161]:
In [158]:
df2 = df2.fillna(0)
In [ ]: