In [63]:
import requests
from StringIO import StringIO
from numpy import nan as NA
import pandas as pd
from pandas import DataFrame
import zipfile
import re
%matplotlib inline
import matplotlib.pyplot as plt

In [64]:
def fetch_prescribing_data():
    url = 'http://datagov.ic.nhs.uk/presentation/2015_01_January/T201501PDPI+BNFT.CSV' #gp prescribing data (Jan 2015)
    r = requests.get(url)
    data = r.content
    df = (pd.read_csv(StringIO(data)))
    df.to_csv('datas/T201501PDPI+BNFT.CSV')

def fetch_dispensary_data():
    url = 'http://systems.hscic.gov.uk/data/ods/datadownloads/data-files/edispensary.zip'
    r = requests.get(url)
    z = zipfile.ZipFile(StringIO(r.content))
    df = pd.read_csv(z.open('edispensary.csv'))
    df.to_csv('datas/edispensary.csv')

def fetch_gp_details():
    url = 'http://systems.hscic.gov.uk/data/ods/datadownloads/data-files/epraccur.zip'
    r = requests.get(url)
    z = zipfile.ZipFile(StringIO(r.content))
    df = pd.read_csv(z.open('epraccur.csv'))
    df.to_csv('datas/epraccur.csv')

def clean_prescribing_data(df):
    df.columns = [x.strip() for x in df.columns] #gets rid of variable whitespace
    df = df[df['BNF NAME'].str.contains('statin')] #cut for rows with statin in them
    df = df[~df['BNF NAME'].str.contains('Nystatin|Sandostatin|Ecostatin')] #throw away unwanted statins
    df.to_csv('datas/StatinsJan2015.csv') #save the result
    return(df)

def clean_dispensing_practice_addresses(dpad):
    dpad = dpad['Dispensing Practices Address Details'].dropna()
    dpad = dpad.reset_index()
    del dpad['index']
    dpad['Dispensing Practices Address Details'] = dpad['Dispensing Practices Address Details'].str.strip()
    dpad['Dispensing Practices Address Details'] = dpad['Dispensing Practices Address Details'].str.replace('\n', ' ')
    dpad['NAME'] = dpad['Dispensing Practices Address Details'].str.split(',').str[0].str.upper()
    dpad['POSTCODE'] = dpad['Dispensing Practices Address Details'].str.split(',').str[-1].astype(str).str.strip()
    dpad.ix[254,2] = 'BN25 1HH' #one practice lacked a postcode.... we fix this manually
    dpad.ix[254,1] = 'Old School Surgery'
    return(dpad)

def validate_dispensing_postcodes():
    assert(len(dpad[~dpad.Postcode.str.contains(r'[A-Z]{1,2}[0-9R][0-9A-Z]? [0-9][A-Z]{2}')]) == 0) #length of dataframe of postcodes that don't pass regex should be 0 

#nb df = pd.read_csv('http://datagov.ic.nhs.uk/presentation/2015_01_January/T201501PDPI+BNFT.CSV') should also work but seems slower

In [65]:
#df = fetch_prescribing_data() #commented because requires internet and is slow 
#df = clean_prescribing_data(df) #commented because need only run once and is slow
#dispdata = fetch_dispensary_data() #doesn't actually contain dispensing practices
#fetch_gp_details()

df = pd.read_csv('datas/StatinsJan2015.csv') #load cleaned prescribing data cut
gpdetail = pd.read_csv('datas/epraccur.csv') #from http://systems.hscic.gov.uk/data/ods/datadownloads/data-files/epraccur.zip

dpad_formatting_junk = ['Dispensing Practices Address Details', 'Primary Care Trust:', 'Report For:', 'Practice Name and Address', 'January 2015']
dpad = pd.read_excel('datas/Disp Pracs Name and Address 2015-01-31.xls', usecols=[0], na_values=dpad_formatting_junk) #load dispensing practice list

dpad = clean_dispensing_practice_addresses(dpad)

gpdetail = gpdetail.icol([1,2,10]) #throw away columns we don't care about
gpdetail.columns = ['PRACTICE', 'NAME', 'POSTCODE']

dpad = pd.merge(gpdetail, dpad, on=['NAME','POSTCODE']) #merge to add practice codes

df['DP'] = df.PRACTICE.isin(dpad.PRACTICE) #add column DP to identify dispensing practices

df['IR'] = df['BNF NAME'].str.contains('Rosuvastatin') #add column for is rosuvastatin

In [66]:
df = df[['PRACTICE','BNF NAME','ITEMS','DP', 'IR']]

In [39]:
#df['DP'] = df['DP'].astype('category',  categories=['dispensing practice', 'not a dispensing practice']) try with newer version

In [40]:
def boolstostrings():
    df['DP'] = df['DP'].astype(str).str.replace('False', 'NDP')
    df['DP'] = df['DP'].astype(str).str.replace('True', 'DP')
    df['IR'] = df['IR'].astype(str).str.replace('False', 'NIR')
    df['IR'] = df['IR'].astype(str).str.replace('True', 'IR')

In [41]:
#boolstostrings()

In [71]:
df.groupby(['DP','PRACTICE', 'IR']).ITEMS.sum()


Out[71]:
DP     PRACTICE  IR   
False  A81001    False     175
                 True        8
       A81002    False    1041
                 True       63
       A81003    False     256
                 True        2
       A81004    False     386
                 True       14
       A81005    False     389
                 True       17
       A81006    False     771
                 True       31
       A81007    False     285
                 True        6
       A81008    False     452
...
True  P81620    False     37
                True       3
      P81732    False    108
                True       8
      Y00293    False    114
                True       4
      Y01163    False    160
                True       7
      Y01166    False      3
      Y01652    False    600
                True      37
      Y02633    False    234
                True      16
      Y03602    False    222
                True      13
Name: ITEMS, Length: 15610, dtype: int64

In [42]:
df1 = df.groupby(['DP','PRACTICE', 'IR']).ITEMS.sum()
#applying groupby when there are no items in a categories generate NaNs where we would instead prefer 0's
df1 = df1.fillna(0)

In [47]:
df2 = df1.unstack()

In [53]:
df2.columns = ['off patent statin', 'rosuvastatin']

In [160]:
df2['Percentage'] = (df2['rosuvastatin'] / (df2['rosuvastatin'] + df2['off patent statin']) * 100)

In [153]:
df2.head()


Out[153]:
off patent statin rosuvastatin Percentage
DP PRACTICE
False A81001 175 8 4.371585
A81002 1041 63 5.706522
A81003 256 2 0.775194
A81004 386 14 3.500000
A81005 389 17 4.187192

In [125]:
def practice_number_sanity_check():
    all_practices = df['PRACTICE'].unique()
    rosuva_practices = df[df['BNF NAME'].str.contains('Rosuvastatin')]['PRACTICE'].unique()
    not_rosuva_practices = set(all_practices) - set(rosuva_practices)
    assert(len(all_practices) == len(rosuva_practices) + len(not_rosuva_practices))

In [154]:
df2[df2.rosuvastatin.isnull()]


Out[154]:
off patent statin rosuvastatin Percentage
DP PRACTICE
False A81630 3 NaN NaN
A81632 9 NaN NaN
A81633 4 NaN NaN
A81634 4 NaN NaN
A84614 51 NaN NaN
A85015 1 NaN NaN
A86027 12 NaN NaN
A86028 194 NaN NaN
A88014 163 NaN NaN
B81016 232 NaN NaN
B81091 124 NaN NaN
B81098 2 NaN NaN
B81671 4 NaN NaN
B81690 47 NaN NaN
B82048 3 NaN NaN
B83032 488 NaN NaN
B83051 2 NaN NaN
B83613 102 NaN NaN
B83617 142 NaN NaN
B83627 174 NaN NaN
B83641 454 NaN NaN
B83642 97 NaN NaN
B83661 147 NaN NaN
B84610 294 NaN NaN
B85062 6 NaN NaN
B86103 80 NaN NaN
B86110 6 NaN NaN
B86638 36 NaN NaN
B86642 89 NaN NaN
B86651 10 NaN NaN
... ... ... ...
Y04585 2 NaN NaN
Y04586 1 NaN NaN
Y04647 1 NaN NaN
Y04654 3 NaN NaN
Y04675 9 NaN NaN
Y04679 13 NaN NaN
Y04681 1 NaN NaN
Y04691 6 NaN NaN
Y04723 1 NaN NaN
Y04724 1 NaN NaN
Y04764 1 NaN NaN
Y04769 1 NaN NaN
Y04831 2 NaN NaN
Y04912 2 NaN NaN
True A82613 18 NaN NaN
A84047 277 NaN NaN
A84609 114 NaN NaN
B82046 181 NaN NaN
C81611 55 NaN NaN
D81081 63 NaN NaN
E82620 4 NaN NaN
E82667 3 NaN NaN
F81717 66 NaN NaN
K82061 244 NaN NaN
L83616 143 NaN NaN
M82020 176 NaN NaN
M82601 249 NaN NaN
M82604 145 NaN NaN
M83691 84 NaN NaN
Y01166 3 NaN NaN

629 rows × 3 columns


In [161]:
df2[df2['off patent statin'].isnull()]


Out[161]:
off patent statin rosuvastatin Percentage
DP PRACTICE

In [158]:
df2 = df2.fillna(0)

In [ ]: