Are statin prescribing ratios different for dispensing practices?

1. Obtain GP Prescribing data

2. Cut GP Prescribing data for things of interest (statins)

3. Obtain list of dispensing practices

3a. Add codes to list of dispensing practices (name and address only is provided)

4. Combine statin cut and dispensing practice list

5. Compare dispensing vs non-dispensing statin prescribing ratios


In [1]:
import requests
from StringIO import StringIO
from numpy import nan as NA
import pandas as pd
import zipfile
import re
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
def fetch_prescribing_data():
    url = 'http://datagov.ic.nhs.uk/presentation/2015_01_January/T201501PDPI+BNFT.CSV' #gp prescribing data (Jan 2015)
    r = requests.get(url)
    data = r.content
    df = (pd.read_csv(StringIO(data)))
    df.to_csv('datas/T201501PDPI+BNFT.CSV')

def fetch_dispensary_data():
    url = 'http://systems.hscic.gov.uk/data/ods/datadownloads/data-files/edispensary.zip'
    r = requests.get(url)
    z = zipfile.ZipFile(StringIO(r.content))
    df = pd.read_csv(z.open('edispensary.csv'))
    df.to_csv('datas/edispensary.csv')

def fetch_gp_details():
    url = 'http://systems.hscic.gov.uk/data/ods/datadownloads/data-files/epraccur.zip'
    r = requests.get(url)
    z = zipfile.ZipFile(StringIO(r.content))
    df = pd.read_csv(z.open('epraccur.csv'))
    df.to_csv('datas/epraccur.csv')

def clean_prescribing_data(df):
    df.columns = [x.strip() for x in df.columns] #gets rid of variable whitespace
    df = df[df['BNF NAME'].str.contains('statin')] #cut for rows with statin in them
    df = df[~df['BNF NAME'].str.contains('Nystatin|Sandostatin|Ecostatin')] #throw away unwanted statins
    df.to_csv('datas/StatinsJan2015.csv') #save the result
    return(df)

def clean_dispensing_practice_addresses(dpad):
    dpad = dpad['Dispensing Practices Address Details'].dropna()
    dpad = dpad.reset_index()
    del dpad['index']
    dpad['Dispensing Practices Address Details'] = dpad['Dispensing Practices Address Details'].str.strip()
    dpad['Dispensing Practices Address Details'] = dpad['Dispensing Practices Address Details'].str.replace('\n', ' ')
    dpad['NAME'] = dpad['Dispensing Practices Address Details'].str.split(',').str[0].str.upper()
    dpad['POSTCODE'] = dpad['Dispensing Practices Address Details'].str.split(',').str[-1].astype(str).str.strip()
    dpad.ix[254,2] = 'BN25 1HH' #one practice lacked a postcode.... we fix this manually
    dpad.ix[254,1] = 'Old School Surgery'
    return(dpad)

def validate_dispensing_postcodes():
    assert(len(dpad[~dpad.Postcode.str.contains(r'[A-Z]{1,2}[0-9R][0-9A-Z]? [0-9][A-Z]{2}')]) == 0) #length of dataframe of postcodes that don't pass regex should be 0 

#nb df = pd.read_csv('http://datagov.ic.nhs.uk/presentation/2015_01_January/T201501PDPI+BNFT.CSV') should also work but seems slower

In [3]:
#df = fetch_prescribing_data() #commented because requires internet and is slow 
#df = clean_prescribing_data(df) #commented because need only run once and is slow
#dispdata = fetch_dispensary_data() #doesn't actually contain dispensing practices
fetch_gp_details()

df = pd.read_csv('datas/StatinsJan2015.csv') #load cleaned prescribing data cut
gpdetail = pd.read_csv('datas/epraccur.csv') #from http://systems.hscic.gov.uk/data/ods/datadownloads/data-files/epraccur.zip

dpad_formatting_junk = ['Dispensing Practices Address Details', 'Primary Care Trust:', 'Report For:', 'Practice Name and Address', 'January 2015']
dpad = pd.read_excel('datas/Disp Pracs Name and Address 2015-01-31.xls', usecols=[0], na_values=dpad_formatting_junk) #load dispensing practice list

dpad = clean_dispensing_practice_addresses(dpad)

gpdetail = gpdetail.icol([1,2,10]) #throw away columns we don't care about
gpdetail.columns = ['PRACTICE', 'NAME', 'POSTCODE']

dpad = pd.merge(gpdetail, dpad, on=['NAME','POSTCODE']) #merge to add practice codes

df['DP'] = df.PRACTICE.isin(dpad.PRACTICE) #add column DP to identify dispensing practices

df['IR'] = df['BNF NAME'].str.contains('Rosuvastatin') #add column for is rosuvastatin

In [4]:
df = df[['PRACTICE','BNF NAME','ITEMS','DP', 'IR']]

In [5]:
df1 = df.groupby(['DP','PRACTICE', 'IR']).ITEMS.sum()

In [6]:
df1 = df1.unstack()
df1 = df1.reset_index()

In [13]:
df1.columns = ['DP', 'PRACTICE', 'False_IR', 'True_IR']

In [14]:
df1['Percentage'] = (df1['True_IR'] / (df1['True_IR'] + df1['False_IR']) * 100)

In [38]:
df1[['DP', 'Percentage']].boxplot(by='DP', figsize=(8,8))
plt.title("Statin prescribing in non-dispensing vs dispensing practices")
plt.suptitle("")
plt.ylim((0,100))
plt.ylabel('Percent of statin items that are Rosuvastatin')
plt.xlabel('Dispensing practice')


Out[38]:
<matplotlib.text.Text at 0x7f9d96ca4210>

In [56]:
df1.Percentage.describe()


Out[56]:
count    7485.000000
mean        6.431462
std         5.377218
min         0.155521
25%         2.873563
50%         5.022831
75%         8.421053
max        70.707071
dtype: float64

In [62]:
df1[df1['Percentage'] > 50].sort(columns='Percentage', as)


Out[62]:
DP PRACTICE False_IR True_IR Percentage
7672 True G82170 74 80 51.948052
2128 False E87694 2 3 60.000000
6627 False Y00860 1 2 66.666667
8083 True M84031 142 323 69.462366
7390 True C85623 58 140 70.707071

In [54]:
gpdetail[gpdetail.PRACTICE.isin(df1[df1['Percentage'] > 50].PRACTICE)]


Out[54]:
PRACTICE NAME POSTCODE
1538 C85623 KINGSWELL SURGERY PMS PRACTICE S36 6DY
2372 E87694 THE ROYAL MEWS SURGERY SW1W 0QH
2711 G82170 LAMBERHURST TN3 8EX
5274 M84031 REVEL SURGERY CV23 0LU
5962 Y00860 SOUTH ESSEX OOH TN24 0GP

In [ ]: