Are statin prescribing ratios different for dispensing practices?

1. Obtain GP Prescribing data

2. Cut GP Prescribing data for things of interest (statins)

3. Obtain list of dispensing practices

3a. Add codes to list of dispensing practices (name and address only is provided)

4. Combine statin cut and dispensing practice list

5. Compare dispensing vs non-dispensing statin prescribing ratios



In [1]:

    
import requests
from StringIO import StringIO
from numpy import nan as NA
import pandas as pd
import zipfile
import re
%matplotlib inline
import matplotlib.pyplot as plt



In [2]:

    
def fetch_prescribing_data():
    url = 'http://datagov.ic.nhs.uk/presentation/2015_01_January/T201501PDPI+BNFT.CSV' #gp prescribing data (Jan 2015)
    r = requests.get(url)
    data = r.content
    df = (pd.read_csv(StringIO(data)))
    df.to_csv('datas/T201501PDPI+BNFT.CSV')

def fetch_dispensary_data():
    url = 'http://systems.hscic.gov.uk/data/ods/datadownloads/data-files/edispensary.zip'
    r = requests.get(url)
    z = zipfile.ZipFile(StringIO(r.content))
    df = pd.read_csv(z.open('edispensary.csv'))
    df.to_csv('datas/edispensary.csv')

def fetch_gp_details():
    url = 'http://systems.hscic.gov.uk/data/ods/datadownloads/data-files/epraccur.zip'
    r = requests.get(url)
    z = zipfile.ZipFile(StringIO(r.content))
    df = pd.read_csv(z.open('epraccur.csv'))
    df.to_csv('datas/epraccur.csv')

def clean_prescribing_data(df):
    df.columns = [x.strip() for x in df.columns] #gets rid of variable whitespace
    df = df[df['BNF NAME'].str.contains('statin')] #cut for rows with statin in them
    df = df[~df['BNF NAME'].str.contains('Nystatin|Sandostatin|Ecostatin')] #throw away unwanted statins
    df.to_csv('datas/StatinsJan2015.csv') #save the result
    return(df)

def clean_dispensing_practice_addresses(dpad):
    dpad = dpad['Dispensing Practices Address Details'].dropna()
    dpad = dpad.reset_index()
    del dpad['index']
    dpad['Dispensing Practices Address Details'] = dpad['Dispensing Practices Address Details'].str.strip()
    dpad['Dispensing Practices Address Details'] = dpad['Dispensing Practices Address Details'].str.replace('\n', ' ')
    dpad['NAME'] = dpad['Dispensing Practices Address Details'].str.split(',').str[0].str.upper()
    dpad['POSTCODE'] = dpad['Dispensing Practices Address Details'].str.split(',').str[-1].astype(str).str.strip()
    dpad.ix[254,2] = 'BN25 1HH' #one practice lacked a postcode.... we fix this manually
    dpad.ix[254,1] = 'Old School Surgery'
    return(dpad)

def validate_dispensing_postcodes():
    assert(len(dpad[~dpad.Postcode.str.contains(r'[A-Z]{1,2}[0-9R][0-9A-Z]? [0-9][A-Z]{2}')]) == 0) #length of dataframe of postcodes that don't pass regex should be 0 

#nb df = pd.read_csv('http://datagov.ic.nhs.uk/presentation/2015_01_January/T201501PDPI+BNFT.CSV') should also work but seems slower



In [3]:

    
#df = fetch_prescribing_data() #commented because requires internet and is slow 
#df = clean_prescribing_data(df) #commented because need only run once and is slow
#dispdata = fetch_dispensary_data() #doesn't actually contain dispensing practices
fetch_gp_details()

df = pd.read_csv('datas/StatinsJan2015.csv') #load cleaned prescribing data cut
gpdetail = pd.read_csv('datas/epraccur.csv') #from http://systems.hscic.gov.uk/data/ods/datadownloads/data-files/epraccur.zip

dpad_formatting_junk = ['Dispensing Practices Address Details', 'Primary Care Trust:', 'Report For:', 'Practice Name and Address', 'January 2015']
dpad = pd.read_excel('datas/Disp Pracs Name and Address 2015-01-31.xls', usecols=[0], na_values=dpad_formatting_junk) #load dispensing practice list

dpad = clean_dispensing_practice_addresses(dpad)

gpdetail = gpdetail.icol([1,2,10]) #throw away columns we don't care about
gpdetail.columns = ['PRACTICE', 'NAME', 'POSTCODE']

dpad = pd.merge(gpdetail, dpad, on=['NAME','POSTCODE']) #merge to add practice codes

df['DP'] = df.PRACTICE.isin(dpad.PRACTICE) #add column DP to identify dispensing practices

df['IR'] = df['BNF NAME'].str.contains('Rosuvastatin') #add column for is rosuvastatin



In [4]:

    
df = df[['PRACTICE','BNF NAME','ITEMS','DP', 'IR']]



In [5]:

    
df1 = df.groupby(['DP','PRACTICE', 'IR']).ITEMS.sum()



In [6]:

    
df1 = df1.unstack()
df1 = df1.reset_index()



In [13]:

    
df1.columns = ['DP', 'PRACTICE', 'False_IR', 'True_IR']



In [14]:

    
df1['Percentage'] = (df1['True_IR'] / (df1['True_IR'] + df1['False_IR']) * 100)



In [38]:

    
df1[['DP', 'Percentage']].boxplot(by='DP', figsize=(8,8))
plt.title("Statin prescribing in non-dispensing vs dispensing practices")
plt.suptitle("")
plt.ylim((0,100))
plt.ylabel('Percent of statin items that are Rosuvastatin')
plt.xlabel('Dispensing practice')









    Out[38]:





<matplotlib.text.Text at 0x7f9d96ca4210>



In [56]:

    
df1.Percentage.describe()









    Out[56]:





count    7485.000000
mean        6.431462
std         5.377218
min         0.155521
25%         2.873563
50%         5.022831
75%         8.421053
max        70.707071
dtype: float64



In [62]:

    
df1[df1['Percentage'] > 50].sort(columns='Percentage', as)



In [54]:

    
gpdetail[gpdetail.PRACTICE.isin(df1[df1['Percentage'] > 50].PRACTICE)]









    Out[54]:






  
    
      
      PRACTICE
      NAME
      POSTCODE
    
  
  
    
      1538
       C85623
       KINGSWELL SURGERY  PMS PRACTICE
        S36 6DY
    
    
      2372
       E87694
                THE ROYAL MEWS SURGERY
       SW1W 0QH
    
    
      2711
       G82170
                           LAMBERHURST
        TN3 8EX
    
    
      5274
       M84031
                         REVEL SURGERY
       CV23 0LU
    
    
      5962
       Y00860
                       SOUTH ESSEX OOH
       TN24 0GP



In [ ]:

	DP	PRACTICE	False_IR	True_IR	Percentage
7672	True	G82170	74	80	51.948052
2128	False	E87694	2	3	60.000000
6627	False	Y00860	1	2	66.666667
8083	True	M84031	142	323	69.462366
7390	True	C85623	58	140	70.707071

	PRACTICE	NAME	POSTCODE
1538	C85623	KINGSWELL SURGERY PMS PRACTICE	S36 6DY
2372	E87694	THE ROYAL MEWS SURGERY	SW1W 0QH
2711	G82170	LAMBERHURST	TN3 8EX
5274	M84031	REVEL SURGERY	CV23 0LU
5962	Y00860	SOUTH ESSEX OOH	TN24 0GP