notebook.community

Edit and run



In [63]:

    
import requests
from StringIO import StringIO
from numpy import nan as NA
import pandas as pd
from pandas import DataFrame
import zipfile
import re
%matplotlib inline
import matplotlib.pyplot as plt



In [64]:

    
def fetch_prescribing_data():
    url = 'http://datagov.ic.nhs.uk/presentation/2015_01_January/T201501PDPI+BNFT.CSV' #gp prescribing data (Jan 2015)
    r = requests.get(url)
    data = r.content
    df = (pd.read_csv(StringIO(data)))
    df.to_csv('datas/T201501PDPI+BNFT.CSV')

def fetch_dispensary_data():
    url = 'http://systems.hscic.gov.uk/data/ods/datadownloads/data-files/edispensary.zip'
    r = requests.get(url)
    z = zipfile.ZipFile(StringIO(r.content))
    df = pd.read_csv(z.open('edispensary.csv'))
    df.to_csv('datas/edispensary.csv')

def fetch_gp_details():
    url = 'http://systems.hscic.gov.uk/data/ods/datadownloads/data-files/epraccur.zip'
    r = requests.get(url)
    z = zipfile.ZipFile(StringIO(r.content))
    df = pd.read_csv(z.open('epraccur.csv'))
    df.to_csv('datas/epraccur.csv')

def clean_prescribing_data(df):
    df.columns = [x.strip() for x in df.columns] #gets rid of variable whitespace
    df = df[df['BNF NAME'].str.contains('statin')] #cut for rows with statin in them
    df = df[~df['BNF NAME'].str.contains('Nystatin|Sandostatin|Ecostatin')] #throw away unwanted statins
    df.to_csv('datas/StatinsJan2015.csv') #save the result
    return(df)

def clean_dispensing_practice_addresses(dpad):
    dpad = dpad['Dispensing Practices Address Details'].dropna()
    dpad = dpad.reset_index()
    del dpad['index']
    dpad['Dispensing Practices Address Details'] = dpad['Dispensing Practices Address Details'].str.strip()
    dpad['Dispensing Practices Address Details'] = dpad['Dispensing Practices Address Details'].str.replace('\n', ' ')
    dpad['NAME'] = dpad['Dispensing Practices Address Details'].str.split(',').str[0].str.upper()
    dpad['POSTCODE'] = dpad['Dispensing Practices Address Details'].str.split(',').str[-1].astype(str).str.strip()
    dpad.ix[254,2] = 'BN25 1HH' #one practice lacked a postcode.... we fix this manually
    dpad.ix[254,1] = 'Old School Surgery'
    return(dpad)

def validate_dispensing_postcodes():
    assert(len(dpad[~dpad.Postcode.str.contains(r'[A-Z]{1,2}[0-9R][0-9A-Z]? [0-9][A-Z]{2}')]) == 0) #length of dataframe of postcodes that don't pass regex should be 0 

#nb df = pd.read_csv('http://datagov.ic.nhs.uk/presentation/2015_01_January/T201501PDPI+BNFT.CSV') should also work but seems slower



In [65]:

    
#df = fetch_prescribing_data() #commented because requires internet and is slow 
#df = clean_prescribing_data(df) #commented because need only run once and is slow
#dispdata = fetch_dispensary_data() #doesn't actually contain dispensing practices
#fetch_gp_details()

df = pd.read_csv('datas/StatinsJan2015.csv') #load cleaned prescribing data cut
gpdetail = pd.read_csv('datas/epraccur.csv') #from http://systems.hscic.gov.uk/data/ods/datadownloads/data-files/epraccur.zip

dpad_formatting_junk = ['Dispensing Practices Address Details', 'Primary Care Trust:', 'Report For:', 'Practice Name and Address', 'January 2015']
dpad = pd.read_excel('datas/Disp Pracs Name and Address 2015-01-31.xls', usecols=[0], na_values=dpad_formatting_junk) #load dispensing practice list

dpad = clean_dispensing_practice_addresses(dpad)

gpdetail = gpdetail.icol([1,2,10]) #throw away columns we don't care about
gpdetail.columns = ['PRACTICE', 'NAME', 'POSTCODE']

dpad = pd.merge(gpdetail, dpad, on=['NAME','POSTCODE']) #merge to add practice codes

df['DP'] = df.PRACTICE.isin(dpad.PRACTICE) #add column DP to identify dispensing practices

df['IR'] = df['BNF NAME'].str.contains('Rosuvastatin') #add column for is rosuvastatin



In [66]:

    
df = df[['PRACTICE','BNF NAME','ITEMS','DP', 'IR']]



In [39]:

    
#df['DP'] = df['DP'].astype('category',  categories=['dispensing practice', 'not a dispensing practice']) try with newer version



In [40]:

    
def boolstostrings():
    df['DP'] = df['DP'].astype(str).str.replace('False', 'NDP')
    df['DP'] = df['DP'].astype(str).str.replace('True', 'DP')
    df['IR'] = df['IR'].astype(str).str.replace('False', 'NIR')
    df['IR'] = df['IR'].astype(str).str.replace('True', 'IR')



In [41]:

    
#boolstostrings()



In [71]:

    
df.groupby(['DP','PRACTICE', 'IR']).ITEMS.sum()









    Out[71]:





DP     PRACTICE  IR   
False  A81001    False     175
                 True        8
       A81002    False    1041
                 True       63
       A81003    False     256
                 True        2
       A81004    False     386
                 True       14
       A81005    False     389
                 True       17
       A81006    False     771
                 True       31
       A81007    False     285
                 True        6
       A81008    False     452
...
True  P81620    False     37
                True       3
      P81732    False    108
                True       8
      Y00293    False    114
                True       4
      Y01163    False    160
                True       7
      Y01166    False      3
      Y01652    False    600
                True      37
      Y02633    False    234
                True      16
      Y03602    False    222
                True      13
Name: ITEMS, Length: 15610, dtype: int64



In [42]:

    
df1 = df.groupby(['DP','PRACTICE', 'IR']).ITEMS.sum()
#applying groupby when there are no items in a categories generate NaNs where we would instead prefer 0's
df1 = df1.fillna(0)



In [47]:

    
df2 = df1.unstack()



In [53]:

    
df2.columns = ['off patent statin', 'rosuvastatin']



In [160]:

    
df2['Percentage'] = (df2['rosuvastatin'] / (df2['rosuvastatin'] + df2['off patent statin']) * 100)



In [153]:

    
df2.head()









    Out[153]:






  
    
      
      
      off patent statin
      rosuvastatin
      Percentage
    
    
      DP
      PRACTICE
      
      
      
    
  
  
    
      False
      A81001
        175
        8
       4.371585
    
    
      A81002
       1041
       63
       5.706522
    
    
      A81003
        256
        2
       0.775194
    
    
      A81004
        386
       14
       3.500000
    
    
      A81005
        389
       17
       4.187192



In [125]:

    
def practice_number_sanity_check():
    all_practices = df['PRACTICE'].unique()
    rosuva_practices = df[df['BNF NAME'].str.contains('Rosuvastatin')]['PRACTICE'].unique()
    not_rosuva_practices = set(all_practices) - set(rosuva_practices)
    assert(len(all_practices) == len(rosuva_practices) + len(not_rosuva_practices))



In [154]:

    
df2[df2.rosuvastatin.isnull()]









    Out[154]:






  
    
      
      
      off patent statin
      rosuvastatin
      Percentage
    
    
      DP
      PRACTICE
      
      
      
    
  
  
    
      False
      A81630
         3
      NaN
      NaN
    
    
      A81632
         9
      NaN
      NaN
    
    
      A81633
         4
      NaN
      NaN
    
    
      A81634
         4
      NaN
      NaN
    
    
      A84614
        51
      NaN
      NaN
    
    
      A85015
         1
      NaN
      NaN
    
    
      A86027
        12
      NaN
      NaN
    
    
      A86028
       194
      NaN
      NaN
    
    
      A88014
       163
      NaN
      NaN
    
    
      B81016
       232
      NaN
      NaN
    
    
      B81091
       124
      NaN
      NaN
    
    
      B81098
         2
      NaN
      NaN
    
    
      B81671
         4
      NaN
      NaN
    
    
      B81690
        47
      NaN
      NaN
    
    
      B82048
         3
      NaN
      NaN
    
    
      B83032
       488
      NaN
      NaN
    
    
      B83051
         2
      NaN
      NaN
    
    
      B83613
       102
      NaN
      NaN
    
    
      B83617
       142
      NaN
      NaN
    
    
      B83627
       174
      NaN
      NaN
    
    
      B83641
       454
      NaN
      NaN
    
    
      B83642
        97
      NaN
      NaN
    
    
      B83661
       147
      NaN
      NaN
    
    
      B84610
       294
      NaN
      NaN
    
    
      B85062
         6
      NaN
      NaN
    
    
      B86103
        80
      NaN
      NaN
    
    
      B86110
         6
      NaN
      NaN
    
    
      B86638
        36
      NaN
      NaN
    
    
      B86642
        89
      NaN
      NaN
    
    
      B86651
        10
      NaN
      NaN
    
    
      ...
      ...
      ...
      ...
    
    
      Y04585
         2
      NaN
      NaN
    
    
      Y04586
         1
      NaN
      NaN
    
    
      Y04647
         1
      NaN
      NaN
    
    
      Y04654
         3
      NaN
      NaN
    
    
      Y04675
         9
      NaN
      NaN
    
    
      Y04679
        13
      NaN
      NaN
    
    
      Y04681
         1
      NaN
      NaN
    
    
      Y04691
         6
      NaN
      NaN
    
    
      Y04723
         1
      NaN
      NaN
    
    
      Y04724
         1
      NaN
      NaN
    
    
      Y04764
         1
      NaN
      NaN
    
    
      Y04769
         1
      NaN
      NaN
    
    
      Y04831
         2
      NaN
      NaN
    
    
      Y04912
         2
      NaN
      NaN
    
    
      True 
      A82613
        18
      NaN
      NaN
    
    
      A84047
       277
      NaN
      NaN
    
    
      A84609
       114
      NaN
      NaN
    
    
      B82046
       181
      NaN
      NaN
    
    
      C81611
        55
      NaN
      NaN
    
    
      D81081
        63
      NaN
      NaN
    
    
      E82620
         4
      NaN
      NaN
    
    
      E82667
         3
      NaN
      NaN
    
    
      F81717
        66
      NaN
      NaN
    
    
      K82061
       244
      NaN
      NaN
    
    
      L83616
       143
      NaN
      NaN
    
    
      M82020
       176
      NaN
      NaN
    
    
      M82601
       249
      NaN
      NaN
    
    
      M82604
       145
      NaN
      NaN
    
    
      M83691
        84
      NaN
      NaN
    
    
      Y01166
         3
      NaN
      NaN
    
  

629 rows × 3 columns



In [161]:

    
df2[df2['off patent statin'].isnull()]









    Out[161]:






  
    
      
      
      off patent statin
      rosuvastatin
      Percentage
    
    
      DP
      PRACTICE



In [158]:

    
df2 = df2.fillna(0)



In [ ]:

		off patent statin	rosuvastatin	Percentage
DP	PRACTICE
False	A81001	175	8	4.371585
	A81002	1041	63	5.706522
	A81003	256	2	0.775194
	A81004	386	14	3.500000
	A81005	389	17	4.187192

		off patent statin	rosuvastatin	Percentage
DP	PRACTICE
False	A81630	3	NaN	NaN
	A81632	9	NaN	NaN
	A81633	4	NaN	NaN
	A81634	4	NaN	NaN
	A84614	51	NaN	NaN
	A85015	1	NaN	NaN
	A86027	12	NaN	NaN
	A86028	194	NaN	NaN
	A88014	163	NaN	NaN
	B81016	232	NaN	NaN
	B81091	124	NaN	NaN
	B81098	2	NaN	NaN
	B81671	4	NaN	NaN
	B81690	47	NaN	NaN
	B82048	3	NaN	NaN
	B83032	488	NaN	NaN
	B83051	2	NaN	NaN
	B83613	102	NaN	NaN
	B83617	142	NaN	NaN
	B83627	174	NaN	NaN
	B83641	454	NaN	NaN
	B83642	97	NaN	NaN
	B83661	147	NaN	NaN
	B84610	294	NaN	NaN
	B85062	6	NaN	NaN
	B86103	80	NaN	NaN
	B86110	6	NaN	NaN
	B86638	36	NaN	NaN
	B86642	89	NaN	NaN
	B86651	10	NaN	NaN
	...	...	...	...
	Y04585	2	NaN	NaN
	Y04586	1	NaN	NaN
	Y04647	1	NaN	NaN
	Y04654	3	NaN	NaN
	Y04675	9	NaN	NaN
	Y04679	13	NaN	NaN
	Y04681	1	NaN	NaN
	Y04691	6	NaN	NaN
	Y04723	1	NaN	NaN
	Y04724	1	NaN	NaN
	Y04764	1	NaN	NaN
	Y04769	1	NaN	NaN
	Y04831	2	NaN	NaN
	Y04912	2	NaN	NaN
True	A82613	18	NaN	NaN
	A84047	277	NaN	NaN
	A84609	114	NaN	NaN
	B82046	181	NaN	NaN
	C81611	55	NaN	NaN
	D81081	63	NaN	NaN
	E82620	4	NaN	NaN
	E82667	3	NaN	NaN
	F81717	66	NaN	NaN
	K82061	244	NaN	NaN
	L83616	143	NaN	NaN
	M82020	176	NaN	NaN
	M82601	249	NaN	NaN
	M82604	145	NaN	NaN
	M83691	84	NaN	NaN
	Y01166	3	NaN	NaN