In [12]:

    
import pandas as pd
import networkx as nx
from collections import Counter

Read data in, do some preliminary cleanup. This is a 1.7GB file being read into memory, so it may break on 32-bit systems.



In [2]:

    
df = pd.read_csv('data_2012.txt',sep='\t',encoding='utf8',skiprows=[1],header=0)
df['npi'] = df['npi'].astype('str')
df['line_srvc_cnt'] = df['line_srvc_cnt'].astype('int')
df.head(2).T









    



/Users/brianckeegan/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1070: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)






    Out[2]:






  
    
      
      0
      1
    
  
  
    
      npi
                  1003000126
                  1003000126
    
    
      nppes_provider_last_org_name
                   ENKESHAFI
                   ENKESHAFI
    
    
      nppes_provider_first_name
                     ARDALAN
                     ARDALAN
    
    
      nppes_provider_mi
                         NaN
                         NaN
    
    
      nppes_credentials
                        M.D.
                        M.D.
    
    
      nppes_provider_gender
                           M
                           M
    
    
      nppes_entity_code
                           I
                           I
    
    
      nppes_provider_street1
                900 SETON DR
                900 SETON DR
    
    
      nppes_provider_street2
                         NaN
                         NaN
    
    
      nppes_provider_city
                  CUMBERLAND
                  CUMBERLAND
    
    
      nppes_provider_zip
                   215021854
                   215021854
    
    
      nppes_provider_state
                          MD
                          MD
    
    
      nppes_provider_country
                          US
                          US
    
    
      provider_type
           Internal Medicine
           Internal Medicine
    
    
      medicare_participation_indicator
                           Y
                           Y
    
    
      place_of_service
                           F
                           F
    
    
      hcpcs_code
                       99222
                       99223
    
    
      hcpcs_description
       Initial hospital care
       Initial hospital care
    
    
      line_srvc_cnt
                         115
                          93
    
    
      bene_unique_cnt
                         112
                          88
    
    
      bene_day_srvc_cnt
                         115
                          93
    
    
      average_Medicare_allowed_amt
                      135.25
                      198.59
    
    
      stdev_Medicare_allowed_amt
                           0
                           0
    
    
      average_submitted_chrg_amt
                         199
                         291
    
    
      stdev_submitted_chrg_amt
                           0
                    9.591663
    
    
      average_Medicare_payment_amt
                    108.1157
                      158.87
    
    
      stdev_Medicare_payment_amt
                   0.9005883
                           0
    
  

27 rows × 2 columns



In [4]:

    
len(df)









    Out[4]:





9153272

Make network

Iterate over the DataFrame and create a NetworkX DiGraph object containing doctors (npi), services they provide (hcpcs_code), and other assorted metadata. This is a bipartite network of doctors connected to services. This is extremely expensive to calculate and takes over 10 minutes to load.



In [58]:

    
g = nx.DiGraph()

for i in df.index:
    npi = df.ix[i,'npi']
    hcpcs_code = df.ix[i,'hcpcs_code']
    
    provider_type = df.ix[i,'provider_type']
    gender = df.ix[i,'nppes_provider_gender']
    state = df.ix[i,'nppes_provider_state']
    description = df.ix[i,'hcpcs_description']
    
    services_count = int(df.ix[i,'line_srvc_cnt'])
    beneficiaries_count = int(df.ix[i,'bene_unique_cnt'])
    total_charges = int(df.ix[i,'average_submitted_chrg_amt'] * services_count)
    total_payments = int(df.ix[i,'average_Medicare_payment_amt'] * services_count)

    g.add_edge(npi,hcpcs_code,s=services_count,b=beneficiaries_count,tc=total_charges,tp=total_payments)
    g.add_node(npi,nt='Person',pt=provider_type,g=gender,s=state,d='Person')
    g.add_node(hcpcs_code,nt='Procedure',pt='Procedure',g='Procedure',s='Procedure',d=description)

Print out the number of edges and nodes in the network. This is a big network.



In [59]:

    
g.number_of_edges(), g.number_of_nodes()









    Out[59]:





(8885068, 886593)

Write this graph object out to a graphml file so that we can visualize it in other programs if we wanted. Writing this to a gigantic file.



In [66]:

    
nx.write_graphml(g,'all_records.graphml')

Compute centrality statistics



In [45]:

    
idc = Counter(nx.in_degree_centrality(g).values())
odc = Counter(nx.out_degree_centrality(g).values())

Plot the distribution of services. Most services provide one service, there are a few doctors who claim to provide more than 10,000 services.



In [67]:

    
plt.scatter([i*(len(g)-1)+1 for i in idc.keys()],[i + 1 for i in idc.values()])
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Doctors providing service',fontsize=18)
plt.ylabel('Number of services',fontsize=18)









    Out[67]:





<matplotlib.text.Text at 0x1657dc410>

Plot the distribution of doctors and the number of services they provide. Vast majority of doctors provide between 1 and 10 services, some doctors in the tail provide more than 1000 services.



In [68]:

    
plt.scatter([i*(len(g)-1)+1 for i in odc.keys()],[i + 1 for i in odc.values()])
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Number of services provided',fontsize=18)
plt.ylabel('Number of doctors',fontsize=18)









    Out[68]:





<matplotlib.text.Text at 0x1664fd5d0>

Compute edge statistics

Plot the distribution of the number of beneficiaries in a doctor-service relationship. Most doctor-service relationships have 10 beneficiaries (owing to aggregation performed presumably for privacy reasons), but there are some doctors who provide services to more than 100,000 beneficiaries.



In [69]:

    
b_edges = Counter([i[2]['b'] for i in g.edges_iter(data=True)])

plt.scatter(b_edges.keys(),b_edges.values())
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Number of beneficiaries',fontsize=18)
plt.ylabel('Number of doctors',fontsize=18)









    Out[69]:





<matplotlib.text.Text at 0x1657a6790>

Plot the distribution of the number of services provided. Some doctors can provide a service multiple times to a patient. Again we see there is a long-tailed distribution with most doctors providing services 10 times but a handful of doctors providing more than a million services in a single year.



In [72]:

    
s_edges = Counter([i[2]['s'] for i in g.edges_iter(data=True)])

plt.scatter(s_edges.keys(),s_edges.values())
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Number of services performed',fontsize=18)
plt.ylabel('Number of doctors',fontsize=18)









    Out[72]:





<matplotlib.text.Text at 0x16763ed50>

Plot the distribution of the average value of charges that were paid out for a given service. In this long-tailed distribution, we see that there are thousands of doctors-service interactions where their total compensation was in the hundreds of dollars, but there are a handful of doctors that received over $10 million in total compensation.



In [70]:

    
tp_edges = Counter([round(i[2]['tp']+1,0) for i in g.edges_iter(data=True)])

plt.scatter(tp_edges.keys(),tp_edges.values())
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Total charges paid ($)',fontsize=18)
plt.ylabel('Number of doctors',fontsize=18)









    Out[70]:





<matplotlib.text.Text at 0x467ccc790>



In [71]:

    
tc_edges = Counter([round(i[2]['tc']+1,0) for i in g.edges_iter(data=True)])

plt.scatter(tc_edges.keys(),tc_edges.values())
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Total charges billed ($)',fontsize=18)
plt.ylabel('Number of doctors',fontsize=18)









    Out[71]:





<matplotlib.text.Text at 0x3cfbf44d0>



In [ ]:

	0	1
npi	1003000126	1003000126
nppes_provider_last_org_name	ENKESHAFI	ENKESHAFI
nppes_provider_first_name	ARDALAN	ARDALAN
nppes_provider_mi	NaN	NaN
nppes_credentials	M.D.	M.D.
nppes_provider_gender	M	M
nppes_entity_code	I	I
nppes_provider_street1	900 SETON DR	900 SETON DR
nppes_provider_street2	NaN	NaN
nppes_provider_city	CUMBERLAND	CUMBERLAND
nppes_provider_zip	215021854	215021854
nppes_provider_state	MD	MD
nppes_provider_country	US	US
provider_type	Internal Medicine	Internal Medicine
medicare_participation_indicator	Y	Y
place_of_service	F	F
hcpcs_code	99222	99223
hcpcs_description	Initial hospital care	Initial hospital care
line_srvc_cnt	115	93
bene_unique_cnt	112	88
bene_day_srvc_cnt	115	93
average_Medicare_allowed_amt	135.25	198.59
stdev_Medicare_allowed_amt	0	0
average_submitted_chrg_amt	199	291
stdev_submitted_chrg_amt	0	9.591663
average_Medicare_payment_amt	108.1157	158.87
stdev_Medicare_payment_amt	0.9005883	0