In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('files/all_claims_files.csv') # read consolidated claims file

In [3]:
# Add column for the file year since each row is a unique patient/year combination
df['year'] = df.file_name.map(lambda x: int(x[6:10]))

In [4]:
all_cols = df.columns # all columns of the Data Frame

In [5]:
payment_cols = ['MEDREIMB_IP', 'BENRES_IP', 'PPPYMT_IP', 'MEDREIMB_OP', 'BENRES_OP',
       'PPPYMT_OP', 'MEDREIMB_CAR', 'BENRES_CAR', 'PPPYMT_CAR'] # Only select the payment-related columns

In [6]:
id_vars = ['DESYNPUF_ID', 'year'] # Use the DESYNPUF_ID/year as a unique key for the row
# Alternatively, just use the DESYNPUF_ID as a unique key for the patient
# Use list comprehension to select all non-payment related columns
# Alternative: id_vars = [x for x in all_cols if x not in payment_cols]

In [8]:
df.shape


Out[8]:
(6000, 34)

In [7]:
df[id_vars].drop_duplicates().shape # Verify uniqueness


Out[7]:
(6000, 2)

In [9]:
df.head()


Out[9]:
DESYNPUF_ID BENE_BIRTH_DT BENE_DEATH_DT BENE_SEX_IDENT_CD BENE_RACE_CD BENE_ESRD_IND SP_STATE_CODE BENE_COUNTY_CD BENE_HI_CVRAGE_TOT_MONS BENE_SMI_CVRAGE_TOT_MONS ... BENRES_IP PPPYMT_IP MEDREIMB_OP BENRES_OP PPPYMT_OP MEDREIMB_CAR BENRES_CAR PPPYMT_CAR file_name year
0 00013D2EFD8E45D1 19230501 NaN 1 1 0 26 950 12 12 ... 0 0 50 10 0 0 0 0 DE1_0_2008_Beneficiary_Summary_File_Sample_1 2008
1 00016F745862898F 19430101 NaN 1 1 0 39 230 12 12 ... 0 0 0 0 0 700 240 0 DE1_0_2008_Beneficiary_Summary_File_Sample_1 2008
2 0001FDD721E223DC 19360901 NaN 2 1 0 39 280 12 12 ... 0 0 0 0 0 0 0 0 DE1_0_2008_Beneficiary_Summary_File_Sample_1 2008
3 00021CA6FF03E670 19410601 NaN 1 5 0 6 290 0 0 ... 0 0 0 0 0 0 0 0 DE1_0_2008_Beneficiary_Summary_File_Sample_1 2008
4 00024B3D2352D2D0 19360801 NaN 1 1 0 52 590 12 12 ... 0 0 30 40 0 220 80 0 DE1_0_2008_Beneficiary_Summary_File_Sample_1 2008

5 rows × 34 columns


In [10]:
df_mlt = pd.melt(df, id_vars=id_vars, value_vars=payment_cols) # Melt the data!

In [12]:
df_mlt.shape


Out[12]:
(54000, 4)

In [13]:
# Split variable into two parts
df_mlt['payer'] = df_mlt['variable'].map(lambda x: x.split('_')[0])
df_mlt['service'] = df_mlt['variable'].map(lambda x: x.split('_')[1])

In [14]:
# Now can easily pivot in different ways

In [17]:
# format to show percentages
# Find all available options here: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.set_option.html
pd.set_option('display.float_format', '{:.2f}'.format)

In [18]:
pd.crosstab(df_mlt.year, df_mlt.service, values=df_mlt.value, aggfunc='sum', normalize=True)


Out[18]:
service CAR IP OP
year
2008 0.11 0.20 0.07
2009 0.13 0.20 0.08
2010 0.08 0.09 0.04

In [19]:
pd.set_option('display.float_format', '${:,.2f}'.format)

In [20]:
df_mlt.pivot_table(index='year', columns='service', values='value', aggfunc='sum', margins=True).applymap(float)


Out[20]:
service CAR IP OP All
year
2008 $3,231,130.00 $5,799,978.00 $1,956,540.00 $10,987,648.00
2009 $3,627,740.00 $5,633,954.00 $2,318,570.00 $11,580,264.00
2010 $2,238,090.00 $2,631,430.00 $1,223,250.00 $6,092,770.00
All $9,096,960.00 $14,065,362.00 $5,498,360.00 $28,660,682.00

In [ ]: