In [1]:
import pandas as pd
In [2]:
df = pd.read_csv('files/all_claims_files.csv') # read consolidated claims file
In [3]:
# Add column for the file year since each row is a unique patient/year combination
df['year'] = df.file_name.map(lambda x: int(x[6:10]))
In [4]:
all_cols = df.columns # all columns of the Data Frame
In [5]:
payment_cols = ['MEDREIMB_IP', 'BENRES_IP', 'PPPYMT_IP', 'MEDREIMB_OP', 'BENRES_OP',
'PPPYMT_OP', 'MEDREIMB_CAR', 'BENRES_CAR', 'PPPYMT_CAR'] # Only select the payment-related columns
In [6]:
id_vars = ['DESYNPUF_ID', 'year'] # Use the DESYNPUF_ID/year as a unique key for the row
# Alternatively, just use the DESYNPUF_ID as a unique key for the patient
# Use list comprehension to select all non-payment related columns
# Alternative: id_vars = [x for x in all_cols if x not in payment_cols]
In [8]:
df.shape
Out[8]:
In [7]:
df[id_vars].drop_duplicates().shape # Verify uniqueness
Out[7]:
In [9]:
df.head()
Out[9]:
In [10]:
df_mlt = pd.melt(df, id_vars=id_vars, value_vars=payment_cols) # Melt the data!
In [12]:
df_mlt.shape
Out[12]:
In [13]:
# Split variable into two parts
df_mlt['payer'] = df_mlt['variable'].map(lambda x: x.split('_')[0])
df_mlt['service'] = df_mlt['variable'].map(lambda x: x.split('_')[1])
In [14]:
# Now can easily pivot in different ways
In [17]:
# format to show percentages
# Find all available options here: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.set_option.html
pd.set_option('display.float_format', '{:.2f}'.format)
In [18]:
pd.crosstab(df_mlt.year, df_mlt.service, values=df_mlt.value, aggfunc='sum', normalize=True)
Out[18]:
In [19]:
pd.set_option('display.float_format', '${:,.2f}'.format)
In [20]:
df_mlt.pivot_table(index='year', columns='service', values='value', aggfunc='sum', margins=True).applymap(float)
Out[20]:
In [ ]: