In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('files/all_claims_files.csv')

In [3]:
df['year'] = df['file_name'].map(lambda x: x[6:10])

In [4]:
df_sum = df.groupby('year')['MEDREIMB_IP'].sum()

In [5]:
df_sum


Out[5]:
year
2008    5065290
2009    4932150
2010    2225930
Name: MEDREIMB_IP, dtype: int64

In [6]:
# Use the pd.to_datetime function to convert to time series
df_sum.index = pd.to_datetime(df_sum.index)

In [7]:
df_sum.shift() # Shift inpatient costs one year forward


Out[7]:
year
2008-01-01          NaN
2009-01-01    5065290.0
2010-01-01    4932150.0
Name: MEDREIMB_IP, dtype: float64

In [8]:
df_sum.shift(-1) # Lag inpatient costs by a year


Out[8]:
year
2008-01-01    4932150.0
2009-01-01    2225930.0
2010-01-01          NaN
Name: MEDREIMB_IP, dtype: float64

In [9]:
df_sum.cumsum() # Calculate cumulative sum


Out[9]:
year
2008-01-01     5065290
2009-01-01     9997440
2010-01-01    12223370
Name: MEDREIMB_IP, dtype: int64

In [10]:
# Rolling is the equivalent of groupby for time series
# the window parameter states how many previous values are needed to calculate the recursion
# the rolling two-year sum is calculated below
df_sum.rolling(window=2).sum()


Out[10]:
year
2008-01-01          NaN
2009-01-01    9997440.0
2010-01-01    7158080.0
Name: MEDREIMB_IP, dtype: float64

In [11]:
df_sum.rolling(window=2).mean() # calculate rolling average


Out[11]:
year
2008-01-01          NaN
2009-01-01    4998720.0
2010-01-01    3579040.0
Name: MEDREIMB_IP, dtype: float64

In [12]:
def yoy_ratio(x):
    return x[1] / x[0]

In [13]:
df_sum.rolling(2).apply(yoy_ratio) # calculate the year-over-year increase in inpatient costs


Out[13]:
year
2008-01-01         NaN
2009-01-01    0.973715
2010-01-01    0.451310
Name: MEDREIMB_IP, dtype: float64

In [14]:
def yoy_trend(x):
    return (x[1] / x[0]) - 1

In [15]:
df_sum.rolling(2).apply(yoy_trend) # calculate the period-to-period trend


Out[15]:
year
2008-01-01         NaN
2009-01-01   -0.026285
2010-01-01   -0.548690
Name: MEDREIMB_IP, dtype: float64

In [17]:
def rolling_GM(x):
    return (x[0] * x[1]) ** .5

In [18]:
df_sum.rolling(2).apply(rolling_GM) # calculate a custom window function i.e. rolling geometric mean


Out[18]:
year
2008-01-01             NaN
2009-01-01    4.998277e+06
2010-01-01    3.313400e+06
Name: MEDREIMB_IP, dtype: float64

In [20]:
df_sum.plot();



In [ ]: