In [1]:
%cd -q ~/GitHub/pfcompute/
In [2]:
%matplotlib inline
import sys
import time
import warnings
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import scipy.stats as st
from multiprocessing import Pool
matplotlib.rcParams['figure.figsize'] = (16.0, 12.0)
matplotlib.style.use('ggplot')
pd.set_option('display.max_rows', 500)
import pf.util
In [3]:
# Read in average personal finance data
avg_pf = pd.read_csv('notebooks/average_personal_income_and_expense.csv', index_col=0, parse_dates=True)
avg_pf.dropna(axis=1, inplace=True)
In [4]:
# Sample onto daily timeframe
daily_pf = avg_pf.resample('D').mean().interpolate(method='polynomial', order=5, limit_direction='both')
In [5]:
# Get the % change from data
daily_pf_change = daily_pf.pct_change()
daily_pf_change = daily_pf_change.iloc[1:]
daily_pf_change = daily_pf_change.replace([np.inf, -np.inf], 1)
In [6]:
def par_dist_fit(x):
col, data = x
model, params = pf.util.best_fit_distribution(data)
return (col, model, params)
models = {}
pool = Pool()
col_data = [(col, data) for col, data in daily_pf_change.iteritems()]
fit_models = pool.map(par_dist_fit, col_data)
pool.close()
pool.join()
In [7]:
# Create random % changes for future
number_of_days_to_sim = 10*365
date_range = pd.date_range(start='2015-01-01', freq='D', periods=number_of_days_to_sim)
rand_pf = pd.DataFrame(None, columns=avg_pf.columns, index=date_range)
for col, model_name, params in fit_models:
# Get the model
model = getattr(st, model_name)
# Separate parts of parameters
arg = params[:-2]
loc = params[-2]
scale = params[-1]
rvs = model(loc=loc, scale=scale, *arg).rvs(size=number_of_days_to_sim)
rand_pf[col] = rvs
In [8]:
avg_pf.plot(legend=False)
Out[8]:
In [9]:
# Display years
size = 10*365
i = daily_pf.index.get_loc('2015-01-01')
# Calculate projected returns from end of data
projected_pf = (1 + rand_pf.cumsum()).multiply(daily_pf.iloc[i]).iloc[:size]
ax = daily_pf.iloc[i-2*size:i].plot(legend=False, color='r', alpha=0.25, logy=True, label='Historical')
projected_pf.plot(legend=False, color='b', alpha=0.25, logy=True, label='Projected', ax=ax)
Out[9]:
In [11]:
# Determine max plot size
plotMax = len(fit_models)
plt.figure(figsize=(12, 6*plotMax))
for i, (col, model_name, params) in enumerate(fit_models):
# Get the model and PDF
model = getattr(st, model_name)
pdf = pf.util.make_pdf(model, params)
# Plot the PDF, Historical and Projected Histogram
plt.subplot(plotMax, 1, i+1)
ax = pdf.plot(label='Model PDF', lw=2, legend=True)
daily_pf_change[col].dropna().plot(kind='hist', bins=64, alpha=0.7, normed=True, label='Historical', legend=True, ax=ax)
rand_pf[col].dropna().plot(kind='hist', bins=64, color='k', alpha=0.3, normed=True, label='Projected', legend=True, ax=ax)
ax.set_title('{}\n{}({})'.format(col, model_name, ', '.join(['{:0.4f}'.format(x) for x in params])))