In [4]:
import pymc3 as pm
import numpy as np
import pandas as pd
from theano import shared
import scipy.stats as stats
import matplotlib.pyplot as plt
import arviz as az
plt.style.use('ggplot')
In [5]:
df_movies = pd.read_excel("rundown2.xlsx", sheet_name="list")
df_movies_2018 = df_movies[df_movies.year == 2018]
df_movies_2018.head()
Out[5]:
In [48]:
x = df_movies_2018.opening / 1e6
y = df_movies_2018.gross / 1e6
_, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].plot(x, y, 'C0.')
ax[0].set_xlabel('opening 2018')
ax[0].set_ylabel('total 2018', rotation=90)
# ax[0].plot(x, y_real, 'k')
az.plot_kde(y, ax=ax[1])
ax[1].set_xlabel('total 2018')
plt.tight_layout()
In [7]:
with pm.Model() as model_g:
α = pm.Normal('α', mu=0, sd=10)
β = pm.Normal('β', mu=0, sd=1)
ϵ = pm.HalfCauchy('ϵ', 5)
μ = pm.Deterministic('μ', α + β * x)
y_pred = pm.Normal('y_pred', mu=μ, sd=ϵ, observed=y)
trace_g = pm.sample(2000, tune=1000)
In [8]:
varnames=['α', 'β', 'ϵ']
az.plot_trace(trace_g, varnames)
Out[8]:
In [9]:
pm.autocorrplot(trace_g, varnames)
Out[9]:
In [10]:
pm.summary(trace_g, varnames)
Out[10]:
In [54]:
plt.figure(figsize=(8,8))
plt.plot(x, y, 'b.');
alpha_m = trace_g['α'].mean()
beta_m = trace_g['β'].mean()
plt.plot(x, alpha_m + beta_m * x, c='k', label='y = {:.2f} + {:.2f} * x'.format(alpha_m, beta_m))
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$y$', fontsize=16, rotation=0)
plt.legend(loc=2, fontsize=14)
Out[54]:
In [12]:
ppc = pm.sample_posterior_predictive(trace_g, samples=1000, model=model_g)
In [13]:
idx = np.argsort(x)
x_ord = x[idx]
In [53]:
plt.figure(figsize=(8,8))
plt.plot(x, y, 'b.')
plt.plot(x, alpha_m + beta_m * x, c='k', label='y = {:.2f} + {:.2f} * x'.format(alpha_m, beta_m))
sig0 = pm.hpd(ppc['y_pred'], alpha=0.5)[idx]
sig1 = pm.hpd(ppc['y_pred'], alpha=0.05)[idx]
plt.fill_between(x_ord, sig0[:,0], sig0[:,1], color='gray', alpha=1)
plt.fill_between(x_ord, sig1[:,0], sig1[:,1], color='gray', alpha=0.5)
plt.xlabel('opening 2018', fontsize=16)
plt.ylabel('total 2018', fontsize=16, rotation=90)
Out[53]:
In [52]:
x_st = ( x - x.mean() ) / x.std()
y_st = ( y - y.mean() ) / y.std()
_, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].plot(x_st, y_st, 'C0.')
ax[0].set_xlabel('opening 2018 (st)')
ax[0].set_ylabel('total 2018 (st)', rotation=90)
# ax[0].plot(x, y_real, 'k')
az.plot_kde(y_st, ax=ax[1])
ax[1].set_xlabel('total 2018')
plt.tight_layout()
In [26]:
with pm.Model() as model_g_st:
α = pm.Normal('α', mu=0, sd=10)
β = pm.Normal('β', mu=0, sd=1)
ϵ = pm.HalfCauchy('ϵ', 5)
μ = pm.Deterministic('μ', α + β * x_st)
y_pred = pm.Normal('y_pred', mu=μ, sd=ϵ, observed=y_st)
trace_g_st = pm.sample(2000, tune=1000)
In [27]:
varnames=['α', 'β', 'ϵ']
az.plot_trace(trace_g_st, varnames)
Out[27]:
In [28]:
pm.autocorrplot(trace_g_st, varnames)
Out[28]:
In [29]:
pm.summary(trace_g_st, varnames)
Out[29]:
In [50]:
plt.figure(figsize=(8,8))
plt.plot(x_st, y_st, 'b.');
alpha_m_st = trace_g_st['α'].mean()
beta_m_st = trace_g_st['β'].mean()
plt.plot(x_st, alpha_m_st + beta_m_st * x_st, c='k', label='y_st = {:.2f} + {:.2f} * x_st'.format(alpha_m_st, beta_m_st))
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$y$', fontsize=16, rotation=0)
plt.legend(loc=2, fontsize=14)
Out[50]:
In [40]:
np.mean((y_hat, y_hat_dst), axis=0)
Out[40]:
In [42]:
y_hat = alpha_m + beta_m * x
y_hat_dst = ((alpha_m_st + beta_m_st * x_st) * y.std()) + y.mean()
# mean absolute percentage diff bt standardized, non-standardized models
# not interested in identifying superior accuracy given ease of use of non-standardized model
np.mean(np.abs(y_hat - y_hat_dst) / np.mean((y_hat, y_hat_dst), axis=0))
Out[42]:
In [51]:
plt.figure(figsize=(8,8))
plt.plot(x, y, 'b.');
alpha_m = trace_g['α'].mean()
beta_m = trace_g['β'].mean()
plt.plot(x, alpha_m + beta_m * x, c='k', label='y = {:.2f} + {:.2f} * x'.format(alpha_m, beta_m))
plt.plot(x, y_hat_dst, label='y de-standardized')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$y$', fontsize=16, rotation=0)
plt.legend(loc=2, fontsize=14)
Out[51]:
In [ ]:
In [13]:
with pm.Model() as model_t:
alpha = pm.Normal('alpha', mu=0, sd=10)
beta = pm.Normal('beta', mu=0, sd=1)
epsilon = pm.HalfCauchy('epsilon', 5)
nu = pm.Deterministic('nu', pm.Exponential('nu_', 1/29) + 1)
y_pred = pm.StudentT('y_pred', mu=alpha + beta * x, sd=epsilon, nu=nu, observed=y)
# start = pm.find_MAP()
# step = pm.NUTS(scaling=start)
trace_t = pm.sample(2000, tune=1000)
In [15]:
varnames_t = ['alpha', 'beta', 'epsilon', 'nu']
pm.traceplot(trace_t, varnames_t)
Out[15]:
In [17]:
pm.summary(trace_t, varnames_t)
Out[17]:
In [18]:
df_movies_2017 = df_movies[df_movies.year == 2017]
df_movies_2017.head()
Out[18]:
In [24]:
x_17 = df_movies_2017.opening / 1e6
y_17 = df_movies_2017.gross / 1e6
_, ax = plt.subplots(1, 2, figsize=(8, 4))
ax[0].plot(x_17, y_17, 'C0.')
ax[0].set_xlabel('opening 2017')
ax[0].set_ylabel('total 2017', rotation=90)
# ax[0].plot(x, y_real, 'k')
az.plot_kde(y, ax=ax[1])
ax[1].set_xlabel('total 2017')
plt.tight_layout()
In [39]:
alpha_mt = trace_t['alpha'].mean()
beta_mt = trace_t['beta'].mean()
plt.figure(figsize=(8,8))
plt.plot(x_17, y_17, '.')
plt.plot(x_17, alpha_m + beta_m * x_17, label="Gaussian")
plt.plot(x_17, alpha_mt + beta_mt * x_17, label='robust')
plt.legend(loc=2, fontsize=12)
plt.tight_layout()
In [54]:
def rmse(y_s, y_hat):
return np.sqrt(np.mean(np.square(y_s - y_hat)))
rmse(y_17, alpha_m + beta_m * x_17), rmse(y_17, alpha_mt + beta_mt * x_17)
Out[54]:
In [71]:
df_movies.pivot_table('year_total', 'year').sort_index(ascending=False).head(12)
Out[71]:
In [80]:
not_training = df_movies.year != 2018
ten_bill_plus = df_movies.year >= 2009
df_movies_test = df_movies[not_training & ten_bill_plus]
df_movies_test.describe()
Out[80]:
In [81]:
x_test = df_movies_test.opening / 1e6
y_test = df_movies_test.gross / 1e6
_, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].plot(x_test, y_test, 'C0.')
ax[0].set_xlabel('opening 2009 - 17')
ax[0].set_ylabel('total 2009 - 17', rotation=90)
# ax[0].plot(x, y_real, 'k')
az.plot_kde(y, ax=ax[1])
ax[1].set_xlabel('total 2009 - 17')
plt.tight_layout()
In [82]:
plt.plot(x_test, y_test, '.')
plt.plot(x_test, alpha_m + beta_m * x_test, label="Gaussian")
plt.plot(x_test, alpha_mt + beta_mt * x_test, label='robust')
plt.legend(loc=2, fontsize=12)
plt.tight_layout()
In [83]:
rmse(y_test, alpha_m + beta_m * x_test), rmse(y_test, alpha_mt + beta_mt * x_test)
Out[83]:
In [85]:
df_movies_2019 = pd.read_excel("../../20190520_wta_movies/rundown2.xlsx", sheet_name="2019_YTD (Oct 16)")
df_movies_2019 = df_movies_2019[df_movies_2019.year == 2019]
print(df_movies_2019.shape)
df_movies_2019.head()
Out[85]:
In [90]:
df_movies_2019_closed = df_movies_2019[df_movies_2019.close != "-"]
print(df_movies_2019_closed.shape)
df_movies_2019_closed.head(10)
Out[90]:
In [91]:
x_19 = df_movies_2019_closed.opening / 1e6
y_19 = df_movies_2019_closed.gross / 1e6
_, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].plot(x_19, y_19, 'C0.')
ax[0].set_xlabel('opening 2019')
ax[0].set_ylabel('total 2019', rotation=90)
# ax[0].plot(x, y_real, 'k')
az.plot_kde(y, ax=ax[1])
ax[1].set_xlabel('total 2019')
plt.tight_layout()
In [92]:
plt.figure(figsize=(8,8))
plt.plot(x_19, y_19, '.')
plt.plot(x_19, alpha_m + beta_m * x_19, label="Gaussian")
plt.legend(loc=2, fontsize=12)
plt.tight_layout()
In [99]:
jw3_opening = df_movies_2019_closed.loc[10, 'opening']
alpha_m + beta_m * jw3_opening
Out[99]:
In [100]:
jw3_total = df_movies_2019_closed.loc[10, 'gross']
jw3_total
Out[100]:
In [ ]: