In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
df_param_stats = pd.read_excel("param_stats.xlsx")
print(df_param_stats.dtypes)
print(df_param_stats.shape)
df_param_stats.head()
Out[2]:
In [3]:
import seaborn as sns
sns.__version__
# dot plot with ci lines for settings
# or: https://seaborn.pydata.org/generated/seaborn.violinplot.html
# also, CLT: run above N times and grab mean
Out[3]:
In [4]:
ax = sns.violinplot(x="d_threshold", y="mean_cuml", hue="m_nights", data=df_param_stats)
In [5]:
ax = sns.catplot(x="d_threshold", y="mean_cuml", hue="m_nights", row="binary_eval", col="change_rate",
data=df_param_stats[df_param_stats.incl_contra==False])
# plt.axhline(y=192, linewidth=5, color='red')
In [6]:
ax = sns.catplot(x="d_threshold", y="mean_cuml", hue="m_nights", row="binary_eval", col="change_rate", data=df_param_stats[df_param_stats.incl_contra==True])
In [7]:
# use decision tree to identify parameters/thresholds to get high score (25+)
# i.e., change_rate == 0.2, m_nights == 10, etc.
In [8]:
import statsmodels.api as sm
# code template: http://blog.yhat.com/posts/logistic-regression-and-python.html
In [9]:
print(df_param_stats.mean_cuml.describe())
df_param_stats.head()
Out[9]:
In [10]:
df_param_stats.columns
Out[10]:
In [11]:
dummy_binary = pd.get_dummies(df_param_stats.binary_eval, prefix="binary")
dummy_binary.head()
Out[11]:
In [12]:
dummy_contrary = pd.get_dummies(df_param_stats.incl_contra, prefix="contrarians")
dummy_contrary.head()
Out[12]:
In [13]:
dummy_top_q = pd.get_dummies(df_param_stats.mean_cuml>192., prefix="top_q")
dummy_top_q.head()
Out[13]:
In [14]:
cols_to_keep = ['d_threshold', 'm_nights', 'change_rate']
df_param_classify = df_param_stats[cols_to_keep].join(dummy_binary.binary_True).join(dummy_contrary.contrarians_True).join(dummy_top_q.top_q_True)
df_param_classify.head()
Out[14]:
In [15]:
df_param_classify['intercept'] = 1.0
df_param_classify.columns
Out[15]:
In [16]:
df_param_classify = df_param_classify[['top_q_True', 'd_threshold', 'm_nights', 'change_rate', 'binary_True', 'contrarians_True', 'intercept']]
df_param_classify.head()
Out[16]:
In [17]:
train_cols = df_param_classify.columns[1:]
logit = sm.Logit(df_param_classify['top_q_True'], df_param_classify[train_cols])
# fit the model
result = logit.fit()
In [18]:
print(result.summary())
In [19]:
# odds ratios only
print(np.exp(result.params))
# increases your shot of a good time:
# the more nights taken into consideration
# updating preferences as binary decision: go / don't go
# BIG ONE: having contrarians in the population
# decreases shot at good times:
# making threshold too tight/close to zero
# changing preferences too aggresively
In [20]:
from sklearn import tree
# code template: http://scikit-learn.org/stable/modules/tree.html
In [21]:
# df_param_classify.columns[1:-1]
X = np.matrix(df_param_classify[['d_threshold', 'm_nights', 'change_rate', 'binary_True',
'contrarians_True']])
y = np.array(df_param_classify.top_q_True)
In [22]:
X[0]
Out[22]:
In [23]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
In [24]:
clf.predict([[-3. , 10. , 0.9, 1. , 1. ]])
Out[24]:
In [25]:
clf.predict_proba([[-3. , 10. , 0.9, 1. , 1. ]])
Out[25]:
In [26]:
clf.predict_proba([[-3. , 10. , 0.9, 1. , 1. ]])[0][1]
Out[26]:
In [27]:
ds = [-3, -2, -1]
ms = [10., 5., 3.]
cr = [.9, .5, .2]
bn = [1., 0.]
ct = [1., 0.]
fun_time_probs = []
for i in range(len(ds)):
for j in range(len(ms)):
for k in range(len(cr)):
for l in range(len(bn)):
for m in range(len(ct)):
temp_dict = {}
temp_dict['d_threshold'] = ds[i]
temp_dict['m_nights'] = ms[j]
temp_dict['change_rate'] = cr[k]
temp_dict['binary_eval'] = bn[l]
temp_dict['incl_contra'] = ct[m]
temp_dict['fun_time'] = clf.predict_proba([[ds[i], ms[j], cr[k], bn[l], ct[m]]])[0][1]
fun_time_probs.append(temp_dict)
df_fun_time_probs = pd.DataFrame(fun_time_probs)
df_fun_time_probs = df_fun_time_probs[['d_threshold', 'm_nights', 'change_rate',
'binary_eval', 'incl_contra', 'fun_time']]
df_fun_time_probs.head()
Out[27]:
In [39]:
# df_fun_time_probs.to_excel("fun_time_probs.xlsx")
In [73]:
df_param_contr = pd.read_excel("param_stats_high_perf.xlsx", sheet_name="Sheet3")
print(df_param_contr.dtypes)
print(df_param_contr.shape)
df_param_contr.head()
Out[73]:
In [75]:
ax = sns.catplot(x="group", y="mean_cuml", hue="m_nights",
row="binary_eval", col="change_rate", data=df_param_contr)
In [76]:
df_param_contr = pd.read_excel("param_stats_high_perf.xlsx", sheet_name="Sheet4")
print(df_param_contr.dtypes)
print(df_param_contr.shape)
df_param_contr.head()
Out[76]:
In [79]:
ax = sns.catplot(x="group", y="mean_cuml", hue="m_nights",
row="binary_eval", col="change_rate", data=df_param_contr)
In [78]:
ax = sns.catplot(x="group", y="mean_cuml", hue="d_threshold",
row="binary_eval", col="change_rate", data=df_param_contr)
In [50]:
print(np.sum(df_param_contr.hist_cnt > 50))
df_param_contr[df_param_contr.hist_cnt > 50][df_param_contr.mean_cuml_hist > df_param_contr.mean_cuml_contr].shape[0]
Out[50]:
In [49]:
print(np.sum(df_param_contr.hist_cnt < 50))
df_param_contr[df_param_contr.hist_cnt < 50][df_param_contr.mean_cuml_hist < df_param_contr.mean_cuml_contr].shape[0]
Out[49]:
In [31]:
df_param_contr.columns
Out[31]:
In [64]:
df_sub_hist = df_param_contr[['d_threshold', 'm_nights', 'change_rate', 'binary_eval', 'incl_contra',
'setting_run', 'hist_cnt', 'mean_cuml_hist']]
df_sub_hist.columns = [['d_threshold', 'm_nights', 'change_rate', 'binary_eval', 'incl_contra',
'setting_run', 'hist_cnt', 'mean_cuml']]
df_sub_hist['hist'] = 1
print(df_sub_hist.shape)
df_sub_hist.head()
Out[64]:
In [65]:
df_sub_contr = df_param_contr[['d_threshold', 'm_nights', 'change_rate', 'binary_eval', 'incl_contra',
'setting_run', 'hist_cnt', 'mean_cuml_contr']]
df_sub_contr.columns = [['d_threshold', 'm_nights', 'change_rate', 'binary_eval', 'incl_contra',
'setting_run', 'hist_cnt', 'mean_cuml']]
df_sub_contr['hist'] = 0
print(df_sub_contr.shape)
df_sub_contr.head()
Out[65]:
In [66]:
frames = [df_sub_hist, df_sub_contr]
df_param_contr_longer = pd.concat(frames, ignore_index=True)
print(df_param_contr_longer.shape)
df_param_contr_longer.head()
Out[66]:
In [67]:
df_param_contr_longer.tail()
Out[67]:
In [68]:
df_param_contr_longer.dtypes
Out[68]:
In [71]:
df_param_contr_longer.shape
Out[71]:
In [59]:
ax = sns.catplot(x="d_threshold", y="mean_cuml_contr", hue="m_nights", row="binary_eval", col="change_rate",
data=df_param_contr)
In [ ]: