In [113]:
##Some code to run at the beginning of the file, to be able to show images in the notebook
##Don't worry about this cell
#Print the plots in this screen
%matplotlib inline
#Be able to plot images saved in the hard drive
from IPython.display import Image
#Make the notebook wider
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import seaborn as sns
import pylab as plt
import pandas as pd
import numpy as np
import scipy.stats
import statsmodels.formula.api as smf
In [114]:
def qq_plot(x):
(osm, osr),(slope, intercept, r) = scipy.stats.probplot(x, dist='norm', plot=None)
plt.plot(osm, osr, '.', osm, slope*osm + intercept)
plt.xlabel('Quantiles',fontsize=14)
plt.ylabel('Quantiles Obs',fontsize=14)
F) No outliers
While normality of independent variables is not an assumption, it make the other assumptions easier
Source: https://www.analyticsvidhya.com/blog/2016/07/deeper-regression-analysis-assumptions-plots-solutions/
In [58]:
df = pd.read_csv("../class5/data/big3_position.csv",sep="\t").dropna()
df.head()
Out[58]:
In [59]:
#How to run a regression (be careful, this is wrong)
mod = smf.ols(formula='MarketCap ~ Revenue + Employees', data=df)
res = mod.fit()
print(res.summary())
In [60]:
#Our independent variables are highly correlated
scipy.stats.pearsonr(df["Revenue"],df["Employees"])
Out[60]:
In [61]:
#Residuals vs fitted
plt.scatter(res.predict(), res.resid)
Out[61]:
In [62]:
#And it's caused by this -> Very very far from normal, it's not additive
qq_plot(df["MarketCap"])
In [63]:
df["log_MarketCap"] = np.log10(df["MarketCap"])
df["log_Revenue"] = np.log10(df["Revenue"])
df["log_Employees"] = np.log10(df["Employees"])
df = df.loc[np.isfinite(df["log_MarketCap"])]
df = df.loc[np.isfinite(df["log_Revenue"])]
In [64]:
#And it's caused by this -> Very very far from normal, it's not additive
qq_plot(df["log_MarketCap"])
qq_plot(df["log_Employees"])
In [65]:
#This now owrks
mod = smf.ols(formula='log_MarketCap ~ log_Employees', data=df)
res = mod.fit()
print(res.summary())
In [12]:
sns.regplot(res.predict(),res.resid)
plt.ylabel('Residual')
plt.xlabel('Fitted values')
Out[12]:
In [13]:
sns.regplot(res.predict(),res.resid)
plt.ylabel('Residual')
plt.xlabel('Fitted values')
Out[13]:
In [14]:
#Normality
sns.distplot(res.resid,kde=False)
plt.ylabel('Count')
plt.xlabel('Normalized residuals')
plt.xlim((-3,3))
Out[14]:
In [15]:
#Normality 2
qq_plot(res.resid)
In [16]:
sns.pairplot(df.loc[:,["log_Revenue","log_Employees"]])
Out[16]:
In [17]:
#Larger cook distance, larger influence
influence = res.get_influence()
(c, p) = influence.cooks_distance
plt.vlines(np.arange(len(c)), ymin=0, ymax=c,color="gray")
plt.ylabel("Distance")
plt.xlabel("Observation")
Out[17]:
In [18]:
#Larger leverage, larger influence. If high residual and high leverage -> Probably affecting the model
from statsmodels.graphics.regressionplots import *
influence_plot(res)
plt.show()
mod = smf.ols(formula='log_MarketCap ~ log_Employees', data=df)
mod = smf.ols(formula='log_MarketCap ~ log_Employees + log_Revenue', data=df)
mod = smf.ols(formula='log_MarketCap ~ log_Employees * log_Revenue', data=df)
mod = smf.ols(formula='log_MarketCap ~ log_Employees + C(Position)', data=df)
mod = smf.logit(formula='xx ~ log_Employees + C(Position)', data=df)
mod = smf.mixedlm(formula='log_MarketCap ~ log_Employees', data=df,groups="Position")
mod = smf.mixedlm(formula='log_MarketCap ~ log_Employees', data=df,groups="Position", re_formula="Year")
In [115]:
df = pd.read_csv("data/test_mixed.csv")
df.head()
Out[115]:
In [93]:
sns.lmplot("wealth","happiness",data=df)
plt.show()
mod = smf.ols(formula='happiness ~ wealth', data=df)#,groups="country")
res = mod.fit()
res.summary()
Out[93]:
In [91]:
sns.lmplot("wealth","happiness",data=df,hue="country")
plt.show()
mod = smf.mixedlm(formula='happiness ~ wealth', data=df,groups="country")
res = mod.fit()
res.summary()
Out[91]: