imports
In [1]:
%matplotlib inline
import statsmodels as sm
from statsmodels.formula.api import ols
import numpy as np
import pandas as pd
import pylab as plt
import seaborn as sns
import scipy.stats as stats
sns.set(style="white")
In [2]:
immer_ds = sm.datasets.get_rdataset("immer", "MASS")
In [3]:
print immer_ds.__doc__
In [4]:
immer_data = immer_ds.data
immer_ds.data
Out[4]:
In [5]:
immer_data.describe()
Out[5]:
Is manchuria yielding more crop than trebi?
In [6]:
manchuria_yield = immer_data.Y1[immer_data.Var == "M"]
trebi_yield = immer_data.Y1[immer_data.Var == "T"]
In [7]:
sns.distplot(manchuria_yield, rug=True, hist=False, label="manchuria")
sns.distplot(trebi_yield, rug=True, hist=False, label="trebi")
Out[7]:
In [8]:
t, p = stats.ttest_ind(manchuria_yield, trebi_yield)
print t, p
How does it work?
where $s_{X_1X_2}$ the the common variance
What does it really mean? Further away the means are the higher the statistic gets
In [9]:
for rv2_mean in range(0,5,1):
rvs1 = stats.norm.rvs(loc=0,scale=1,size=500)
rvs2 = stats.norm.rvs(loc=rv2_mean,scale=1,size=500)
stats.ttest_ind(rvs1,rvs2)
f = plt.figure()
sns.distplot(rvs1, hist=False, label="variable 1")
sns.distplot(rvs2, hist=False, label="variable 2", axlabel="t = %g, p=%g"%stats.ttest_ind(rvs1,rvs2))
plt.xlim([-9,9])
plt.ylim([0, 0.45])
In general the less there is an overlap between the two distributions the higher the statistic
In [10]:
for rvs_spread in range(1,6,1):
rvs1 = stats.norm.rvs(loc=0,scale=rvs_spread,size=500)
rvs2 = stats.norm.rvs(loc=1,scale=rvs_spread,size=500)
stats.ttest_ind(rvs1,rvs2)
plt.figure()
sns.distplot(rvs1, hist=False, label="variable 1")
sns.distplot(rvs2, hist=False, label="variable 2", axlabel="t = %g, p=%g"%stats.ttest_ind(rvs1,rvs2))
plt.xlim([-12,12])
plt.ylim([0, 0.45])
The higher the sample size the higher the statistic.
In [11]:
for sample_size in range(100,500,100):
rvs1 = stats.norm.rvs(loc=0,scale=1,size=sample_size)
rvs2 = stats.norm.rvs(loc=1,scale=1,size=sample_size)
stats.ttest_ind(rvs1,rvs2)
plt.figure()
sns.distplot(rvs1, hist=False, label="variable 1")
sns.distplot(rvs2, hist=False, label="variable 2", axlabel="t = %g, p=%g"%stats.ttest_ind(rvs1,rvs2))
plt.xlim([-12,12])
plt.ylim([0, 0.45])
Sample size not only increases the statistic, but also increases degrees of freedom which are used for calculating the p value.
In [12]:
dfs = range(10,300, 10)
ps = [stats.t.pdf(2, df) for df in dfs]
plt.plot(ps, dfs)
plt.xlabel("p-value")
plt.ylabel("degrees of freedom")
Out[12]:
Even tiniest effect size can be statistically significant if you collect enough samples.
In [13]:
sample_sizes = range(10, 1000, 10)
ps = [stats.ttest_ind(stats.norm.rvs(loc=0,scale=1,size=sample_size),
stats.norm.rvs(loc=0.3,scale=1,size=sample_size))[1] for sample_size in sample_sizes]
plt.semilogx(ps, sample_sizes)
plt.xlabel("p-value")
plt.ylabel("sample size")
Out[13]:
Excercise: plot distributions of svansona and velvet crops, and perform a two sample t test. Now include every sample from each crop type twice in the test - what will hapen to the p-value?
In [14]:
svansona_yield = immer_data.Y1[immer_data.Var == "S"]
velvet_yield = immer_data.Y1[immer_data.Var == "V"]
In [15]:
sns.distplot(svansona_yield, hist=False, rug=True)
sns.distplot(velvet_yield, hist=False, rug=True)
Out[15]:
In [16]:
stats.ttest_ind(svansona_yield, velvet_yield)
Out[16]:
In [17]:
svansona_yield_doubled = list(svansona_yield) + list(svansona_yield)
velvet_yield_doubled = list(velvet_yield) + list(velvet_yield)
stats.ttest_ind(svansona_yield_doubled, velvet_yield_doubled)
Out[17]:
In [18]:
year1_yield = immer_data.Y1
year2_yield = immer_data.Y2
In [19]:
sns.distplot(year1_yield, rug=True, hist=False, label="1931")
sns.distplot(year2_yield, rug=True, hist=False, label="1932")
plt.xlabel("crop")
Out[19]:
In [20]:
sns.violinplot(inner=None, data=immer_data)
for y1, y2 in zip(year1_yield, year2_yield):
plt.plot([0,1], [y1,y2], color="b")
In [21]:
y_diff = year1_yield-year2_yield
sns.distplot(y_diff, rug=True, hist=False)
Out[21]:
Let's calculate a one sample t-test checking if the distribution of differences is significantly different from zero.
In [22]:
stats.ttest_1samp(y_diff, 0)
Out[22]:
This is equivalent to so called paired two sample t test
In [23]:
stats.ttest_rel(year1_yield,year2_yield)
Out[23]:
What would happen if we ignored the pairing and treat the samples as independent?
In [24]:
stats.ttest_ind(year1_yield,year2_yield)
Out[24]:
The result is similar, but would it always be the case? Let's generate a lot of data and compare the difference between two sample independent and paired tests.
In [25]:
t_diffs = []
rvs = []
for i in range(1000):
rvs1 = stats.norm.rvs(loc=0,scale=1,size=15)
rvs2 = stats.norm.rvs(loc=0.2,scale=1,size=15)
t_rel, _ = stats.ttest_rel(rvs1,rvs2)
t_ind, _ = stats.ttest_ind(rvs1,rvs2)
t_diff = t_rel - t_ind
t_diffs.append(t_diff)
rvs.append((rvs1, rvs2))
In [26]:
sns.distplot(t_diffs)
Out[26]:
Note that the distribution is left skewed. This means that independent samples t test has more often higher values than the paired version.
Let's pick an extreme
In [27]:
rv1, rv2 = rvs[np.array(t_diffs).argmax()]
In [28]:
sns.violinplot(pd.DataFrame(np.array([rv1, rv2]).T))
for y1, y2 in zip(rv1, rv2):
plt.plot([0,1], [y1,y2], color="b")
In [29]:
stats.ttest_rel(rv1, rv2)
Out[29]:
In [30]:
stats.ttest_ind(rv1, rv2)
Out[30]:
Using the wrong test can result in a test being less or more significant than it should. However the sign of the stattistic can never change.
Are the yields of barley crops in 1931 correlated with yields in 1932
In [31]:
plt.scatter(x=year1_yield, y=year2_yield)
Out[31]:
We can try to fit a line to this plot. A line is defined by the following equation: $$ y = ax + b $$ where $a$ the slope and $b$ is the intercept of the line.
In [32]:
slope, intercept, r_value, p_value, std_err = stats.linregress(year1_yield, year2_yield)
In [33]:
plt.scatter(x=year1_yield, y=year2_yield)
plt.plot(np.arange(60,200),intercept+slope*np.arange(60,200), color="r")
plt.xlim([60, 200])
plt.xlabel("1931 crop yield")
plt.ylabel("1932 crop yield")
Out[33]:
From the equation we can see that the intercept is equivalent to the crop yield in 1932 (y) in case the yield in 1931 (x) was zero: $$ y = a0 + b = b$$
In [34]:
plt.scatter(x=year1_yield, y=year2_yield)
plt.plot(np.arange(0,200),intercept+slope*np.arange(0,200), color="r")
plt.xlim([0, 200])
plt.xlabel("1931 crop yield")
plt.ylabel("1932 crop yield")
print intercept
How is this line fitted? The line is a prediction. It tells you what should be the crop yield in 1932 based on 1931. But it's not perfect. Let's draw how far the predictions are from reality.
In [35]:
plt.scatter(x=year1_yield, y=year2_yield)
plt.plot(np.arange(60,200),intercept+slope*np.arange(60,200), color="r")
plt.xlim([60, 200])
plt.xlabel("1931 crop yield")
plt.ylabel("1932 crop yield")
plt.vlines(year1_yield, year2_yield,intercept+slope*year1_yield)
Out[35]:
We can count the errors
In [36]:
print np.abs(year2_yield-intercept+slope*year1_yield).sum()
Different lines will have bigger different errors:
In [37]:
slope2 = slope + 0.15
intercept2 = intercept - 10
plt.scatter(x=year1_yield, y=year2_yield)
plt.plot(np.arange(60,200),intercept2+slope2*np.arange(60,200), color="r")
plt.xlim([60, 200])
plt.xlabel("1931 crop yield")
plt.ylabel("1932 crop yield")
plt.vlines(year1_yield, year2_yield,intercept2+slope2*year1_yield)
print np.abs(year2_yield-intercept2+slope2*year1_yield).sum()
The fitting procedure is minimizing those errors trying to find a line that best describes the data.
In practice sum of absolute values of errors is not minimized - it's the sum of squares
In [38]:
print ((year2_yield-intercept2+slope2*year1_yield)**2).sum()
In [39]:
((year2_yield-intercept2+slope2*year1_yield)**2).sum()
Out[39]:
In practice this means that big errors will make much bigger impact on the fit than small errors.
Why square not absolute value? This comes from the fact that linear regression model is using Gaussian random noise.
where $\epsilon = N(0,1)$ (is a Gaussina with mean zero and variance 1)
Note that fitting a linear model has a close solution. This means that we can calculate optimal slope and intercept parameters without having to try multiple different lines.
To assess how well we can predict values of one variable with the values of another we can use the correlation value
In [40]:
r_value
Out[40]:
Correlation is nothing else than covariance of the two variables normalized by their variance. It varies from -1 (anticorrealted), through 0 (uncorrelated), to 1 (fully correlated)
In [41]:
np.corrcoef(x=year1_yield, y=year2_yield)
Out[41]:
Correlation coefficient has another interesting property - when squared it describes the percentage of variance of variable Y explained by one varaince in variable X (and vice versa)
In [42]:
r_value**2*100
Out[42]:
There is also a p-value. It is directly related to the r_value and the number of samples.
Excercise: There is clearly an outlier in the distribution (data point in the upper right corner on the scatterplot) what will happen to the correlation if you remove it?
In [103]:
year1_yield_no_outlier = year1_yield[year1_yield!=year1_yield.max()]
year2_yield_no_outlier = year2_yield[year1_yield!=year1_yield.max()]
slope, intercept, r_value, p_value, std_err = stats.linregress(year1_yield_no_outlier, year2_yield_no_outlier)
plt.scatter(x=year1_yield_no_outlier, y=year2_yield_no_outlier)
plt.plot(np.arange(60,200),intercept+slope*np.arange(60,200), color="b", label="without outlier")
plt.xlim([60, 200])
plt.xlabel("1931 crop yield")
plt.ylabel("1932 crop yield")
plt.vlines(year1_yield_no_outlier, year2_yield_no_outlier,intercept+slope*year1_yield_no_outlier)
slope, intercept, r_value, p_value, std_err = stats.linregress(year1_yield, year2_yield)
plt.plot(np.arange(60,200),intercept+slope*np.arange(60,200), color="r", label="with outlier")
plt.legend()
Out[103]:
In [43]:
prestige = sm.datasets.get_rdataset("Duncan", "car", cache=True).data
In [44]:
ds = sm.datasets.get_rdataset("Duncan", "car", cache=True)
print ds.__doc__
In [45]:
prestige.head()
Out[45]:
How well can we describe predict prestige using education and icome?
In [46]:
prestige_model = ols("prestige ~ income + education", data=prestige).fit()
prestige_model.summary()
Out[46]:
Multiple linear regression (or General Linear Models - GLM) are based on the premise of a matrix of variables describing the data: $$ Y = \beta X + \epsilon $$ where $\beta$ are the coefficient and X is the design matrix. In our case the design matrix looks this way
In [47]:
plt.matshow(prestige_model.model.exog)
plt.xticks(range(3), prestige_model.model.exog_names, rotation=-25)
plt.colorbar()
Out[47]:
Note that on top of our two regressors of interest ("income" and "education") we also have "intercept" just like in regural linear regrossion with one variable.
In fact we can recreate the same results from our previous example using GLM.
In [48]:
crop_model = ols("Y2 ~ Y1", data=immer_ds.data).fit()
crop_model.summary()
Out[48]:
In [49]:
plt.matshow(crop_model.model.exog)
plt.xticks(range(2), crop_model.model.exog_names, rotation=-25)
plt.colorbar()
Out[49]:
You can also express two sample T test in GLM
In [50]:
immer_subset = immer_data[np.logical_or(immer_data.Var == "M", immer_data.Var == "T")]
In [51]:
subset_model = ols("Y1 ~ Var", data=immer_subset).fit()
subset_model.summary()
Out[51]:
In [52]:
plt.matshow(subset_model.model.exog)
plt.xticks(range(2), subset_model.model.exog_names, rotation=-25)
plt.colorbar()
Out[52]:
This model can be also used for more than two "groups"
In [53]:
full_model = ols("Y1 ~ Var", data=immer_ds.data).fit()
full_model.summary()
Out[53]:
In [54]:
plt.matshow(full_model.model.exog)
plt.xticks(range(len(full_model.model.exog_names)), full_model.model.exog_names, rotation=-25)
plt.colorbar()
Out[54]:
We can now explicitly test hypotheses such as is there a difference between manchuria and trebi.
In [55]:
full_model.f_test("Var[T.P] = Var[T.T]")
Out[55]:
Which is equivalent to
In [56]:
full_model.f_test([0,1,0,-1,0])
Out[56]:
There are also other ways of coding categorical variables in design matrices
In [57]:
data = sm.datasets.longley.load_pandas().data
data.head()
Out[57]:
In [58]:
print sm.datasets.longley.NOTE
First we fit a model with one variable
In [59]:
model_1 = ols("TOTEMP ~ POP", data=data).fit()
model_1.summary()
Out[59]:
Lots of variance explained and high t value!
Lets try another variable
In [60]:
model_2 = ols("TOTEMP ~ GNP", data=data).fit()
model_2.summary()
Out[60]:
Even more variance explained and even higher t value!
Let's put both variables in the model
In [61]:
model_1_plus_2 = ols("TOTEMP ~ POP + GNP", data=data).fit()
model_1_plus_2.summary()
Out[61]:
The t values decreased drastically! Effect of the population (POP) changed direction!
In [62]:
sns.jointplot("GNP", "POP", data=data)
Out[62]:
It's because the two variables are highly correlated. t values and coefficients reported by a GLM model correspond only to the unique variance. Varaince shared by the independen variables are not assigned to any of them. Note however overall model performance ($R^2$) increased.
In [63]:
sns.jointplot("TOTEMP", "POP", data=data)
Out[63]:
In [64]:
slope, intercept, r_value, p_value, std_err = stats.linregress(data["GNP"], data["POP"])
In [65]:
predicted = intercept+slope*data["GNP"]
data["POPwoGNP"] = data["POP"] - predicted
In [66]:
sns.jointplot("TOTEMP", "POPwoGNP", data=data)
Out[66]:
Excercise: There are many different models one could fit. Write a loop that iterates over a set of models and try to find one that shows the highest correlation between employment and the size of the armed forces. What can you learn from this experiment in the context of what we said about p-hacking?
In [81]:
regressors = set(data.columns) - set(["ARMED", "UNEMP"])
regressors
Out[81]:
In [86]:
list(itertools.combinations(regressors, 4))
Out[86]:
In [71]:
import itertools
import pandas as pd
models = []
t_values = []
p_values = []
for i in range(len(regressors)):
for regresor_subset in itertools.combinations(regressors, i):
ind_variables = ["ARMED", ] + list(regresor_subset)
models.append(ind_variables)
model = ols("UNEMP ~ " + " + ".join(ind_variables), data=data).fit()
t_values.append(model.params["ARMED"])
p_values.append(model.pvalues["ARMED"])
In [88]:
t_values
Out[88]:
In [89]:
df = pd.DataFrame({"covariates": models,
"p_values": p_values,
"t_values": t_values})
In [90]:
df.head()
Out[90]:
In [92]:
df.sort(["t_values"])
Out[92]:
In [ ]: