In [ ]:
# Working with data 2017. Class 5
## Contact
Javier Garcia-Bernardo
garcia@uva.nl
## 0. Structure
## 1 Assumptions of t-test and regression
- Normality
- Independent and identically distributed (i.i.d.)
- Equal variance
- (for linear regression) Uncorrelated residuals
## 2. groups
- Compare one group vs one value
- Compare two groups
- Compare two paired groups
## 3. Multiple groups
- ANOVA
- Multiple comparison (Tukey correction)
## 4. Regressions
- Linear regression
- Logistic regression
- Machine learning
In [1]:
#Install something we'll need
!pip install --user scikits.bootstrap
#Requires restarting the kernel
In [91]:
##Some code to run at the beginning of the file, to be able to show images in the notebook
##Don't worry about this cell
#Print the plots in this screen
%matplotlib inline
#Be able to plot images saved in the hard drive
from IPython.display import Image
#Make the notebook wider
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import seaborn as sns
import pylab as plt
import pandas as pd
import numpy as np
import scipy
import scipy.stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import scikits.bootstrap as bootstrap
In [4]:
Image(url="https://kanbanize.com/blog/wp-content/uploads/2014/07/Standard_deviation_diagram.png", width=500)
Out[4]:
How to test it
In [ ]:
In [14]:
def qq_plot(x):
import scipy.stats
(osm, osr),(slope, intercept, r) = scipy.stats.probplot(x, dist='norm', plot=None)
plt.plot(osm, osr, '.', osm, slope*osm + intercept)
plt.xlabel('Quantiles',fontsize=14)
plt.ylabel('Quantiles Obs',fontsize=14)
In [55]:
x = np.random.randn(100)
plt.subplot(2,2,1)
qq_plot(x)
x = np.random.exponential(2,100)
plt.subplot(2,2,2)
qq_plot(x)
plt.tight_layout()
plt.show()
In [33]:
Image(url="http://goodsciencebadscience.nl/wp-content/uploads/2012/09/variances1.gif",width=500)
Out[33]:
How to test it
In [12]:
#random normal data (mean 0 std 1)
x = np.random.randn(100)
#random normal data (mean 0 std 2)
y = np.random.randn(100)*2
#Test
print(scipy.stats.levene(x,y))
In [34]:
print("These are the residuals")
Image(url="https://upload.wikimedia.org/wikipedia/commons/e/ed/Residuals_for_Linear_Regression_Fit.png",width=500)
Out[34]:
In [8]:
Image(url="https://i.stack.imgur.com/RU17l.png")
Out[8]:
In [15]:
from scipy.stats import norm
#Our sample, normally distributed with mean 0.1 and std 1
x = np.random.randn(100)+2
#histogram
plt.subplot(2,2,1)
sns.distplot(x,kde=False,fit=norm)
plt.subplot(2,2,2)
qq_plot(x)
plt.show()
print("-"*20)
print("Test for mean of group different than zero")
scipy.stats.ttest_1samp(x,popmean=0)
Out[15]:
In [16]:
from scipy.stats import norm
#Our sample, normally distributed with mean 0.1 and std 1
x = np.random.randn(100)+0.1
#Our other sample, normally distributed with mean 0.5 and std 1
y = np.random.randn(100)+0.5
#Equal variance?
#We are testing if we can do the test
#If significant it means that the variances are different
print("Test for equal variance")
print(scipy.stats.levene(x,y))
In [17]:
#histogram
plt.subplot(2,2,1)
sns.distplot(x,kde=True)
sns.distplot(y,kde=True)
plt.subplot(2,2,2)
qq_plot(x)
qq_plot(y)
plt.tight_layout()
plt.show()
print("-"*20)
print("Test for different means")
scipy.stats.ttest_ind(x,y)
Out[17]:
In [18]:
from scipy.stats import norm
#Our sample, normally distributed with mean 0.1 and std 1
x = np.random.randn(100)+0.1
#Our other sample, similar to x but adding 0.05 and some random noise
y = x+np.random.randn(100)/10 + 0.05
#Equal variance?
print("Test for equal variance")
print(scipy.stats.levene(x,y))
In [19]:
#histogram
plt.subplot(2,2,1)
sns.distplot(x,kde=True)
sns.distplot(y,kde=True)
plt.subplot(2,2,2)
qq_plot(x)
qq_plot(y)
plt.tight_layout()
plt.show()
In [20]:
print("WRONG: What would happen if we don't use paired t-test")
print(scipy.stats.ttest_ind(x,y))
print("-"*20)
print("Test for different means")
scipy.stats.ttest_rel(x,y)
Out[20]:
In [94]:
scipy.stats.ttest_rel?
In [23]:
#Our sample, normally distributed with mean 0.1 and std 1
x = np.random.randn(100)+0.1
#Our other sample, normally distributed with mean 0.2 and std 2
y = np.random.randn(50)*2+0.2
#Equal variance?
print("Test for equal variance")
print(scipy.stats.levene(x,y))
In [31]:
#histogram
plt.subplot(2,2,1)
sns.distplot(x,kde=True)
sns.distplot(y,kde=True)
plt.subplot(2,2,2)
qq_plot(x)
qq_plot(y)
plt.tight_layout()
plt.show()
print("WRONG: What happens if we try the test assumming equal variance")
print(scipy.stats.ttest_ind(x,y))
print("-"*20)
print("Test for different means with different variance")
scipy.stats.ttest_ind(x,y,equal_var=False)
Out[31]:
In [39]:
#Our sample, uniformly distributed
x = np.random.exponential(2,100)
#Our other sample, niformly distributed + 0.1
y = np.random.exponential(2,100)+0.1
#Equal variance?
print("Test for equal variance")
print(scipy.stats.levene(x,y))
In [40]:
#histogram
plt.subplot(2,2,1)
sns.distplot(x,kde=True)
sns.distplot(y,kde=True)
plt.subplot(2,2,2)
qq_plot(x)
qq_plot(y)
plt.show()
In [42]:
print("WRONG: What happens if we try t-test")
print(scipy.stats.ttest_ind(x,y))
print("-"*20)
print("Test for different means with no normal distributions, equal variance")
scipy.stats.mannwhitneyu(x,y)
Out[42]:
In [43]:
#Our sample, normally distributed with mean 0.1 and std 1
x = np.random.random(100)
#Our other sample, normally distributed with mean 0.05 and std 1 and some random noise
y = x+np.random.randn(100)/10 + 0.05
#Equal variance?
print("Test for equal variance")
print(scipy.stats.levene(x,y))
In [44]:
#histogram
plt.subplot(2,2,1)
sns.distplot(x,kde=True)
sns.distplot(y,kde=True)
plt.subplot(2,2,2)
qq_plot(x)
qq_plot(y)
plt.show()
In [45]:
print("WRONG: What happens if we try t-test")
print(scipy.stats.ttest_rel(x,y))
print("-"*20)
print("Test for different means with no normal distributions and equal variance")
print(scipy.stats.wilcoxon(x,y))
Analysis of variance (ANOVA) is a collection of statistical models used to analyze the differences between group means and their associated procedures (such as "variation" among and between groups).
The Kruskal–Wallis one-way analysis of variance by ranks (named after William Kruskal and W. Allen Wallis) is a non-parametric method for testing whether samples originate from the same distribution.
In [46]:
#Our sample, normally distributed with mean 0.1 and std 1
x = np.random.randn(100)+0.1
#Our other sample, normally distributed with mean 0.2 and std 1
y = np.random.randn(100)+0.2
#Our other sample, normally distributed with mean 0.3 and std 1
z = np.random.randn(100)+0.3
#Equal variance?
print("Test for equal variance")
print(scipy.stats.levene(x,y,z))
In [47]:
#histogram
plt.subplot(2,2,1)
sns.distplot(x,kde=True)
sns.distplot(y,kde=True)
sns.distplot(z,kde=True)
plt.subplot(2,2,2)
qq_plot(x)
qq_plot(y)
qq_plot(z)
plt.show()
print("-"*20)
print("ONE-WAY ANOVA")
print(scipy.stats.f_oneway(x, y, z) )
print("-"*20)
print("ONE-WAY KRUSKAL WALLIS") #use if not-normal and equal variance
print(scipy.stats.mstats.kruskalwallis(x,y,z))
In [64]:
#Let's compare two identical samples, see how often we get false results
for i in range(100):
x = np.random.randn(100)
y = np.random.randn(100)
stat, pvalue = scipy.stats.ttest_ind(x,y)
if pvalue < 0.05:
print(i,pvalue)
In [65]:
#Our sample, normally distributed with mean 0.1 and std 1
x = np.random.randn(100)+0.1
#Our other sample, normally distributed with mean 0.2 and std 1
y = np.random.randn(100)+0.2
#Our other sample, normally distributed with mean 0.3 and std 1
z = np.random.randn(100)+0.3
#Our other sample, normally distributed with mean 0.3 and std 1
w = np.random.randn(100)+0.5
print("New threshold (instead of 0.05) = ",0.05/6)
#Compare x and y
print("x and y: ", scipy.stats.ttest_ind(x,y))
#Compare x and z
print("x and z: ", scipy.stats.ttest_ind(x,z))
#Compare x and w
print("x and w: ", scipy.stats.ttest_ind(x,w))
#Compare y and z
print("y and z: ", scipy.stats.ttest_ind(y,z))
#Compare y and w
print("y and w: ", scipy.stats.ttest_ind(y,w))
#Compare z and w
print("z and w: ", scipy.stats.ttest_ind(z,w))
In [144]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
In [68]:
df = pd.read_csv("data/tukey_example.csv")
df.sample(10)
Out[68]:
In [69]:
res2 = pairwise_tukeyhsd(df["productivity"],df["group"])
print(res2)
In [70]:
res2.plot_simultaneous(comparison_name=None,xlabel='diffs',ylabel='Group')
plt.show()
They tell us a lot about the mechanism generating our values
Normal distribution:
Power-law distribution:
Binomial distribution: Number of successes out of a number of trials. Each event independent
In [73]:
from scipy.stats import lognorm,norm,expon,powerlaw
x = np.random.lognormal(1,0.8,3000)
plt.subplot(2,2,1)
sns.distplot(x,fit=norm,kde=False)
plt.title("norm")
plt.subplot(2,2,2)
sns.distplot(x,fit=lognorm,kde=False)
plt.title("lognorm")
plt.subplot(2,2,3)
sns.distplot(x,fit=expon,kde=False)
plt.title("expon")
plt.subplot(2,2,4)
sns.distplot(x,fit=powerlaw,kde=False)
plt.title("powerlaw")
plt.tight_layout()
plt.show()
In [71]:
import scikits.bootstrap as bootstrap
In [73]:
#Bootstrap of mean and std
x = np.random.randn(100)*3 #this could be one of our columns
CIs = bootstrap.ci(x, statfunction=np.mean,n_samples=100000)
print('CI for mean with bootstrapping = ', CIs)
CIs = bootstrap.ci(x, statfunction=np.std,n_samples=100000)
print('CI for std with bootstrapping = ', CIs)
In [75]:
x = np.random.randn(100)*3
np.mean(x),np.std(x)
Out[75]:
In [76]:
CIs = bootstrap.ci(x, statfunction=np.mean,n_samples=100000)
print('CI for mean with bootstrapping = ', CIs)
In [78]:
scipy.stats.ttest_1samp(x,popmean=0)
Out[78]:
In [79]:
#Bootsrap of p-value
def return_pvalue(x):
stat,pvalue = scipy.stats.ttest_1samp(x,0)
return pvalue
CIs = bootstrap.ci(x, statfunction=return_pvalue, n_samples=10000)
print('CI for p-value with bootstrapping = ', CIs)
In [109]:
df = pd.read_csv("data/big3_position.csv",sep="\t")
df.head()
Out[109]:
In [81]:
industrial = df.loc[df["TypeEnt"]=="Industrial company"]
financial = df.loc[df["TypeEnt"]=="Financial company"]
In [84]:
#Employees (like most is lognormally distributed -> we can convert it to a normal distribution and run our tests happily)
i = np.log(industrial["Employees"].dropna())
f = np.log(financial["Employees"].dropna())
from scipy.stats import lognorm,norm
plt.figure(figsize=(10,3))
plt.subplot(1,2,1)
sns.distplot(df["Employees"].dropna(),fit=lognorm,kde=False)
plt.subplot(1,2,2)
sns.distplot(np.log(df["Employees"].dropna()),fit=norm,kde=False)
Out[84]:
In [89]:
i.head()
Out[89]:
In [90]:
f.head()
Out[90]:
In [92]:
#Check if our distributions have the same variance
scipy.stats.levene(i,f)
Out[92]:
In [97]:
sns.distplot(i)
sns.distplot(f)
Out[97]:
In [95]:
#Plot histograms and qq_plots for i and f
qq_plot(i)
qq_plot(f)
In [98]:
#Run the correct statistical test to know if the
#number of employees of financial and industrial companies is different
scipy.stats.man(i,f)
Out[98]:
In [123]:
df = pd.read_csv("data/big3_position.csv",sep="\t")
df.head()
Out[123]:
In [124]:
#Create a new column that is the logarithm of "MarketCap", call it "logMarketCap"
df["logMarketCap"] = np.log(df["MarketCap"])
df.head()
Out[124]:
In [112]:
df.describe()
Out[112]:
In [125]:
#Replace infinite values (log(0) = infinite) with nan
df = df.replace([np.inf, -np.inf], np.nan)
#Drop na values if "logMarketCap" is na
df = df.dropna(subset=["logMarketCap"])
In [114]:
df.describe()
Out[114]:
In [115]:
#Find the distribution of "MarketCap" and "logMarketCap", fitting the rigth distribution
from scipy.stats import norm,lognorm,expon
sns.distplot(df["MarketCap"],fit=lognorm,kde=False)
Out[115]:
In [116]:
#Find the distribution of "MarketCap" and "logMarketCap", fitting the rigth distribution
from scipy.stats import norm,lognorm,expon
sns.distplot(df["logMarketCap"],fit=norm,kde=False)
Out[116]:
In [126]:
#Kepe only three types of entities
df = df.loc[df["TypeEnt"].isin(["Financial company","Industrial company","Bank"])]
In [118]:
#Plot distributions for each type
plt.figure(figsize=(6,4))
for typeent,group in df.groupby("TypeEnt"):
sns.distplot(group["logMarketCap"],kde=False,norm_hist=True,label=typeent)
plt.legend()
In [127]:
#Run ANOVA
bank = df.loc[df["TypeEnt"] == "Bank"]
bank_values = bank["logMarketCap"]
ind = df.loc[df["TypeEnt"] == "Industrial company"]
ind_values = ind["logMarketCap"]
fin = df.loc[df["TypeEnt"] == "Financial company"]
fin_values = fin["logMarketCap"]
scipy.stats.f_oneway(bank_values,ind_values,fin_values)
Out[127]:
In [128]:
#Run Tukey test
res2 = pairwise_tukeyhsd(df["logMarketCap"],df["TypeEnt"])
print(res2)
In [129]:
#Plot tukey test
res2.plot_simultaneous(comparison_name=None,xlabel='diffs',ylabel='Group')
plt.show()