Hypothesis testing



In [1]:

    
# importing required modules
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

# runing the functions script
%run stats_func.py



In [2]:

    
# loading the iris dataset
df = pd.read_csv('iris.csv')
df.head()









    Out[2]:






  
    
      
      Id
      SepalLengthCm
      SepalWidthCm
      PetalLengthCm
      PetalWidthCm
      Species
    
  
  
    
      0
      1
      5.1
      3.5
      1.4
      0.2
      Iris-setosa
    
    
      1
      2
      4.9
      3.0
      1.4
      0.2
      Iris-setosa
    
    
      2
      3
      4.7
      3.2
      1.3
      0.2
      Iris-setosa
    
    
      3
      4
      4.6
      3.1
      1.5
      0.2
      Iris-setosa
    
    
      4
      5
      5.0
      3.6
      1.4
      0.2
      Iris-setosa



In [3]:

    
# for further analysis sepal length and sepal width will be used
sepalLength = df['SepalLengthCm'].values
sepalWidth = df['SepalWidthCm'].values

# the plot above is for all iris spieces, which can be listed via:
df['Species'].unique()









    Out[3]:





array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)



In [4]:

    
# the remaining analysis will focus only on Iris-setosa
sepalLengthSetosa = np.array(df['SepalLengthCm'][df['Species'] == 'Iris-setosa'])
sepalWidthSetosa = np.array(df['SepalWidthCm'][df['Species'] == 'Iris-setosa'])
sepalLengthVirginica = np.array(df['SepalLengthCm'][df['Species'] == 'Iris-virginica'])

Samples permutation

Special function is defined to mix the data (that should be very similar):

def permutation_sample(data1, data2):
    """Generate a permutation sample from two data sets."""

    # Concatenate the data sets: data
    data = np.concatenate((data1, data2))

    # Permute the concatenated array: permuted_data
    permuted_data = np.random.permutation(data)

    # Split the permuted array into two: perm_sample_1, perm_sample_2
    perm_sample_1 = permuted_data[:len(data1)]
    perm_sample_2 = permuted_data[len(data1):]

    return perm_sample_1, perm_sample_2



In [5]:

    
(np.mean(sepalLengthSetosa), np.mean(sepalLengthVirginica))









    Out[5]:





(5.0060000000000002, 6.5879999999999983)



In [6]:

    
for i in range(50):
    # Generate permutation samples
    perm_sample_1, perm_sample_2 = permutation_sample(sepalLengthSetosa, sepalLengthVirginica)


    # Compute ECDFs
    x_1, y_1 = ecdf(perm_sample_1)
    x_2, y_2 = ecdf(perm_sample_2)

    # Plot ECDFs of permutation sample
    _ = plt.plot(x_1, y_1, marker='.', linestyle='none',
                 color='red', alpha=0.02)
    _ = plt.plot(x_2, y_2, marker='.', linestyle='none',
                 color='blue', alpha=0.02)

# Create and plot ECDFs from original data
x_1, y_1 = ecdf(sepalLengthSetosa)
x_2, y_2 = ecdf(sepalLengthVirginica)
_ = plt.plot(x_1, y_1, marker='.', linestyle='none', color='red', label='Sepal Length - setosa')
_ = plt.plot(x_2, y_2, marker='.', linestyle='none', color='blue', label='Sepal Length - virginica')

# Label axes, set margin, and show plot
plt.margins(0.02)
_ = plt.xlabel('sepal lengths and widths [cm]')
_ = plt.ylabel('ECDF')
plt.legend();

Comparison of means

for this purpose a new function is defined - draw permutation replicates, that uses permutation samples function inside:

def draw_perm_reps(data_1, data_2, func, size=1):
    """Generate multiple permutation replicates."""

    # Initialize array of replicates: perm_replicates
    perm_replicates = np.empty(size)

    for i in range(size):
        # Generate permutation sample
        perm_sample_1, perm_sample_2 = permutation_sample(data_1, data_2)

        # Compute the test statistic
        perm_replicates[i] = func(perm_sample_1, perm_sample_2)

    return perm_replicates

For this example difference of means is looked at



In [7]:

    
# definition of the difference of means
def diff_of_means(data_1, data_2):
    """Difference in means of two arrays."""

    # The difference of means of data_1, data_2: diff
    diff = np.mean(data_1) - np.mean(data_2)

    return diff



In [8]:

    
# empirical difference of means
empirical_diff_means = diff_of_means(sepalLengthSetosa, sepalLengthVirginica)
empirical_diff_means









    Out[8]:





-1.5819999999999981



In [9]:

    
# permutation replicates
perm_replicates = draw_perm_reps(sepalLengthSetosa, sepalLengthVirginica, diff_of_means, size=10000)



In [10]:

    
# computing p-value
p = np.sum(perm_replicates <= empirical_diff_means) / len(perm_replicates)
p









    Out[10]:





0.0



In [ ]:

	Id	SepalLengthCm	SepalWidthCm	PetalLengthCm	PetalWidthCm	Species
0	1	5.1	3.5	1.4	0.2	Iris-setosa
1	2	4.9	3.0	1.4	0.2	Iris-setosa
2	3	4.7	3.2	1.3	0.2	Iris-setosa
3	4	4.6	3.1	1.5	0.2	Iris-setosa
4	5	5.0	3.6	1.4	0.2	Iris-setosa