In [1]:
# importing required modules
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
# runing the functions script
%run stats_func.py
In [2]:
# loading the iris dataset
df = pd.read_csv('iris.csv')
df.head()
Out[2]:
In [3]:
# for further analysis sepal length and sepal width will be used
sepalLength = df['SepalLengthCm'].values
sepalWidth = df['SepalWidthCm'].values
# the plot above is for all iris spieces, which can be listed via:
df['Species'].unique()
Out[3]:
In [4]:
# the remaining analysis will focus only on Iris-setosa
sepalLengthSetosa = np.array(df['SepalLengthCm'][df['Species'] == 'Iris-setosa'])
sepalWidthSetosa = np.array(df['SepalWidthCm'][df['Species'] == 'Iris-setosa'])
sepalLengthVirginica = np.array(df['SepalLengthCm'][df['Species'] == 'Iris-virginica'])
Special function is defined to mix the data (that should be very similar):
def permutation_sample(data1, data2):
"""Generate a permutation sample from two data sets."""
# Concatenate the data sets: data
data = np.concatenate((data1, data2))
# Permute the concatenated array: permuted_data
permuted_data = np.random.permutation(data)
# Split the permuted array into two: perm_sample_1, perm_sample_2
perm_sample_1 = permuted_data[:len(data1)]
perm_sample_2 = permuted_data[len(data1):]
return perm_sample_1, perm_sample_2
In [5]:
(np.mean(sepalLengthSetosa), np.mean(sepalLengthVirginica))
Out[5]:
In [6]:
for i in range(50):
# Generate permutation samples
perm_sample_1, perm_sample_2 = permutation_sample(sepalLengthSetosa, sepalLengthVirginica)
# Compute ECDFs
x_1, y_1 = ecdf(perm_sample_1)
x_2, y_2 = ecdf(perm_sample_2)
# Plot ECDFs of permutation sample
_ = plt.plot(x_1, y_1, marker='.', linestyle='none',
color='red', alpha=0.02)
_ = plt.plot(x_2, y_2, marker='.', linestyle='none',
color='blue', alpha=0.02)
# Create and plot ECDFs from original data
x_1, y_1 = ecdf(sepalLengthSetosa)
x_2, y_2 = ecdf(sepalLengthVirginica)
_ = plt.plot(x_1, y_1, marker='.', linestyle='none', color='red', label='Sepal Length - setosa')
_ = plt.plot(x_2, y_2, marker='.', linestyle='none', color='blue', label='Sepal Length - virginica')
# Label axes, set margin, and show plot
plt.margins(0.02)
_ = plt.xlabel('sepal lengths and widths [cm]')
_ = plt.ylabel('ECDF')
plt.legend();
for this purpose a new function is defined - draw permutation replicates, that uses permutation samples function inside:
def draw_perm_reps(data_1, data_2, func, size=1):
"""Generate multiple permutation replicates."""
# Initialize array of replicates: perm_replicates
perm_replicates = np.empty(size)
for i in range(size):
# Generate permutation sample
perm_sample_1, perm_sample_2 = permutation_sample(data_1, data_2)
# Compute the test statistic
perm_replicates[i] = func(perm_sample_1, perm_sample_2)
return perm_replicates
For this example difference of means is looked at
In [7]:
# definition of the difference of means
def diff_of_means(data_1, data_2):
"""Difference in means of two arrays."""
# The difference of means of data_1, data_2: diff
diff = np.mean(data_1) - np.mean(data_2)
return diff
In [8]:
# empirical difference of means
empirical_diff_means = diff_of_means(sepalLengthSetosa, sepalLengthVirginica)
empirical_diff_means
Out[8]:
In [9]:
# permutation replicates
perm_replicates = draw_perm_reps(sepalLengthSetosa, sepalLengthVirginica, diff_of_means, size=10000)
In [10]:
# computing p-value
p = np.sum(perm_replicates <= empirical_diff_means) / len(perm_replicates)
p
Out[10]:
In [ ]: