In [1]:
# importing required modules
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
# runing the functions script
%run stats_func.py
In [2]:
# loading the iris dataset
df = pd.read_csv('iris.csv')
df.head()
Out[2]:
In [3]:
# for further analysis Sepal Length variable will be used
# plotting the selected variable
sns.boxplot('SepalLengthCm', data=df);
In [4]:
sns.swarmplot('SepalLengthCm', data=df);
In [5]:
# converting Sepal Length to numpy array
sepalLength = df['SepalLengthCm']
type(sepalLength)
Out[5]:
In [6]:
sepalLength = np.array(sepalLength)
type(sepalLength)
Out[6]:
In [7]:
# use of Empirical Cumulative Density Function (ECDF)
x, y = ecdf(sepalLength)
# plot of the data
plt.plot(x, y, marker='.', linestyle='none')
plt.xlabel('Sepal Length [cm]')
plt.ylabel('ECDF');
In [8]:
# generating the normal distribution points with mean and std_dev derived from the dataset
mu = np.mean(sepalLength)
sigma = np.std(sepalLength)
sepalNormDistr = np.random.normal(mu, sigma, 10000)
# ECDF function
x_n, y_n = ecdf(sepalNormDistr)
# plotting all together
plt.plot(x, y, marker='.', linestyle='none', label='empirical')
plt.plot(x_n, y_n, color='red', label='norm distr')
plt.legend()
plt.xlabel('Sepal Length [cm]')
plt.ylabel('CDF');
In [9]:
# comparison with expotential distribution
sepalExpDistr = np.random.exponential(mu, 10000)
# ECDF function
x_e, y_e = ecdf(sepalExpDistr)
# plotting
plt.plot(x, y, marker='.', linestyle='none', label='empirical')
plt.plot(x_e, y_e, color='orange', label='exp distr')
plt.legend()
plt.xlabel('Sepal Length [cm]')
plt.ylabel('CDF');
In [10]:
# histogram
plt.hist(sepalLength, bins=30, edgecolor='black')
plt.xlabel('Sepal Length [cm]')
plt.ylabel('Frequency / count');
In [ ]: