Use of Empirical Cumulative Density Function (ECDF) with iris dataset



In [1]:

    
# importing required modules
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

# runing the functions script
%run stats_func.py



In [2]:

    
# loading the iris dataset
df = pd.read_csv('iris.csv')
df.head()









    Out[2]:






  
    
      
      Id
      SepalLengthCm
      SepalWidthCm
      PetalLengthCm
      PetalWidthCm
      Species
    
  
  
    
      0
      1
      5.1
      3.5
      1.4
      0.2
      Iris-setosa
    
    
      1
      2
      4.9
      3.0
      1.4
      0.2
      Iris-setosa
    
    
      2
      3
      4.7
      3.2
      1.3
      0.2
      Iris-setosa
    
    
      3
      4
      4.6
      3.1
      1.5
      0.2
      Iris-setosa
    
    
      4
      5
      5.0
      3.6
      1.4
      0.2
      Iris-setosa



In [3]:

    
# for further analysis Sepal Length variable will be used

# plotting the selected variable
sns.boxplot('SepalLengthCm', data=df);



In [4]:

    
sns.swarmplot('SepalLengthCm', data=df);



In [5]:

    
# converting Sepal Length to numpy array

sepalLength = df['SepalLengthCm']
type(sepalLength)









    Out[5]:





pandas.core.series.Series



In [6]:

    
sepalLength = np.array(sepalLength)
type(sepalLength)









    Out[6]:





numpy.ndarray



In [7]:

    
# use of Empirical Cumulative Density Function (ECDF)
x, y = ecdf(sepalLength)

# plot of the data
plt.plot(x, y, marker='.', linestyle='none')
plt.xlabel('Sepal Length [cm]')
plt.ylabel('ECDF');



In [8]:

    
# generating the normal distribution points with mean and std_dev derived from the dataset
mu = np.mean(sepalLength)
sigma = np.std(sepalLength)
sepalNormDistr = np.random.normal(mu, sigma, 10000)

# ECDF function
x_n, y_n = ecdf(sepalNormDistr)

# plotting all together
plt.plot(x, y, marker='.', linestyle='none', label='empirical')
plt.plot(x_n, y_n, color='red', label='norm distr')
plt.legend()
plt.xlabel('Sepal Length [cm]')
plt.ylabel('CDF');



In [9]:

    
# comparison with expotential distribution
sepalExpDistr = np.random.exponential(mu, 10000)

# ECDF function
x_e, y_e = ecdf(sepalExpDistr)

# plotting
plt.plot(x, y, marker='.', linestyle='none', label='empirical')
plt.plot(x_e, y_e, color='orange', label='exp distr')
plt.legend()
plt.xlabel('Sepal Length [cm]')
plt.ylabel('CDF');



In [10]:

    
# histogram
plt.hist(sepalLength, bins=30, edgecolor='black')
plt.xlabel('Sepal Length [cm]')
plt.ylabel('Frequency / count');



In [ ]:

	Id	SepalLengthCm	SepalWidthCm	PetalLengthCm	PetalWidthCm	Species
0	1	5.1	3.5	1.4	0.2	Iris-setosa
1	2	4.9	3.0	1.4	0.2	Iris-setosa
2	3	4.7	3.2	1.3	0.2	Iris-setosa
3	4	4.6	3.1	1.5	0.2	Iris-setosa
4	5	5.0	3.6	1.4	0.2	Iris-setosa