In [8]:
# import
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
%matplotlib inline
In [9]:
# reading excel file
fh = pd.ExcelFile("dataset/EAVS.xlsx")
fh
print(fh.sheet_names)
In [10]:
data = fh.parse("SectionC")
data.head()
Out[10]:
In [11]:
## Loading the IRIS dataset
irisds = load_iris()
# selecting the data field only
data = irisds['data']
# saving into csv file
col = irisds['feature_names']
print(len(data))
In [12]:
# sample data
print(col)
data[:5]
Out[12]:
In [13]:
# plotting the sepal length ( sns style)
sepal = data[:,0]
sns.set()
plt.hist(sepal)
plt.xlabel("sepal length")
plt.ylabel("Count")
Out[13]:
In [14]:
# by default matplotlib creates 10 bins, we can customize it 2 ways
# no of bins
# plotting the sepal length ( sns style)
sepal = data[:,0]
sns.set()
plt.hist(sepal, bins=6)
plt.xlabel("sepal length")
plt.ylabel("Count")
Out[14]:
In [15]:
# bins details
# plotting the sepal length ( sns style)
sepal = data[:,0]
sns.set()
plt.hist(sepal, bins=[ x for x in range(0, 10)])
plt.xlabel("sepal length")
plt.ylabel("Count")
Out[15]:
In [16]:
iris = pd.DataFrame(data, columns=col)
iris['species'] = ['Setosa' if x==0 else 'Versicolour' if x==1 else 'Virginica' for x in irisds['target']]
iris.head()
Out[16]:
Bee Sworm plot
In [17]:
sns.set()
sns.swarmplot(x="species", y='sepal length (cm)', data=iris)
plt.xlabel("species")
plt.ylabel("sepal length (cm)")
Out[17]:
Empirical cumulative distribution function (ECDF)
In [18]:
def ecdf(data):
n = len(data)
x = np.sort(data)
y = np.arange(1, n+1)/n
return x,y
In [19]:
# plotting
x, y = ecdf(iris[iris['species']=='Setosa']['sepal length (cm)'])
plt.plot(x, y, marker='.', linestyle='none')
plt.xlabel("sepal length (cm)")
plt.xlabel("ecdf")
plt.margins(0.02)
plt.legend(('Setosa'), loc='upper left')
Out[19]:
In [20]:
# plotting all species
x, y = ecdf(iris[iris['species']=='Setosa']['sepal length (cm)'])
plt.plot(x, y, marker='.', linestyle='none')
x, y = ecdf(iris[iris['species']=='Versicolour']['sepal length (cm)'])
plt.plot(x, y, marker='.', linestyle='none')
x, y = ecdf(iris[iris['species']=='Virginica']['sepal length (cm)'])
plt.plot(x, y, marker='.', linestyle='none')
plt.xlabel("sepal length (cm)")
plt.xlabel("ecdf")
plt.margins(0.02)
plt.legend(('Setosa','Versicolour', 'Virginica'), loc='upper left')
Out[20]:
mean - avg value but it is affected by outliers values
$$ mean = \frac{1}{n}\Sigma_{i=1}^n {i_n} $$$$ mean = \frac{1}{n}\sum_{n=1}^n {i_n} $$median - middile value in values, doesn't affected by diversity of values
In [21]:
# checking out what is avg and median sepal length (cm) for Virginica species
print("Mean : ",np.mean(iris[iris['species']=='Virginica']['sepal length (cm)']))
print("Median : ",np.median(iris[iris['species']=='Virginica']['sepal length (cm)']))
In [22]:
# creating variables
virgin = iris[iris['species']=='Virginica']
setosa = iris[iris['species']=='Setosa']
versi = iris[iris['species']=='Versicolour']
In [23]:
# getting percentile 25, 50, 75
print("25, 50, 70 percentile", np.percentile(virgin['sepal length (cm)'], [25, 50, 75]))
for repesenting the percentile and checking the outliers, we use boxplot
in box plot, box starts from 25% , median 50% and end at 75%
whiskers are ususllay represent the 1.5x range of box values(25-75)
If there is any points after the whiskers are called outliers
In [24]:
#sns.boxplot(x='species', y='sepal length (cm)', data=virgin)
#sns.boxplot(x='species', y='sepal length (cm)', data=setosa)
#sns.boxplot(x='species', y='sepal length (cm)', data=versi)
sns.boxplot(x='species', y='sepal length (cm)', data=iris)
plt.margins(0.2)
Variance - it is the sum of squared distance of data-point from mean data-point
if x(bar) is mean
$$ variance = \frac{1}{n}{\sum_{i=1}^{n}(x_i - \bar{x})^2} $$Standard Variance - square root of variance
$$ std =\sqrt{\frac{1}{n}{\sum_{i=1}^{n}(x_i - \bar{x})^2}} $$higher std value represents the more diverse data
In [25]:
var = np.var(versi['sepal length (cm)'])
var
Out[25]:
In [26]:
std1 = np.sqrt(var)
std2 = np.std(versi['sepal length (cm)'])
print(std1, std2)
covariance - how two datapoints are related with each other Or A measure of how two quantities vary together
$$ cov = \frac{1}{n}{\sum{(x-\bar{x})}{(y-\bar{y})}} $$Pearson correlation coefficient -
$$ \rho = \frac{\frac{1}{n}{\sum{(x-\bar{x})}{(y-\bar{y})}}}{std(x) std(y)} $$
In [27]:
# plotting the scatter plot
plt.plot(versi['sepal length (cm)'], versi['sepal width (cm)'], marker='.',linestyle='none')
plt.xlabel('sepal length (cm)')
plt.ylabel("sepal width (cm)")
plt.margins(0.2)
In [28]:
plt.scatter(versi['sepal length (cm)' ], versi['sepal width (cm)'])
plt.xlabel('sepal length (cm)')
plt.ylabel("sepal width (cm)")
plt.margins(0.2)
In [29]:
## Calculating the covariance
cov_mat = np.cov(versi['sepal length (cm)'], versi['sepal width (cm)'])
print(cov_mat)
In [30]:
## Calculating the corelation coefficient
coef = np.corrcoef(versi['sepal length (cm)'], versi['sepal width (cm)'])
print(coef)
print(coef[0,1])
In [31]:
# let's calculate the corelation coefficient in math
std_len = np.std(versi['sepal length (cm)'])
std_wid = np.std(versi['sepal width (cm)'])
print(std_len, std_wid)
coeff = cov_mat/(std_len*std_wid)
coeff
Out[31]:
In [ ]:
In [ ]:
In [ ]: