Regression


In [1]:
# importing required modules
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

# runing the functions script
%run stats_func.py

In [2]:
# loading the iris dataset
df = pd.read_csv('iris.csv')
df.head()


Out[2]:
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
3 4 4.6 3.1 1.5 0.2 Iris-setosa
4 5 5.0 3.6 1.4 0.2 Iris-setosa

In [3]:
# for further analysis sepal length and sepal width will be used
sepalLength = np.array(df['SepalLengthCm'])
sepalWidth = np.array(df['SepalWidthCm'])

In [4]:
# setting seaborn plot styles
sns.set()

# scatter plot
plt.plot(sepalLength, sepalWidth, marker='.', linestyle='none')
plt.xlabel('Sepal length [cm]')
plt.ylabel('Sepal width [cm]');



In [5]:
# the plot above is for all iris spieces, which can be listed via:
df['Species'].unique()


Out[5]:
array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [6]:
# the remaining analysis will focus only on Iris-setosa
sepalLengthSetosa = np.array(df['SepalLengthCm'][df['Species'] == 'Iris-setosa'])
sepalWidthSetosa = np.array(df['SepalWidthCm'][df['Species'] == 'Iris-setosa'])

# scatter plot
plt.plot(sepalLengthSetosa, sepalWidthSetosa, marker='.', linestyle='none')
plt.xlabel('Sepal length [cm] Iris-setosa')
plt.ylabel('Sepal width [cm] Iris-setosa');



In [7]:
# linear regression
a, b = np.polyfit(sepalLengthSetosa, sepalWidthSetosa, 1)

# regression line
x = np.linspace(4, 6, 10)
y = a * x + b

# scatter plot with regression
plt.plot(sepalLengthSetosa, sepalWidthSetosa, marker='.', linestyle='none', label='data points')
plt.plot(x, y, color='green', label='lin reg line')
plt.legend(loc='lower right')
plt.xlabel('Sepal length [cm] Iris-setosa')
plt.ylabel('Sepal width [cm] Iris-setosa');



In [8]:
# second order regression
aa, bb, cc = np.polyfit(sepalLengthSetosa, sepalWidthSetosa, 2)

# regression line calculation
yy = aa * x**2 + bb * x + cc

# scatter plot with regression line
plt.plot(sepalLengthSetosa, sepalWidthSetosa, marker='.', linestyle='none', label='data points')
plt.plot(x, yy, color='red', label='quad reg line')
plt.legend(loc='lower right')
plt.xlabel('Sepal length [cm] Iris-setosa')
plt.ylabel('Sepal width [cm] Iris-setosa');



In [ ]: