In [5]:
import pandas as pd
#%matplotlib inline
import matplotlib.pyplot as plt

In [6]:
vis = pd.DataFrame({
    'cells': [116,117,120,1,52,79,109,27,85,51,78,55,26,39,107],
    'photo': [60,67,64,8,13,63,63,2,46,27,43,24,10,28,56]})

In [7]:
graph = []
def graph_all(graph): [g() for g in graph]

In [8]:
def scatter(): return plt.scatter(x=vis.cells, y=vis.photo, color='g')
graph = [scatter]
graph_all(graph)

In [9]:
x_range = vis.cells.min(), vis.cells.max()
y_mean = vis.photo.mean(), vis.photo.mean()
def mean(): return plt.plot(x_range, y_mean, color="brown")
graph.append(mean)
graph_all(graph)

The equations for the slope of the regression line can be constructed as:

$$ \hat\beta = \frac{ \operatorname{Cov}[x,y] }{ \operatorname{Var}[x] } $$

In [10]:
slope = vis.cells.cov(vis.photo)/ vis.cells.var()
slope


Out[10]:
0.57225117604575371

The equation for the intercept of the regression line can be construted as:

$$ \hat\alpha = \bar{y} - \hat\beta\,\bar{x} $$

In [11]:
intercept = vis.photo.mean() - (slope * vis.cells.mean())
intercept


Out[11]:
-2.2487165973726917

So we can construction the regression line by solving for our expected y:

$$ \hat{y} = (slope * x) + intercept $$

In [12]:
vis.expected = (vis.cells * slope) + intercept
vis.expected


Out[12]:
0     64.132420
1     64.704671
2     66.421425
3     -1.676465
4     27.508345
5     42.959126
6     60.126662
7     13.202065
8     46.392633
9     26.936093
10    42.386875
11    29.225098
12    12.629814
13    20.069079
14    58.982159
Name: cells, dtype: float64

In [13]:
def regression(): return plt.plot(vis.cells, vis.expected)
graph.append(regression)
graph_all(graph)

SStotal:


In [14]:
def SStotal(): return [plt.plot((vis.cells[i], vis.cells[i]), (vis.photo.mean(), vis.photo[i]), color='black') for i in range(len(vis))]
graph.append(SStotal)

In [15]:
graph_all(graph)

SSregrssion


In [16]:
def SSreg(): return [plt.plot((vis.cells[i], vis.cells[i]), (vis.expected[i], vis.photo[i]), color='red') for i in range(len(vis))]
graph.append(SSreg)
graph_all(graph)

recall that to calculate SStotal we use the formula:

$$ SS_{total} = \sum\limits_{x=i}^n(y_{i} - \hat{y})^{2} $$

In [17]:
sum_of_squares = {'total' : ((vis.photo - vis.photo.mean())**2).sum()}
sum_of_squares['total']


Out[17]:
7684.9333333333334

In [18]:
sum_of_squares['regression'] = ((vis.photo - vis.expected)**2).sum()
sum_of_squares['regression']


Out[18]:
974.25824207999085

now r squared is just..

$$ R^{2} = 1 - \frac{SS_{regression}}{SS_{total}} $$

In [19]:
r_squared = 1 - (sum_of_squares['regression']/sum_of_squares['total'])

In [20]:
r_squared


Out[20]:
0.8732248934607989

In [1]:
%load_ext vimception



In [5]:



0
1


In [ ]: