In [1]:
import pandas as pd
import numpy as np
from random import randint
from pandas import Series,DataFrame
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 10)
plt.rcParams['font.size'] = 14
In [8]:
%%html
<style>
table {float:left}
</style>
In [2]:
from sklearn.datasets import load_boston
boston = load_boston()
type(boston)
Out[2]:
Load and return the boston house-prices dataset (regression).
Description | Value |
---|---|
Samples total | 506 |
Dimensionality | 13 |
Features | real, positive |
Targets | real 5. - 50. |
In [13]:
type(boston.data)
Out[13]:
In [57]:
print boston.target.shape
plt.figure()
sns.distplot(boston.target)
Out[57]:
Using least quare matrix method from this: https://onlinecourses.science.psu.edu/stat501/node/382
In [22]:
# least square estimation (based on setting 1st derivation equation to zero)
# Betas = [1/(X_tr*X)]*X_tr*Y
Y = boston.target
X = boston.data
X_tr = X.transpose()
inv = np.linalg.inv(X_tr.dot(X))
b = (inv.dot(X_tr)).dot(Y)
In [25]:
y_pred = X.dot(b)
In [35]:
# Computer R Squared
ssreg = np.sum((y_pred-Y.mean())**2)
sstot = np.sum((Y-Y.mean())**2)
R_sq = ssreg/sstot
print 'R Squared = {:.2f}'.format(R_sq)
In [55]:
#standardized residual plot
res = Y-y_pred
res_d = ((1.0/(len(res)-1))*np.sum(res**2))**0.5
res_stu = res/res_d
plt.figure()
grid = sns.JointGrid(x=Y, y=res_stu, space=0, size=6, ratio=50)
grid.plot_joint(plt.scatter, color="g")
grid.plot_marginals(sns.rugplot, height=1, color="g")
grid.set_axis_labels(xlabel="Boston house price",ylabel="studentized residues")
Out[55]:
In [47]:
res_d
Out[47]:
In [48]:
res_stu[0:10]
Out[48]:
In [ ]: