In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn import linear_model, datasets, metrics, model_selection, feature_selection, preprocessing
from scipy import stats
In [2]:
boston = datasets.load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = boston.target
In [3]:
print('shape:', X.shape)
In [4]:
X.describe()
Out[4]:
In [5]:
sns.distplot(y)
Out[5]:
In [6]:
sns.pairplot(X);
In [7]:
fig = plt.figure()
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)
sns.distplot(y, ax=ax2)
sns.boxplot(data=y, orient='h', ax=ax1)
Out[7]:
In [8]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.7)
print('train samples:', len(X_train))
print('test samples', len(X_test))
In [9]:
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")
In [10]:
lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)
Out[10]:
In [11]:
print('No coef:', len(lr.coef_))
print('Coefficients: \n', lr.coef_)
In [12]:
predicted = lr.predict(X_test)
In [13]:
fig, ax = plt.subplots()
ax.scatter(y_test, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], ls='--', color='red')
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
Out[13]:
In [14]:
residual = (y_test - predicted)
In [15]:
fig, ax = plt.subplots()
ax.scatter(y_test, residual)
plt.axhline(0, color='red', ls='--')
ax.set_xlabel('y')
ax.set_ylabel('residual')
Out[15]:
In [16]:
sns.distplot(residual);
The trainig scores
In [17]:
metrics.r2_score(y_train, lr.predict(X_train))
Out[17]:
In [18]:
metrics.mean_squared_error(y_train, lr.predict(X_train))
Out[18]:
In [19]:
metrics.r2_score(y_test, predicted)
Out[19]:
In [20]:
metrics.mean_squared_error(y_test, predicted)
Out[20]:
In [21]:
print(lr.intercept_)
print(lr.coef_)