In [85]:
# data analytics in python
import numpy as np
import pandas as pd
# data
#from sklearn import datasets
# plot
import matplotlib.pyplot as plt
%matplotlib inline
# regression
import sklearn
from sklearn import datasets, linear_model
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
# correlation coef.
from scipy.stats import pearsonr
In [75]:
boston = datasets.load_boston()
boston_df = pd.DataFrame(boston.data, columns = boston.feature_names)
boston_df["MEDV"] = boston.target
boston_df.head()
Out[75]:
In [76]:
# Lets make it simple - we select few features and the target data:
# - MEDV Median value of owner-occupied homes in $1000's
# and we select couple of numerical independent variables:
# - RM average number of rooms per dwelling
# - AGE proportion of owner-occupied units built prior to 1940
boston_df = boston_df[["MEDV", "RM", "AGE"]]
In [110]:
print boston_X_train.flatten()
In [159]:
# create the model
regr = linear_model.LinearRegression()
# If we just split our sample to training and testing, with ratio e.g. 75/25
# And use only one feature
# Each newaxis object in the selection tuple serves to expand the dimensions of the resulting selection by one unit-length dimension
# boston_df["RAD"].shape , (506,)
# boston_df["RAD"][:, np.newaxis].shape, (506, 1) <= this is the desired format for the independent variables
# boston_df["MEDV"].shape, (506, ) <= this is the desired format for the target variable
boston_X = boston_df["RM"][:, np.newaxis]
boston_y = boston_df["MEDV"]
# Split the data into training/testing sets
boston_X_train = boston_X[:-120]
boston_X_test = boston_X[-120:]
# Split the targets into training/testing sets
boston_y_train = boston_y[:-120]
boston_y_test = boston_y[-120:]
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(boston_X_train, boston_y_train)
# The coefficients, are the alpha, beta line coefficients
print('alpha: ', regr.coef_)
print('beta: ', regr.intercept_)
a = regr.coef_
b = regr.intercept_
#and then we can plot the relevant line , e.g.
# plt.plot([-0.15, 0.15], [a*-0.15+b, a*0.15+b], 'k--', color='red', linewidth=3)
# The mean square error
print("Training Residual sum of squares: %.2f" % np.mean((regr.predict(boston_X_train) - boston_y_train) ** 2))
print("Test Residual sum of squares: %.2f" % np.mean((regr.predict(boston_X_test) - boston_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
# Returns the coefficient of determination R^2 of the prediction.
print('Training Variance score: %.2f' % regr.score(boston_X_train, boston_y_train))
print('Test Variance score: %.2f' % regr.score(boston_X_test, boston_y_test))
print 'Training pearson:', pearsonr(boston_X_train.flatten().tolist(), boston_y_train.tolist())
print 'Test pearson', pearsonr(boston_X_test.flatten(), boston_y_test.tolist())
#print('Test set cccuracy: %.2f' % accuracy_score(boston_X_test, boston_y_test))
# !!! Attention, few words on score - here the score is the R2 score !!!
# There is no reason r^2 shouldn't be negative (despite the ^2 in its name).
# This is also stated in the doc. You can see r^2 as the comparison of your model fit
# (in the context of linear regression, e.g a model of order 1 (affine)) to a model of order 0
# (just fitting a constant), both by minimizing a squared loss.
# The constant minimizing the squared error is the mean.
# Since you are doing cross validation with left out data,
# it can happen that the mean of your test set is wildly different from the mean of your training set.
# This alone can induce a much higher incurred squared error in your prediction versus just predicting
# the mean of the test data, which results in a negative r^2 score.
# In worst case, if your data do not explain your target at all, these scores can become very strongly negative.
# TODO cross check next
# R^2 is bounded above by 1.0, but it is not bounded below.
# Correlation is always bounded between -1 and 1.
# Plot outputs
fig, ax = plt.subplots(1, 2)
ax[0].scatter(boston_X_train, boston_y_train, color='black')
ax[0].plot(boston_X_train, regr.predict(boston_X_train), color='red', linewidth=3)
ax[0].set_title('Training')
ax[0].set_xlabel('RM (our feature)')
ax[0].set_ylabel('MEDV (our target response)')
ax[1].scatter(boston_X_test, boston_y_test, color='black')
ax[1].plot(boston_X_test, regr.predict(boston_X_test), color='blue', linewidth=3)
ax[1].set_title('Test')
ax[1].set_xlabel('RM (our feature)')
ax[1].set_ylabel('MEDV (our target response)')
plt.show()
In [8]:
# comments, it is obvious even by looking at the plot to see that just splitting the sample e.g. 75/25 was not good enough.
# Especialy since our sample is relatively small ~500 rows
# As we see there is not a good distribution and spread of data to partition it into separate training and test sets
# in the conventional validation method.
# The training Variance score: 0.56
# The test Variance score: -1.78 - quite worse
# Also the relevant RSS is much better in training.
# It is worth to see what we can get by performing cross validation...
In [114]:
# In this example, is almost obvious that the data are not nicely distributed and when we split simply by [:120] etc
# we have big chances to be biased.
# The best way to do it, is in the same time we split the data also to sample them so to be sure that they are better
# distributed.
# We can get this quickly if we use the train_test_split function of scikit:
# use scikit to easily get the split
boston_X_train, boston_X_test, boston_y_train, boston_y_test = cross_validation.train_test_split(
boston_df["RM"][:, np.newaxis], boston_df["MEDV"], test_size=0.25, random_state=0)
regr = linear_model.LinearRegression()
regr.fit(boston_X_train, boston_y_train)
print("Training Residual sum of squares: %.2f" % np.mean((regr.predict(boston_X_train) - boston_y_train) ** 2))
print("Test Residual sum of squares: %.2f" % np.mean((regr.predict(boston_X_test) - boston_y_test) ** 2))
print('Training Variance score: %.2f' % regr.score(boston_X_train, boston_y_train))
print('Test Variance score: %.2f' % regr.score(boston_X_test, boston_y_test))
print 'Training pearson:', pearsonr(boston_X_train.flatten(), boston_y_train.tolist())
print 'Test pearson', pearsonr(boston_X_test.flatten(), boston_y_test.tolist())
# Plot outputs
fig, ax = plt.subplots(1, 2)
ax[0].scatter(boston_X_train, boston_y_train, color='black')
ax[0].plot(boston_X_train, regr.predict(boston_X_train), color='red', linewidth=3)
ax[0].set_title('Training')
ax[0].set_xlabel('RM (our feature)')
ax[0].set_ylabel('MEDV (our target response)')
ax[1].scatter(boston_X_test, boston_y_test, color='black')
ax[1].plot(boston_X_test, regr.predict(boston_X_test), color='blue', linewidth=3)
ax[1].set_title('Test')
ax[1].set_xlabel('RM (our feature)')
ax[1].set_ylabel('MEDV (our target response)')
plt.show()
# !!! The results are greately improved !!!
# Nevertheless we will continue the work with cross validation and see what we can get out more of this technique
# !!! A note on shuffling found in scikit learn page
# If the data ordering is not arbitrary (e.g. samples with the same label are contiguous),
# shuffling it first may be essential to get a meaningful cross- validation result.
# However, the opposite may be true if the samples are not independently and identically distributed.
# For example, if samples correspond to news articles, and are ordered by their time of publication,
# then shuffling the data will likely lead to a model that is overfit and an inflated validation score:
# it will be tested on samples that are artificially similar (close in time) to training samples.
In [ ]:
# THEORY (from wikipedia)
# Cross-validation, is a model validation technique for assessing how the results of a statistical analysis
# will generalize to an independent data set.
# It is mainly used in settings where the goal is prediction, and we want to estimate how accurately
# a predictive model will perform in practice.
# How it works:
# Cross validation involves many rounds.
# One round of cross-validation involves partitioning a sample of data into complementary subsets,
# performing the analysis on one (training) subset, and validating the analysis on the other (testing) subset .
# To reduce variability, multiple rounds of cross-validation are performed using different partitions,
# and the validation results are averaged over the rounds.
# Usually in conventional split, the RMSError on the test dataset does not properly represent model performance,
# this can happen due to :
# - not enough data available
# - not good distribution and spread of data to partition into separate training and test sets
# So a fair way to properly estimate model prediction performance is to use cross-validation.
# In summary, cross-validation combines (averages) measures of fit to correct for the training error
# and derive a more accurate estimate of model prediction performance
In [163]:
boston_X = boston_df["RM"][:, np.newaxis]
boston_y = boston_df["MEDV"]
nfolds = 10
estimator = linear_model.LinearRegression()
features = boston_X
target = boston_y
scores = cross_validation.cross_val_score(estimator, features, target, cv=nfolds, scoring='r2')
print scores
print scores.mean()
print np.median(scores)
#The mean score and the 95% confidence interval of the score estimate are hence given by:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
# Generate cross-validated estimates for each input data point
# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validated:
predicted = cross_validation.cross_val_predict(estimator, features, target, cv=nfolds)
fig,ax = plt.subplots()
ax.scatter(target, predicted)
#ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
fig.show()
#print('Training Variance score: %.2f' % regr.score(boston_y, predicted))
#print('Test Variance score: %.2f' % regr.score(boston_y.flatten(), predicted))
#print 'Training pearson:', pearsonr(boston_y.flatten(), predicted.tolist())
#print 'Test pearson', pearsonr(boston_y.flatten(), predicted.tolist())
# Plot outputs
fig, ax = plt.subplots(1, 1)
ax.scatter(features, target, color='black')
ax.plot(features, predicted, color='red', linewidth=1)
ax.set_xlabel('RM (our feature)')
ax.set_ylabel('MEDV (our target response)')
plt.show()
In [161]:
from sklearn.cross_validation import KFold
In [179]:
kf = KFold(len(features), n_folds=10)
In [184]:
xtr=[]
ytr=[]
xte=[]
yte=[]
for train, test in kf:
xtr.append(train)
xte.append(test)
Xtrain = features[xtr[0]]
ytrain = target[xtr[0]]
Xtest = features[xte[0]]
ytest = target[xte[0]]
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(Xtrain, ytrain)
# Explained variance score: 1 is perfect prediction
# Returns the coefficient of determination R^2 of the prediction.
print 'Training Variance score', regr.score(Xtrain, ytrain)
print'Test Variance score', regr.score(Xtest, ytest)
In [183]:
# And yes it is the first test variance score we see in the cross_val_score above
In [ ]: