In [87]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import datasets
import matplotlib.pyplot as plt
%matplotlib inline
boston = datasets.load_boston()
In [55]:
# Lets explore a bit the Boston data:
print "boston keys: ", boston.keys()
print "boston feature names", boston.feature_names
# DESCR says "Median Value (attribute 14) is usually the target"
# The MEDV is the boston.target
In [57]:
# get the df from boston data
boston_df = pd.DataFrame(np.column_stack([boston.data, boston.target]), columns = boston.feature_names)
boston_df.head()
Out[57]:
In [61]:
# Lets make it simple,
# we select the target data:
# - MEDV Median value of owner-occupied homes in $1000's
# and we selecvt couple of categorial independent variables:
# - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
# - RAD index of accessibility to radial highways
boston_df = boston_df[["MEDV", "CHAS", "RAD"]]
In [63]:
boston_df.head()
Out[63]:
In [ ]:
# Perform the regression
from sklearn import datasets, linear_model
# create the model
regr = linear_model.LinearRegression()
In [89]:
from sklearn import datasets, linear_model
# If we just split our sample to training and testing, with ratio e.g. 75/25
# And use only one feature
# Each newaxis object in the selection tuple serves to expand the dimensions of the
# resulting selection by one unit-length dimension
# boston_df["RAD"].shape , (506,)
# boston_df["RAD"][:, np.newaxis].shape, (506, 1) <= this is the desired format for the independent variables
# boston_df["MEDV"].shape, (506, ) <= this is the desired format for the target variable
boston_X = boston_df["RAD"][:, np.newaxis]
boston_y = boston_df["MEDV"]
# Split the data into training/testing sets
boston_X_train = boston_X[:-120]
boston_X_test = boston_X[-120:]
# Split the targets into training/testing sets
boston_y_train = boston_y[:-120]
boston_y_test = boston_y[-120:]
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(boston_X_train, boston_y_train)
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean square error
print("Residual sum of squares: %.2f" % np.mean((regr.predict(boston_X_test) - boston_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(boston_X_test, boston_y_test))
# Plot outputs
plt.scatter(boston_X_test, boston_y_test, color='black')
plt.plot(boston_X_test, regr.predict(boston_X_test), color='blue', linewidth=3)
plt.show()
In [94]:
print boston_df["MEDV"].shape
print boston_df["RAD"][:, np.newaxis].shape
In [ ]:
# Prepare the data , training and test parts
# !!! Attention !!!
# Split sample validation without resampling (cross-validation, or better: bootstrapping)
# is unreliable unless you have a bery big sample (e.g., N>20000).
# If the sample is big
# Probably a simple split would work 75/25.
# We have just to run twice (2 splits) and we see how much results vary.
# They probably vary so little that you only need one split.
# Think of the width of a confidence interval for a proportion with such a big sample size
# In our case the sample size is 506 - not so big.
# We better do cross-validation (and later bootstrapping)
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
iris.data, iris.target, test_size=0.4, random_state=0)
In [64]:
len(boston_df)
Out[64]:
In [66]:
diabetes = datasets.load_diabetes()
len(diabetes.data)
Out[66]:
In [ ]: