In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [5]:
## numpy array
a = np.array([1, 4, 6])
print a.shape
print
print np.ones((3, 4))
print
print np.zeros((2, 5))
print
print np.arange(6).reshape(2, 3)
print
print a.T
print
print np.hstack([a, a])
print
print np.vstack([a, a])
In [6]:
## element wise or matrix multiplication
print np.dot(a, a) # or a.dot(a)
print
print a*a
In [7]:
# you can convert a 1-d array to a 2-d array with np.newaxis
print 'a:'
print a
print 'a.shape:', a.shape
print
print 'a[np.newaxis] is a 2-d row vector:'
print a[np.newaxis]
print 'a[np.newaxis].shape:', a[np.newaxis].shape
print
print 'a[np.newaxis].T: is a 2-d column vector:'
print a[np.newaxis].T
print 'a[np.newaxis].T.shape:', a[np.newaxis].T.shape
print
In [8]:
# numpy provides a ton of other functions for working with matrices
m = np.array([[1, 2],[3, 4]])
m_inverse = np.linalg.inv(m)
print 'inverse of [[1, 2],[3, 4]]:'
print m_inverse
print
print 'm.dot(m_inverse):'
print m.dot(m_inverse)
In [9]:
# and for doing all kinds of sciency type stuff. like generating random numbers:
np.random.seed(5678)
n = np.random.randn(3, 4)
print 'a matrix with random entries drawn from a Normal(0, 1) distribution:'
print n
In [10]:
np.random.seed(3333)
n_data = 10 # number of data points. i.e. N
n_dim = 5 # number of dimensions of each datapoint. i.e. D
betas = np.random.randn(n_dim + 1)
X_no_constant = np.random.randn(n_data, n_dim)
print 'X_no_constant:'
print X_no_constant
print
# INSERT YOUR CODE HERE!
X = np.hstack([np.ones(n_data)[np.newaxis].T, X_no_constant])
y = np.dot(X, betas)
# Tests:
y_expected = np.array([-0.41518357, -9.34696153, 5.08980544,
-0.26983873, -1.47667864, 1.96580794,
6.87009791, -2.07784135, -0.7726816,
-2.74954984])
np.testing.assert_allclose(y, y_expected)
print '****** Tests passed! ******'
In [11]:
b = np.array([[6, 7], [3, 1], [4, 0]])
df = pd.DataFrame(data=b, columns=['Weight', 'Height'])
print 'b:'
print b
print
print 'DataFame version of b:'
print df
print
In [12]:
baseball = pd.read_csv('data/baseball.dat.txt')
In [16]:
# baseball.head()
# baseball.describe()
# baseball.keys()
# baseball.info()
In [18]:
millionaire_indices = baseball['Salary'] > 1000
# you can use the query indices to look at a subset of your original dataframe
print 'baseball.shape:', baseball.shape
print "baseball[millionaire_indices].shape:", baseball[millionaire_indices].shape
baseball[millionaire_indices][['Salary', 'AVG', 'Runs', 'Name']].head()
Out[18]:
In [19]:
shoe_size_df = pd.read_csv('data/baseball2.dat.txt')
shoe_size_df.shape
Out[19]:
In [20]:
merged = pd.merge(baseball, shoe_size_df, on=['Name'])
merged
Out[20]:
In [23]:
merged_outer = pd.merge(baseball, shoe_size_df, on=['Name'], how='outer')
merged_outer.head()
Out[23]:
In [25]:
baseball = pd.read_csv('data/baseball.dat.txt')
In [26]:
f = plt.figure()
plt.hist(baseball['Hits'], bins=15) # plot or scatter
plt.xlabel('Number of Hits')
plt.ylabel('Frequency')
plt.title('Histogram of Number of Hits')
f.set_size_inches(10, 5)
plt.show()
In [31]:
from sklearn import linear_model
In [32]:
## linear regression models
model_lr = linear_model.LinearRegression()
model_ridge = linear_model.Ridge(alpha=1)
model_lasso = linear_model.Lasso(alpha=1)
model_en = linear_model.ElasticNet(alpha=0.5, l1_ratio=0.1)
In [27]:
def mean_squared_error(y_true, y_pred):
"""
calculate the mean_squared_error given a vector of true ys and a vector of predicted ys
"""
diff = y_true - y_pred
return np.dot(diff, diff) / len(diff)
def predict_test_values(model, X_train, y_train, X_test):
model.fit(X_train, y_train)
return model.predict(X_test)
def calc_train_and_test_error(model, X_train, y_train, X_test, y_test):
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
return mean_squared_error(y_train, y_pred_train), mean_squared_error(y_test, y_pred_test)
In [33]:
# load overfitting data
with np.load('data/overfitting_data.npz') as data:
x_train = data['x_train']
y_train = data['y_train']
x_test = data['x_test']
y_test = data['y_test']
In [34]:
## Model performance
print "Linear Regression Training and Test Errors:"
print calc_train_and_test_error(model_lr, x_train, y_train, x_test, y_test)
print
print "Ridge Regression Training and Test Errors:"
print calc_train_and_test_error(model_ridge, x_train, y_train, x_test, y_test)
print
print "Lasso Regression Training and Test Errors:"
print calc_train_and_test_error(model_lasso, x_train, y_train, x_test, y_test)
print
print 'ElasticNet Training and Test Errors:'
print calc_train_and_test_error(model_en, x_train, y_train, x_test, y_test)
print
In [35]:
n_disp_coefs = 10
print 'Linear Regression Coefficients:'
print model_lr.coef_[:n_disp_coefs]
print
print 'Ridge Regression Coefficients:'
print model_ridge.coef_[:n_disp_coefs]
print
print 'LASSO Coefficients:'
print model_lasso.coef_[:n_disp_coefs]
print
print 'ElasticNet Coefficients:'
print model_en.coef_[:n_disp_coefs]
print
In [36]:
print "Sum of Linear Regression Coefficients:"
print np.sum(np.abs(model_lr.coef_))
print
print "Sum of Ridge Regression Coefficients:"
print np.sum(np.abs(model_ridge.coef_))
print
print "Sum of Lasso Regression Coefficients:"
print np.sum(np.abs(model_lasso.coef_))
print
print 'Sum of ElasticNet Coefficients'
print np.sum(np.abs(model_en.coef_))
print
In [37]:
# a helper function for performing validation set cross validation
from sklearn.cross_validation import train_test_split
validation_portion = 0.1
seed = 1234
x_train_small, x_valid, y_train_small, y_valid = \
train_test_split(x_train, y_train, test_size=validation_portion, random_state=seed)
print 'Original Training Set Size:'
print x_train.shape, y_train.shape
print
print 'Reducted Training Set Size:'
print x_train_small.shape, y_train_small.shape
print
print 'Validation Set Size:'
print x_valid.shape, y_valid.shape
print
In [38]:
def validation_set_error(model, x_train, y_train, validation_portion=0.1, seed=1234):
# FILL IN YOUR CODE HERE
x_train_small, x_valid, y_train_small, y_valid = \
train_test_split(x_train, y_train, test_size=validation_portion, random_state=seed)
model.fit(x_train_small, y_train_small)
y_pred_valid = model.predict(x_valid)
return mean_squared_error(y_valid, y_pred_valid)
# set up models
model_lr_valid = linear_model.LinearRegression()
model_ridge_valid = linear_model.Ridge(alpha=10)
# calculate errors
valid_portion = .1
n_seeds = 5
print "Linear Regression Training and Test Errors:"
# FILL IN YOUR CODE HERE
print calc_train_and_test_error(model_lr_valid, x_train_small, y_train_small, x_test, y_test)
print
print "Linear Regression Validation Errors:"
# FILL IN YOUR CODE HERE
print validation_set_error(model_lr_valid, x_train, y_train, validation_portion=0.1, seed=1234)
print
for seed in range(n_seeds):
print validation_set_error(model_lr_valid, x_train, y_train, validation_portion=valid_portion, seed=seed)
print
print "Ridge Regression Training and Test Errors:"
# FILL IN YOUR CODE HERE
print calc_train_and_test_error(model_ridge_valid, x_train_small, y_train_small, x_test, y_test)
print
print "Ridge Regression Validation Errors:"
# FILL IN YOUR CODE HERE
print validation_set_error(model_ridge_valid, x_train, y_train, validation_portion=0.1, seed=1234)
print
for seed in range(n_seeds):
print validation_set_error(model_ridge_valid, x_train, y_train, validation_portion=valid_portion, seed=seed)
print
In [42]:
# scikit learn provides a useful object to help you perform kfold cross validation
from sklearn.cross_validation import KFold
n_data = len(y_train)
fold_count = 0
for train_reduced_row_ids, valid_row_ids in KFold(n_data, n_folds=4):
print
print
print "FOLD %d:" % fold_count
print "-------"
print("train_ids:\n%s\n\nvalid_ids\n%s" % (train_reduced_row_ids, valid_row_ids))
x_train_reduced = x_train[train_reduced_row_ids]
y_train_reduced = y_train[train_reduced_row_ids]
x_valid = x_train[valid_row_ids]
y_valid = y_train[valid_row_ids]
fold_count += 1
In [43]:
# NOTE: KFolds isn't random at all. It's important to shuffle your data first before using it.
from sklearn.utils import shuffle
x_train_shuffled, y_train_shuffled = shuffle(x_train, y_train)
In [44]:
def kfold_error(model, x_train, y_train, k=4, seed=1234):
# FILL IN YOUR CODE HERE
# shuffle training data
x_train_shuffled, y_train_shuffled = shuffle(x_train, y_train, random_state=seed)
n_data = len(y_train)
error_sum = 0
for train_reduced_row_ids, valid_row_ids in KFold(n_data, n_folds=k):
x_train_reduced = x_train_shuffled[train_reduced_row_ids]
y_train_reduced = y_train_shuffled[train_reduced_row_ids]
x_valid = x_train_shuffled[valid_row_ids]
y_valid = y_train_shuffled[valid_row_ids]
model.fit(x_train_reduced, y_train_reduced)
y_valid_pred = model.predict(x_valid)
error_sum += mean_squared_error(y_valid, y_valid_pred)
return error_sum*1.0 / k
# set up models
model_lr_valid = linear_model.LinearRegression()
model_ridge_valid = linear_model.Ridge(alpha=10)
# calculate errors
n_seeds = 3
k = 5
print "Linear Regression Training and Test Errors:"
# FILL IN YOUR CODE HERE
print calc_train_and_test_error(model_lr_valid, x_train, y_train, x_test, y_test)
print
print "Linear Regression K-Fold Errors:"
# FILL IN YOUR CODE HERE
print
for seed in range(n_seeds):
print kfold_error(model_lr_valid, x_train, y_train, k=k, seed=seed)
print
print
print "Ridge Regression Training and Test Errors:"
# FILL IN YOUR CODE HERE
print calc_train_and_test_error(model_ridge_valid, x_train, y_train, x_test, y_test)
print
print "Ridge Regression K-Fold Errors:"
# FILL IN YOUR CODE HERE
print
for seed in range(n_seeds):
print kfold_error(model_ridge_valid, x_train, y_train, k=k, seed=seed)
print
In [45]:
def model_name(model):
s = model.__str__().lower()
if "linearregression" in s:
return 'LinearRegression'
elif "lasso" in s:
return 'Lasso(a=%g)' % model.alpha
elif "ridge" in s:
return 'Ridge(a=%g)' % model.alpha
elif "elastic" in s:
return 'ElasticNet(a=%g, r=%g)' % (model.alpha, model.l1_ratio)
else:
raise ValueError("Unknown Model Type")
def create_models(alphas=(.01, .03, .1, .3, 1, 3), l1_ratios=(.7, .5, .3)):
models = [linear_model.LinearRegression()]
models.extend([linear_model.Ridge(a) for a in alphas])
models.extend([linear_model.Lasso(a) for a in alphas])
models.extend([linear_model.ElasticNet(a, l1_ratio=l) for a in alphas for l in l1_ratios])
return models
def results_df(models, betas_true, x_train, y_train, x_test, y_test, k=4):
n_data, n_dim = x_train.shape
n_zeros = n_dim - len(betas_true)
betas_true = np.concatenate([betas_true, np.zeros(n_zeros)])
# fit models to training data
[m.fit(x_train, y_train) for m in models]
betas = np.vstack([betas_true] + [m.coef_ for m in models])
beta_names = ['Beta ' + str(i) for i in range(n_dim)]
# set up model names
model_names = ["True Coefs"] + [model_name(m) for m in models]
df = pd.DataFrame(data=betas, columns=beta_names, index=model_names)
# calculate training errors
y_preds = [m.predict(x_train) for m in models]
errors = [np.nan] + [mean_squared_error(y_train, y_pred) for y_pred in y_preds]
df['Train Error'] = errors
# calculate validation errors
errors = [np.nan] + [kfold_error(m, x_train, y_train, k=k) for m in models]
df['Cross Validation Error'] = errors
# calculate test errors
y_preds = [m.predict(x_test) for m in models]
errors = [np.nan] + [mean_squared_error(y_test, y_pred) for y_pred in y_preds]
df['Test Error'] = errors
return df
# these are some of the magic parameters that I used to actually
# generate the overfitting dataset
n_dim = 598
n_dim_meaningful = 3
n_dim_disp_extra = 2
# the actual betas used to generate the y values. the rest were 0.
betas_true = np.arange(n_dim_meaningful) + 1
# create a whole bunch of untrained models
models = create_models(alphas=(.01, .03, .1, .3, 1), l1_ratios=(.9, .7, .5))
#
all_results = results_df(models, betas_true, x_train, y_train, x_test, y_test, k=4)
# decide which columns we want to display
disp_cols = ["Beta " + str(i) for i in range(n_dim_meaningful + n_dim_disp_extra)]
disp_cols += ['Train Error', 'Cross Validation Error', 'Test Error']
# display the results
all_results[disp_cols]
Out[45]:
In [46]:
# scikit learn includes some functions for making cross validation easier
# and computationally faster for a some models
from sklearn import linear_model
model_ridge_cv = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0])
model_lasso_cv = linear_model.LassoCV(alphas=[0.1, 1.0, 10.0])
model_en_cv = linear_model.ElasticNetCV(l1_ratio=[.9], n_alphas=100)
In [47]:
In [ ]: