In [1]:
import numpy as np, pandas as pd
from matplotlib import pyplot as plt
import seaborn
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# plotting options
seaborn.set()
plt.rcParams['figure.figsize'] = (15, 8)
%matplotlib inline

Import training data


In [3]:
datadir = '/home/kristy/SDCard/ExtraDocuments/CountyExercisePredictions/'

data_train = pd.read_csv(datadir+'data_nonan_train.csv', index_col=0)
labels_train = pd.read_csv(datadir+'labels_train.csv', index_col=0, header=None)

# Normalize data
xScaler = StandardScaler()
yScaler = StandardScaler()
X_train = xScaler.fit_transform(data_train)
y_train = yScaler.fit_transform(labels_train)

print(X_train.shape)
print(y_train.shape)


(2552, 32)
(2552, 1)

Reduce dimensionality


In [4]:
# Show all scaled values
plt.figure(figsize=[30,10])
plt.boxplot(X_train);



In [5]:
# Initialize PCA model that doesn't reduce dimensions at all in order to figure out how many components
# to put in final PCA model.
pca_full = PCA()

In [6]:
X_reduced = pca_full.fit_transform(X_train)

In [7]:
total_variance_explained = np.cumsum(np.sort(pca_full.explained_variance_ratio_)[::-1])
plt.plot(total_variance_explained, 'o-');
plt.ylim([0.15,1.05]);
plt.xlim([-1, len(total_variance_explained)+1]);



In [8]:
# Find the number of dimensions needed to explain 90% of variance
sum(total_variance_explained <= .9)


Out[8]:
20

In [9]:
# Create a new PCA model that actually reduces the dimensions.
pca = PCA(n_components=21)
X_reduced = pca.fit_transform(X_train)
print sum(pca.explained_variance_ratio_)


0.906739508169

Set up k-fold cross-validation


In [63]:
from sklearn.cross_validation import KFold, cross_val_score
k_fold = KFold(n=len(y_train), n_folds=5)

Linear regression

Other possible models:

  • ridge regression
  • SGD regression
  • Elastic Net
  • SVR
  • Ensemble regressors
  • CART
  • regression decision trees
  • neural network
  • LASSO
  • ridge regression
  • random forest?
  • regression with gradient boosting

In [64]:
# As a first attempt, try linear regression because it's easy.
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

In [65]:
# Do k-fold cross-validation to see CV scores.
print("Linear regression (CV): {}".format(cross_val_score(lin_reg, X_reduced, y_train, cv=k_fold)))

# Use all training data for final model
lin_reg.fit(X_reduced, y_train)

# Check performance on training set.
print("Linear regression (training): {:0.4f}".format(lin_reg.score(X_reduced, y_train)))


Linear regression (CV): [ 0.74069762  0.73762674  0.71854045  0.73175824  0.72352609]
Linear regression (training): 0.7356

In [66]:
yhat_train = lin_reg.predict(X_reduced)

In [67]:
plt.scatter(y_train, yhat_train);



In [68]:
residuals = yhat_train - y_train
plt.scatter(y_train, residuals)


Out[68]:
<matplotlib.collections.PathCollection at 0x7f2a89a4ebd0>

In [69]:
from sklearn import metrics

In [70]:
print(metrics.r2_score(y_train, yhat_train))
print(metrics.explained_variance_score(y_train, yhat_train))
print(metrics.mean_squared_error(y_train, yhat_train))


0.735600059581
0.735600059581
0.264399940419

Look at individual covariates to see if any transformations jump out


In [71]:
X_reduced.shape


Out[71]:
(2552, 21)

In [82]:
# PCA components
plt.figure(figsize=[30,20]);
for i in range(21):
    plt.subplot(7,3,i+1);
    plt.scatter(X_reduced[:,i], y_train);



In [56]:
X_train.shape


Out[56]:
(2552, 32)

In [83]:
# Original covariates
plt.figure(figsize=[30,30]);
for i in range(32):
    plt.subplot(8,4,i+1);
    plt.scatter(X_train[:,i], y_train);


None of these look particularly nonlinear, so we won't worry about transforming the data here.

Do linear regression on original covariates for comparison


In [72]:
full_lin_reg = LinearRegression()
# Do k-fold cross-validation to see CV scores.
print cross_val_score(full_lin_reg, X_reduced, y_train, cv=k_fold)

# Use all training data for final model
full_lin_reg.fit(X_train, y_train)

# Check performance on training set.
print(full_lin_reg.score(X_train, y_train))


[ 0.74069762  0.73762674  0.71854045  0.73175824  0.72352609]
0.748469483139

This isn't a whole lot better than the initial model, so we'll stick with the reduced dataset

Ridge regression


In [73]:
from sklearn.linear_model import Ridge

In [75]:
rdg_reg = Ridge()
# Do k-fold cross-validation to see CV scores.
print("Ridge regression (CV): {}".format(cross_val_score(rdg_reg, X_reduced, y_train, cv=k_fold)))

# Use all training data for final model
rdg_reg.fit(X_reduced, y_train)
print("Ridge regression (training): {:0.4f}".format(rdg_reg.score(X_reduced, y_train)))


Ridge regression (CV): [ 0.7407035   0.73763654  0.71855399  0.73175393  0.72352627]
Ridge regression (training): 0.7356

It turns out ridge regression isn't an improvement.

Ridge regression with CV to find regularization param


In [76]:
from sklearn.linear_model import RidgeCV

In [77]:
rcv_reg = RidgeCV(alphas=[1, 10])
# Do k-fold cross-validation to see CV scores (not clear whether the k-fold CV is still needed here).
print("Ridge regression with CV (CV): {}".format(cross_val_score(rcv_reg, X_reduced, y_train, cv=k_fold)))

# Use all training data for final model
rcv_reg.fit(X_reduced, y_train)
print("Ridge regression with CV (training): {:0.4f}".format(rcv_reg.score(X_reduced, y_train)))


Ridge regression with CV (CV): [ 0.74075407  0.73772108  0.71867303  0.73171337  0.72352615]
Ridge regression with CV (training): 0.7356

Lasso


In [78]:
from sklearn.linear_model import Lasso, LassoCV

In [79]:
las_reg = Lasso()
# Do k-fold cross-validation to see CV scores.
print("Lass (CV): {}".format(cross_val_score(las_reg, X_reduced, y_train, cv=k_fold)))

# Use all training data for final model
las_reg.fit(X_reduced, y_train)
print("Lasso (training): {:0.4f}".format(las_reg.score(X_reduced, y_train)))


Lass (CV): [ 0.36509358  0.37894036  0.37076763  0.35965758  0.36321629]
Lasso (training): 0.3679

This one is really bad with the default alpha value. Let's do it again with a better value.


In [80]:
lcv_reg = LassoCV()
# Do k-fold cross-validation to see CV scores (again, not clear whether this is needed for CV version).
# print("Lasso CV (CV): {}".format(cross_val_score(lcv_reg, X_reduced, y_train, cv=k_fold)))
# k-fold CV crashes. Check this later.

# Use all training data for final model
lcv_reg.fit(X_reduced, np.ravel(y_train))
print("Lasso CV (training): {:0.4f}".format(lcv_reg.score(X_reduced, y_train)))


Lasso CV (training): 0.7355

In [81]:
lcv_reg.alpha_


Out[81]:
0.0023502615954269709

Run models on test data


In [82]:
# import data
data_test = pd.read_csv(datadir+'data_nonan_test.csv', index_col=0)
labels_test = pd.read_csv(datadir+'labels_test.csv', index_col=0, header=None)

# Normalize data
X_test = xScaler.fit_transform(data_test)
y_test = yScaler.fit_transform(labels_test)

# Reduce dimensions
X_test_reduced = pca.transform(X_test)

# Run tests
print("Linear regression (testing): {:0.4f}".format(lin_reg.score(X_test_reduced, y_test)))
print("Ridge regression (testing): {:0.4f}".format(rdg_reg.score(X_test_reduced, y_test)))
print("Ridge regression with CV (testing): {:0.4f}".format(rcv_reg.score(X_test_reduced, y_test)))
print("Lasso (testing): {:0.4f}".format(las_reg.score(X_test_reduced, y_test)))
print("Lasso with CV (testing): {:0.4f}".format(lcv_reg.score(X_test_reduced, y_test)))


Linear regression (testing): 0.7373
Ridge regression (testing): 0.7373
Ridge regression with CV (testing): 0.7373
Lasso (testing): 0.3745
Lasso with CV (testing): 0.7373

Conclusions

It doesn't really seem like the different regression methods perform differently on this dataset. They have decent performance that's slightly better on the test set than on the training set. That seems odd. It may indicate an overall problem with the model, but I'm not sure what.