In [83]:
import numpy as np, pandas as pd
from matplotlib import pyplot as plt
import seaborn
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# plotting options
seaborn.set()
plt.rcParams['figure.figsize'] = (15, 8)
%matplotlib inline

Import training data


In [84]:
datadir = '/home/kristy/SDCard/ExtraDocuments/CountyExercisePredictions/'

data_train = pd.read_csv(datadir+'data_nonan_train.csv', index_col=0)
labels_train = pd.read_csv(datadir+'labels_train.csv', index_col=0, header=None)

# Normalize data
xScaler = StandardScaler()
yScaler = StandardScaler()
X_train = xScaler.fit_transform(data_train)
y_train = yScaler.fit_transform(labels_train)

print(X_train.shape)
print(y_train.shape)


(1914, 32)
(1914, 1)

Reduce dimensionality


In [85]:
# Show all scaled values
plt.figure(figsize=[30,10])
plt.boxplot(X_train);



In [86]:
# Initialize PCA model that doesn't reduce dimensions at all in order to figure out how many components
# to put in final PCA model.
pca_full = PCA()

In [87]:
X_reduced = pca_full.fit_transform(X_train)

In [88]:
total_variance_explained = np.cumsum(np.sort(pca_full.explained_variance_ratio_)[::-1])
plt.plot(total_variance_explained, 'o-');
plt.ylim([0.15,1.05]);
plt.xlim([-1, len(total_variance_explained)+1]);



In [89]:
# Find the number of dimensions needed to explain 90% of variance
sum(total_variance_explained <= .9)


Out[89]:
20

In [90]:
# Create a new PCA model that actually reduces the dimensions.
pca = PCA(n_components=21)
X_reduced = pca.fit_transform(X_train)
print sum(pca.explained_variance_ratio_)


0.908845895055

Prepare cross-validation data


In [101]:
# import data
data_valid = pd.read_csv(datadir+'data_nonan_valid.csv', index_col=0)
labels_valid = pd.read_csv(datadir+'labels_valid.csv', index_col=0, header=None)

# Normalize data
X_valid = xScaler.fit_transform(data_valid)
y_valid = yScaler.fit_transform(labels_valid)

# Reduce dimensions
X_valid_reduced = pca.transform(X_valid)

Linear regression

Other possible models:

  • ridge regression
  • SGD regression
  • Elastic Net
  • SVR
  • Ensemble regressors
  • CART
  • regression decision trees
  • neural network
  • LASSO
  • ridge regression
  • random forest?
  • regression with gradient boosting

In [102]:
# As a first attempt, try linear regression because it's easy.
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

In [103]:
lin_reg.fit(X_reduced, y_train)

# Check performance on training and validation sets.
print("Linear regression (training): {:0.4f}".format(lin_reg.score(X_reduced, y_train)))
print("Linear regression (validation): {:0.4f}".format(lin_reg.score(X_valid_reduced, y_valid)))


Linear regression (training): 0.7372
Linear regression (validation): 0.7604

In [93]:
yhat_train = lin_reg.predict(X_reduced)

In [94]:
plt.scatter(y_train, yhat_train);



In [95]:
residuals = yhat_train - y_train
plt.scatter(y_train, residuals)


Out[95]:
<matplotlib.collections.PathCollection at 0x7f2a8a0f1e90>

In [96]:
from sklearn import metrics

In [97]:
print(metrics.r2_score(y_train, yhat_train))
print(metrics.explained_variance_score(y_train, yhat_train))
print(metrics.mean_squared_error(y_train, yhat_train))


0.737249826669
0.737249826669
0.262750173331

Look at individual covariates to see if any transformations jump out


In [98]:
X_reduced.shape


Out[98]:
(1914, 21)

In [82]:
# PCA components
plt.figure(figsize=[30,20]);
for i in range(21):
    plt.subplot(7,3,i+1);
    plt.scatter(X_reduced[:,i], y_train);



In [99]:
X_train.shape


Out[99]:
(1914, 32)

In [83]:
# Original covariates
plt.figure(figsize=[30,30]);
for i in range(32):
    plt.subplot(8,4,i+1);
    plt.scatter(X_train[:,i], y_train);


None of these look particularly nonlinear, so we won't worry about transforming the data here.

Do linear regression on original covariates for comparison


In [104]:
full_lin_reg = LinearRegression()
full_lin_reg.fit(X_train, y_train)

# Check performance on training set.
print("Linear regression with all covariates (training): {:0.4f}".format(full_lin_reg.score(X_train, y_train)))
print("Linear regression with all covariates (validation): {:0.4f}".format(full_lin_reg.score(X_valid, y_valid)))


Linear regression with all covariates (training): 0.7512
Linear regression with all covariates (validation): 0.7635

This isn't a whole lot better than the initial model (especially for the validation data), so we'll stick with the reduced dataset for now.

Ridge regression


In [73]:
from sklearn.linear_model import Ridge

In [105]:
rdg_reg = Ridge()
rdg_reg.fit(X_reduced, y_train)
print("Ridge regression (training): {:0.4f}".format(rdg_reg.score(X_reduced, y_train)))


Ridge regression (training): 0.7372

It turns out ridge regression isn't an improvement.

Ridge regression with CV to find regularization param


In [76]:
from sklearn.linear_model import RidgeCV

In [106]:
rcv_reg = RidgeCV(alphas=[1, 10])
rcv_reg.fit(X_reduced, y_train)
print("Ridge regression with CV (training): {:0.4f}".format(rcv_reg.score(X_reduced, y_train)))


Ridge regression with CV (training): 0.7372

Lasso


In [78]:
from sklearn.linear_model import Lasso, LassoCV

In [107]:
las_reg = Lasso()
las_reg.fit(X_reduced, y_train)
print("Lasso (training): {:0.4f}".format(las_reg.score(X_reduced, y_train)))


Lasso (training): 0.3939

This one is really bad with the default alpha value. Let's do it again with a better value.


In [108]:
lcv_reg = LassoCV()
lcv_reg.fit(X_reduced, np.ravel(y_train))
print("Lasso CV (training): {:0.4f}".format(lcv_reg.score(X_reduced, y_train)))


Lasso CV (training): 0.7368

In [109]:
lcv_reg.alpha_


Out[109]:
0.0042311585359146638

Run all models on validation data


In [110]:
# Run tests
print("Linear regression (validation): {:0.4f}".format(lin_reg.score(X_valid_reduced, y_valid)))
print("Ridge regression (validation): {:0.4f}".format(rdg_reg.score(X_valid_reduced, y_valid)))
print("Ridge regression with CV (validation): {:0.4f}".format(rcv_reg.score(X_valid_reduced, y_valid)))
print("Lasso (validation): {:0.4f}".format(las_reg.score(X_valid_reduced, y_valid)))
print("Lasso with CV (validation): {:0.4f}".format(lcv_reg.score(X_valid_reduced, y_valid)))


Linear regression (validation): 0.7604
Ridge regression (validation): 0.7604
Ridge regression with CV (validation): 0.7605
Lasso (validation): 0.3880
Lasso with CV (validation): 0.7599

Conclusions

It doesn't really seem like the different regression methods perform differently on this dataset. They have decent performance that's slightly better on the cross-validation set than on the training set. That seems odd. It may indicate an overall problem with the model, but I'm not sure what.

Test final model on test data not previously used.


In [ ]:
## import data
data_test = pd.read_csv(datadir+'data_nonan_test.csv', index_col=0)
labels_test = pd.read_csv(datadir+'labels_test.csv', index_col=0, header=None)

# Normalize data
X_test = xScaler.fit_transform(data_test)
y_test = yScaler.fit_transform(labels_test)

# Reduce dimensions
X_test_reduced = pca.transform(X_test)

# Run tests
print("Final model (testing): {:0.4f}".format(the_model.score(X_test_reduced, y_test)))