In [83]:
import numpy as np, pandas as pd
from matplotlib import pyplot as plt
import seaborn
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# plotting options
seaborn.set()
plt.rcParams['figure.figsize'] = (15, 8)
%matplotlib inline
In [84]:
datadir = '/home/kristy/SDCard/ExtraDocuments/CountyExercisePredictions/'
data_train = pd.read_csv(datadir+'data_nonan_train.csv', index_col=0)
labels_train = pd.read_csv(datadir+'labels_train.csv', index_col=0, header=None)
# Normalize data
xScaler = StandardScaler()
yScaler = StandardScaler()
X_train = xScaler.fit_transform(data_train)
y_train = yScaler.fit_transform(labels_train)
print(X_train.shape)
print(y_train.shape)
In [85]:
# Show all scaled values
plt.figure(figsize=[30,10])
plt.boxplot(X_train);
In [86]:
# Initialize PCA model that doesn't reduce dimensions at all in order to figure out how many components
# to put in final PCA model.
pca_full = PCA()
In [87]:
X_reduced = pca_full.fit_transform(X_train)
In [88]:
total_variance_explained = np.cumsum(np.sort(pca_full.explained_variance_ratio_)[::-1])
plt.plot(total_variance_explained, 'o-');
plt.ylim([0.15,1.05]);
plt.xlim([-1, len(total_variance_explained)+1]);
In [89]:
# Find the number of dimensions needed to explain 90% of variance
sum(total_variance_explained <= .9)
Out[89]:
In [90]:
# Create a new PCA model that actually reduces the dimensions.
pca = PCA(n_components=21)
X_reduced = pca.fit_transform(X_train)
print sum(pca.explained_variance_ratio_)
In [101]:
# import data
data_valid = pd.read_csv(datadir+'data_nonan_valid.csv', index_col=0)
labels_valid = pd.read_csv(datadir+'labels_valid.csv', index_col=0, header=None)
# Normalize data
X_valid = xScaler.fit_transform(data_valid)
y_valid = yScaler.fit_transform(labels_valid)
# Reduce dimensions
X_valid_reduced = pca.transform(X_valid)
In [102]:
# As a first attempt, try linear regression because it's easy.
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
In [103]:
lin_reg.fit(X_reduced, y_train)
# Check performance on training and validation sets.
print("Linear regression (training): {:0.4f}".format(lin_reg.score(X_reduced, y_train)))
print("Linear regression (validation): {:0.4f}".format(lin_reg.score(X_valid_reduced, y_valid)))
In [93]:
yhat_train = lin_reg.predict(X_reduced)
In [94]:
plt.scatter(y_train, yhat_train);
In [95]:
residuals = yhat_train - y_train
plt.scatter(y_train, residuals)
Out[95]:
In [96]:
from sklearn import metrics
In [97]:
print(metrics.r2_score(y_train, yhat_train))
print(metrics.explained_variance_score(y_train, yhat_train))
print(metrics.mean_squared_error(y_train, yhat_train))
In [98]:
X_reduced.shape
Out[98]:
In [82]:
# PCA components
plt.figure(figsize=[30,20]);
for i in range(21):
plt.subplot(7,3,i+1);
plt.scatter(X_reduced[:,i], y_train);
In [99]:
X_train.shape
Out[99]:
In [83]:
# Original covariates
plt.figure(figsize=[30,30]);
for i in range(32):
plt.subplot(8,4,i+1);
plt.scatter(X_train[:,i], y_train);
In [104]:
full_lin_reg = LinearRegression()
full_lin_reg.fit(X_train, y_train)
# Check performance on training set.
print("Linear regression with all covariates (training): {:0.4f}".format(full_lin_reg.score(X_train, y_train)))
print("Linear regression with all covariates (validation): {:0.4f}".format(full_lin_reg.score(X_valid, y_valid)))
In [73]:
from sklearn.linear_model import Ridge
In [105]:
rdg_reg = Ridge()
rdg_reg.fit(X_reduced, y_train)
print("Ridge regression (training): {:0.4f}".format(rdg_reg.score(X_reduced, y_train)))
In [76]:
from sklearn.linear_model import RidgeCV
In [106]:
rcv_reg = RidgeCV(alphas=[1, 10])
rcv_reg.fit(X_reduced, y_train)
print("Ridge regression with CV (training): {:0.4f}".format(rcv_reg.score(X_reduced, y_train)))
In [78]:
from sklearn.linear_model import Lasso, LassoCV
In [107]:
las_reg = Lasso()
las_reg.fit(X_reduced, y_train)
print("Lasso (training): {:0.4f}".format(las_reg.score(X_reduced, y_train)))
In [108]:
lcv_reg = LassoCV()
lcv_reg.fit(X_reduced, np.ravel(y_train))
print("Lasso CV (training): {:0.4f}".format(lcv_reg.score(X_reduced, y_train)))
In [109]:
lcv_reg.alpha_
Out[109]:
In [110]:
# Run tests
print("Linear regression (validation): {:0.4f}".format(lin_reg.score(X_valid_reduced, y_valid)))
print("Ridge regression (validation): {:0.4f}".format(rdg_reg.score(X_valid_reduced, y_valid)))
print("Ridge regression with CV (validation): {:0.4f}".format(rcv_reg.score(X_valid_reduced, y_valid)))
print("Lasso (validation): {:0.4f}".format(las_reg.score(X_valid_reduced, y_valid)))
print("Lasso with CV (validation): {:0.4f}".format(lcv_reg.score(X_valid_reduced, y_valid)))
It doesn't really seem like the different regression methods perform differently on this dataset. They have decent performance that's slightly better on the cross-validation set than on the training set. That seems odd. It may indicate an overall problem with the model, but I'm not sure what.
In [ ]:
## import data
data_test = pd.read_csv(datadir+'data_nonan_test.csv', index_col=0)
labels_test = pd.read_csv(datadir+'labels_test.csv', index_col=0, header=None)
# Normalize data
X_test = xScaler.fit_transform(data_test)
y_test = yScaler.fit_transform(labels_test)
# Reduce dimensions
X_test_reduced = pca.transform(X_test)
# Run tests
print("Final model (testing): {:0.4f}".format(the_model.score(X_test_reduced, y_test)))