In [1]:
    
import numpy as np, pandas as pd
from matplotlib import pyplot as plt
import seaborn
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# plotting options
seaborn.set()
plt.rcParams['figure.figsize'] = (15, 8)
%matplotlib inline
    
In [3]:
    
datadir = '/home/kristy/SDCard/ExtraDocuments/CountyExercisePredictions/'
data_train = pd.read_csv(datadir+'data_nonan_train.csv', index_col=0)
labels_train = pd.read_csv(datadir+'labels_train.csv', index_col=0, header=None)
# Normalize data
xScaler = StandardScaler()
yScaler = StandardScaler()
X_train = xScaler.fit_transform(data_train)
y_train = yScaler.fit_transform(labels_train)
print(X_train.shape)
print(y_train.shape)
    
    
In [4]:
    
# Show all scaled values
plt.figure(figsize=[30,10])
plt.boxplot(X_train);
    
    
In [5]:
    
# Initialize PCA model that doesn't reduce dimensions at all in order to figure out how many components
# to put in final PCA model.
pca_full = PCA()
    
In [6]:
    
X_reduced = pca_full.fit_transform(X_train)
    
In [7]:
    
total_variance_explained = np.cumsum(np.sort(pca_full.explained_variance_ratio_)[::-1])
plt.plot(total_variance_explained, 'o-');
plt.ylim([0.15,1.05]);
plt.xlim([-1, len(total_variance_explained)+1]);
    
    
In [8]:
    
# Find the number of dimensions needed to explain 90% of variance
sum(total_variance_explained <= .9)
    
    Out[8]:
In [9]:
    
# Create a new PCA model that actually reduces the dimensions.
pca = PCA(n_components=21)
X_reduced = pca.fit_transform(X_train)
print sum(pca.explained_variance_ratio_)
    
    
In [63]:
    
from sklearn.cross_validation import KFold, cross_val_score
k_fold = KFold(n=len(y_train), n_folds=5)
    
In [64]:
    
# As a first attempt, try linear regression because it's easy.
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
    
In [65]:
    
# Do k-fold cross-validation to see CV scores.
print("Linear regression (CV): {}".format(cross_val_score(lin_reg, X_reduced, y_train, cv=k_fold)))
# Use all training data for final model
lin_reg.fit(X_reduced, y_train)
# Check performance on training set.
print("Linear regression (training): {:0.4f}".format(lin_reg.score(X_reduced, y_train)))
    
    
In [66]:
    
yhat_train = lin_reg.predict(X_reduced)
    
In [67]:
    
plt.scatter(y_train, yhat_train);
    
    
In [68]:
    
residuals = yhat_train - y_train
plt.scatter(y_train, residuals)
    
    Out[68]:
    
In [69]:
    
from sklearn import metrics
    
In [70]:
    
print(metrics.r2_score(y_train, yhat_train))
print(metrics.explained_variance_score(y_train, yhat_train))
print(metrics.mean_squared_error(y_train, yhat_train))
    
    
In [71]:
    
X_reduced.shape
    
    Out[71]:
In [82]:
    
# PCA components
plt.figure(figsize=[30,20]);
for i in range(21):
    plt.subplot(7,3,i+1);
    plt.scatter(X_reduced[:,i], y_train);
    
    
In [56]:
    
X_train.shape
    
    Out[56]:
In [83]:
    
# Original covariates
plt.figure(figsize=[30,30]);
for i in range(32):
    plt.subplot(8,4,i+1);
    plt.scatter(X_train[:,i], y_train);
    
    
In [72]:
    
full_lin_reg = LinearRegression()
# Do k-fold cross-validation to see CV scores.
print cross_val_score(full_lin_reg, X_reduced, y_train, cv=k_fold)
# Use all training data for final model
full_lin_reg.fit(X_train, y_train)
# Check performance on training set.
print(full_lin_reg.score(X_train, y_train))
    
    
In [73]:
    
from sklearn.linear_model import Ridge
    
In [75]:
    
rdg_reg = Ridge()
# Do k-fold cross-validation to see CV scores.
print("Ridge regression (CV): {}".format(cross_val_score(rdg_reg, X_reduced, y_train, cv=k_fold)))
# Use all training data for final model
rdg_reg.fit(X_reduced, y_train)
print("Ridge regression (training): {:0.4f}".format(rdg_reg.score(X_reduced, y_train)))
    
    
In [76]:
    
from sklearn.linear_model import RidgeCV
    
In [77]:
    
rcv_reg = RidgeCV(alphas=[1, 10])
# Do k-fold cross-validation to see CV scores (not clear whether the k-fold CV is still needed here).
print("Ridge regression with CV (CV): {}".format(cross_val_score(rcv_reg, X_reduced, y_train, cv=k_fold)))
# Use all training data for final model
rcv_reg.fit(X_reduced, y_train)
print("Ridge regression with CV (training): {:0.4f}".format(rcv_reg.score(X_reduced, y_train)))
    
    
In [78]:
    
from sklearn.linear_model import Lasso, LassoCV
    
In [79]:
    
las_reg = Lasso()
# Do k-fold cross-validation to see CV scores.
print("Lass (CV): {}".format(cross_val_score(las_reg, X_reduced, y_train, cv=k_fold)))
# Use all training data for final model
las_reg.fit(X_reduced, y_train)
print("Lasso (training): {:0.4f}".format(las_reg.score(X_reduced, y_train)))
    
    
In [80]:
    
lcv_reg = LassoCV()
# Do k-fold cross-validation to see CV scores (again, not clear whether this is needed for CV version).
# print("Lasso CV (CV): {}".format(cross_val_score(lcv_reg, X_reduced, y_train, cv=k_fold)))
# k-fold CV crashes. Check this later.
# Use all training data for final model
lcv_reg.fit(X_reduced, np.ravel(y_train))
print("Lasso CV (training): {:0.4f}".format(lcv_reg.score(X_reduced, y_train)))
    
    
In [81]:
    
lcv_reg.alpha_
    
    Out[81]:
In [82]:
    
# import data
data_test = pd.read_csv(datadir+'data_nonan_test.csv', index_col=0)
labels_test = pd.read_csv(datadir+'labels_test.csv', index_col=0, header=None)
# Normalize data
X_test = xScaler.fit_transform(data_test)
y_test = yScaler.fit_transform(labels_test)
# Reduce dimensions
X_test_reduced = pca.transform(X_test)
# Run tests
print("Linear regression (testing): {:0.4f}".format(lin_reg.score(X_test_reduced, y_test)))
print("Ridge regression (testing): {:0.4f}".format(rdg_reg.score(X_test_reduced, y_test)))
print("Ridge regression with CV (testing): {:0.4f}".format(rcv_reg.score(X_test_reduced, y_test)))
print("Lasso (testing): {:0.4f}".format(las_reg.score(X_test_reduced, y_test)))
print("Lasso with CV (testing): {:0.4f}".format(lcv_reg.score(X_test_reduced, y_test)))