In [1]:
import numpy as np, pandas as pd
from matplotlib import pyplot as plt
import seaborn
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# plotting options
seaborn.set()
plt.rcParams['figure.figsize'] = (15, 8)
%matplotlib inline
In [3]:
datadir = '/home/kristy/SDCard/ExtraDocuments/CountyExercisePredictions/'
data_train = pd.read_csv(datadir+'data_nonan_train.csv', index_col=0)
labels_train = pd.read_csv(datadir+'labels_train.csv', index_col=0, header=None)
# Normalize data
xScaler = StandardScaler()
yScaler = StandardScaler()
X_train = xScaler.fit_transform(data_train)
y_train = yScaler.fit_transform(labels_train)
print(X_train.shape)
print(y_train.shape)
In [4]:
# Show all scaled values
plt.figure(figsize=[30,10])
plt.boxplot(X_train);
In [5]:
# Initialize PCA model that doesn't reduce dimensions at all in order to figure out how many components
# to put in final PCA model.
pca_full = PCA()
In [6]:
X_reduced = pca_full.fit_transform(X_train)
In [7]:
total_variance_explained = np.cumsum(np.sort(pca_full.explained_variance_ratio_)[::-1])
plt.plot(total_variance_explained, 'o-');
plt.ylim([0.15,1.05]);
plt.xlim([-1, len(total_variance_explained)+1]);
In [8]:
# Find the number of dimensions needed to explain 90% of variance
sum(total_variance_explained <= .9)
Out[8]:
In [9]:
# Create a new PCA model that actually reduces the dimensions.
pca = PCA(n_components=21)
X_reduced = pca.fit_transform(X_train)
print sum(pca.explained_variance_ratio_)
In [63]:
from sklearn.cross_validation import KFold, cross_val_score
k_fold = KFold(n=len(y_train), n_folds=5)
In [64]:
# As a first attempt, try linear regression because it's easy.
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
In [65]:
# Do k-fold cross-validation to see CV scores.
print("Linear regression (CV): {}".format(cross_val_score(lin_reg, X_reduced, y_train, cv=k_fold)))
# Use all training data for final model
lin_reg.fit(X_reduced, y_train)
# Check performance on training set.
print("Linear regression (training): {:0.4f}".format(lin_reg.score(X_reduced, y_train)))
In [66]:
yhat_train = lin_reg.predict(X_reduced)
In [67]:
plt.scatter(y_train, yhat_train);
In [68]:
residuals = yhat_train - y_train
plt.scatter(y_train, residuals)
Out[68]:
In [69]:
from sklearn import metrics
In [70]:
print(metrics.r2_score(y_train, yhat_train))
print(metrics.explained_variance_score(y_train, yhat_train))
print(metrics.mean_squared_error(y_train, yhat_train))
In [71]:
X_reduced.shape
Out[71]:
In [82]:
# PCA components
plt.figure(figsize=[30,20]);
for i in range(21):
plt.subplot(7,3,i+1);
plt.scatter(X_reduced[:,i], y_train);
In [56]:
X_train.shape
Out[56]:
In [83]:
# Original covariates
plt.figure(figsize=[30,30]);
for i in range(32):
plt.subplot(8,4,i+1);
plt.scatter(X_train[:,i], y_train);
In [72]:
full_lin_reg = LinearRegression()
# Do k-fold cross-validation to see CV scores.
print cross_val_score(full_lin_reg, X_reduced, y_train, cv=k_fold)
# Use all training data for final model
full_lin_reg.fit(X_train, y_train)
# Check performance on training set.
print(full_lin_reg.score(X_train, y_train))
In [73]:
from sklearn.linear_model import Ridge
In [75]:
rdg_reg = Ridge()
# Do k-fold cross-validation to see CV scores.
print("Ridge regression (CV): {}".format(cross_val_score(rdg_reg, X_reduced, y_train, cv=k_fold)))
# Use all training data for final model
rdg_reg.fit(X_reduced, y_train)
print("Ridge regression (training): {:0.4f}".format(rdg_reg.score(X_reduced, y_train)))
In [76]:
from sklearn.linear_model import RidgeCV
In [77]:
rcv_reg = RidgeCV(alphas=[1, 10])
# Do k-fold cross-validation to see CV scores (not clear whether the k-fold CV is still needed here).
print("Ridge regression with CV (CV): {}".format(cross_val_score(rcv_reg, X_reduced, y_train, cv=k_fold)))
# Use all training data for final model
rcv_reg.fit(X_reduced, y_train)
print("Ridge regression with CV (training): {:0.4f}".format(rcv_reg.score(X_reduced, y_train)))
In [78]:
from sklearn.linear_model import Lasso, LassoCV
In [79]:
las_reg = Lasso()
# Do k-fold cross-validation to see CV scores.
print("Lass (CV): {}".format(cross_val_score(las_reg, X_reduced, y_train, cv=k_fold)))
# Use all training data for final model
las_reg.fit(X_reduced, y_train)
print("Lasso (training): {:0.4f}".format(las_reg.score(X_reduced, y_train)))
In [80]:
lcv_reg = LassoCV()
# Do k-fold cross-validation to see CV scores (again, not clear whether this is needed for CV version).
# print("Lasso CV (CV): {}".format(cross_val_score(lcv_reg, X_reduced, y_train, cv=k_fold)))
# k-fold CV crashes. Check this later.
# Use all training data for final model
lcv_reg.fit(X_reduced, np.ravel(y_train))
print("Lasso CV (training): {:0.4f}".format(lcv_reg.score(X_reduced, y_train)))
In [81]:
lcv_reg.alpha_
Out[81]:
In [82]:
# import data
data_test = pd.read_csv(datadir+'data_nonan_test.csv', index_col=0)
labels_test = pd.read_csv(datadir+'labels_test.csv', index_col=0, header=None)
# Normalize data
X_test = xScaler.fit_transform(data_test)
y_test = yScaler.fit_transform(labels_test)
# Reduce dimensions
X_test_reduced = pca.transform(X_test)
# Run tests
print("Linear regression (testing): {:0.4f}".format(lin_reg.score(X_test_reduced, y_test)))
print("Ridge regression (testing): {:0.4f}".format(rdg_reg.score(X_test_reduced, y_test)))
print("Ridge regression with CV (testing): {:0.4f}".format(rcv_reg.score(X_test_reduced, y_test)))
print("Lasso (testing): {:0.4f}".format(las_reg.score(X_test_reduced, y_test)))
print("Lasso with CV (testing): {:0.4f}".format(lcv_reg.score(X_test_reduced, y_test)))