In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
data = pd.read_csv("/Users/jacquelynzuker/Desktop/DataScienceBootcamp/HR_comma_sep.csv")
data = pd.get_dummies(data)
y = data.Work_accident
del data["Work_accident"]
X = data
In [3]:
from sklearn import linear_model
from sklearn.cross_validation import cross_val_score
log_regr = linear_model.LogisticRegressionCV()
log_regr.fit(X, y)
# Inspect the results.
print('\nCoefficients: \n', log_regr.coef_)
print('\nIntercept: \n', log_regr.intercept_)
print('\nR-squared:')
print(log_regr.score(X, y))
log_regr_score = cross_val_score(log_regr, X, y, cv = 10)
print("\nCross-Validated R-Squared of Vanilla Regression Model: %0.2f (+/- %0.2f)"
% (log_regr_score.mean(), log_regr_score.std() * 2))
In [4]:
# LASSO Logistic Regression with an L1 penalty
from sklearn.linear_model import LogisticRegression
lasso = LogisticRegression(penalty = "l1", solver="liblinear")
lasso.fit(X, y)
lasso_score = cross_val_score(lasso, X, y, cv=10)
print("Cross-Validated R-Squared of Lasso Regression Model: %0.2f (+/- %0.2f)"
% (lasso_score.mean(), lasso_score.std() * 2))
In [5]:
from sklearn.linear_model import RidgeClassifier
ridgelm = linear_model.RidgeClassifier(alpha=0)
ridgelm_score = cross_val_score(ridgelm, X, y, cv=10)
print("Cross-Validated R-Squared of Ridge Regression Model: %0.2f (+/- %0.2f)"
% (ridgelm_score.mean(), ridgelm_score.std() * 2))
In [7]:
interval = 0.1
alpha_range = np.arange(0,1,interval)
# search for an optimal correlation value for Linear Models
lasso_scores = []
ridge_scores = []
for alphaVal in alpha_range:
lasso = LogisticRegression(penalty = "l1", solver="liblinear")
lasso_score = cross_val_score(lasso, X, y, cv=10)
lasso_scores.append(lasso_score.mean())
ridgeBig = linear_model.RidgeClassifier(alpha=alphaVal, fit_intercept=False)
ridge_score = cross_val_score(ridgeBig, X, y, cv=10)
ridge_scores.append(ridge_score.mean())
# plot the performance of Lasso and Ridge Regression against the value of alpha
plt.plot(alpha_range, lasso_scores, label = "Lasso Regression")
plt.plot(alpha_range, ridge_scores, label = "Ridge Regression")
plt.xlabel("Value of Alpha")
plt.ylabel("Cross-Validated Regression Score")
plt.legend()
plt.title("Optimal Alpha Values for Lasso and Ridge Regression")
plt.show()