In [1]:
import pandas as pd
from sklearn.datasets import load_boston
boston = load_boston()
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)
dataset['target'] = boston.target
observations = len(dataset)
variables = dataset.columns[:-1]
X = dataset.ix[:,:-1]
y = dataset['target'].values
In [2]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)
print ("Train dataset sample size: %i" % len(X_train))
print ("Test dataset sample size: %i" % len(X_test))
In [3]:
X_train, X_out_sample, y_train, y_out_sample = train_test_split(X, y, test_size=0.40, random_state=101)
X_validation, X_test, y_validation, y_test = train_test_split(X_out_sample, y_out_sample, test_size=0.50, random_state=101)
print ("Train dataset sample size: %i" % len(X_train))
print ("Validation dataset sample size: %i" % len(X_validation))
print ("Test dataset sample size: %i" % len(X_test))
In [4]:
from sklearn.cross_validation import cross_val_score, KFold, StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import numpy as np
def RMSE(y_true, y_pred):
return np.sum((y_true -y_pred)**2)
lm = LinearRegression()
cv_iterator = KFold(n=len(X), n_folds=10, shuffle=True, random_state=101)
edges = np.histogram(y, bins=5)[1]
binning = np.digitize(y, edges)
stratified_cv_iterator = StratifiedKFold(binning, n_folds=10, shuffle=True, random_state=101)
second_order=PolynomialFeatures(degree=2, interaction_only=False)
third_order=PolynomialFeatures(degree=3, interaction_only=True)
over_param_X = second_order.fit_transform(X)
extra_over_param_X = third_order.fit_transform(X)
cv_score = cross_val_score(lm, over_param_X, y, cv=cv_iterator, scoring='mean_squared_error', n_jobs=1)
In [5]:
print (cv_score)
In [6]:
print ('Cv score: mean %0.3f std %0.3f' % (np.mean(np.abs(cv_score)), np.std(cv_score)))
In [7]:
cv_score = cross_val_score(lm, over_param_X, y, cv=stratified_cv_iterator, scoring='mean_squared_error', n_jobs=1)
print ('Cv score: mean %0.3f std %0.3f' % (np.mean(np.abs(cv_score)), np.std(cv_score)))
Valid options are ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'log_loss', 'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc'
In [8]:
import random
def Bootstrap(n, n_iter=3, random_state=None):
"""
Random sampling with replacement cross-validation generator.
For each iter a sample bootstrap of the indexes [0, n) is
generated and the function returns the obtained sample
and a list of all the excluded indexes.
"""
if random_state:
random.seed(random_state)
for j in range(n_iter):
bs = [random.randint(0, n-1) for i in range(n)]
out_bs = list({i for i in range(n)} - set(bs))
yield bs, out_bs
boot = Bootstrap(n=10, n_iter=5, random_state=101)
for train_idx, validation_idx in boot:
print (train_idx, validation_idx)
In [9]:
import numpy as np
boot = Bootstrap(n=len(X), n_iter=10, random_state=101)
lm = LinearRegression()
bootstrapped_coef = np.zeros((10,13))
for k, (train_idx, validation_idx) in enumerate(boot):
lm.fit(X.ix[train_idx,:],y[train_idx])
bootstrapped_coef[k,:] = lm.coef_
In [10]:
print(bootstrapped_coef[:,10])
In [11]:
print(bootstrapped_coef[:,6])
In [12]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=3)
lm = LinearRegression()
lm.fit(X_train,y_train)
print ('Train (cases, features) = %s' % str(X_train.shape))
print ('Test (cases, features) = %s' % str(X_test.shape))
print ('In-sample mean squared error %0.3f' % mean_squared_error(y_train,lm.predict(X_train)))
print ('Out-sample mean squared error %0.3f' % mean_squared_error(y_test,lm.predict(X_test)))
In [13]:
from sklearn.preprocessing import PolynomialFeatures
second_order=PolynomialFeatures(degree=2, interaction_only=False)
third_order=PolynomialFeatures(degree=3, interaction_only=True)
In [14]:
lm.fit(second_order.fit_transform(X_train),y_train)
print ('(cases, features) = %s' % str(second_order.fit_transform(X_train).shape))
print ('In-sample mean squared error %0.3f' % mean_squared_error(y_train,lm.predict(second_order.fit_transform(X_train))))
print ('Out-sample mean squared error %0.3f' % mean_squared_error(y_test,lm.predict(second_order.fit_transform(X_test))))
In [15]:
lm.fit(third_order.fit_transform(X_train),y_train)
print ('(cases, features) = %s' % str(third_order.fit_transform(X_train).shape))
print ('In-sample mean squared error %0.3f' % mean_squared_error(y_train,lm.predict(third_order.fit_transform(X_train))))
print ('Out-sample mean squared error %0.3f' % mean_squared_error(y_test,lm.predict(third_order.fit_transform(X_test))))
In [16]:
try:
import urllib.request as urllib2
except:
import urllib2
import numpy as np
train_data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data'
validation_data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_valid.data'
train_response = 'https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels'
validation_response = 'https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/madelon_valid.labels'
try:
Xt = np.loadtxt(urllib2.urlopen(train_data))
yt = np.loadtxt(urllib2.urlopen(train_response))
Xv = np.loadtxt(urllib2.urlopen(validation_data))
yv = np.loadtxt(urllib2.urlopen(validation_response))
except:
# In case downloading the data doesn't works,
# just manually download the files into the working directory
Xt = np.loadtxt('madelon_train.data')
yt = np.loadtxt('madelon_train.labels')
Xv = np.loadtxt('madelon_valid.data')
yv = np.loadtxt('madelon_valid.labels')
In [17]:
print ('Training set: %i observations %i feature' % (Xt.shape))
print ('Validation set: %i observations %i feature' % (Xv.shape))
In [18]:
from scipy.stats import describe
print (describe(Xt))
In [19]:
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
def visualize_correlation_matrix(data, hurdle = 0.0):
R = np.corrcoef(data, rowvar=0)
R[np.where(np.abs(R)<hurdle)] = 0.0
heatmap = plt.pcolor(R, cmap=mpl.cm.coolwarm, alpha=0.8)
heatmap.axes.set_frame_on(False)
plt.xticks(rotation=90)
plt.tick_params(axis='both', which='both', bottom='off', top='off', left = 'off',
right = 'off')
plt.colorbar()
plt.show()
visualize_correlation_matrix(Xt[:,100:150], hurdle=0.0)
In [20]:
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
In [21]:
logit = LogisticRegression()
In [22]:
logit.fit(Xt,yt)
Out[22]:
In [23]:
from sklearn.metrics import roc_auc_score
print ('Training area under the curve: %0.3f' % roc_auc_score(yt,logit.predict_proba(Xt)[:,1]))
print ('Validation area under the curve: %0.3f' % roc_auc_score(yv,logit.predict_proba(Xv)[:,1]))
In [24]:
from sklearn.feature_selection import SelectPercentile, f_classif
selector = SelectPercentile(f_classif, percentile=50)
selector.fit(Xt,yt)
variable_filter = selector.get_support()
In [25]:
plt.hist(selector.scores_, bins=50, histtype='bar')
plt.grid()
plt.show()
In [26]:
variable_filter = selector.scores_ > 10
print ("Number of filtered variables: %i" % np.sum(variable_filter))
from sklearn.preprocessing import PolynomialFeatures
interactions = PolynomialFeatures(degree=2, interaction_only=True)
Xs = interactions.fit_transform(Xt[:,variable_filter])
print ("Number of variables and interactions: %i" % Xs.shape[1])
In [27]:
logit.fit(Xs,yt)
Xvs = interactions.fit_transform(Xv[:,variable_filter])
print ('Validation area Under the Curve before recursive selection: %0.3f' % roc_auc_score(yv,logit.predict_proba(Xvs)[:,1]))
In [28]:
# Execution time: 3.15 s
from sklearn.feature_selection import RFECV
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
lm = LinearRegression()
cv_iterator = KFold(n=len(X_train), n_folds=10, shuffle=True, random_state=101)
recursive_selector = RFECV(estimator=lm, step=1, cv=cv_iterator, scoring='mean_squared_error')
recursive_selector.fit(second_order.fit_transform(X_train),y_train)
print ('Initial number of features : %i' % second_order.fit_transform(X_train).shape[1])
print ('Optimal number of features : %i' % recursive_selector.n_features_)
In [29]:
a = second_order.fit_transform(X_train)
print (a)
In [30]:
essential_X_train = recursive_selector.transform(second_order.fit_transform(X_train))
essential_X_test = recursive_selector.transform(second_order.fit_transform(X_test))
lm.fit(essential_X_train, y_train)
print ('cases = %i features = %i' % essential_X_test.shape)
print ('In-sample mean squared error %0.3f' % mean_squared_error(y_train,lm.predict(essential_X_train)))
print ('Out-sample mean squared error %0.3f' % mean_squared_error(y_test,lm.predict(essential_X_test)))
In [31]:
edges = np.histogram(y, bins=5)[1]
binning = np.digitize(y, edges)
stratified_cv_iterator = StratifiedKFold(binning, n_folds=10, shuffle=True, random_state=101)
essential_X = recursive_selector.transform(second_order.fit_transform(X))
cv_score = cross_val_score(lm, essential_X, y, cv=stratified_cv_iterator, scoring='mean_squared_error', n_jobs=1)
print ('Cv score: mean %0.3f std %0.3f' % (np.mean(np.abs(cv_score)), np.std(cv_score)))
In [32]:
from sklearn.linear_model import Ridge
ridge = Ridge(normalize=True)
# The following commented line is to show a logistic regression with L2 regularization
# lr_l2 = LogisticRegression(C=1.0, penalty='l2', tol=0.01)
ridge.fit(second_order.fit_transform(X), y)
Out[32]:
In [33]:
lm.fit(second_order.fit_transform(X), y)
Out[33]:
In [34]:
print ('Average coefficient: Non regularized = %0.3f Ridge = %0.3f' % (np.mean(lm.coef_), np.mean(ridge.coef_)))
print ('Min coefficient: Non regularized = %0.3f Ridge = %0.3f' % (np.min(lm.coef_), np.min(ridge.coef_)))
print ('Max coefficient: Non regularized = %0.3f Ridge = %0.3f' % (np.max(lm.coef_), np.max(ridge.coef_)))
In [35]:
from sklearn.grid_search import GridSearchCV
edges = np.histogram(y, bins=5)[1]
binning = np.digitize(y, edges)
stratified_cv_iterator = StratifiedKFold(binning, n_folds=10, shuffle=True, random_state=101)
search = GridSearchCV(estimator=ridge, param_grid={'alpha':np.logspace(-4,2,7)}, scoring = 'mean_squared_error',
n_jobs=1, refit=True, cv=stratified_cv_iterator)
search.fit(second_order.fit_transform(X), y)
print ('Best alpha: %0.5f' % search.best_params_['alpha'])
print ('Best CV mean squared error: %0.3f' % np.abs(search.best_score_))
In [36]:
search.grid_scores_
Out[36]:
In [37]:
# Alternative: sklearn.linear_model.RidgeCV
from sklearn.linear_model import RidgeCV
auto_ridge = RidgeCV(alphas=np.logspace(-4,2,7), normalize=True, scoring = 'mean_squared_error', cv=None)
auto_ridge.fit(second_order.fit_transform(X), y)
print ('Best alpha: %0.5f' % auto_ridge.alpha_)
In [38]:
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats import expon
np.random.seed(101)
search_func=RandomizedSearchCV(estimator=ridge, param_distributions={'alpha':np.logspace(-4,2,100)}, n_iter=10,
scoring='mean_squared_error', n_jobs=1, iid=False, refit=True, cv=stratified_cv_iterator)
search_func.fit(second_order.fit_transform(X), y)
print ('Best alpha: %0.5f' % search_func.best_params_['alpha'])
print ('Best CV mean squared error: %0.3f' % np.abs(search_func.best_score_))
In [39]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=1.0, normalize=True, max_iter=2*10**5)
#The following comment shows an example of L1 logistic regression
#lr_l1 = LogisticRegression(C=1.0, penalty='l1', tol=0.01)
In [40]:
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats import expon
np.random.seed(101)
stratified_cv_iterator = StratifiedKFold(binning, n_folds=10, shuffle=True, random_state=101)
search_func=RandomizedSearchCV(estimator=lasso, param_distributions={'alpha':np.logspace(-5,2,100)}, n_iter=10,
scoring='mean_squared_error', n_jobs=1, iid=False, refit=True, cv=stratified_cv_iterator)
search_func.fit(second_order.fit_transform(X), y)
print ('Best alpha: %0.5f' % search_func.best_params_['alpha'])
print ('Best CV mean squared error: %0.3f' % np.abs(search_func.best_score_))
In [41]:
print ('Zero value coefficients: %i out of %i' % (np.sum(~(search_func.best_estimator_.coef_==0.0)),
len(search_func.best_estimator_.coef_)))
In [42]:
# Alternative: sklearn.linear_model.LassoCV
# Execution time: 54.9 s
from sklearn.linear_model import LassoCV
auto_lasso = LassoCV(alphas=np.logspace(-5,2,100), normalize=True, n_jobs=1, cv=None, max_iter=10**6)
auto_lasso.fit(second_order.fit_transform(X), y)
print ('Best alpha: %0.5f' % auto_lasso.alpha_)
In [43]:
# Execution time: 1min 3s
from sklearn.linear_model import ElasticNet
import numpy as np
elasticnet = ElasticNet(alpha=1.0, l1_ratio=0.15, normalize=True, max_iter=10**6, random_state=101)
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats import expon
np.random.seed(101)
search_func=RandomizedSearchCV(estimator=elasticnet, param_distributions={'alpha':np.logspace(-5,2,100),
'l1_ratio':np.arange(0.0, 1.01, 0.05)}, n_iter=10,
scoring='mean_squared_error', n_jobs=1, iid=False, refit=True, cv=stratified_cv_iterator)
search_func.fit(second_order.fit_transform(X), y)
print ('Best alpha: %0.5f' % search_func.best_params_['alpha'])
print ('Best l1_ratio: %0.5f' % search_func.best_params_['l1_ratio'])
print ('Best CV mean squared error: %0.3f' % np.abs(search_func.best_score_))
In [44]:
print ('Zero value coefficients: %i out of %i' % (np.sum(~(search_func.best_estimator_.coef_==0.0)),
len(search_func.best_estimator_.coef_)))
In [45]:
# Alternative: sklearn.linear_model.ElasticNetCV
from sklearn.linear_model import ElasticNetCV
auto_elastic = ElasticNetCV(alphas=np.logspace(-5,2,100), normalize=True, n_jobs=1, cv=None, max_iter=10**6)
auto_elastic.fit(second_order.fit_transform(X), y)
print ('Best alpha: %0.5f' % auto_elastic.alpha_)
print ('Best l1_ratio: %0.5f' % auto_elastic.l1_ratio_)
In [46]:
print(second_order.fit_transform(X).shape)
print(len(y))
print(second_order.fit_transform(X)[0])
print(y[0])
In [47]:
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import RandomizedLogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
In [48]:
threshold = 0.03
stability_selection = RandomizedLogisticRegression(n_resampling=300, n_jobs=1, random_state=101, scaling=0.15,
sample_fraction=0.50, selection_threshold=threshold)
interactions = PolynomialFeatures(degree=4, interaction_only=True)
model = make_pipeline(stability_selection, interactions, logit)
model.fit(Xt,yt)
Out[48]:
In [49]:
print(Xt.shape)
print(yt.shape)
#print(Xt)
#print(yt)
#print(model.steps[0][1].all_scores_)
In [50]:
print ('Number of features picked by stability selection: %i' % np.sum(model.steps[0][1].all_scores_ >= threshold))
In [51]:
from sklearn.metrics import roc_auc_score
print ('Area Under the Curve: %0.3f' % roc_auc_score(yv,model.predict_proba(Xv)[:,1]))
In [ ]: