In [1]:
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sequential_backward_selection import SBS
from pprint import pprint
from time import time
import pandas as pd
import numpy as np
import pickle
from warnings import filterwarnings
filterwarnings('ignore')
import matplotlib.pyplot as plt
% matplotlib inline
In [2]:
df = pd.read_csv('data/wheat-2013-supervised-edited.csv')
df.drop(df.columns[0],axis=1,inplace=True)
df.head()
Out[2]:
In [3]:
models = {}
models['Linear Regression'] = LinearRegression()
models['Gradient Boost'] = GradientBoostingRegressor(random_state=42)
models['Random Forest'] = RandomForestRegressor(random_state=42)
In [4]:
def show_feat_importances(name, trained_model,feat_labels):
plt.figure(figsize=(10,5))
if name == 'Linear Regression':
importances = trained_model.coef_
plt.ylabel('Coefficients')
else:
importances = trained_model.feature_importances_
plt.ylabel('Importances')
indices = np.argsort(importances)[::-1]
plt.bar(range(len(feat_labels)),importances[indices],color='lightblue',align='center')
plt.xticks(range(len(feat_labels)),feat_labels[indices],rotation=90)
plt.xlim([-1,len(feat_labels)])
plt.tight_layout()
plt.title('FEATURE IMPORTANCE | MODEL:{}'.format(name))
plt.grid()
plt.show()
The model scores alone tipped me off to further investigate the longitude and latitude features. In my review, I concluded that longitude and latitude was a source of leakage (where features are directly related to the target). As a result, I removed longitude and latitude as features in Round#2 of model/feature evaluation.
In [5]:
start_time = time()
X = np.matrix(df.ix[:,:-1])
y = np.array(df.ix[:,-1])
X_std = StandardScaler().fit_transform(X)
y_std = StandardScaler().fit_transform(y)
X_train,X_test,y_train,y_test = train_test_split(X_std,y_std,test_size=0.25,random_state=42)
for name,model in models.items():
results= model.fit(X_train,y_train)
test_score = model.score(X_test,y_test)
train_score = np.mean(cross_val_score(model,X_train,y_train,cv=8))
print '################################### {} ##########################################'.format(name)
print 'RUN_TIME:{}sec \t TEST_SCORE:{} \t TRAIN_SCORE:{}'.format(time()-start_time,test_score.round(2),train_score.round(2))
show_feat_importances(name,results,df.columns[:-1])
In [6]:
start_time = time()
X = np.matrix(df.ix[:,2:-1])
y = np.array(df.ix[:,-1])
X_std = StandardScaler().fit_transform(X)
y_std = StandardScaler().fit_transform(y)
X_train,X_test,y_train,y_test = train_test_split(X_std,y_std,test_size=0.25,random_state=42)
for name,model in models.items():
results= model.fit(X_train,y_train)
test_score = model.score(X_test,y_test)
train_score = np.mean(cross_val_score(model,X_train,y_train,cv=8))
print '################################### {} ##########################################'.format(name)
print 'RUN_TIME:{}sec \t TEST_SCORE:{} \t TRAIN_SCORE:{}'.format(time()-start_time,test_score.round(2),train_score.round(2))
show_feat_importances(name,results,df.columns[2:-1])
The sequential backward selection (SBS) is a feature selection technique that utilizes the greedy algorithm approach to optimize the score for a model given k features.
In the graph shown below, the (test) scores plateau at the...
See below the graph for a print out of the specific feature removed for each model.
In [7]:
sbs_dict = {}
plt.figure(figsize=(10,5))
names = []
for name,model in models.items():
names.append(name)
sbs = SBS(model, k_features=1)
sbs.fit(X,y)
k_feat = [len(k)-1 for k in sbs.subsets_]
sbs_dict[name] = sbs.subsets_
sbs_feat_subset = sbs.subsets_
plt.plot(k_feat,sbs.scores_,marker='o')
plt.xlabel('# of Features Removed')
plt.ylabel('Test Score')
plt.grid(True)
plt.title('Sequential Backward Selection')
plt.legend(names,loc=0)
plt.show()
In [8]:
for name,model in models.items():
print '################################### {} ##########################################'.format(name)
if name == 'Random Forest':
k = 4
else:
k = 8
remove_feats = list(set(list(df.columns[2:-1])) - set(list(df.columns[2:-1][list(sbs_dict[name][k])])))
pprint(remove_feats)
print ''
In [9]:
with open('data/SBS_feat_set.plk','wb') as f:
pickle.dump(sbs_dict,f)