In [1]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, RANSACRegressor
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
% matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from pprint import pprint
In [2]:
with open('pickles/model_optimization_train_scores.plk','rb') as f:
train_pickle = pickle.load(f)
with open('pickles/model_optimization_test_scores.plk','rb') as f:
test_pickle = pickle.load(f)
In [3]:
name = 'Linear'
param_list = train_pickle[name][0]
train_scores= train_pickle[name][1]
test_scores = test_pickle[name][1]
plt.figure(figsize=(10,7))
plt.plot(param_list,train_scores,marker='^')
plt.plot(param_list,test_scores,marker='o')
plt.legend(['Train','Test'],loc=0)
plt.title('Linear Regression')
plt.xlabel('RANSAC Sample Size')
plt.ylabel('Score')
plt.grid(True)
plt.show()
In [4]:
name = 'Random Forest'
param_list = train_pickle[name][0]
train_scores= train_pickle[name][1]
test_scores = test_pickle[name][1]
fig,axs = plt.subplots(nrows=1,ncols=2)
fig.set_figheight(7)
fig.set_figwidth(15)
ax = axs[0]
ax.plot(param_list,train_scores,marker='^')
ax.legend(['n_depth={}'.format(n) for n in range(1,6)]+['default'],loc=5)
ax.legend()
ax.set_title('Random Forest (Train)')
ax.set_xlabel('n Trees')
ax.set_ylabel('Score')
ax.grid(True)
ax = axs[1]
ax.plot(param_list,test_scores,marker='o')
ax.legend(['n_depth={}'.format(n) for n in range(1,6)]+['default'],loc=5)
ax.set_title('Random Forest (Test)')
ax.set_xlabel('n Trees')
ax.set_ylabel('Score')
ax.grid(True)
In [5]:
name = 'Gradient Boost'
param_list = train_pickle[name][0]
train_scores= train_pickle[name][1]
test_scores = test_pickle[name][1]
fig,axs = plt.subplots(nrows=1,ncols=2)
fig.set_figheight(7)
fig.set_figwidth(15)
ax = axs[0]
ax.plot(param_list,train_scores,marker='^')
ax.legend(['n_depth={} (default)'.format(n) if n==3 else 'n_depth={}'.format(n) for n in range(1,6)]+['default'],loc=2)
ax.set_title('XG Boost (Train)')
ax.set_xlabel('n Trees')
ax.set_ylabel('Score')
ax.grid(True)
ax = axs[1]
ax.plot(param_list,test_scores,marker='o')
ax.legend(['n_depth={} (default)'.format(n) if n==3 else 'n_depth={}'.format(n) for n in range(1,6)]+['default'],loc=2)
ax.set_title('XG Boost (Test)')
ax.set_xlabel('n Trees')
ax.set_ylabel('Score')
ax.grid(True)
In [6]:
df = pd.read_csv('data/wheat-2013-supervised-edited.csv')
drop_cols = ['Latitude','Longitude'] + [df.columns[0]]
df.drop(drop_cols,axis=1,inplace=True)
In [7]:
with open('pickles/SBS_feat_set.plk','rb') as f:
sbs_dict = pickle.load(f)
In [8]:
models = {}
models['Linear'] = LinearRegression()
models['Random Forest'] = RandomForestRegressor(random_state=42)
models['Gradient Boost'] = GradientBoostingRegressor(n_estimators=500,max_depth=5)
In [9]:
for name,model in models.items():
if name == 'Random Forest':
k = 4
X = np.matrix(df.ix[:,:-1])[:,list(sbs_dict[name][k])]
y = np.array(df.ix[:,-1])
X_train, X_test, y_train, y_test = train_test_split(X,ytest_size=0.25,random_state=42)
elif name == 'Linear':
k = 8
X = np.matrix(df.ix[:,:-1])[:,list(sbs_dict[name][k])]
y = np.array(df.ix[:,-1])
X_std = StandardScaler().fit_transform(X)
y_std = StandardScaler().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X_std,y_std,test_size=0.25,random_state=42)
elif name == 'Gradient Boost':
X = np.matrix(df.ix[:,:-1])[:,list(sbs_dict[name][k])]
y = np.array(df.ix[:,-1])
X_train, X_test, y_train, y_test = train_test_split(X,ytest_size=0.25,random_state=42)
trained_model = model.fit(X_train,y_train)
with open('pickles/trained_model2_{}.plk'.format(name),'wb') as f:
pickle.dump(trained_model,f)