In [20]:
from time import time
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
% matplotlib inline
In [19]:
df = pd.read_csv('data/wheat-2014-supervised.csv')
drop_cols = ['CountyName','State','Date', 'precipTypeIsOther'] + ['Latitude','Longitude']
df.drop(drop_cols,axis=1,inplace=True)
df.dropna(inplace=True)
In [39]:
with open('models/trained_model_Linear.plk','rb') as f:
LR_model = pickle.load(f)
with open('models/trained_model_Random Forest.plk','rb') as f:
RF_model = pickle.load(f)
with open('models/trained_model_Gradient Boost.plk','rb') as f:
GB_model = pickle.load(f)
models = {}
models['Linear'] = LR_model
models['Random Forest'] = RF_model
models['Gradient Boost'] = GB_model
In [44]:
with open('pickles/SBS_feat_set.plk','rb') as f:
sbs_dict = pickle.load(f)
# y_pred = {}
# for name,model in models.items():
# if name == 'Random Forest':
# k = 4
# else:
# k = 8
# X_test = np.matrix(df.ix[:,:-1])[:,list(sbs_dict[name][k])]
# y_true = np.array(df.ix[:,-1])
# y_pred[name] = [model.predict(X_test[i]) for i in range(len(X_test))]
# with open('pickles/predictions.plk','wb') as f:
# pickle.dump(y_pred,f)
In [45]:
with open('pickles/predictions.plk','rb') as f:
y_preds = pickle.load(f)
In [63]:
def regression_analysis(name,y_true,y_pred):
figs,axs = plt.subplots(ncols=2,nrows=1)
figs.set_figwidth(15)
figs.set_figheight(10)
ax = axs[0]
ax.scatter(y_pred,abs(y_true)-abs(y_pred))
ax.plot(np.linspace(-100,100),np.linspace(-100,100)*0, '--r')
if name == 'Linear':
ax.set_xlim([-100,50])
ax.set_ylim([-80,80])
else:
ax.set_xlim([0,100])
ax.set_ylim([-60,60])
ax.set_title('Residual VS. Predicted (MODEL:{})'.format(name))
ax.set_xlabel('Predicted Yield')
ax.set_ylabel('Residual')
ax.grid(True)
ax = axs[1]
ax.scatter(y_true,y_pred)
ax.plot([0,78],[0,78],'--r')
ax.set_xlim([0,80])
ax.set_ylim([0,80])
ax.set_title('Predicted VS. True (MODEL:{})'.format(name))
ax.set_xlabel('True Yield')
ax.set_ylabel('Predicted Yield')
ax.grid(True)
In [64]:
names = ['Linear','Random Forest','Gradient Boost']
for name in names:
y_true = np.array(df.ix[:,-1]).reshape(len(df),1)#[:10000]
y_pred = np.array(y_preds[name])#[:10000]
regression_analysis(name,y_true,y_pred)