In [76]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from datetime import date
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, median_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
In [77]:
y_data = pd.read_csv('../data/yield_w_srad.csv')
y_data.columns
Out[77]:
In [78]:
y_data.head()
Out[78]:
In [79]:
#drop columns that are not required
drop_col = ['year', 'county', 'site', 'id']
y_data.drop(drop_col, axis = 1, inplace = True)
y_data['planted'] = pd.to_datetime(y_data['planted'],
format = "%Y-%m-%d", infer_datetime_format = True)
y_data['head_date'] = pd.to_datetime(y_data['head_date'],
format = "%Y-%m-%d", infer_datetime_format = True)
y_data['planted_day'] = y_data['planted'].apply(lambda t: t.day)
y_data['head_day'] = y_data['head_date'].apply(lambda t: t.day)
In [136]:
# Prepare the data
y_data = y_data.dropna()
y_data['trial_type'] = pd.factorize(y_data['trial_type'])[0]
y_data['grain_type'] = pd.factorize(y_data['grain_type'])[0]
y_split = y_data.copy().drop(['planted', 'head_date', 'planted_day', 'head_day'], axis = 1)
y = y_split['yield_lb']
x = y_split.drop(['yield_lb'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=5)
# Drop the 'plot' feature
xtrain_new = x_train.drop('plot', axis = 1)
xtest_new = x_test.drop('plot', axis = 1)
# Scale the data first
xtrainnew = preprocessing.scale(xtrainnew)
xtestnew = preprocessing.scale(xtestnew)
In [73]:
rng = np.random.RandomState(1)
In [109]:
# function to generate boosting model
def get_model(model_depth=5, no_estimators=100, random_state=rng):
model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=model_depth),
n_estimators=no_estimators, random_state=random_state)
return model
In [121]:
depths = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
validations_r2 = []
trainings_r2 = []
for depth in depths:
# get the model
reg_model = get_model(model_depth = depth)
# fit training data
reg_model.fit(xtrainnew, y_train)
# calculate r2 score
train_r2 = reg_model.score(xtrainnew, y_train)
valid_r2 = reg_model.score(xtestnew, y_test)
trainings_r2.append(train_r2)
validations_r2.append(valid_r2)
In [122]:
plt.plot(depths, trainings_r2, label ='train r2')
plt.plot(depths, validations_r2, label ='validation r2')
plt.legend(loc='upper left')
plt.xlabel('max depth of model')
plt.ylabel('r2 score')
plt.show()
It looks like the depth = 12 would be the best choice. Now testing number of estimators.
In [123]:
estimators = [1,10,20,30,40,50,60,70,80,100]
validations_r2_e = []
trainings_r2_e = []
for e in estimators:
# get the model
reg_model = get_model(model_depth = 12, no_estimators = e)
# fit training data
reg_model.fit(xtrainnew, y_train)
# calculate r2 score
train_r2 = reg_model.score(xtrainnew, y_train)
valid_r2 = reg_model.score(xtestnew, y_test)
trainings_r2_e.append(train_r2)
validations_r2_e.append(valid_r2)
In [124]:
plt.plot(estimators, trainings_r2_e, label ='train r2')
plt.plot(estimators, validations_r2_e, label ='validation r2')
plt.legend(loc='lower right')
plt.xlabel('Number of estimators in the model')
plt.ylabel('r2 score')
plt.show()
In [129]:
final_model = get_model(model_depth = 12, no_estimators = 100)
In [130]:
final_model.fit(xtrainnew, y_train)
Out[130]:
In [131]:
training_r2 = final_model.score(xtrainnew, y_train)
print 'training r2 = {}'.format(training_r2)
In [132]:
valid_r2 = final_model.score(xtestnew, y_test)
print 'validatioin r2 = {}'.format(valid_r2)
In [133]:
test_preds = final_model.predict(xtestnew)
r2 = r2_score(y_test, test_preds)
mse = mean_squared_error(y_test, test_preds)
print 'MSE = {}, r2 = {}'.format(mse, r2)
plt.scatter(y_test, test_preds, label="r^2= {:.2f}".format(r2),)
plt.legend(loc="lower right")
plt.title("RandomForest Regression with scikit-learn", size = 10)
plt.show()
In [137]:
importance = final_model.feature_importances_
importance = pd.DataFrame(importance, index=xtrain_new.columns,
columns=["Importance"])
importance["Std"] = np.std([tree.feature_importances_
for tree in final_model.estimators_], axis=0)
x = range(importance.shape[0])
importance = importance.sort_values(by = 'Importance', ascending=True)
y = importance.iloc[:, 0]
yerr = importance.iloc[:, 1]
plt.figure(figsize=(10, 6))
plt.barh(x, y, yerr=yerr, align="center", )
plt.yticks(x, importance.index, rotation = 'horizontal', size = 10)
plt.title('Feature Importance for random forest regression model', size = 12)
plt.show()
In [138]:
plt.hist(y_test, bins = 50, label = 'actual', alpha = 0.8)
plt.hist(test_preds, bins = 50, label = 'prediction', alpha = 0.8)
plt.legend()
plt.show()
In [8]:
for leaf_size in [1,5,10,50]:
random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=leaf_size)
random_forest.fit(x_train, y_train)
test_preds = random_forest.predict(x_test)
r2 = r2_score(y_test, test_preds)
mse = mean_squared_error(y_test, test_preds)
print 'r2 = {:.2f}, MSE = {}'.format(r2, mse)
In [9]:
random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=1)
random_forest.fit(preprocessing.scale(x_train), y_train)
Out[9]:
In [18]:
importance = random_forest.feature_importances_
importance = pd.DataFrame(importance, index=x_train.columns,
columns=["Importance"])
importance["Std"] = np.std([tree.feature_importances_
for tree in random_forest.estimators_], axis=0)
x = range(importance.shape[0])
importance = importance.sort_values(by = 'Importance', ascending=True)
y = importance.iloc[:, 0]
yerr = importance.iloc[:, 1]
plt.figure(figsize=(10, 6))
plt.barh(x, y, yerr=yerr, align="center", )
plt.yticks(x, importance.index, rotation = 'horizontal', size = 10)
plt.title('Feature Importance for random forest regression model', size = 12);
In [19]:
test_preds = random_forest.predict(preprocessing.scale(x_test))
r2 = r2_score(y_test, test_preds)
mse = mean_squared_error(y_test, test_preds)
print 'MSE = {}, r2 = {}'.format(mse, r2)
plt.scatter(y_test, test_preds, label="r^2= {:.2f}".format(r2),)
plt.legend(loc="lower right")
plt.title("RandomForest Regression with scikit-learn", size = 10);
In [30]:
plt.hist(y_test, bins = 50, );
print 'Actual yield_lb test data'
print '#########################'
print y_test.describe()
In [32]:
plt.hist(test_preds, bins = 50, );
print 'Predicted yield_lb test data'
print '#########################'
print pd.Series(test_preds).describe()
Random forest model without "PLOT"
In [37]:
xtrainnew = x_train.drop('plot', axis = 1)
xtestnew = x_test.drop('plot', axis = 1)
In [38]:
for leaf_size in [1,5,10,50]:
random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=leaf_size)
random_forest.fit(xtrainnew, y_train)
test_preds = random_forest.predict(xtestnew)
r2 = r2_score(y_test, test_preds)
mse = mean_squared_error(y_test, test_preds)
print 'r2 = {:.2f}, MSE = {}'.format(r2, mse)
In [39]:
random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=1)
random_forest.fit(preprocessing.scale(xtrainnew), y_train)
importance = random_forest.feature_importances_
importance = pd.DataFrame(importance, index=xtrainnew.columns,
columns=["Importance"])
importance["Std"] = np.std([tree.feature_importances_
for tree in random_forest.estimators_], axis=0)
x = range(importance.shape[0])
importance = importance.sort_values(by = 'Importance', ascending=True)
y = importance.iloc[:, 0]
yerr = importance.iloc[:, 1]
plt.figure(figsize=(10, 6))
plt.barh(x, y, yerr=yerr, align="center", )
plt.yticks(x, importance.index, rotation = 'horizontal', size = 10)
plt.title('Feature Importance for random forest regression model', size = 12);
In [40]:
test_preds = random_forest.predict(preprocessing.scale(xtestnew))
r2 = r2_score(y_test, test_preds)
mse = mean_squared_error(y_test, test_preds)
print 'MSE = {}, r2 = {}'.format(mse, r2)
plt.scatter(y_test, test_preds, label="r^2= {:.2f}".format(r2),)
plt.legend(loc="lower right")
plt.title("RandomForest Regression with scikit-learn", size = 10);
In [41]:
plt.hist(y_test, bins = 50, );
print 'Actual yield_lb test data'
print '#########################'
print y_test.describe()
In [42]:
plt.hist(test_preds, bins = 50, );
print 'Predicted yield_lb test data'
print '#########################'
print pd.Series(test_preds).describe()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: