Random Forest regression model
In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.ensemble import RandomForestRegressor as rf
from datetime import date
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, median_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
In [2]:
y_data = pd.read_csv('../../yield_w_srad.csv')
y_data.columns
Out[2]:
In [3]:
y_data.head()
Out[3]:
In [4]:
#drop columns that are not required
drop_col = ['year', 'county', 'site', 'id']
y_data.drop(drop_col, axis = 1, inplace = True)
y_data['planted'] = pd.to_datetime(y_data['planted'],
format = "%Y-%m-%d", infer_datetime_format = True)
y_data['head_date'] = pd.to_datetime(y_data['head_date'],
format = "%Y-%m-%d", infer_datetime_format = True)
y_data['planted_day'] = y_data['planted'].apply(lambda t: t.day)
y_data['head_day'] = y_data['head_date'].apply(lambda t: t.day)
In [35]:
plt.scatter(y_data['planted'].dt.year, y_data['yield_lb'], s = 200, alpha = .3)
plt.scatter(y_data['head_date'].dt.year, y_data['yield_lb'], c = 'b', alpha = .1, label = 'head', s = 40)
plt.xlabel('year')
plt.ylabel('yield')
plt.title('yield_lb change over planted and head time period');
In [6]:
y_data = y_data.dropna()
y_data['trial_type'] = pd.factorize(y_data['trial_type'])[0]
y_data['grain_type'] = pd.factorize(y_data['grain_type'])[0]
y_split = y_data.copy().drop(['planted', 'head_date', 'planted_day', 'head_day'], axis = 1)
y = y_split['yield_lb']
x = y_split.drop(['yield_lb'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=5)
x_train.describe()
Out[6]:
Random Forest training
In [7]:
# # 5. Declare data preprocessing steps
# pipeline = make_pipeline(preprocessing.StandardScaler(),
# RandomForestRegressor(n_estimators=100))
# # 6. Declare hyperparameters to tune
# hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
# 'randomforestregressor__max_depth': [None, 5, 3, 1]}
# # 7. Tune model using cross-validation pipeline
# clf = GridSearchCV(pipeline, hyperparameters, cv=10)
# # \
# clf.fit(x_train, y_train)
# # 8. Refit on the entire training set
# # No additional code needed if clf.refit == True (default is True)
# # 9. Evaluate model pipeline on test data
# pred = clf.predict(x_test)
# print r2_score(y_test, pred)
# print mean_squared_error(y_test, pred)
In [8]:
for leaf_size in [1,5,10,50]:
random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=leaf_size)
random_forest.fit(x_train, y_train)
test_preds = random_forest.predict(x_test)
r2 = r2_score(y_test, test_preds)
mse = mean_squared_error(y_test, test_preds)
print 'r2 = {:.2f}, MSE = {}'.format(r2, mse)
In [9]:
random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=1)
random_forest.fit(preprocessing.scale(x_train), y_train)
Out[9]:
In [18]:
importance = random_forest.feature_importances_
importance = pd.DataFrame(importance, index=x_train.columns,
columns=["Importance"])
importance["Std"] = np.std([tree.feature_importances_
for tree in random_forest.estimators_], axis=0)
x = range(importance.shape[0])
importance = importance.sort_values(by = 'Importance', ascending=True)
y = importance.iloc[:, 0]
yerr = importance.iloc[:, 1]
plt.figure(figsize=(10, 6))
plt.barh(x, y, yerr=yerr, align="center", )
plt.yticks(x, importance.index, rotation = 'horizontal', size = 10)
plt.title('Feature Importance for random forest regression model', size = 12);
In [19]:
test_preds = random_forest.predict(preprocessing.scale(x_test))
r2 = r2_score(y_test, test_preds)
mse = mean_squared_error(y_test, test_preds)
print 'MSE = {}, r2 = {}'.format(mse, r2)
plt.scatter(y_test, test_preds, label="r^2= {:.2f}".format(r2),)
plt.legend(loc="lower right")
plt.title("RandomForest Regression with scikit-learn", size = 10);
In [30]:
plt.hist(y_test, bins = 50, );
print 'Actual yield_lb test data'
print '#########################'
print y_test.describe()
In [32]:
plt.hist(test_preds, bins = 50, );
print 'Predicted yield_lb test data'
print '#########################'
print pd.Series(test_preds).describe()
Random forest model without "PLOT"
In [37]:
xtrainnew = x_train.drop('plot', axis = 1)
xtestnew = x_test.drop('plot', axis = 1)
In [38]:
for leaf_size in [1,5,10,50]:
random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=leaf_size)
random_forest.fit(xtrainnew, y_train)
test_preds = random_forest.predict(xtestnew)
r2 = r2_score(y_test, test_preds)
mse = mean_squared_error(y_test, test_preds)
print 'r2 = {:.2f}, MSE = {}'.format(r2, mse)
In [39]:
random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=1)
random_forest.fit(preprocessing.scale(xtrainnew), y_train)
importance = random_forest.feature_importances_
importance = pd.DataFrame(importance, index=xtrainnew.columns,
columns=["Importance"])
importance["Std"] = np.std([tree.feature_importances_
for tree in random_forest.estimators_], axis=0)
x = range(importance.shape[0])
importance = importance.sort_values(by = 'Importance', ascending=True)
y = importance.iloc[:, 0]
yerr = importance.iloc[:, 1]
plt.figure(figsize=(10, 6))
plt.barh(x, y, yerr=yerr, align="center", )
plt.yticks(x, importance.index, rotation = 'horizontal', size = 10)
plt.title('Feature Importance for random forest regression model', size = 12);
In [40]:
test_preds = random_forest.predict(preprocessing.scale(xtestnew))
r2 = r2_score(y_test, test_preds)
mse = mean_squared_error(y_test, test_preds)
print 'MSE = {}, r2 = {}'.format(mse, r2)
plt.scatter(y_test, test_preds, label="r^2= {:.2f}".format(r2),)
plt.legend(loc="lower right")
plt.title("RandomForest Regression with scikit-learn", size = 10);
In [41]:
plt.hist(y_test, bins = 50, );
print 'Actual yield_lb test data'
print '#########################'
print y_test.describe()
In [42]:
plt.hist(test_preds, bins = 50, );
print 'Predicted yield_lb test data'
print '#########################'
print pd.Series(test_preds).describe()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: