Random Forest regression model


In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.ensemble import RandomForestRegressor as rf
from datetime import date
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, median_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

In [2]:
y_data = pd.read_csv('../../yield_w_srad.csv')
y_data.columns


Out[2]:
Index([u'planted', u'site', u'plot', u'rep', u'id', u'dth', u'lodging',
       u'height', u'grain_type', u'moisture', u'year', u'county',
       u'trial_type', u'yield_lb', u'head_date', u'pi', u'vg_avg_tmin',
       u'vg_avg_tmax', u'fl_avg_tmin', u'fl_avg_tmax', u'gf_avg_tmin',
       u'gf_avg_tmax', u'se_avg_tmin', u'se_avg_tmax', u'veg_srad',
       u'rep_srad', u'gf_srad'],
      dtype='object')

In [3]:
y_data.head()


Out[3]:
planted site plot rep id dth lodging height grain_type moisture ... vg_avg_tmax fl_avg_tmin fl_avg_tmax gf_avg_tmin gf_avg_tmax se_avg_tmin se_avg_tmax veg_srad rep_srad gf_srad
0 1995-05-12 RES 349.0 1 CM101 78 15.0 85.0 S 16.0 ... 28.114548 15.854166 32.492108 14.893976 33.887337 14.025759 30.655951 78968 36409 49842
1 1995-05-12 RES 374.0 2 CM101 78 15.0 80.0 S 15.9 ... 28.114548 15.854166 32.492108 14.893976 33.887337 14.025759 30.655951 78968 36409 49842
2 1995-05-12 RES 8.0 1 M203 87 60.0 105.0 M 23.0 ... 28.114548 16.097288 33.369639 13.492801 32.751324 13.859148 30.760579 78968 49218 47679
3 1995-05-12 RES 1096.0 1 94Y589 87 10.0 95.0 M 18.7 ... 28.114548 16.097288 33.369639 13.492801 32.751324 13.859148 30.760579 78968 49218 47679
4 1995-05-12 RES 350.0 1 94Y383 86 1.0 85.0 M 19.9 ... 28.114548 16.101021 33.400215 13.581463 32.705987 13.873189 30.725791 78968 47809 47944

5 rows × 27 columns


In [4]:
#drop columns that are not required
drop_col = ['year', 'county', 'site', 'id']
y_data.drop(drop_col, axis = 1, inplace = True)
y_data['planted'] = pd.to_datetime(y_data['planted'], 
                                   format = "%Y-%m-%d", infer_datetime_format = True)
y_data['head_date'] = pd.to_datetime(y_data['head_date'], 
                                     format = "%Y-%m-%d", infer_datetime_format = True)
y_data['planted_day'] = y_data['planted'].apply(lambda t: t.day)
y_data['head_day'] = y_data['head_date'].apply(lambda t: t.day)

In [35]:
plt.scatter(y_data['planted'].dt.year, y_data['yield_lb'], s = 200, alpha = .3)
plt.scatter(y_data['head_date'].dt.year, y_data['yield_lb'], c = 'b', alpha = .1, label = 'head', s = 40)
plt.xlabel('year')
plt.ylabel('yield')
plt.title('yield_lb change over planted and head time period');



In [6]:
y_data = y_data.dropna()
y_data['trial_type'] = pd.factorize(y_data['trial_type'])[0]
y_data['grain_type'] = pd.factorize(y_data['grain_type'])[0]

y_split = y_data.copy().drop(['planted', 'head_date', 'planted_day', 'head_day'], axis = 1)
y = y_split['yield_lb']
x = y_split.drop(['yield_lb'], axis = 1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=5)
x_train.describe()


Out[6]:
plot rep dth lodging height grain_type moisture trial_type pi vg_avg_tmin vg_avg_tmax fl_avg_tmin fl_avg_tmax gf_avg_tmin gf_avg_tmax se_avg_tmin se_avg_tmax veg_srad rep_srad gf_srad
count 4818.000000 4818.000000 4818.000000 4818.000000 4818.000000 4818.000000 4818.000000 4818.000000 4818.000000 4818.000000 4818.000000 4818.000000 4818.000000 4818.000000 4818.000000 4818.000000 4818.000000 4818.000000 4818.000000 4818.000000
mean 384.747198 2.248028 84.462433 23.527293 94.534747 1.082814 16.837813 0.934205 49.563097 13.657877 30.563047 15.400812 33.635437 14.142866 32.810042 14.217699 31.930401 73207.497302 51698.921129 46269.671856
std 621.639330 1.094706 6.629958 32.028831 7.405346 0.714039 3.206468 0.799385 2.799334 0.904879 1.527051 1.170603 1.417476 1.032498 1.092075 0.674924 0.842727 3981.297685 8783.983977 2785.485142
min 1.000000 1.000000 71.000000 0.000000 70.000000 0.000000 6.190000 0.000000 46.000000 12.123631 27.607502 12.787868 29.037816 8.464561 26.182818 12.806541 29.889453 67738.000000 31346.000000 30415.000000
25% 44.000000 1.000000 80.000000 1.000000 89.916000 1.000000 14.900000 0.000000 47.000000 12.806615 29.458961 14.314851 32.582901 13.363344 32.254054 13.633277 31.521658 70102.000000 45634.000000 44660.000000
50% 89.000000 2.000000 84.000000 5.000000 95.000000 1.000000 16.600000 1.000000 49.000000 13.766445 30.525245 15.222422 33.559038 14.048884 32.945452 14.408095 32.060290 72664.000000 51075.000000 46484.000000
75% 370.750000 3.000000 88.000000 40.000000 100.000000 2.000000 18.325000 2.000000 51.000000 14.430488 31.633240 16.502931 34.453982 15.205106 33.616870 14.792181 32.482795 75758.000000 56510.250000 48143.000000
max 2664.000000 4.000000 124.000000 100.000000 125.999999 2.000000 35.300000 2.000000 57.000000 15.187658 33.389713 18.403577 37.108562 15.895720 34.768822 15.243826 33.562510 83871.000000 106563.000000 52828.000000

Random Forest training


In [7]:
# # 5. Declare data preprocessing steps
# pipeline = make_pipeline(preprocessing.StandardScaler(), 
#                          RandomForestRegressor(n_estimators=100))
 
# # 6. Declare hyperparameters to tune
# hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
#                   'randomforestregressor__max_depth': [None, 5, 3, 1]}
 
# # 7. Tune model using cross-validation pipeline
# clf = GridSearchCV(pipeline, hyperparameters, cv=10)
# #  \
# clf.fit(x_train, y_train)
 
# # 8. Refit on the entire training set
# # No additional code needed if clf.refit == True (default is True)
 
# # 9. Evaluate model pipeline on test data
# pred = clf.predict(x_test)
# print r2_score(y_test, pred)
# print mean_squared_error(y_test, pred)

In [8]:
for leaf_size in [1,5,10,50]:
    random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=leaf_size)
    random_forest.fit(x_train, y_train)
    test_preds = random_forest.predict(x_test)
    r2 = r2_score(y_test, test_preds)
    mse = mean_squared_error(y_test, test_preds)
    print 'r2 = {:.2f}, MSE = {}'.format(r2, mse)


r2 = 0.57, MSE = 776332.152478
r2 = 0.56, MSE = 802581.838596
r2 = 0.53, MSE = 856870.915549
r2 = 0.43, MSE = 1030202.34458

In [9]:
random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=1)
random_forest.fit(preprocessing.scale(x_train), y_train)


Out[9]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=5,
           verbose=0, warm_start=False)

In [18]:
importance = random_forest.feature_importances_
importance = pd.DataFrame(importance, index=x_train.columns, 
                          columns=["Importance"])

importance["Std"] = np.std([tree.feature_importances_
                            for tree in random_forest.estimators_], axis=0)

x = range(importance.shape[0])
importance = importance.sort_values(by = 'Importance', ascending=True)
y = importance.iloc[:, 0]
yerr = importance.iloc[:, 1]

plt.figure(figsize=(10, 6))
plt.barh(x, y, yerr=yerr, align="center", )
plt.yticks(x, importance.index, rotation = 'horizontal', size = 10)
plt.title('Feature Importance for random forest regression model', size = 12);


Random Forest testing


In [19]:
test_preds = random_forest.predict(preprocessing.scale(x_test))
r2 = r2_score(y_test, test_preds)
mse = mean_squared_error(y_test, test_preds)
print 'MSE = {}, r2 = {}'.format(mse, r2)

plt.scatter(y_test, test_preds, label="r^2= {:.2f}".format(r2),)
plt.legend(loc="lower right")
plt.title("RandomForest Regression with scikit-learn", size = 10);


MSE = 838277.339147, r2 = 0.539338343059

In [30]:
plt.hist(y_test, bins = 50, );
print 'Actual yield_lb test data'
print '#########################'
print y_test.describe()


Actual yield_lb test data
#########################
count     2066.000000
mean      9333.310761
std       1349.298262
min       4659.299163
25%       8449.491404
50%       9394.321302
75%      10262.714512
max      13744.665767
Name: yield_lb, dtype: float64

In [32]:
plt.hist(test_preds, bins = 50, );
print 'Predicted yield_lb test data'
print '#########################'
print pd.Series(test_preds).describe()


Predicted yield_lb test data
#########################
count     2066.000000
mean      9345.768032
std        918.657304
min       5513.259791
25%       8739.708278
50%       9319.066509
75%       9997.916390
max      11575.220000
dtype: float64

Random forest model without "PLOT"


In [37]:
xtrainnew = x_train.drop('plot', axis = 1)
xtestnew  = x_test.drop('plot', axis = 1)

In [38]:
for leaf_size in [1,5,10,50]:
    random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=leaf_size)
    random_forest.fit(xtrainnew, y_train)
    test_preds = random_forest.predict(xtestnew)
    r2 = r2_score(y_test, test_preds)
    mse = mean_squared_error(y_test, test_preds)
    print 'r2 = {:.2f}, MSE = {}'.format(r2, mse)


r2 = 0.52, MSE = 864631.302237
r2 = 0.52, MSE = 881103.321669
r2 = 0.49, MSE = 925887.966746
r2 = 0.40, MSE = 1089317.31498

In [39]:
random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=1)
random_forest.fit(preprocessing.scale(xtrainnew), y_train)

importance = random_forest.feature_importances_
importance = pd.DataFrame(importance, index=xtrainnew.columns, 
                          columns=["Importance"])

importance["Std"] = np.std([tree.feature_importances_
                            for tree in random_forest.estimators_], axis=0)

x = range(importance.shape[0])
importance = importance.sort_values(by = 'Importance', ascending=True)
y = importance.iloc[:, 0]
yerr = importance.iloc[:, 1]

plt.figure(figsize=(10, 6))
plt.barh(x, y, yerr=yerr, align="center", )
plt.yticks(x, importance.index, rotation = 'horizontal', size = 10)
plt.title('Feature Importance for random forest regression model', size = 12);



In [40]:
test_preds = random_forest.predict(preprocessing.scale(xtestnew))
r2 = r2_score(y_test, test_preds)
mse = mean_squared_error(y_test, test_preds)
print 'MSE = {}, r2 = {}'.format(mse, r2)

plt.scatter(y_test, test_preds, label="r^2= {:.2f}".format(r2),)
plt.legend(loc="lower right")
plt.title("RandomForest Regression with scikit-learn", size = 10);


MSE = 903927.734153, r2 = 0.50326123787

In [41]:
plt.hist(y_test, bins = 50, );
print 'Actual yield_lb test data'
print '#########################'
print y_test.describe()


Actual yield_lb test data
#########################
count     2066.000000
mean      9333.310761
std       1349.298262
min       4659.299163
25%       8449.491404
50%       9394.321302
75%      10262.714512
max      13744.665767
Name: yield_lb, dtype: float64

In [42]:
plt.hist(test_preds, bins = 50, );
print 'Predicted yield_lb test data'
print '#########################'
print pd.Series(test_preds).describe()


Predicted yield_lb test data
#########################
count     2066.000000
mean      9361.145035
std        903.512342
min       6020.139615
25%       8764.183319
50%       9328.280977
75%       9979.842252
max      11838.047777
dtype: float64

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: