Random Forest regression model



In [23]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.ensemble import RandomForestRegressor as rf
from datetime import date
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, median_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor



In [2]:

    
y_data = pd.read_csv('../../yield_w_srad.csv')
y_data.columns









    Out[2]:





Index([u'planted', u'site', u'plot', u'rep', u'id', u'dth', u'lodging',
       u'height', u'grain_type', u'moisture', u'year', u'county',
       u'trial_type', u'yield_lb', u'head_date', u'pi', u'vg_avg_tmin',
       u'vg_avg_tmax', u'fl_avg_tmin', u'fl_avg_tmax', u'gf_avg_tmin',
       u'gf_avg_tmax', u'se_avg_tmin', u'se_avg_tmax', u'veg_srad',
       u'rep_srad', u'gf_srad'],
      dtype='object')



In [3]:

    
y_data.head()









    Out[3]:







  
    
      
      planted
      site
      plot
      rep
      id
      dth
      lodging
      height
      grain_type
      moisture
      ...
      vg_avg_tmax
      fl_avg_tmin
      fl_avg_tmax
      gf_avg_tmin
      gf_avg_tmax
      se_avg_tmin
      se_avg_tmax
      veg_srad
      rep_srad
      gf_srad
    
  
  
    
      0
      1995-05-12
      RES
      349.0
      1
      CM101
      78
      15.0
      85.0
      S
      16.0
      ...
      28.114548
      15.854166
      32.492108
      14.893976
      33.887337
      14.025759
      30.655951
      78968
      36409
      49842
    
    
      1
      1995-05-12
      RES
      374.0
      2
      CM101
      78
      15.0
      80.0
      S
      15.9
      ...
      28.114548
      15.854166
      32.492108
      14.893976
      33.887337
      14.025759
      30.655951
      78968
      36409
      49842
    
    
      2
      1995-05-12
      RES
      8.0
      1
      M203
      87
      60.0
      105.0
      M
      23.0
      ...
      28.114548
      16.097288
      33.369639
      13.492801
      32.751324
      13.859148
      30.760579
      78968
      49218
      47679
    
    
      3
      1995-05-12
      RES
      1096.0
      1
      94Y589
      87
      10.0
      95.0
      M
      18.7
      ...
      28.114548
      16.097288
      33.369639
      13.492801
      32.751324
      13.859148
      30.760579
      78968
      49218
      47679
    
    
      4
      1995-05-12
      RES
      350.0
      1
      94Y383
      86
      1.0
      85.0
      M
      19.9
      ...
      28.114548
      16.101021
      33.400215
      13.581463
      32.705987
      13.873189
      30.725791
      78968
      47809
      47944
    
  

5 rows × 27 columns



In [4]:

    
#drop columns that are not required
drop_col = ['year', 'county', 'site', 'id']
y_data.drop(drop_col, axis = 1, inplace = True)
y_data['planted'] = pd.to_datetime(y_data['planted'], 
                                   format = "%Y-%m-%d", infer_datetime_format = True)
y_data['head_date'] = pd.to_datetime(y_data['head_date'], 
                                     format = "%Y-%m-%d", infer_datetime_format = True)
y_data['planted_day'] = y_data['planted'].apply(lambda t: t.day)
y_data['head_day'] = y_data['head_date'].apply(lambda t: t.day)



In [35]:

    
plt.scatter(y_data['planted'].dt.year, y_data['yield_lb'], s = 200, alpha = .3)
plt.scatter(y_data['head_date'].dt.year, y_data['yield_lb'], c = 'b', alpha = .1, label = 'head', s = 40)
plt.xlabel('year')
plt.ylabel('yield')
plt.title('yield_lb change over planted and head time period');



In [6]:

    
y_data = y_data.dropna()
y_data['trial_type'] = pd.factorize(y_data['trial_type'])[0]
y_data['grain_type'] = pd.factorize(y_data['grain_type'])[0]

y_split = y_data.copy().drop(['planted', 'head_date', 'planted_day', 'head_day'], axis = 1)
y = y_split['yield_lb']
x = y_split.drop(['yield_lb'], axis = 1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=5)
x_train.describe()









    Out[6]:







  
    
      
      plot
      rep
      dth
      lodging
      height
      grain_type
      moisture
      trial_type
      pi
      vg_avg_tmin
      vg_avg_tmax
      fl_avg_tmin
      fl_avg_tmax
      gf_avg_tmin
      gf_avg_tmax
      se_avg_tmin
      se_avg_tmax
      veg_srad
      rep_srad
      gf_srad
    
  
  
    
      count
      4818.000000
      4818.000000
      4818.000000
      4818.000000
      4818.000000
      4818.000000
      4818.000000
      4818.000000
      4818.000000
      4818.000000
      4818.000000
      4818.000000
      4818.000000
      4818.000000
      4818.000000
      4818.000000
      4818.000000
      4818.000000
      4818.000000
      4818.000000
    
    
      mean
      384.747198
      2.248028
      84.462433
      23.527293
      94.534747
      1.082814
      16.837813
      0.934205
      49.563097
      13.657877
      30.563047
      15.400812
      33.635437
      14.142866
      32.810042
      14.217699
      31.930401
      73207.497302
      51698.921129
      46269.671856
    
    
      std
      621.639330
      1.094706
      6.629958
      32.028831
      7.405346
      0.714039
      3.206468
      0.799385
      2.799334
      0.904879
      1.527051
      1.170603
      1.417476
      1.032498
      1.092075
      0.674924
      0.842727
      3981.297685
      8783.983977
      2785.485142
    
    
      min
      1.000000
      1.000000
      71.000000
      0.000000
      70.000000
      0.000000
      6.190000
      0.000000
      46.000000
      12.123631
      27.607502
      12.787868
      29.037816
      8.464561
      26.182818
      12.806541
      29.889453
      67738.000000
      31346.000000
      30415.000000
    
    
      25%
      44.000000
      1.000000
      80.000000
      1.000000
      89.916000
      1.000000
      14.900000
      0.000000
      47.000000
      12.806615
      29.458961
      14.314851
      32.582901
      13.363344
      32.254054
      13.633277
      31.521658
      70102.000000
      45634.000000
      44660.000000
    
    
      50%
      89.000000
      2.000000
      84.000000
      5.000000
      95.000000
      1.000000
      16.600000
      1.000000
      49.000000
      13.766445
      30.525245
      15.222422
      33.559038
      14.048884
      32.945452
      14.408095
      32.060290
      72664.000000
      51075.000000
      46484.000000
    
    
      75%
      370.750000
      3.000000
      88.000000
      40.000000
      100.000000
      2.000000
      18.325000
      2.000000
      51.000000
      14.430488
      31.633240
      16.502931
      34.453982
      15.205106
      33.616870
      14.792181
      32.482795
      75758.000000
      56510.250000
      48143.000000
    
    
      max
      2664.000000
      4.000000
      124.000000
      100.000000
      125.999999
      2.000000
      35.300000
      2.000000
      57.000000
      15.187658
      33.389713
      18.403577
      37.108562
      15.895720
      34.768822
      15.243826
      33.562510
      83871.000000
      106563.000000
      52828.000000

Random Forest training



In [7]:

    
# # 5. Declare data preprocessing steps
# pipeline = make_pipeline(preprocessing.StandardScaler(), 
#                          RandomForestRegressor(n_estimators=100))
 
# # 6. Declare hyperparameters to tune
# hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
#                   'randomforestregressor__max_depth': [None, 5, 3, 1]}
 
# # 7. Tune model using cross-validation pipeline
# clf = GridSearchCV(pipeline, hyperparameters, cv=10)
# #  \
# clf.fit(x_train, y_train)
 
# # 8. Refit on the entire training set
# # No additional code needed if clf.refit == True (default is True)
 
# # 9. Evaluate model pipeline on test data
# pred = clf.predict(x_test)
# print r2_score(y_test, pred)
# print mean_squared_error(y_test, pred)



In [8]:

    
for leaf_size in [1,5,10,50]:
    random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=leaf_size)
    random_forest.fit(x_train, y_train)
    test_preds = random_forest.predict(x_test)
    r2 = r2_score(y_test, test_preds)
    mse = mean_squared_error(y_test, test_preds)
    print 'r2 = {:.2f}, MSE = {}'.format(r2, mse)









    



r2 = 0.57, MSE = 776332.152478
r2 = 0.56, MSE = 802581.838596
r2 = 0.53, MSE = 856870.915549
r2 = 0.43, MSE = 1030202.34458



In [9]:

    
random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=1)
random_forest.fit(preprocessing.scale(x_train), y_train)









    Out[9]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=5,
           verbose=0, warm_start=False)



In [18]:

    
importance = random_forest.feature_importances_
importance = pd.DataFrame(importance, index=x_train.columns, 
                          columns=["Importance"])

importance["Std"] = np.std([tree.feature_importances_
                            for tree in random_forest.estimators_], axis=0)

x = range(importance.shape[0])
importance = importance.sort_values(by = 'Importance', ascending=True)
y = importance.iloc[:, 0]
yerr = importance.iloc[:, 1]

plt.figure(figsize=(10, 6))
plt.barh(x, y, yerr=yerr, align="center", )
plt.yticks(x, importance.index, rotation = 'horizontal', size = 10)
plt.title('Feature Importance for random forest regression model', size = 12);

Random Forest testing



In [19]:

    
test_preds = random_forest.predict(preprocessing.scale(x_test))
r2 = r2_score(y_test, test_preds)
mse = mean_squared_error(y_test, test_preds)
print 'MSE = {}, r2 = {}'.format(mse, r2)

plt.scatter(y_test, test_preds, label="r^2= {:.2f}".format(r2),)
plt.legend(loc="lower right")
plt.title("RandomForest Regression with scikit-learn", size = 10);









    



MSE = 838277.339147, r2 = 0.539338343059



In [30]:

    
plt.hist(y_test, bins = 50, );
print 'Actual yield_lb test data'
print '#########################'
print y_test.describe()









    



Actual yield_lb test data
#########################
count     2066.000000
mean      9333.310761
std       1349.298262
min       4659.299163
25%       8449.491404
50%       9394.321302
75%      10262.714512
max      13744.665767
Name: yield_lb, dtype: float64



In [32]:

    
plt.hist(test_preds, bins = 50, );
print 'Predicted yield_lb test data'
print '#########################'
print pd.Series(test_preds).describe()









    



Predicted yield_lb test data
#########################
count     2066.000000
mean      9345.768032
std        918.657304
min       5513.259791
25%       8739.708278
50%       9319.066509
75%       9997.916390
max      11575.220000
dtype: float64

Random forest model without "PLOT"



In [37]:

    
xtrainnew = x_train.drop('plot', axis = 1)
xtestnew  = x_test.drop('plot', axis = 1)



In [38]:

    
for leaf_size in [1,5,10,50]:
    random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=leaf_size)
    random_forest.fit(xtrainnew, y_train)
    test_preds = random_forest.predict(xtestnew)
    r2 = r2_score(y_test, test_preds)
    mse = mean_squared_error(y_test, test_preds)
    print 'r2 = {:.2f}, MSE = {}'.format(r2, mse)









    



r2 = 0.52, MSE = 864631.302237
r2 = 0.52, MSE = 881103.321669
r2 = 0.49, MSE = 925887.966746
r2 = 0.40, MSE = 1089317.31498



In [39]:

    
random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=1)
random_forest.fit(preprocessing.scale(xtrainnew), y_train)

importance = random_forest.feature_importances_
importance = pd.DataFrame(importance, index=xtrainnew.columns, 
                          columns=["Importance"])

importance["Std"] = np.std([tree.feature_importances_
                            for tree in random_forest.estimators_], axis=0)

x = range(importance.shape[0])
importance = importance.sort_values(by = 'Importance', ascending=True)
y = importance.iloc[:, 0]
yerr = importance.iloc[:, 1]

plt.figure(figsize=(10, 6))
plt.barh(x, y, yerr=yerr, align="center", )
plt.yticks(x, importance.index, rotation = 'horizontal', size = 10)
plt.title('Feature Importance for random forest regression model', size = 12);



In [40]:

    
test_preds = random_forest.predict(preprocessing.scale(xtestnew))
r2 = r2_score(y_test, test_preds)
mse = mean_squared_error(y_test, test_preds)
print 'MSE = {}, r2 = {}'.format(mse, r2)

plt.scatter(y_test, test_preds, label="r^2= {:.2f}".format(r2),)
plt.legend(loc="lower right")
plt.title("RandomForest Regression with scikit-learn", size = 10);









    



MSE = 903927.734153, r2 = 0.50326123787



In [41]:

    
plt.hist(y_test, bins = 50, );
print 'Actual yield_lb test data'
print '#########################'
print y_test.describe()









    



Actual yield_lb test data
#########################
count     2066.000000
mean      9333.310761
std       1349.298262
min       4659.299163
25%       8449.491404
50%       9394.321302
75%      10262.714512
max      13744.665767
Name: yield_lb, dtype: float64



In [42]:

    
plt.hist(test_preds, bins = 50, );
print 'Predicted yield_lb test data'
print '#########################'
print pd.Series(test_preds).describe()









    



Predicted yield_lb test data
#########################
count     2066.000000
mean      9361.145035
std        903.512342
min       6020.139615
25%       8764.183319
50%       9328.280977
75%       9979.842252
max      11838.047777
dtype: float64



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	planted	site	plot	rep	id	dth	lodging	height	grain_type	moisture	...	vg_avg_tmax	fl_avg_tmin	fl_avg_tmax	gf_avg_tmin	gf_avg_tmax	se_avg_tmin	se_avg_tmax	veg_srad	rep_srad	gf_srad
0	1995-05-12	RES	349.0	1	CM101	78	15.0	85.0	S	16.0	...	28.114548	15.854166	32.492108	14.893976	33.887337	14.025759	30.655951	78968	36409	49842
1	1995-05-12	RES	374.0	2	CM101	78	15.0	80.0	S	15.9	...	28.114548	15.854166	32.492108	14.893976	33.887337	14.025759	30.655951	78968	36409	49842
2	1995-05-12	RES	8.0	1	M203	87	60.0	105.0	M	23.0	...	28.114548	16.097288	33.369639	13.492801	32.751324	13.859148	30.760579	78968	49218	47679
3	1995-05-12	RES	1096.0	1	94Y589	87	10.0	95.0	M	18.7	...	28.114548	16.097288	33.369639	13.492801	32.751324	13.859148	30.760579	78968	49218	47679
4	1995-05-12	RES	350.0	1	94Y383	86	1.0	85.0	M	19.9	...	28.114548	16.101021	33.400215	13.581463	32.705987	13.873189	30.725791	78968	47809	47944

	plot	rep	dth	lodging	height	grain_type	moisture	trial_type	pi	vg_avg_tmin	vg_avg_tmax	fl_avg_tmin	fl_avg_tmax	gf_avg_tmin	gf_avg_tmax	se_avg_tmin	se_avg_tmax	veg_srad	rep_srad	gf_srad
count	4818.000000	4818.000000	4818.000000	4818.000000	4818.000000	4818.000000	4818.000000	4818.000000	4818.000000	4818.000000	4818.000000	4818.000000	4818.000000	4818.000000	4818.000000	4818.000000	4818.000000	4818.000000	4818.000000	4818.000000
mean	384.747198	2.248028	84.462433	23.527293	94.534747	1.082814	16.837813	0.934205	49.563097	13.657877	30.563047	15.400812	33.635437	14.142866	32.810042	14.217699	31.930401	73207.497302	51698.921129	46269.671856
std	621.639330	1.094706	6.629958	32.028831	7.405346	0.714039	3.206468	0.799385	2.799334	0.904879	1.527051	1.170603	1.417476	1.032498	1.092075	0.674924	0.842727	3981.297685	8783.983977	2785.485142
min	1.000000	1.000000	71.000000	0.000000	70.000000	0.000000	6.190000	0.000000	46.000000	12.123631	27.607502	12.787868	29.037816	8.464561	26.182818	12.806541	29.889453	67738.000000	31346.000000	30415.000000
25%	44.000000	1.000000	80.000000	1.000000	89.916000	1.000000	14.900000	0.000000	47.000000	12.806615	29.458961	14.314851	32.582901	13.363344	32.254054	13.633277	31.521658	70102.000000	45634.000000	44660.000000
50%	89.000000	2.000000	84.000000	5.000000	95.000000	1.000000	16.600000	1.000000	49.000000	13.766445	30.525245	15.222422	33.559038	14.048884	32.945452	14.408095	32.060290	72664.000000	51075.000000	46484.000000
75%	370.750000	3.000000	88.000000	40.000000	100.000000	2.000000	18.325000	2.000000	51.000000	14.430488	31.633240	16.502931	34.453982	15.205106	33.616870	14.792181	32.482795	75758.000000	56510.250000	48143.000000
max	2664.000000	4.000000	124.000000	100.000000	125.999999	2.000000	35.300000	2.000000	57.000000	15.187658	33.389713	18.403577	37.108562	15.895720	34.768822	15.243826	33.562510	83871.000000	106563.000000	52828.000000