In [76]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from datetime import date
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, median_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

Data Preparation


In [77]:
y_data = pd.read_csv('../data/yield_w_srad.csv')
y_data.columns


Out[77]:
Index([u'planted', u'site', u'plot', u'rep', u'id', u'dth', u'lodging',
       u'height', u'grain_type', u'moisture', u'year', u'county',
       u'trial_type', u'yield_lb', u'head_date', u'pi', u'vg_avg_tmin',
       u'vg_avg_tmax', u'fl_avg_tmin', u'fl_avg_tmax', u'gf_avg_tmin',
       u'gf_avg_tmax', u'se_avg_tmin', u'se_avg_tmax', u'veg_srad',
       u'rep_srad', u'gf_srad'],
      dtype='object')

In [78]:
y_data.head()


Out[78]:
planted site plot rep id dth lodging height grain_type moisture ... vg_avg_tmax fl_avg_tmin fl_avg_tmax gf_avg_tmin gf_avg_tmax se_avg_tmin se_avg_tmax veg_srad rep_srad gf_srad
0 1995-05-12 RES 349.0 1 CM101 78 15.0 85.0 S 16.0 ... 28.114548 15.854166 32.492108 14.893976 33.887337 14.025759 30.655951 78968 36409 49842
1 1995-05-12 RES 374.0 2 CM101 78 15.0 80.0 S 15.9 ... 28.114548 15.854166 32.492108 14.893976 33.887337 14.025759 30.655951 78968 36409 49842
2 1995-05-12 RES 8.0 1 M203 87 60.0 105.0 M 23.0 ... 28.114548 16.097288 33.369639 13.492801 32.751324 13.859148 30.760579 78968 49218 47679
3 1995-05-12 RES 1096.0 1 94Y589 87 10.0 95.0 M 18.7 ... 28.114548 16.097288 33.369639 13.492801 32.751324 13.859148 30.760579 78968 49218 47679
4 1995-05-12 RES 350.0 1 94Y383 86 1.0 85.0 M 19.9 ... 28.114548 16.101021 33.400215 13.581463 32.705987 13.873189 30.725791 78968 47809 47944

5 rows × 27 columns


In [79]:
#drop columns that are not required
drop_col = ['year', 'county', 'site', 'id']
y_data.drop(drop_col, axis = 1, inplace = True)
y_data['planted'] = pd.to_datetime(y_data['planted'], 
                                   format = "%Y-%m-%d", infer_datetime_format = True)
y_data['head_date'] = pd.to_datetime(y_data['head_date'], 
                                     format = "%Y-%m-%d", infer_datetime_format = True)
y_data['planted_day'] = y_data['planted'].apply(lambda t: t.day)
y_data['head_day'] = y_data['head_date'].apply(lambda t: t.day)

In [136]:
# Prepare the data
y_data = y_data.dropna()
y_data['trial_type'] = pd.factorize(y_data['trial_type'])[0]
y_data['grain_type'] = pd.factorize(y_data['grain_type'])[0]

y_split = y_data.copy().drop(['planted', 'head_date', 'planted_day', 'head_day'], axis = 1)
y = y_split['yield_lb']
x = y_split.drop(['yield_lb'], axis = 1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=5)


# Drop the 'plot' feature
xtrain_new = x_train.drop('plot', axis = 1)
xtest_new  = x_test.drop('plot', axis = 1)



# Scale the data first
xtrainnew = preprocessing.scale(xtrainnew)
xtestnew = preprocessing.scale(xtestnew)

In [73]:
rng = np.random.RandomState(1)

In [109]:
# function to generate boosting model
def get_model(model_depth=5, no_estimators=100, random_state=rng):
    
    model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=model_depth),
                               n_estimators=no_estimators, random_state=random_state)
    
    return model

Parameter selection

  • It is not clear what numbers should be used for depth and number of estimators
  • Experiment to find out optimum values to prevent both under- and over- fitting

In [121]:
depths = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
validations_r2 = []
trainings_r2 = []

for depth in depths:
    
    # get the model
    reg_model = get_model(model_depth = depth)
    
    # fit training data
    reg_model.fit(xtrainnew, y_train)
    
    # calculate r2 score
    train_r2 = reg_model.score(xtrainnew, y_train)
    valid_r2 = reg_model.score(xtestnew, y_test)
    
    trainings_r2.append(train_r2)
    validations_r2.append(valid_r2)

In [122]:
plt.plot(depths, trainings_r2, label ='train r2')
plt.plot(depths, validations_r2, label ='validation r2')
plt.legend(loc='upper left')
plt.xlabel('max depth of model')
plt.ylabel('r2 score')
plt.show()


It looks like the depth = 12 would be the best choice. Now testing number of estimators.


In [123]:
estimators = [1,10,20,30,40,50,60,70,80,100]

validations_r2_e = []
trainings_r2_e = []

for e in estimators:
    
    # get the model
    reg_model = get_model(model_depth = 12, no_estimators = e)
    
    # fit training data
    reg_model.fit(xtrainnew, y_train)
    
    # calculate r2 score
    train_r2 = reg_model.score(xtrainnew, y_train)
    valid_r2 = reg_model.score(xtestnew, y_test)
    
    trainings_r2_e.append(train_r2)
    validations_r2_e.append(valid_r2)

In [124]:
plt.plot(estimators, trainings_r2_e, label ='train r2')
plt.plot(estimators, validations_r2_e, label ='validation r2')
plt.legend(loc='lower right')
plt.xlabel('Number of estimators in the model')
plt.ylabel('r2 score')
plt.show()


Summary of parameter choice

  • Depth = 12
  • It looks like r2 is not very sensitive about number of estimators, so just use 100

In [129]:
final_model = get_model(model_depth = 12, no_estimators = 100)

In [130]:
final_model.fit(xtrainnew, y_train)


Out[130]:
AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=12, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
         learning_rate=1.0, loss='linear', n_estimators=100,
         random_state=<mtrand.RandomState object at 0x7f04d95602d0>)

In [131]:
training_r2 = final_model.score(xtrainnew, y_train)

print 'training r2 = {}'.format(training_r2)


training r2 = 0.929730276537

In [132]:
valid_r2 = final_model.score(xtestnew, y_test)

print 'validatioin r2 = {}'.format(valid_r2)


validatioin r2 = 0.519518002214

In [133]:
test_preds = final_model.predict(xtestnew)
r2 = r2_score(y_test, test_preds)
mse = mean_squared_error(y_test, test_preds)
print 'MSE = {}, r2 = {}'.format(mse, r2)

plt.scatter(y_test, test_preds, label="r^2= {:.2f}".format(r2),)
plt.legend(loc="lower right")
plt.title("RandomForest Regression with scikit-learn", size = 10)
plt.show()


MSE = 874344.900522, r2 = 0.519518002214

In [137]:
importance = final_model.feature_importances_
importance = pd.DataFrame(importance, index=xtrain_new.columns, 
                          columns=["Importance"])

importance["Std"] = np.std([tree.feature_importances_
                            for tree in final_model.estimators_], axis=0)

x = range(importance.shape[0])
importance = importance.sort_values(by = 'Importance', ascending=True)
y = importance.iloc[:, 0]
yerr = importance.iloc[:, 1]

plt.figure(figsize=(10, 6))
plt.barh(x, y, yerr=yerr, align="center", )
plt.yticks(x, importance.index, rotation = 'horizontal', size = 10)
plt.title('Feature Importance for random forest regression model', size = 12)
plt.show()



In [138]:
plt.hist(y_test, bins = 50, label = 'actual', alpha = 0.8)
plt.hist(test_preds, bins = 50, label = 'prediction', alpha = 0.8)
plt.legend()
plt.show()


Random Forest


  • result by Zamir

In [8]:
for leaf_size in [1,5,10,50]:
    random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=leaf_size)
    random_forest.fit(x_train, y_train)
    test_preds = random_forest.predict(x_test)
    r2 = r2_score(y_test, test_preds)
    mse = mean_squared_error(y_test, test_preds)
    print 'r2 = {:.2f}, MSE = {}'.format(r2, mse)


r2 = 0.57, MSE = 776332.152478
r2 = 0.56, MSE = 802581.838596
r2 = 0.53, MSE = 856870.915549
r2 = 0.43, MSE = 1030202.34458

In [9]:
random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=1)
random_forest.fit(preprocessing.scale(x_train), y_train)


Out[9]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=5,
           verbose=0, warm_start=False)

In [18]:
importance = random_forest.feature_importances_
importance = pd.DataFrame(importance, index=x_train.columns, 
                          columns=["Importance"])

importance["Std"] = np.std([tree.feature_importances_
                            for tree in random_forest.estimators_], axis=0)

x = range(importance.shape[0])
importance = importance.sort_values(by = 'Importance', ascending=True)
y = importance.iloc[:, 0]
yerr = importance.iloc[:, 1]

plt.figure(figsize=(10, 6))
plt.barh(x, y, yerr=yerr, align="center", )
plt.yticks(x, importance.index, rotation = 'horizontal', size = 10)
plt.title('Feature Importance for random forest regression model', size = 12);


Random Forest testing


In [19]:
test_preds = random_forest.predict(preprocessing.scale(x_test))
r2 = r2_score(y_test, test_preds)
mse = mean_squared_error(y_test, test_preds)
print 'MSE = {}, r2 = {}'.format(mse, r2)

plt.scatter(y_test, test_preds, label="r^2= {:.2f}".format(r2),)
plt.legend(loc="lower right")
plt.title("RandomForest Regression with scikit-learn", size = 10);


MSE = 838277.339147, r2 = 0.539338343059

In [30]:
plt.hist(y_test, bins = 50, );
print 'Actual yield_lb test data'
print '#########################'
print y_test.describe()


Actual yield_lb test data
#########################
count     2066.000000
mean      9333.310761
std       1349.298262
min       4659.299163
25%       8449.491404
50%       9394.321302
75%      10262.714512
max      13744.665767
Name: yield_lb, dtype: float64

In [32]:
plt.hist(test_preds, bins = 50, );
print 'Predicted yield_lb test data'
print '#########################'
print pd.Series(test_preds).describe()


Predicted yield_lb test data
#########################
count     2066.000000
mean      9345.768032
std        918.657304
min       5513.259791
25%       8739.708278
50%       9319.066509
75%       9997.916390
max      11575.220000
dtype: float64

Random forest model without "PLOT"


In [37]:
xtrainnew = x_train.drop('plot', axis = 1)
xtestnew  = x_test.drop('plot', axis = 1)

In [38]:
for leaf_size in [1,5,10,50]:
    random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=leaf_size)
    random_forest.fit(xtrainnew, y_train)
    test_preds = random_forest.predict(xtestnew)
    r2 = r2_score(y_test, test_preds)
    mse = mean_squared_error(y_test, test_preds)
    print 'r2 = {:.2f}, MSE = {}'.format(r2, mse)


r2 = 0.52, MSE = 864631.302237
r2 = 0.52, MSE = 881103.321669
r2 = 0.49, MSE = 925887.966746
r2 = 0.40, MSE = 1089317.31498

In [39]:
random_forest = rf(n_estimators=100, oob_score= False, random_state = 5, min_samples_leaf=1)
random_forest.fit(preprocessing.scale(xtrainnew), y_train)

importance = random_forest.feature_importances_
importance = pd.DataFrame(importance, index=xtrainnew.columns, 
                          columns=["Importance"])

importance["Std"] = np.std([tree.feature_importances_
                            for tree in random_forest.estimators_], axis=0)

x = range(importance.shape[0])
importance = importance.sort_values(by = 'Importance', ascending=True)
y = importance.iloc[:, 0]
yerr = importance.iloc[:, 1]

plt.figure(figsize=(10, 6))
plt.barh(x, y, yerr=yerr, align="center", )
plt.yticks(x, importance.index, rotation = 'horizontal', size = 10)
plt.title('Feature Importance for random forest regression model', size = 12);



In [40]:
test_preds = random_forest.predict(preprocessing.scale(xtestnew))
r2 = r2_score(y_test, test_preds)
mse = mean_squared_error(y_test, test_preds)
print 'MSE = {}, r2 = {}'.format(mse, r2)

plt.scatter(y_test, test_preds, label="r^2= {:.2f}".format(r2),)
plt.legend(loc="lower right")
plt.title("RandomForest Regression with scikit-learn", size = 10);


MSE = 903927.734153, r2 = 0.50326123787

In [41]:
plt.hist(y_test, bins = 50, );
print 'Actual yield_lb test data'
print '#########################'
print y_test.describe()


Actual yield_lb test data
#########################
count     2066.000000
mean      9333.310761
std       1349.298262
min       4659.299163
25%       8449.491404
50%       9394.321302
75%      10262.714512
max      13744.665767
Name: yield_lb, dtype: float64

In [42]:
plt.hist(test_preds, bins = 50, );
print 'Predicted yield_lb test data'
print '#########################'
print pd.Series(test_preds).describe()


Predicted yield_lb test data
#########################
count     2066.000000
mean      9361.145035
std        903.512342
min       6020.139615
25%       8764.183319
50%       9328.280977
75%       9979.842252
max      11838.047777
dtype: float64

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: