In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from numpy.linalg import matrix_rank
from statsmodels.stats.outliers_influence import variance_inflation_factor
%matplotlib inline









    



//anaconda/lib/python2.7/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)
//anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Random forest regressor

Load training and label data



In [2]:

    
train = pd.read_csv('./../data/training.csv')
label = pd.read_csv('./../data/labels.csv', header=None)
train.drop('Unnamed: 0', axis=1, inplace=True)
# label encode type 
le = LabelEncoder()
train['type_enc'] = le.fit_transform(train['type'])
label.columns = ['0', 'p_label2']
label.drop('0', axis=1, inplace=True)
y_label = np.ravel(label)



In [3]:

    
train.columns









    Out[3]:





Index([u'gasLimit_t', u'gasUsed_t', u'newContract', u'blockTime',
       u'difficulty', u'gasLimit_b', u'gasUsed_b', u'reward', u'size', u'type',
       u'totalFee', u'amount_gwei', u'gasShare', u'gweiPaid', u'gweiPaid_b',
       u'gweiShare', u'free_t', u'day', u'hour', u'dayofweek', u'txcnt_second',
       u'avg_blocktime_6', u'avg_gasUsed_b_6', u'avg_tx_count_6',
       u'avg_uncle_count_6', u'avg_difficulty_6', u'avg_txcnt_second_6',
       u'avg_gasUsed_t_6', u'avg_price_6', u'avg_blocktime_60',
       u'avg_gasUsed_b_60', u'avg_tx_count_60', u'avg_uncle_count_60',
       u'avg_difficulty_60', u'avg_txcnt_second_60', u'avg_gasUsed_t_60',
       u'avg_price_60', u'mv', u'type_enc'],
      dtype='object')

Select features



In [4]:

    
sub_cols = ['gasLimit_t', 
            'gasUsed_t', 
            'newContract', 
       'avg_blocktime_6', 
       'avg_uncle_count_6', 
            'avg_txcnt_second_6',
       'avg_gasUsed_t_6', 
            'avg_price_6', 
            'avg_uncle_count_60',
        'avg_price_60', 
            'mv', 
            'type_enc']

Split the data



In [5]:

    
sub_train = train[sub_cols]
X = sub_train.values
y = y_label
X_train, X_test, y_train, y_test = train_test_split(X, y)



In [6]:

    
matrix_rank(X), len(sub_cols)









    Out[6]:





(12, 12)

Variance inflation factors

If the VIF is equal to 1 there is no multicollinearity among factors, but if the VIF is greater than 1, the predictors may be moderately correlated. The output above shows that the VIF for the Publication and Years factors are about 1.5, which indicates some correlation, but not enough to be overly concerned about. A VIF between 5 and 10 indicates high correlation that may be problematic. And if the VIF goes above 10, you can assume that the regression coefficients are poorly estimated due to multicollinearity.



In [7]:

    
for i, col in enumerate(sub_train.columns):
    print('VIF col {}: {}'.format(col,variance_inflation_factor(X,i)))









    



VIF col gasLimit_t: 1.59171325219
VIF col gasUsed_t: 1.41965478479
VIF col newContract: 1.03855950223
VIF col avg_blocktime_6: 4.44493959221
VIF col avg_uncle_count_6: 1.62189618616
VIF col avg_txcnt_second_6: 2.24676473359
VIF col avg_gasUsed_t_6: 2.08424912098
VIF col avg_price_6: 2.14220672369
VIF col avg_uncle_count_60: 4.95359233193
VIF col avg_price_60: 3.6317550142
VIF col mv: 1.11936078272
VIF col type_enc: 3.34148223087

Cross validation to find optimal model



In [8]:

    
rf1 = RandomForestRegressor()



In [9]:

    
rf1.get_params().keys()









    Out[9]:





['warm_start',
 'oob_score',
 'n_jobs',
 'min_impurity_decrease',
 'verbose',
 'max_leaf_nodes',
 'bootstrap',
 'min_samples_leaf',
 'n_estimators',
 'min_samples_split',
 'min_weight_fraction_leaf',
 'criterion',
 'random_state',
 'min_impurity_split',
 'max_features',
 'max_depth']

Standardize data



In [10]:

    
# Create a scaler object
#sc = StandardScaler()

# Fit the scaler to the feature data and transform
#X_std = sc.fit_transform(X)

Nested cross validation



In [11]:

    
# Create a list of 10 candidate values for the C parameter
param_candidates = dict(max_depth=np.arange(1, 7, 1), 
                        n_estimators=np.arange(5, 25, 5))



In [12]:

    
param_candidates









    Out[12]:





{'max_depth': array([1, 2, 3, 4, 5, 6]),
 'n_estimators': array([ 5, 10, 15, 20])}



In [13]:

    
# Create a gridsearch object with the decision tree regressor and the max_depth value candidates
rf = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_candidates)



In [14]:

    
# Fit the cross validated grid search on the data 
rf.fit(X_train, y_train)









    Out[14]:





GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([ 5, 10, 15, 20]), 'max_depth': array([1, 2, 3, 4, 5, 6])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)



In [17]:

    
# Show the best param values
rf.best_estimator_









    Out[17]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)



In [56]:

    
print('Mean CV r2_score: {}'.format(np.mean(cross_val_score(
        rf, X_test, y_test, scoring='r2', cv=5))))









    



Mean CV r2_score: 0.825487086998



In [43]:

    
results1 = pd.DataFrame(X_train)
results1.columns = sub_cols
results1['label']  = y_train
results1['pred']  = rf.predict(X_train)
results1['stage'] = 'Train'



In [44]:

    
results1.head()









    Out[44]:







  
    
      
      gasLimit_t
      gasUsed_t
      newContract
      avg_blocktime_6
      avg_uncle_count_6
      avg_txcnt_second_6
      avg_gasUsed_t_6
      avg_price_6
      avg_uncle_count_60
      avg_price_60
      mv
      type_enc
      label
      pred
      stage
    
  
  
    
      0
      50000.0
      21000.0
      0.0
      30.333333
      0.000000
      10.333333
      48851.226918
      30.109970
      0.100000
      29.814134
      0.269151
      2.0
      10.861544
      18.237229
      Train
    
    
      1
      50000.0
      21000.0
      0.0
      24.833333
      0.000000
      11.500000
      31427.203082
      22.623876
      0.133333
      25.257682
      1.182141
      2.0
      12.959649
      15.160349
      Train
    
    
      2
      650000.0
      518513.0
      0.0
      39.166667
      0.000000
      3.666667
      34272.846260
      27.709378
      0.016667
      24.240462
      0.092250
      2.0
      7.762371
      18.237229
      Train
    
    
      3
      74318.0
      37158.0
      0.0
      19.833333
      0.000000
      4.166667
      70370.262842
      29.253390
      0.066667
      26.272013
      0.688410
      2.0
      22.216060
      18.237229
      Train
    
    
      4
      250000.0
      45992.0
      0.0
      22.000000
      0.166667
      6.166667
      138817.151832
      19.691374
      0.066667
      23.750735
      0.332509
      2.0
      8.783028
      9.816403
      Train



In [45]:

    
results2 = pd.DataFrame(X_test)
results2.columns = sub_cols
results2['label']  = y_test
results2['pred']  = rf.predict(X_test)
results2['stage'] = 'Test'



In [46]:

    
results2.head()









    Out[46]:







  
    
      
      gasLimit_t
      gasUsed_t
      newContract
      avg_blocktime_6
      avg_uncle_count_6
      avg_txcnt_second_6
      avg_gasUsed_t_6
      avg_price_6
      avg_uncle_count_60
      avg_price_60
      mv
      type_enc
      label
      pred
      stage
    
  
  
    
      0
      1730520.0
      41847.0
      0.0
      33.166667
      0.000000
      2.166667
      44060.899025
      25.589140
      0.050000
      26.094059
      3.996689
      0.0
      4.013700
      7.970963
      Test
    
    
      1
      21000.0
      21000.0
      0.0
      36.000000
      0.166667
      4.000000
      48306.599942
      30.763945
      0.100000
      24.730089
      0.634915
      2.0
      24.167385
      18.237229
      Test
    
    
      2
      90000.0
      21000.0
      0.0
      56.833333
      0.000000
      4.333333
      52169.843132
      36.520256
      0.083333
      32.719292
      1.116790
      2.0
      51.044565
      18.237229
      Test
    
    
      3
      940000.0
      29912.0
      0.0
      39.000000
      0.000000
      3.166667
      29540.946698
      26.630111
      0.033333
      24.008287
      1.516613
      2.0
      10.110389
      15.160349
      Test
    
    
      4
      524747.0
      437289.0
      1.0
      28.666667
      0.000000
      4.166667
      55129.327656
      18.702611
      0.033333
      23.785293
      1.724892
      1.0
      8.752006
      15.160349
      Test



In [51]:

    
results = pd.concat([results1,results2], axis=0)



In [52]:

    
results.head()









    Out[52]:







  
    
      
      gasLimit_t
      gasUsed_t
      newContract
      avg_blocktime_6
      avg_uncle_count_6
      avg_txcnt_second_6
      avg_gasUsed_t_6
      avg_price_6
      avg_uncle_count_60
      avg_price_60
      mv
      type_enc
      label
      pred
      stage
    
  
  
    
      0
      50000.0
      21000.0
      0.0
      30.333333
      0.000000
      10.333333
      48851.226918
      30.109970
      0.100000
      29.814134
      0.269151
      2.0
      10.861544
      18.237229
      Train
    
    
      1
      50000.0
      21000.0
      0.0
      24.833333
      0.000000
      11.500000
      31427.203082
      22.623876
      0.133333
      25.257682
      1.182141
      2.0
      12.959649
      15.160349
      Train
    
    
      2
      650000.0
      518513.0
      0.0
      39.166667
      0.000000
      3.666667
      34272.846260
      27.709378
      0.016667
      24.240462
      0.092250
      2.0
      7.762371
      18.237229
      Train
    
    
      3
      74318.0
      37158.0
      0.0
      19.833333
      0.000000
      4.166667
      70370.262842
      29.253390
      0.066667
      26.272013
      0.688410
      2.0
      22.216060
      18.237229
      Train
    
    
      4
      250000.0
      45992.0
      0.0
      22.000000
      0.166667
      6.166667
      138817.151832
      19.691374
      0.066667
      23.750735
      0.332509
      2.0
      8.783028
      9.816403
      Train



In [54]:

    
results['logPred']=results.pred.apply(np.log10)
results['logLabel']=results.label.apply(np.log10)



In [72]:

    
sns.lmplot(x="label", y="pred", hue="stage",truncate=True, size=10, data=results)









    Out[72]:





<seaborn.axisgrid.FacetGrid at 0x1a59d1ad90>



In [76]:

    
sns.jointplot(x="label", y="pred", size=10, data=results)









    Out[76]:





<seaborn.axisgrid.JointGrid at 0x1a32d14dd0>



In [55]:

    
sns.lmplot(x="logLabel", y="logPred", hue="stage",truncate=True, size=10, data=results)









    Out[55]:





<seaborn.axisgrid.FacetGrid at 0x1a77b18d90>



In [77]:

    
sns.jointplot(x="logLabel", y="logPred", size=10, data=results)









    Out[77]:





<seaborn.axisgrid.JointGrid at 0x1a341e0990>



In [ ]:

    
sns.jointplot(x="logLabel", y="logPred",kind="kde", size=10, data=results)



In [57]:

    
results['absoluteError'] = results.pred-results.label
results['relativeError'] = results.absoluteError/results.label



In [67]:

    
results.relativeError.hist(bins=np.arange(-1,4,.1))









    Out[67]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a331e26d0>



In [71]:

    
results.absoluteError.hist(bins=np.arange(-50,50,1))









    Out[71]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a59bf1990>

Plot results



In [73]:

    
sub_train.columns









    Out[73]:





Index([u'gasLimit_t', u'gasUsed_t', u'newContract', u'avg_blocktime_6',
       u'avg_uncle_count_6', u'avg_txcnt_second_6', u'avg_gasUsed_t_6',
       u'avg_price_6', u'avg_uncle_count_60', u'avg_price_60', u'mv',
       u'type_enc'],
      dtype='object')



In [65]:

    
y_pred = rf.predict(X_test)



In [68]:

    
y_pred.shape, y_test.shape









    Out[68]:





((349157,), (349157,))



In [80]:

    
results = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})



In [ ]:

    
results['y_test_log'] =



In [86]:

    
plt.scatter(results['y_test'].apply(np.log10), results['y_pred'].apply(np.log10))
#plt.xlim(0,100)
#plt.ylim(0,100)









    Out[86]:





<matplotlib.collections.PathCollection at 0x116706450>



In [ ]:

    
sns.set(style="darkgrid", color_codes=True)
sns.jointplot(x="y_test", y="y_pred", data=results, marginal_kws=dict(bins=25)
plt.xlim(0, 100)
plt.tight_layout()
plt.savefig('./../images/jointplot.png', dpi=300)

Feature importances



In [91]:

    
def rf_regressor(X_train, X_test, y_train, y_test):
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    scores = cross_val_score(rf, X_train, y_train, scoring='r2', cv=5)
    print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
    print('R2_score: {}'.format(r2_score(y_test, y_pred)))
    print('avg_CV_score: {}'.format(np.mean(scores)))
    # write predicted values to csv
    p = pd.DataFrame({'y_pred': y_pred})
    p.to_csv('./../data/label_pred.csv')
    return rf



In [92]:

    
model = rf_regressor(X_train, X_test, y_train, y_test)









    



MSE: 13.1884823058
R2_score: 0.987232856113
avg_CV_score: 0.983622505175



In [ ]:

    
def plot_feature_importance(rf, feature_df):
    cols = []
    for col in feature_df.columns:
        cols.append(col)

    feat_scores = pd.DataFrame({'Fraction of Samples Affected' : rf.feature_importances_},
                           index=cols)
    feat_scores = feat_scores.sort_values(by='Fraction of Samples Affected')
    feat_scores.plot(kind='barh', color='r', figsize=(6,6))
    #plt.xlabel('Importance', fontsize=18)
    plt.title('Feature Importance', fontsize=18)
    plt.tight_layout()
    plt.savefig('./../images/feat_import_pruned.png', dpi=300)



In [93]:

    
plot_feature_importance(model, sub_train)



In [ ]:

	gasLimit_t	gasUsed_t	avg_blocktime_6	avg_uncle_count_6	avg_txcnt_second_6	avg_gasUsed_t_6	avg_price_6	avg_uncle_count_60	avg_price_60	mv	type_enc	label	pred	stage
0	50000.0	21000.0	30.333333	0.000000	10.333333	48851.226918	30.109970	0.100000	29.814134	0.269151	2.0	10.861544	18.237229	Train
1	50000.0	21000.0	24.833333	0.000000	11.500000	31427.203082	22.623876	0.133333	25.257682	1.182141	2.0	12.959649	15.160349	Train
2	650000.0	518513.0	39.166667	0.000000	3.666667	34272.846260	27.709378	0.016667	24.240462	0.092250	2.0	7.762371	18.237229	Train
3	74318.0	37158.0	19.833333	0.000000	4.166667	70370.262842	29.253390	0.066667	26.272013	0.688410	2.0	22.216060	18.237229	Train
4	250000.0	45992.0	22.000000	0.166667	6.166667	138817.151832	19.691374	0.066667	23.750735	0.332509	2.0	8.783028	9.816403	Train

	gasLimit_t	gasUsed_t	newContract	avg_blocktime_6	avg_uncle_count_6	avg_txcnt_second_6	avg_gasUsed_t_6	avg_price_6	avg_uncle_count_60	avg_price_60	mv	type_enc	label	pred	stage
0	1730520.0	41847.0	0.0	33.166667	0.000000	2.166667	44060.899025	25.589140	0.050000	26.094059	3.996689	0.0	4.013700	7.970963	Test
1	21000.0	21000.0	0.0	36.000000	0.166667	4.000000	48306.599942	30.763945	0.100000	24.730089	0.634915	2.0	24.167385	18.237229	Test
2	90000.0	21000.0	0.0	56.833333	0.000000	4.333333	52169.843132	36.520256	0.083333	32.719292	1.116790	2.0	51.044565	18.237229	Test
3	940000.0	29912.0	0.0	39.000000	0.000000	3.166667	29540.946698	26.630111	0.033333	24.008287	1.516613	2.0	10.110389	15.160349	Test
4	524747.0	437289.0	1.0	28.666667	0.000000	4.166667	55129.327656	18.702611	0.033333	23.785293	1.724892	1.0	8.752006	15.160349	Test