In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from numpy.linalg import matrix_rank
from statsmodels.stats.outliers_influence import variance_inflation_factor
%matplotlib inline


//anaconda/lib/python2.7/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)
//anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Random forest regressor

Load training and label data


In [2]:
train = pd.read_csv('./../data/training.csv')
label = pd.read_csv('./../data/labels.csv', header=None)
train.drop('Unnamed: 0', axis=1, inplace=True)
# label encode type 
le = LabelEncoder()
train['type_enc'] = le.fit_transform(train['type'])
label.columns = ['0', 'p_label2']
label.drop('0', axis=1, inplace=True)
y_label = np.ravel(label)

In [3]:
train.columns


Out[3]:
Index([u'gasLimit_t', u'gasUsed_t', u'newContract', u'blockTime',
       u'difficulty', u'gasLimit_b', u'gasUsed_b', u'reward', u'size', u'type',
       u'totalFee', u'amount_gwei', u'gasShare', u'gweiPaid', u'gweiPaid_b',
       u'gweiShare', u'free_t', u'day', u'hour', u'dayofweek', u'txcnt_second',
       u'avg_blocktime_6', u'avg_gasUsed_b_6', u'avg_tx_count_6',
       u'avg_uncle_count_6', u'avg_difficulty_6', u'avg_txcnt_second_6',
       u'avg_gasUsed_t_6', u'avg_price_6', u'avg_blocktime_60',
       u'avg_gasUsed_b_60', u'avg_tx_count_60', u'avg_uncle_count_60',
       u'avg_difficulty_60', u'avg_txcnt_second_60', u'avg_gasUsed_t_60',
       u'avg_price_60', u'mv', u'type_enc'],
      dtype='object')

Select features


In [4]:
sub_cols = ['gasLimit_t', 
            'gasUsed_t', 
            'newContract', 
       'avg_blocktime_6', 
       'avg_uncle_count_6', 
            'avg_txcnt_second_6',
       'avg_gasUsed_t_6', 
            'avg_price_6', 
            'avg_uncle_count_60',
        'avg_price_60', 
            'mv', 
            'type_enc']

Split the data


In [5]:
sub_train = train[sub_cols]
X = sub_train.values
y = y_label
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
matrix_rank(X), len(sub_cols)


Out[6]:
(12, 12)

Variance inflation factors

If the VIF is equal to 1 there is no multicollinearity among factors, but if the VIF is greater than 1, the predictors may be moderately correlated. The output above shows that the VIF for the Publication and Years factors are about 1.5, which indicates some correlation, but not enough to be overly concerned about. A VIF between 5 and 10 indicates high correlation that may be problematic. And if the VIF goes above 10, you can assume that the regression coefficients are poorly estimated due to multicollinearity.


In [7]:
for i, col in enumerate(sub_train.columns):
    print('VIF col {}: {}'.format(col,variance_inflation_factor(X,i)))


VIF col gasLimit_t: 1.59171325219
VIF col gasUsed_t: 1.41965478479
VIF col newContract: 1.03855950223
VIF col avg_blocktime_6: 4.44493959221
VIF col avg_uncle_count_6: 1.62189618616
VIF col avg_txcnt_second_6: 2.24676473359
VIF col avg_gasUsed_t_6: 2.08424912098
VIF col avg_price_6: 2.14220672369
VIF col avg_uncle_count_60: 4.95359233193
VIF col avg_price_60: 3.6317550142
VIF col mv: 1.11936078272
VIF col type_enc: 3.34148223087

Cross validation to find optimal model


In [8]:
rf1 = RandomForestRegressor()

In [9]:
rf1.get_params().keys()


Out[9]:
['warm_start',
 'oob_score',
 'n_jobs',
 'min_impurity_decrease',
 'verbose',
 'max_leaf_nodes',
 'bootstrap',
 'min_samples_leaf',
 'n_estimators',
 'min_samples_split',
 'min_weight_fraction_leaf',
 'criterion',
 'random_state',
 'min_impurity_split',
 'max_features',
 'max_depth']

Standardize data


In [10]:
# Create a scaler object
#sc = StandardScaler()

# Fit the scaler to the feature data and transform
#X_std = sc.fit_transform(X)

Nested cross validation


In [11]:
# Create a list of 10 candidate values for the C parameter
param_candidates = dict(max_depth=np.arange(1, 7, 1), 
                        n_estimators=np.arange(5, 25, 5))

In [12]:
param_candidates


Out[12]:
{'max_depth': array([1, 2, 3, 4, 5, 6]),
 'n_estimators': array([ 5, 10, 15, 20])}

In [13]:
# Create a gridsearch object with the decision tree regressor and the max_depth value candidates
rf = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_candidates)

In [14]:
# Fit the cross validated grid search on the data 
rf.fit(X_train, y_train)


Out[14]:
GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([ 5, 10, 15, 20]), 'max_depth': array([1, 2, 3, 4, 5, 6])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [17]:
# Show the best param values
rf.best_estimator_


Out[17]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [56]:
print('Mean CV r2_score: {}'.format(np.mean(cross_val_score(
        rf, X_test, y_test, scoring='r2', cv=5))))


Mean CV r2_score: 0.825487086998

In [43]:
results1 = pd.DataFrame(X_train)
results1.columns = sub_cols
results1['label']  = y_train
results1['pred']  = rf.predict(X_train)
results1['stage'] = 'Train'

In [44]:
results1.head()


Out[44]:
gasLimit_t gasUsed_t newContract avg_blocktime_6 avg_uncle_count_6 avg_txcnt_second_6 avg_gasUsed_t_6 avg_price_6 avg_uncle_count_60 avg_price_60 mv type_enc label pred stage
0 50000.0 21000.0 0.0 30.333333 0.000000 10.333333 48851.226918 30.109970 0.100000 29.814134 0.269151 2.0 10.861544 18.237229 Train
1 50000.0 21000.0 0.0 24.833333 0.000000 11.500000 31427.203082 22.623876 0.133333 25.257682 1.182141 2.0 12.959649 15.160349 Train
2 650000.0 518513.0 0.0 39.166667 0.000000 3.666667 34272.846260 27.709378 0.016667 24.240462 0.092250 2.0 7.762371 18.237229 Train
3 74318.0 37158.0 0.0 19.833333 0.000000 4.166667 70370.262842 29.253390 0.066667 26.272013 0.688410 2.0 22.216060 18.237229 Train
4 250000.0 45992.0 0.0 22.000000 0.166667 6.166667 138817.151832 19.691374 0.066667 23.750735 0.332509 2.0 8.783028 9.816403 Train

In [45]:
results2 = pd.DataFrame(X_test)
results2.columns = sub_cols
results2['label']  = y_test
results2['pred']  = rf.predict(X_test)
results2['stage'] = 'Test'

In [46]:
results2.head()


Out[46]:
gasLimit_t gasUsed_t newContract avg_blocktime_6 avg_uncle_count_6 avg_txcnt_second_6 avg_gasUsed_t_6 avg_price_6 avg_uncle_count_60 avg_price_60 mv type_enc label pred stage
0 1730520.0 41847.0 0.0 33.166667 0.000000 2.166667 44060.899025 25.589140 0.050000 26.094059 3.996689 0.0 4.013700 7.970963 Test
1 21000.0 21000.0 0.0 36.000000 0.166667 4.000000 48306.599942 30.763945 0.100000 24.730089 0.634915 2.0 24.167385 18.237229 Test
2 90000.0 21000.0 0.0 56.833333 0.000000 4.333333 52169.843132 36.520256 0.083333 32.719292 1.116790 2.0 51.044565 18.237229 Test
3 940000.0 29912.0 0.0 39.000000 0.000000 3.166667 29540.946698 26.630111 0.033333 24.008287 1.516613 2.0 10.110389 15.160349 Test
4 524747.0 437289.0 1.0 28.666667 0.000000 4.166667 55129.327656 18.702611 0.033333 23.785293 1.724892 1.0 8.752006 15.160349 Test

In [51]:
results = pd.concat([results1,results2], axis=0)

In [52]:
results.head()


Out[52]:
gasLimit_t gasUsed_t newContract avg_blocktime_6 avg_uncle_count_6 avg_txcnt_second_6 avg_gasUsed_t_6 avg_price_6 avg_uncle_count_60 avg_price_60 mv type_enc label pred stage
0 50000.0 21000.0 0.0 30.333333 0.000000 10.333333 48851.226918 30.109970 0.100000 29.814134 0.269151 2.0 10.861544 18.237229 Train
1 50000.0 21000.0 0.0 24.833333 0.000000 11.500000 31427.203082 22.623876 0.133333 25.257682 1.182141 2.0 12.959649 15.160349 Train
2 650000.0 518513.0 0.0 39.166667 0.000000 3.666667 34272.846260 27.709378 0.016667 24.240462 0.092250 2.0 7.762371 18.237229 Train
3 74318.0 37158.0 0.0 19.833333 0.000000 4.166667 70370.262842 29.253390 0.066667 26.272013 0.688410 2.0 22.216060 18.237229 Train
4 250000.0 45992.0 0.0 22.000000 0.166667 6.166667 138817.151832 19.691374 0.066667 23.750735 0.332509 2.0 8.783028 9.816403 Train

In [54]:
results['logPred']=results.pred.apply(np.log10)
results['logLabel']=results.label.apply(np.log10)

In [72]:
sns.lmplot(x="label", y="pred", hue="stage",truncate=True, size=10, data=results)


Out[72]:
<seaborn.axisgrid.FacetGrid at 0x1a59d1ad90>

In [76]:
sns.jointplot(x="label", y="pred", size=10, data=results)


Out[76]:
<seaborn.axisgrid.JointGrid at 0x1a32d14dd0>

In [55]:
sns.lmplot(x="logLabel", y="logPred", hue="stage",truncate=True, size=10, data=results)


Out[55]:
<seaborn.axisgrid.FacetGrid at 0x1a77b18d90>

In [77]:
sns.jointplot(x="logLabel", y="logPred", size=10, data=results)


Out[77]:
<seaborn.axisgrid.JointGrid at 0x1a341e0990>

In [ ]:
sns.jointplot(x="logLabel", y="logPred",kind="kde", size=10, data=results)

In [57]:
results['absoluteError'] = results.pred-results.label
results['relativeError'] = results.absoluteError/results.label

In [67]:
results.relativeError.hist(bins=np.arange(-1,4,.1))


Out[67]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a331e26d0>

In [71]:
results.absoluteError.hist(bins=np.arange(-50,50,1))


Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a59bf1990>

Plot results


In [73]:
sub_train.columns


Out[73]:
Index([u'gasLimit_t', u'gasUsed_t', u'newContract', u'avg_blocktime_6',
       u'avg_uncle_count_6', u'avg_txcnt_second_6', u'avg_gasUsed_t_6',
       u'avg_price_6', u'avg_uncle_count_60', u'avg_price_60', u'mv',
       u'type_enc'],
      dtype='object')

In [65]:
y_pred = rf.predict(X_test)

In [68]:
y_pred.shape, y_test.shape


Out[68]:
((349157,), (349157,))

In [80]:
results = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})

In [ ]:
results['y_test_log'] =

In [86]:
plt.scatter(results['y_test'].apply(np.log10), results['y_pred'].apply(np.log10))
#plt.xlim(0,100)
#plt.ylim(0,100)


Out[86]:
<matplotlib.collections.PathCollection at 0x116706450>

In [ ]:
sns.set(style="darkgrid", color_codes=True)
sns.jointplot(x="y_test", y="y_pred", data=results, marginal_kws=dict(bins=25)
plt.xlim(0, 100)
plt.tight_layout()
plt.savefig('./../images/jointplot.png', dpi=300)

Feature importances


In [91]:
def rf_regressor(X_train, X_test, y_train, y_test):
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    scores = cross_val_score(rf, X_train, y_train, scoring='r2', cv=5)
    print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
    print('R2_score: {}'.format(r2_score(y_test, y_pred)))
    print('avg_CV_score: {}'.format(np.mean(scores)))
    # write predicted values to csv
    p = pd.DataFrame({'y_pred': y_pred})
    p.to_csv('./../data/label_pred.csv')
    return rf

In [92]:
model = rf_regressor(X_train, X_test, y_train, y_test)


MSE: 13.1884823058
R2_score: 0.987232856113
avg_CV_score: 0.983622505175

In [ ]:
def plot_feature_importance(rf, feature_df):
    cols = []
    for col in feature_df.columns:
        cols.append(col)

    feat_scores = pd.DataFrame({'Fraction of Samples Affected' : rf.feature_importances_},
                           index=cols)
    feat_scores = feat_scores.sort_values(by='Fraction of Samples Affected')
    feat_scores.plot(kind='barh', color='r', figsize=(6,6))
    #plt.xlabel('Importance', fontsize=18)
    plt.title('Feature Importance', fontsize=18)
    plt.tight_layout()
    plt.savefig('./../images/feat_import_pruned.png', dpi=300)

In [93]:
plot_feature_importance(model, sub_train)



In [ ]: