In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn import tree
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import QuantileTransformer, Normalizer, LabelEncoder
import statsmodels.api as sm
%matplotlib inline


//anaconda/lib/python2.7/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)
//anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
//anaconda/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [2]:
train = pd.read_csv('./../data/training.csv')

In [3]:
label = pd.read_csv('./../data/labels.csv', header=None)

In [4]:
train.head()


Out[4]:
Unnamed: 0 gasLimit_t gasUsed_t newContract blockTime difficulty gasLimit_b gasUsed_b reward size ... avg_price_6 avg_blocktime_60 avg_gasUsed_b_60 avg_tx_count_60 avg_uncle_count_60 avg_difficulty_60 avg_txcnt_second_60 avg_gasUsed_t_60 avg_price_60 mv
0 0 50000 21000 0 10 2338428842518947 6732067 383005 5008232105000000000 2336 ... 24.523604 24.45 5.768627e+06 94.616667 0.15 2.332719e+15 7.4 59403.370662 23.908728 0.932255
1 1 20967 9416 0 10 2338428842518947 6732067 383005 5008232105000000000 2336 ... 24.523604 24.45 5.768627e+06 94.616667 0.15 2.332719e+15 7.4 59403.370662 23.908728 0.932255
2 2 50000 21000 0 10 2338428842518947 6732067 383005 5008232105000000000 2336 ... 24.523604 24.45 5.768627e+06 94.616667 0.15 2.332719e+15 7.4 59403.370662 23.908728 0.932255
3 3 50000 21000 0 10 2338428842518947 6732067 383005 5008232105000000000 2336 ... 24.523604 24.45 5.768627e+06 94.616667 0.15 2.332719e+15 7.4 59403.370662 23.908728 0.932255
4 4 50000 21000 0 10 2338428842518947 6732067 383005 5008232105000000000 2336 ... 24.523604 24.45 5.768627e+06 94.616667 0.15 2.332719e+15 7.4 59403.370662 23.908728 0.932255

5 rows × 39 columns


In [5]:
train.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
train.columns


Out[6]:
Index([u'gasLimit_t', u'gasUsed_t', u'newContract', u'blockTime',
       u'difficulty', u'gasLimit_b', u'gasUsed_b', u'reward', u'size', u'type',
       u'totalFee', u'amount_gwei', u'gasShare', u'gweiPaid', u'gweiPaid_b',
       u'gweiShare', u'free_t', u'day', u'hour', u'dayofweek', u'txcnt_second',
       u'avg_blocktime_6', u'avg_gasUsed_b_6', u'avg_tx_count_6',
       u'avg_uncle_count_6', u'avg_difficulty_6', u'avg_txcnt_second_6',
       u'avg_gasUsed_t_6', u'avg_price_6', u'avg_blocktime_60',
       u'avg_gasUsed_b_60', u'avg_tx_count_60', u'avg_uncle_count_60',
       u'avg_difficulty_60', u'avg_txcnt_second_60', u'avg_gasUsed_t_60',
       u'avg_price_60', u'mv'],
      dtype='object')

In [7]:
train['type'].head()


Out[7]:
0      tx
1    call
2      tx
3      tx
4      tx
Name: type, dtype: object

In [8]:
# label encode type 
le = LabelEncoder()
train['type_enc'] = le.fit_transform(train['type'])

In [9]:
train['type_enc'].value_counts()


Out[9]:
2    967103
0    401896
1     27627
Name: type_enc, dtype: int64

In [10]:
label.columns = ['0', 'p_label2']

In [11]:
label.drop('0', axis=1, inplace=True)

In [12]:
train.dropna(inplace=True)

In [13]:
train.shape, label.shape


Out[13]:
((1396626, 39), (1396626, 1))

In [14]:
y_label = np.ravel(label)

Drop some features

  • that might cause data leakage
  • that we don't have access to as inputs

In [15]:
train.columns


Out[15]:
Index([u'gasLimit_t', u'gasUsed_t', u'newContract', u'blockTime',
       u'difficulty', u'gasLimit_b', u'gasUsed_b', u'reward', u'size', u'type',
       u'totalFee', u'amount_gwei', u'gasShare', u'gweiPaid', u'gweiPaid_b',
       u'gweiShare', u'free_t', u'day', u'hour', u'dayofweek', u'txcnt_second',
       u'avg_blocktime_6', u'avg_gasUsed_b_6', u'avg_tx_count_6',
       u'avg_uncle_count_6', u'avg_difficulty_6', u'avg_txcnt_second_6',
       u'avg_gasUsed_t_6', u'avg_price_6', u'avg_blocktime_60',
       u'avg_gasUsed_b_60', u'avg_tx_count_60', u'avg_uncle_count_60',
       u'avg_difficulty_60', u'avg_txcnt_second_60', u'avg_gasUsed_t_60',
       u'avg_price_60', u'mv', u'type_enc'],
      dtype='object')

Note: there is an API that estimates the amount of gas used, thus we have access to gasUsed_t


In [16]:
train.drop(['type', 
            'mv', 
            'blockTime', 
            'difficulty', 
            'gasLimit_b', 
            'gasUsed_b',
            'reward',
            'size',
            'totalFee',
            'gasShare',
            'gweiPaid',
            'gweiPaid_b',
            'gweiShare',
            'txcnt_second'], axis=1, inplace=True)

In [17]:
train.columns


Out[17]:
Index([u'gasLimit_t', u'gasUsed_t', u'newContract', u'amount_gwei', u'free_t',
       u'day', u'hour', u'dayofweek', u'avg_blocktime_6', u'avg_gasUsed_b_6',
       u'avg_tx_count_6', u'avg_uncle_count_6', u'avg_difficulty_6',
       u'avg_txcnt_second_6', u'avg_gasUsed_t_6', u'avg_price_6',
       u'avg_blocktime_60', u'avg_gasUsed_b_60', u'avg_tx_count_60',
       u'avg_uncle_count_60', u'avg_difficulty_60', u'avg_txcnt_second_60',
       u'avg_gasUsed_t_60', u'avg_price_60', u'type_enc'],
      dtype='object')

Prune out some more features


In [18]:
train.drop(['free_t',
           'newContract',
           'amount_gwei',
           'type_enc',
           'dayofweek',
           'day'], axis=1, inplace=True)

Split data into training and test sets


In [19]:
X = train.values
y = y_label
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [20]:
X_train.shape, y_train.shape


Out[20]:
((1047469, 19), (1047469,))

Random forest regressor


In [21]:
def rf_regressor(X_train, X_test, y_train, y_test):
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    scores = cross_val_score(rf, X_train, y_train, scoring='r2', cv=5)
    print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
    print('R2_score: {}'.format(r2_score(y_test, y_pred)))
    print('avg_CV_score: {}'.format(np.mean(scores)))
    # write predicted values to csv
    p = pd.DataFrame({'y_pred': y_pred})
    p.to_csv('./../data/label_pred.csv')
    return rf

In [ ]:
model = rf_regressor(X_train, X_test, y_train, y_test)

In [ ]:
def plot_feature_importance(rf, feature_df):
    cols = []
    for col in feature_df.columns:
        cols.append(col)

    feat_scores = pd.DataFrame({'Fraction of Samples Affected' : rf.feature_importances_},
                           index=cols)
    feat_scores = feat_scores.sort_values(by='Fraction of Samples Affected')
    feat_scores.plot(kind='barh', color='r', figsize=(6,6))
    #plt.xlabel('Importance', fontsize=18)
    plt.title('Feature Importance', fontsize=18)
    plt.tight_layout()
    plt.savefig('./../images/feat_import_pruned.png', dpi=300)

In [ ]:
plot_feature_importance(model, train)

Plot predicted values against labels


In [ ]:
y_pred = pd.read_csv('./../data/label_pred.csv')

In [ ]:
y_pred.drop('Unnamed: 0', axis=1, inplace=True)

In [ ]:
y_pred.head()

In [ ]:
y_test.shape

In [ ]:
y_pred = y_pred.values.ravel()

In [ ]:
y_test.shape, y_pred.shape

In [ ]:
result = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})

In [ ]:
result.head()

In [ ]:
plt.scatter(result['y_test'], result['y_pred'])
plt.xlim(0,100)
plt.ylim(0,100)

In [ ]:
sns.set(style="darkgrid", color_codes=True)
sns.jointplot(x="y_test", y="y_pred", data=result)
plt.xlim(0, 100)
plt.tight_layout()
plt.savefig('./../images/jointplot.png', dpi=300)

It looks like it is overfitting

Plot model residuals


In [ ]:
sns.residplot(result.y_test, result.y_pred , lowess=True, color="g")

In [66]:
def linear_regression(X_train, X_test, y_train, y_test):
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    scores = cross_val_score(lr, X_train, y_train, scoring='r2', cv=5)
    print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
    print('R2_score: {}'.format(r2_score(y_test, y_pred)))
    print('avg_CV_score: {}'.format(np.mean(scores)))
    return lr

In [67]:
linear_regression(X_train, X_test, y_train, y_test)


MSE: 524.675830395
R2_score: 0.386641873865
avg_CV_score: 0.493296457359
//anaconda/lib/python2.7/site-packages/scipy/linalg/basic.py:1018: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)
Out[67]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [68]:
# get summary statistics from statsmodels
model = sm.OLS(y_train, X_train)
result = model.fit()
result.summary()


Out[68]:
OLS Regression Results
Dep. Variable: y R-squared: 0.498
Model: OLS Adj. R-squared: 0.498
Method: Least Squares F-statistic: 1.155e+05
Date: Tue, 03 Oct 2017 Prob (F-statistic): 0.00
Time: 18:33:33 Log-Likelihood: -4.7605e+06
No. Observations: 1047469 AIC: 9.521e+06
Df Residuals: 1047459 BIC: 9.521e+06
Df Model: 9
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
x1 1.331e-07 6.06e-08 2.196 0.028 1.43e-08 2.52e-07
x2 -1.928e-06 1.63e-07 -11.845 0.000 -2.25e-06 -1.61e-06
x3 -0.0362 0.003 -10.705 0.000 -0.043 -0.030
x4 0.0708 0.002 28.366 0.000 0.066 0.076
x5 6.064e-07 3.15e-08 19.270 0.000 5.45e-07 6.68e-07
x6 -0.0270 0.001 -24.730 0.000 -0.029 -0.025
x7 -4.95e-05 1.72e-06 -28.735 0.000 -5.29e-05 -4.61e-05
x8 4.691e-16 3.53e-17 13.302 0.000 4e-16 5.38e-16
x9 -0.0077 0.004 -2.025 0.043 -0.015 -0.000
x10 -1.852e-06 5.1e-07 -3.636 0.000 -2.85e-06 -8.54e-07
x11 0.7153 0.001 862.578 0.000 0.714 0.717
x12 0.0160 0.000 35.579 0.000 0.015 0.017
x13 -1.938e-07 5.13e-08 -3.775 0.000 -2.94e-07 -9.32e-08
x14 0.0163 0.002 9.823 0.000 0.013 0.020
x15 -2.174e-05 4.68e-07 -46.486 0.000 -2.27e-05 -2.08e-05
x16 -3.034e-15 5.98e-17 -50.774 0.000 -3.15e-15 -2.92e-15
x17 0.0017 0.000 3.771 0.000 0.001 0.003
x18 2.406e-05 1.38e-06 17.486 0.000 2.14e-05 2.68e-05
x19 -0.0594 0.001 -41.299 0.000 -0.062 -0.057
Omnibus: 2151518.126 Durbin-Watson: 2.001
Prob(Omnibus): 0.000 Jarque-Bera (JB): 68067830551.167
Skew: 16.411 Prob(JB): 0.00
Kurtosis: 1251.406 Cond. No. 1.10e+17

All features show to be significant

Check the rank of the feature matrix


In [70]:
from numpy.linalg import matrix_rank

In [84]:
X_train.shape


Out[84]:
(1047469, 19)

In [71]:
matrix_rank(X_train)


Out[71]:
3

It appears that our feature matrix is rank difficient and has a lot of colinearity


In [ ]: