In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import QuantileTransformer, Normalizer
import statsmodels.api as sm
%matplotlib inline









    



//anaconda/lib/python2.7/site-packages/pandas/core/computation/__init__.py:18: UserWarning: The installed version of numexpr 2.4.3 is not supported in pandas and will be not be used
The minimum supported version is 2.4.6

  ver=ver, min_ver=_MIN_NUMEXPR_VERSION), UserWarning)
//anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
//anaconda/lib/python2.7/site-packages/matplotlib/__init__.py:878: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
//anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Load data



In [2]:

    
num_prev_blocks = 50



In [3]:

    
df = pd.read_csv('./../data/data_filtered.csv')



In [4]:

    
df_avg = pd.read_csv('./../data/block_avg_{}.csv'.format(num_prev_blocks))



In [5]:

    
np.unique(df['block_id'].values).shape









    Out[5]:





(14470,)

Merge data



In [6]:

    
df.columns









    Out[6]:





Index([u'Unnamed: 0', u'hash_t', u'accountNonce', u'amount', u'block_id',
       u'gasLimit_t', u'gasUsed_t', u'newContract', u'price', u'time_t',
       u'txIndex', u'type', u'blockTime', u'difficulty', u'gasLimit_b',
       u'gasUsed_b', u'reward', u'size', u'time_b', u'totalFee', u'tx_count',
       u'uncle_count', u'amount_binary', u'price_gwei', u'day', u'hour',
       u'minute', u'second', u'dayofweek', u'amount_eth', u'type_enc',
       u'txcnt_second', u'avg_gasUsed_t_perblock', u'avg_price_perblock'],
      dtype='object')



In [7]:

    
df_avg.columns









    Out[7]:





Index([u'Unnamed: 0', u'avg_blocktime', u'avg_gasUsed_b', u'avg_tx_count',
       u'avg_uncle_count', u'avg_difficulty', u'avg_txcnt_second',
       u'avg_gasUsed_t', u'avg_price', u'blockids'],
      dtype='object')



In [8]:

    
df.drop('Unnamed: 0', axis=1, inplace=True)



In [9]:

    
df_avg.drop('Unnamed: 0', axis=1, inplace=True)



In [10]:

    
df_avg.shape









    Out[10]:





(14470, 9)



In [11]:

    
df.shape[1] + df_avg.shape[1]









    Out[11]:





42



In [12]:

    
df_avg.head()









    Out[12]:







  
    
      
      avg_blocktime
      avg_gasUsed_b
      avg_tx_count
      avg_uncle_count
      avg_difficulty
      avg_txcnt_second
      avg_gasUsed_t
      avg_price
      blockids
    
  
  
    
      0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4286251
    
    
      1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4286252
    
    
      2
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4286253
    
    
      3
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4286254
    
    
      4
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      4286255



In [13]:

    
merged = pd.merge(df, df_avg, left_on='block_id', right_on='blockids')



In [14]:

    
merged.columns









    Out[14]:





Index([u'hash_t', u'accountNonce', u'amount', u'block_id', u'gasLimit_t',
       u'gasUsed_t', u'newContract', u'price', u'time_t', u'txIndex', u'type',
       u'blockTime', u'difficulty', u'gasLimit_b', u'gasUsed_b', u'reward',
       u'size', u'time_b', u'totalFee', u'tx_count', u'uncle_count',
       u'amount_binary', u'price_gwei', u'day', u'hour', u'minute', u'second',
       u'dayofweek', u'amount_eth', u'type_enc', u'txcnt_second',
       u'avg_gasUsed_t_perblock', u'avg_price_perblock', u'avg_blocktime',
       u'avg_gasUsed_b', u'avg_tx_count', u'avg_uncle_count',
       u'avg_difficulty', u'avg_txcnt_second', u'avg_gasUsed_t', u'avg_price',
       u'blockids'],
      dtype='object')



In [15]:

    
merged.shape









    Out[15]:





(629224, 42)



In [16]:

    
#find null values
for col in merged.columns:
    print(col, merged[col].isnull().sum())









    



('hash_t', 0)
('accountNonce', 0)
('amount', 0)
('block_id', 0)
('gasLimit_t', 0)
('gasUsed_t', 0)
('newContract', 0)
('price', 0)
('time_t', 0)
('txIndex', 534929)
('type', 0)
('blockTime', 0)
('difficulty', 0)
('gasLimit_b', 0)
('gasUsed_b', 0)
('reward', 0)
('size', 0)
('time_b', 0)
('totalFee', 0)
('tx_count', 0)
('uncle_count', 0)
('amount_binary', 0)
('price_gwei', 0)
('day', 0)
('hour', 0)
('minute', 0)
('second', 0)
('dayofweek', 0)
('amount_eth', 0)
('type_enc', 0)
('txcnt_second', 0)
('avg_gasUsed_t_perblock', 0)
('avg_price_perblock', 0)
('avg_blocktime', 4587)
('avg_gasUsed_b', 4587)
('avg_tx_count', 4587)
('avg_uncle_count', 4587)
('avg_difficulty', 4587)
('avg_txcnt_second', 4587)
('avg_gasUsed_t', 4587)
('avg_price', 4587)
('blockids', 0)



In [17]:

    
merged.drop('txIndex', axis=1, inplace=True)



In [18]:

    
merged.dropna(inplace=True)



In [19]:

    
#find null values
for col in merged.columns:
    print(col, merged[col].isnull().sum())









    



('hash_t', 0)
('accountNonce', 0)
('amount', 0)
('block_id', 0)
('gasLimit_t', 0)
('gasUsed_t', 0)
('newContract', 0)
('price', 0)
('time_t', 0)
('type', 0)
('blockTime', 0)
('difficulty', 0)
('gasLimit_b', 0)
('gasUsed_b', 0)
('reward', 0)
('size', 0)
('time_b', 0)
('totalFee', 0)
('tx_count', 0)
('uncle_count', 0)
('amount_binary', 0)
('price_gwei', 0)
('day', 0)
('hour', 0)
('minute', 0)
('second', 0)
('dayofweek', 0)
('amount_eth', 0)
('type_enc', 0)
('txcnt_second', 0)
('avg_gasUsed_t_perblock', 0)
('avg_price_perblock', 0)
('avg_blocktime', 0)
('avg_gasUsed_b', 0)
('avg_tx_count', 0)
('avg_uncle_count', 0)
('avg_difficulty', 0)
('avg_txcnt_second', 0)
('avg_gasUsed_t', 0)
('avg_price', 0)
('blockids', 0)



In [20]:

    
merged['price_gwei'].hist(bins=2000)
plt.xlim(0,100)









    Out[20]:





(0, 100)



In [21]:

    
np.log(merged['price_gwei'].values)









    Out[21]:





array([ 3.04452244,  3.04452244,  3.04452244, ...,  3.04452244,
        3.04452244,  3.40119738])



In [22]:

    
plt.scatter(np.log(merged['amount_eth'].values), np.log(merged['price_gwei'].values))









    Out[22]:





<matplotlib.collections.PathCollection at 0x10a9a62d0>



In [23]:

    
merged['avg_blocktime'].hist()









    Out[23]:





<matplotlib.axes._subplots.AxesSubplot at 0x10a97f090>



In [24]:

    
plt.scatter(merged['avg_blocktime'], merged['price_gwei'])









    Out[24]:





<matplotlib.collections.PathCollection at 0x10aa72790>

Select features for modeling



In [25]:

    
features = [
        'newContract',
        'day',
        'hour',
        'dayofweek',
        'amount_eth',
        'type_enc',
        'avg_price',
        'avg_blocktime',
        'avg_gasUsed_b',
        'avg_tx_count',
        'avg_uncle_count',
        'avg_difficulty',
        'avg_txcnt_second',
        'avg_gasUsed_t'
        ]
X = merged[features].values
y = merged['price_gwei'].values
X_train, X_test, y_train, y_test = train_test_split(X, y)

Scale features



In [26]:

    
#sklearn quantile transformer
quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=0)
X_train_trans = quantile_transformer.fit_transform(X_train)
X_test_trans = quantile_transformer.transform(X_test)



In [27]:

    
#normalize
normalizer = Normalizer().fit(X_train)
X_train_norm = normalizer.transform(X_train) 
normalizer = Normalizer().fit(X_test)
X_test_norm = normalizer.transform(X_test)

Linear regression



In [28]:

    
def linear_regression(X_train, X_test, y_train, y_test):
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    scores = cross_val_score(lr, X_train, y_train, scoring='r2', cv=5)
    print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
    print('R2_score: {}'.format(r2_score(y_test, y_pred)))
    print('avg_CV_score: {}'.format(np.mean(scores)))
    return lr



In [29]:

    
linear_regression(X_train, X_test, y_train, y_test)









    



MSE: 7588.98821218
R2_score: 0.0101372193301
avg_CV_score: 0.0103448109223






    



//anaconda/lib/python2.7/site-packages/scipy/linalg/basic.py:1018: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)






    Out[29]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [30]:

    
# get summary statistics from statsmodels
model = sm.OLS(y_train, X_train)
result = model.fit()
result.summary()









    Out[30]:





OLS Regression Results

  Dep. Variable:             y           R-squared:              0.008  


  Model:                    OLS          Adj. R-squared:         0.008  


  Method:              Least Squares     F-statistic:            963.5  


  Date:              Sat, 30 Sep 2017    Prob (F-statistic):      0.00   


  Time:                  11:48:11        Log-Likelihood:     -2.7416e+06


  No. Observations:       468477         AIC:                 5.483e+06 


  Df Residuals:           468472         BIC:                 5.483e+06 


  Df Model:                    4                                        


  Covariance Type:       nonrobust                                      




         coef      std err       t       P>|t|   [0.025     0.975]  


  x1   -1.916e-07   2.81e-08     -6.820   0.000  -2.47e-07  -1.37e-07


  x2       0.0437      0.006      7.579   0.000      0.032      0.055


  x3       0.1148      0.019      6.118   0.000      0.078      0.152


  x4      -0.0187      0.003     -7.245   0.000     -0.024     -0.014


  x5       0.0422      0.001     43.343   0.000      0.040      0.044


  x6       0.0127      0.001     16.646   0.000      0.011      0.014


  x7       0.8724      0.015     59.623   0.000      0.844      0.901


  x8       0.2656      0.029      9.280   0.000      0.209      0.322


  x9    7.307e-07   2.11e-07      3.471   0.001   3.18e-07   1.14e-06


  x10      0.0043      0.006      0.674   0.500     -0.008      0.017


  x11  -8.441e-05    2.7e-05     -3.126   0.002     -0.000  -3.15e-05


  x12  -2.924e-15   2.96e-16     -9.870   0.000   -3.5e-15  -2.34e-15


  x13     -0.1318      0.009    -14.199   0.000     -0.150     -0.114


  x14  -5.309e-05   7.38e-06     -7.191   0.000  -6.76e-05  -3.86e-05




  Omnibus:        1799321.142    Durbin-Watson:             2.002     


  Prob(Omnibus):     0.000       Jarque-Bera (JB):   1798231402406.238


  Skew:             90.467       Prob(JB):                   0.00     


  Kurtosis:        9599.372      Cond. No.               1.05e+18



In [31]:

    
for num, col in enumerate(merged[features].columns):
    print(num+1, col)









    



(1, 'newContract')
(2, 'day')
(3, 'hour')
(4, 'dayofweek')
(5, 'amount_eth')
(6, 'type_enc')
(7, 'avg_price')
(8, 'avg_blocktime')
(9, 'avg_gasUsed_b')
(10, 'avg_tx_count')
(11, 'avg_uncle_count')
(12, 'avg_difficulty')
(13, 'avg_txcnt_second')
(14, 'avg_gasUsed_t')

Plot the avg MSE with respect to the number of blocks used

Need to determine how many previous blocks to look at in order to get predictive features



In [32]:

    
mse=[9906, 5689.59, 3902.42, 4946.9, 6474.36, 9032.47, 16197.49]
num_prev = [10, 25, 50, 100, 2000, 3000, 4000]
results = pd.DataFrame({'num_prev_blocks': num_prev, 'mse': mse})
sns.pointplot(x="num_prev_blocks", y="mse", data=results, color='r')
plt.title('Error with respect to number of previous blocks used')
plt.savefig('./../images/mse_prev_blocks.png')

The MSE seems to be minimal when using 50 previous blocks

KNN regression



In [33]:

    
def knn_regressor(X_train, X_test, y_train, y_test):
    model = KNeighborsRegressor(n_neighbors=5, metric='cosine', weights='uniform')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('CV score: {} | MSE: {} | R^2: {}'.format(np.mean(cross_val_score(model, X_train, y_train)), 
                                                    mean_squared_error(y_test, y_pred), 
                                                    r2_score(y_test, y_pred)))

Random forest regressor



In [34]:

    
def rf_regressor(X_train, X_test, y_train, y_test):
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    scores = cross_val_score(rf, X_train, y_train, scoring='r2', cv=5)
    print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
    print('R2_score: {}'.format(r2_score(y_test, y_pred)))
    print('avg_CV_score: {}'.format(np.mean(scores)))
    return rf



In [35]:

    
model = rf_regressor(X_train, X_test, y_train, y_test)









    



MSE: 7175.33647945
R2_score: 0.064091507167
avg_CV_score: 0.0494707596852



In [36]:

    
def plot_feature_importance(rf, feature_df):
    cols = []
    for col in feature_df.columns:
        cols.append(col)

    feat_scores = pd.DataFrame({'Fraction of Samples Affected' : rf.feature_importances_},
                           index=cols)
    feat_scores = feat_scores.sort_values(by='Fraction of Samples Affected')
    feat_scores.plot(kind='barh', color='r', figsize=(6,6))
    plt.xlabel('Importance', fontsize=18)
    plt.title('Feature Importance', fontsize=18)
    plt.savefig('./../images/feat_import_50.png')



In [37]:

    
plot_feature_importance(model, merged[features])



In [ ]:



In [ ]:

	avg_blocktime	avg_gasUsed_b	avg_tx_count	avg_uncle_count	avg_difficulty	avg_txcnt_second	avg_gasUsed_t	avg_price	blockids
0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	4286251
1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	4286252
2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	4286253
3	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	4286254
4	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	4286255

Dep. Variable:	y	R-squared:	0.008
Model:	OLS	Adj. R-squared:	0.008
Method:	Least Squares	F-statistic:	963.5
Date:	Sat, 30 Sep 2017	Prob (F-statistic):	0.00
Time:	11:48:11	Log-Likelihood:	-2.7416e+06
No. Observations:	468477	AIC:	5.483e+06
Df Residuals:	468472	BIC:	5.483e+06
Df Model:	4
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
x1	-1.916e-07	2.81e-08	-6.820	0.000	-2.47e-07	-1.37e-07
x2	0.0437	0.006	7.579	0.000	0.032	0.055
x3	0.1148	0.019	6.118	0.000	0.078	0.152
x4	-0.0187	0.003	-7.245	0.000	-0.024	-0.014
x5	0.0422	0.001	43.343	0.000	0.040	0.044
x6	0.0127	0.001	16.646	0.000	0.011	0.014
x7	0.8724	0.015	59.623	0.000	0.844	0.901
x8	0.2656	0.029	9.280	0.000	0.209	0.322
x9	7.307e-07	2.11e-07	3.471	0.001	3.18e-07	1.14e-06
x10	0.0043	0.006	0.674	0.500	-0.008	0.017
x11	-8.441e-05	2.7e-05	-3.126	0.002	-0.000	-3.15e-05
x12	-2.924e-15	2.96e-16	-9.870	0.000	-3.5e-15	-2.34e-15
x13	-0.1318	0.009	-14.199	0.000	-0.150	-0.114
x14	-5.309e-05	7.38e-06	-7.191	0.000	-6.76e-05	-3.86e-05

Omnibus:	1799321.142	Durbin-Watson:	2.002
Prob(Omnibus):	0.000	Jarque-Bera (JB):	1798231402406.238
Skew:	90.467	Prob(JB):	0.00
Kurtosis:	9599.372	Cond. No.	1.05e+18