notebook.community

Edit and run



In [1]:

    
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as sp
from scipy.stats import norm
from scipy.stats import gamma
%matplotlib inline









    



//anaconda/lib/python2.7/site-packages/pandas/core/computation/__init__.py:18: UserWarning: The installed version of numexpr 2.4.3 is not supported in pandas and will be not be used
The minimum supported version is 2.4.6

  ver=ver, min_ver=_MIN_NUMEXPR_VERSION), UserWarning)
//anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
//anaconda/lib/python2.7/site-packages/matplotlib/__init__.py:878: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))



In [2]:

    
df = pd.read_csv('./../data/clean_data.csv')



In [3]:

    
df.drop('Unnamed: 0', axis=1, inplace=True)



In [4]:

    
df['type'].value_counts()









    Out[4]:





tx         973640
call       492595
create      27726
suicide       318
Name: type, dtype: int64



In [5]:

    
df['newContract'].value_counts()









    Out[5]:





0    1489351
1       4928
Name: newContract, dtype: int64



In [6]:

    
for col in df.columns:
    print(col, df[col].isnull().sum())









    



_id_t 0
accountNonce 0
amount 0
blockHash 0
block_id 0
gasLimit_t 0
gasUsed_t 0
hash_t 0
newContract 0
parentHash_t 0
price 0
recipient 0
sender 0
time_t 0
type 0
_id_b 0
blockTime 0
coinbase 0
difficulty 0
extra 0
gasLimit_b 0
gasUsed_b 0
hash_b 0
mixDigest 1494279
nonce 0
number 0
parentHash_b 0
reward 0
root 0
size 0
time_b 0
totalFee 0
txHash 0
tx_count 0
uncleHash 0
uncle_count 0
price_gwei 0
amount_gwei 0
gasShare 0
gweiPaid 0
gweiPaid_b 0
gweiShare 2
free_t 0
day 0
hour 0
dayofweek 0



In [7]:

    
df.drop('mixDigest', axis=1, inplace=True)



In [8]:

    
df.dropna(inplace=True)



In [9]:

    
df.shape









    Out[9]:





(1494277, 45)



In [10]:

    
# drop cases where gasUsed_t is zero since no gas was used
print('exclude {} rows with zero gas used'.format(df[df['gasUsed_t'] == 0].values.shape[0]))









    



exclude 89799 rows with zero gas used



In [11]:

    
df = df[df['gasUsed_t'] != 0]

Generate new columns with average block info

Take average values over two time horizons

6 blocks (~1 min) -> represents the current state (short frequency view)
60 blocks (~10 min) -> represents the long term view



In [12]:

    
df['txcnt_second'] = df['tx_count'].values / df['blockTime'].values
df['avg_gasUsed_t_perblock'] = df.groupby('block_id')['gasUsed_t'].transform('mean')
df['avg_price_perblock'] = df.groupby('block_id')['price_gwei'].transform('mean')



In [13]:

    
def rolling_avg(window_size):
    
    price = df[['block_id', 'avg_price_perblock']].drop_duplicates().sort_values(
                        'block_id', ascending=True)
    
    gasUsed_t = df[['block_id', 'avg_gasUsed_t_perblock']].drop_duplicates().sort_values(
                            'block_id', ascending=True)
    
    txcnt_second = df[['block_id', 'txcnt_second']].drop_duplicates().sort_values(
                                'block_id', ascending=True)
    
    tx_count = df[['block_id', 'tx_count']].drop_duplicates().sort_values(
                            'block_id', ascending=True)
    
    gasUsed_b = df[['block_id', 'gasUsed_b']].drop_duplicates().sort_values(
                            'block_id', ascending=True)
    
    uncle_count = df[['block_id', 'uncle_count']].drop_duplicates().sort_values(
                                'block_id', ascending=True)
    
    difficulty = df[['block_id', 'difficulty']].drop_duplicates().sort_values(
                                'block_id', ascending=True)
    
    blocktime = df[['block_id', 'blockTime']].drop_duplicates().sort_values(
                             'block_id', ascending=True)
    
    # create new pandas dataframe with average values
    rolling_avg = pd.DataFrame()
    
    # calculate rolling averages 
    rolling_avg['avg_blocktime'] = blocktime['blockTime'].rolling(window=window_size).mean()
    rolling_avg['avg_gasUsed_b'] = gasUsed_b['gasUsed_b'].rolling(window=window_size).mean()
    rolling_avg['avg_tx_count'] = tx_count['tx_count'].rolling(window=window_size).mean()
    rolling_avg['avg_uncle_count'] = uncle_count['uncle_count'].rolling(window=window_size).mean()
    rolling_avg['avg_difficulty'] = difficulty['difficulty'].rolling(window=window_size).mean()
    rolling_avg['avg_txcnt_second'] = txcnt_second['txcnt_second'].rolling(window=window_size).mean()
    rolling_avg['avg_gasUsed_t'] = gasUsed_t['avg_gasUsed_t_perblock'].rolling(window=window_size).mean()
    rolling_avg['avg_price'] = price['avg_price_perblock'].rolling(window=window_size).mean()
    
    # insert blockids to merge on
    rolling_avg['blockids'] = df['block_id'].drop_duplicates().sort_values(ascending=True)
    
    return rolling_avg



In [14]:

    
num_blocks = [6, 60]
for num in num_blocks:
    df_rolling_avg = rolling_avg(num)
    df_rolling_avg.to_csv('./../data/block_avg_{}.csv'.format(num))



In [15]:

    
df_rolling_avg_6 = rolling_avg(6)



In [16]:

    
df_rolling_avg_60 = rolling_avg(60)

Merge data with new columns



In [17]:

    
merged1 = pd.merge(df, df_rolling_avg_6, left_on='block_id', right_on='blockids')



In [18]:

    
merged2 = pd.merge(merged1, df_rolling_avg_60, left_on='block_id', right_on='blockids', suffixes=('_6', '_60'))



In [19]:

    
merged2.columns









    Out[19]:





Index([u'_id_t', u'accountNonce', u'amount', u'blockHash', u'block_id',
       u'gasLimit_t', u'gasUsed_t', u'hash_t', u'newContract', u'parentHash_t',
       u'price', u'recipient', u'sender', u'time_t', u'type', u'_id_b',
       u'blockTime', u'coinbase', u'difficulty', u'extra', u'gasLimit_b',
       u'gasUsed_b', u'hash_b', u'nonce', u'number', u'parentHash_b',
       u'reward', u'root', u'size', u'time_b', u'totalFee', u'txHash',
       u'tx_count', u'uncleHash', u'uncle_count', u'price_gwei',
       u'amount_gwei', u'gasShare', u'gweiPaid', u'gweiPaid_b', u'gweiShare',
       u'free_t', u'day', u'hour', u'dayofweek', u'txcnt_second',
       u'avg_gasUsed_t_perblock', u'avg_price_perblock', u'avg_blocktime_6',
       u'avg_gasUsed_b_6', u'avg_tx_count_6', u'avg_uncle_count_6',
       u'avg_difficulty_6', u'avg_txcnt_second_6', u'avg_gasUsed_t_6',
       u'avg_price_6', u'blockids_6', u'avg_blocktime_60', u'avg_gasUsed_b_60',
       u'avg_tx_count_60', u'avg_uncle_count_60', u'avg_difficulty_60',
       u'avg_txcnt_second_60', u'avg_gasUsed_t_60', u'avg_price_60',
       u'blockids_60'],
      dtype='object')

Create a label

What are we predicting?
A hindsight estimate of what the price should be, given knowledge about previous blocks
Develop a summary statistic about the distribution of prices over previous blocks
Our target: the 25th percentile of the distribution (gweiShare / gasShare)

Definitions

gasUsed_t -> the amount of gas consumed on a transation
gasUsed_b -> the amount of gas consumed in an entire block
gweiPaid -> the total amount paid (Gwei) for a transaction (= gasUsed_t x price_gwei)
gweiPaid_b -> the total amount paid in a block
gweiShare -> the fraction of gwei paid w.r.t. the entire block
gasShare -> the fraction of gas consumed w.r.t. the entire block

Define "miner value" – mv

the fraction of prices per block and gas per block
mv = gweiShare / gasShare
local parameter (per transaction)

Define mu

mu is a summary statistic of mv (global parameter)
a measure of how likely a transaction is to be "picked up" by a miner for completion (risk factor)
our target/goal is for mu to be the 25th percentile of mv (gweiShare / gasShare)
mu = percentile(mv, 25) over the entire distribution of mv values
we can tune this parameter to increase or decrease the desired percentile
it is a pre-emptive statistical calculation based on our hindsight knowledge

The "price" predicted with hindsight

knowing mu, how do we obtain our hindsight recommendation?
using our definition of mu, we solve an equation to obtain p (price)
p = (mu x gweiPaid_b) / gasUsed_b
this will serve as our label and thus recommendation for how much to pay per unit gas for a transation to successfully commence
it tells us what price we need to set in order to force mv for that bid to be mu

Calculate miner value (mv) for every datapoint in our dataset

price / gas or gweiShare / gasShare



In [21]:

    
merged2['mv'] = merged2.gweiShare / merged2.gasShare



In [22]:

    
merged2['mv'].isnull().sum()









    Out[22]:





0



In [23]:

    
merged2['mv'].describe()









    Out[23]:





count    1.404478e+06
mean     1.631380e+00
std      4.518241e+00
min      5.990722e-11
25%      7.329693e-01
50%      1.181439e+00
75%      1.911525e+00
max      4.233297e+03
Name: mv, dtype: float64

There are no zero values, but many values close to zero



In [24]:

    
merged2.groupby('block_id')['mv'].count().head(6)









    Out[24]:





block_id
4286251    258
4286252    241
4286253    208
4286254    107
4286255    189
4286256    192
Name: mv, dtype: int64



In [25]:

    
merged2.groupby('block_id')['mv'].count().mean()









    Out[25]:





96.3158688794404

There are only on average 96 samples in each block



In [26]:

    
print('max tx in block: {}, min tx in block: {}'.format(
       merged2.groupby('block_id')['mv'].count().max(),
       merged2.groupby('block_id')['mv'].count().min()))









    



max tx in block: 504, min tx in block: 1

So we create groupings of 6 blocks to increase sample size

Compute the summary statistic mu

given the distribution of mv values, fit a statistical model to the data
use this fit model to compute the 25th percentile of the distribution



In [249]:

    
merged2['mv'].hist(bins=10000, label='Miner Values', histtype='stepfilled')
plt.xlim(-2, 10)
plt.xlabel('Miner Value')
plt.legend()









    Out[249]:





<matplotlib.legend.Legend at 0x1321bcad0>



In [250]:

    
# compute mean, variance, standard deviation
mu_hat = np.mean(merged2['mv'])
sigma_sq_hat = np.var(merged2['mv'])
sigma_hat = np.std(merged2['mv'])
print("Sample Mean: {0:1.3f}".format(mu_hat))
print("Sample Variance: {0:1.3f}".format(sigma_sq_hat))
print("Sample Standard Dev: {0:1.3f}".format(sigma_hat))









    



Sample Mean: 1.631
Sample Variance: 20.414
Sample Standard Dev: 4.518

Trying a Gaussian distribution



In [321]:

    
x = np.linspace(-10, 15, num=1000)
fig, ax = plt.subplots(1, 1, figsize=(8, 5))

ax.hist(merged2['mv'], normed=True, bins=10000, histtype='stepfilled', label='Samples')
ax.plot(x, norm.pdf(x, mu_hat,sigma_hat), 'r-', lw=3, label='PDF')
ax.axvline(x=np.percentile(norm.pdf(x, mu_hat,sigma_hat), 25), linestyle='--', label='25th percentile')
ax.set_xlim(-5,10)
ax.set_xlabel('Miner Values')
ax.legend()









    Out[321]:





<matplotlib.legend.Legend at 0x15b792d50>



In [322]:

    
# compute 25th percentile
np.percentile(norm.pdf(x, mu_hat,sigma_hat), 25)









    Out[322]:





5.0170797135435324e-17

Set this value to mu



In [323]:

    
mu_normal = np.percentile(norm.pdf(x, mu_hat,sigma_hat), 25)

Trying a Gamma distribution



In [324]:

    
alpha = float(mu_hat ** 2) / sigma_sq_hat
beta = float(mu_hat) / sigma_sq_hat

x = np.linspace(-5, 10, num=1000)
fig, ax = plt.subplots(1, 1, figsize=(8, 5))

ax.hist(merged2['mv'], normed=True, bins=10000, histtype='stepfilled', label='Samples')
ax.plot(x, gamma.pdf(x, alpha), 'g-', lw=3, label='Gamma PDF')
ax.axvline(x=np.percentile(gamma.pdf(x, alpha), 25), linestyle='--', label='25th percentile')
ax.set_xlim(-5,10)
ax.set_xlabel('Miner Values')
ax.legend()









    Out[324]:





<matplotlib.legend.Legend at 0x13aea1110>



In [325]:

    
# compute 25th percentile
np.percentile(gamma.pdf(x, alpha), 25)









    Out[325]:





0.0

The gamma distribution appears to fit the empirical data better but we get zero for the 25th percentile

Compute the label, p, given mu

knowing mu, how do we obtain our hindsight recommendation?
using our definition of mu, we solve an equation to obtain p (price)
p = (mu x gweiPaid_b) / gasUsed_b
this will serve as our label and thus recommendation for how much to pay per unit gas for a transation to successfully commence
it tells us what price we need to set in order to force mv for that bid to be mu



In [336]:

    
mu_normal









    Out[336]:





5.0170797135435324e-17



In [337]:

    
merged2['p_label'] = mu_normal * (merged2.gweiPaid_b / merged2.gasUsed_b)



In [338]:

    
merged2['p_label'].hist(bins=3000)
plt.xlim(-0.1,1)









    Out[338]:





(-0.1, 1)

If mu is higher around 0.01 we get a normal distribution

Write training set and labels to a csv file for modeling



In [333]:

    
merged2.columns









    Out[333]:





Index([u'_id_t', u'accountNonce', u'amount', u'blockHash', u'block_id',
       u'gasLimit_t', u'gasUsed_t', u'hash_t', u'newContract', u'parentHash_t',
       u'price', u'recipient', u'sender', u'time_t', u'type', u'_id_b',
       u'blockTime', u'coinbase', u'difficulty', u'extra', u'gasLimit_b',
       u'gasUsed_b', u'hash_b', u'nonce', u'number', u'parentHash_b',
       u'reward', u'root', u'size', u'time_b', u'totalFee', u'txHash',
       u'tx_count', u'uncleHash', u'uncle_count', u'price_gwei',
       u'amount_gwei', u'gasShare', u'gweiPaid', u'gweiPaid_b', u'gweiShare',
       u'free_t', u'day', u'hour', u'dayofweek', u'txcnt_second',
       u'avg_gasUsed_t_perblock', u'avg_price_perblock', u'avg_blocktime_6',
       u'avg_gasUsed_b_6', u'avg_tx_count_6', u'avg_uncle_count_6',
       u'avg_difficulty_6', u'avg_txcnt_second_6', u'avg_gasUsed_t_6',
       u'avg_price_6', u'blockids_6', u'avg_blocktime_60', u'avg_gasUsed_b_60',
       u'avg_tx_count_60', u'avg_uncle_count_60', u'avg_difficulty_60',
       u'avg_txcnt_second_60', u'avg_gasUsed_t_60', u'avg_price_60',
       u'blockids_60', u'mv', u'p_label'],
      dtype='object')



In [334]:

    
# select candidate features for modeling
sel_cols = ['gasLimit_t',
           'gasUsed_t',
           'newContract',
           'blockTime',
           'difficulty',
           'gasLimit_b',
           'gasUsed_b',
           'reward',
           'size',
           'totalFee',
           'amount_gwei',
           'gasShare',
           'gweiPaid',
           'gweiPaid_b',
           'gweiShare',
           'free_t',
           'day',
           'hour',
           'dayofweek',
           'txcnt_second',
           'avg_blocktime_6',
           'avg_gasUsed_b_6',
           'avg_tx_count_6', 
           'avg_uncle_count_6',
           'avg_difficulty_6',
           'avg_txcnt_second_6',
           'avg_gasUsed_t_6',
           'avg_price_6', 
           'avg_blocktime_60',
           'avg_gasUsed_b_60',
           'avg_tx_count_60',
           'avg_uncle_count_60',
           'avg_difficulty_60',
           'avg_txcnt_second_60',
           'avg_gasUsed_t_60',
           'avg_price_60',
           'mv']



In [287]:

    
features = merged2[sel_cols]



In [288]:

    
features.to_csv('./../data/training.csv')



In [289]:

    
labels = merged2['p_label']



In [290]:

    
labels.to_csv('./../data/labels.csv')

Model for first group of 6



In [291]:

    
# compute mean, variance, standard deviation
mu_hat = np.mean(samples)
sigma_sq_hat = np.var(samples)
sigma_hat = np.std(samples)
print("Sample Mean: {0:1.3f}".format(mu_hat))
print("Sample Variance: {0:1.3f}".format(sigma_sq_hat))
print("Sample Standard Dev: {0:1.3f}".format(sigma_hat))









    



Sample Mean: 1.535
Sample Variance: 1.206
Sample Standard Dev: 1.098



In [292]:

    
x = np.linspace(-5, 8, num=250)
fig, ax = plt.subplots(1, 1, figsize=(8, 5))

ax.hist(samples, normed=True, bins=25, histtype='stepfilled', label='Samples')
ax.plot(x, norm.pdf(x, mu_hat,sigma_hat), 'r-', lw=3, label='PDF')
ax.axvline(x=np.percentile(norm.pdf(x, mu_hat,sigma_hat), 25), linestyle='--', label='25th percentile')
ax.set_xlim(-2,6)
ax.set_xlabel('Miner Values')
ax.legend()









    Out[292]:





<matplotlib.legend.Legend at 0x154970f10>



In [293]:

    
# compute 25th percentile
np.percentile(norm.pdf(x, mu_hat,sigma_hat), 25)









    Out[293]:





1.7948041111385629e-05



In [ ]: