In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as sp
from scipy.stats import norm
from scipy.stats import gamma
%matplotlib inline
In [2]:
df = pd.read_csv('./../data/clean_data.csv')
In [3]:
df.drop('Unnamed: 0', axis=1, inplace=True)
In [4]:
df['type'].value_counts()
Out[4]:
In [5]:
df['newContract'].value_counts()
Out[5]:
In [6]:
for col in df.columns:
print(col, df[col].isnull().sum())
In [7]:
df.drop('mixDigest', axis=1, inplace=True)
In [8]:
df.dropna(inplace=True)
In [9]:
df.shape
Out[9]:
In [10]:
# drop cases where gasUsed_t is zero since no gas was used
print('exclude {} rows with zero gas used'.format(df[df['gasUsed_t'] == 0].values.shape[0]))
In [11]:
df = df[df['gasUsed_t'] != 0]
In [12]:
df['txcnt_second'] = df['tx_count'].values / df['blockTime'].values
df['avg_gasUsed_t_perblock'] = df.groupby('block_id')['gasUsed_t'].transform('mean')
df['avg_price_perblock'] = df.groupby('block_id')['price_gwei'].transform('mean')
In [13]:
def rolling_avg(window_size):
price = df[['block_id', 'avg_price_perblock']].drop_duplicates().sort_values(
'block_id', ascending=True)
gasUsed_t = df[['block_id', 'avg_gasUsed_t_perblock']].drop_duplicates().sort_values(
'block_id', ascending=True)
txcnt_second = df[['block_id', 'txcnt_second']].drop_duplicates().sort_values(
'block_id', ascending=True)
tx_count = df[['block_id', 'tx_count']].drop_duplicates().sort_values(
'block_id', ascending=True)
gasUsed_b = df[['block_id', 'gasUsed_b']].drop_duplicates().sort_values(
'block_id', ascending=True)
uncle_count = df[['block_id', 'uncle_count']].drop_duplicates().sort_values(
'block_id', ascending=True)
difficulty = df[['block_id', 'difficulty']].drop_duplicates().sort_values(
'block_id', ascending=True)
blocktime = df[['block_id', 'blockTime']].drop_duplicates().sort_values(
'block_id', ascending=True)
# create new pandas dataframe with average values
rolling_avg = pd.DataFrame()
# calculate rolling averages
rolling_avg['avg_blocktime'] = blocktime['blockTime'].rolling(window=window_size).mean()
rolling_avg['avg_gasUsed_b'] = gasUsed_b['gasUsed_b'].rolling(window=window_size).mean()
rolling_avg['avg_tx_count'] = tx_count['tx_count'].rolling(window=window_size).mean()
rolling_avg['avg_uncle_count'] = uncle_count['uncle_count'].rolling(window=window_size).mean()
rolling_avg['avg_difficulty'] = difficulty['difficulty'].rolling(window=window_size).mean()
rolling_avg['avg_txcnt_second'] = txcnt_second['txcnt_second'].rolling(window=window_size).mean()
rolling_avg['avg_gasUsed_t'] = gasUsed_t['avg_gasUsed_t_perblock'].rolling(window=window_size).mean()
rolling_avg['avg_price'] = price['avg_price_perblock'].rolling(window=window_size).mean()
# insert blockids to merge on
rolling_avg['blockids'] = df['block_id'].drop_duplicates().sort_values(ascending=True)
return rolling_avg
In [14]:
num_blocks = [6, 60]
for num in num_blocks:
df_rolling_avg = rolling_avg(num)
df_rolling_avg.to_csv('./../data/block_avg_{}.csv'.format(num))
In [15]:
df_rolling_avg_6 = rolling_avg(6)
In [16]:
df_rolling_avg_60 = rolling_avg(60)
In [17]:
merged1 = pd.merge(df, df_rolling_avg_6, left_on='block_id', right_on='blockids')
In [18]:
merged2 = pd.merge(merged1, df_rolling_avg_60, left_on='block_id', right_on='blockids', suffixes=('_6', '_60'))
In [19]:
merged2.columns
Out[19]:
In [21]:
merged2['mv'] = merged2.gweiShare / merged2.gasShare
In [22]:
merged2['mv'].isnull().sum()
Out[22]:
In [23]:
merged2['mv'].describe()
Out[23]:
There are no zero values, but many values close to zero
In [24]:
merged2.groupby('block_id')['mv'].count().head(6)
Out[24]:
In [25]:
merged2.groupby('block_id')['mv'].count().mean()
Out[25]:
There are only on average 96 samples in each block
In [26]:
print('max tx in block: {}, min tx in block: {}'.format(
merged2.groupby('block_id')['mv'].count().max(),
merged2.groupby('block_id')['mv'].count().min()))
So we create groupings of 6 blocks to increase sample size
In [249]:
merged2['mv'].hist(bins=10000, label='Miner Values', histtype='stepfilled')
plt.xlim(-2, 10)
plt.xlabel('Miner Value')
plt.legend()
Out[249]:
In [250]:
# compute mean, variance, standard deviation
mu_hat = np.mean(merged2['mv'])
sigma_sq_hat = np.var(merged2['mv'])
sigma_hat = np.std(merged2['mv'])
print("Sample Mean: {0:1.3f}".format(mu_hat))
print("Sample Variance: {0:1.3f}".format(sigma_sq_hat))
print("Sample Standard Dev: {0:1.3f}".format(sigma_hat))
In [321]:
x = np.linspace(-10, 15, num=1000)
fig, ax = plt.subplots(1, 1, figsize=(8, 5))
ax.hist(merged2['mv'], normed=True, bins=10000, histtype='stepfilled', label='Samples')
ax.plot(x, norm.pdf(x, mu_hat,sigma_hat), 'r-', lw=3, label='PDF')
ax.axvline(x=np.percentile(norm.pdf(x, mu_hat,sigma_hat), 25), linestyle='--', label='25th percentile')
ax.set_xlim(-5,10)
ax.set_xlabel('Miner Values')
ax.legend()
Out[321]:
In [322]:
# compute 25th percentile
np.percentile(norm.pdf(x, mu_hat,sigma_hat), 25)
Out[322]:
Set this value to mu
In [323]:
mu_normal = np.percentile(norm.pdf(x, mu_hat,sigma_hat), 25)
In [324]:
alpha = float(mu_hat ** 2) / sigma_sq_hat
beta = float(mu_hat) / sigma_sq_hat
x = np.linspace(-5, 10, num=1000)
fig, ax = plt.subplots(1, 1, figsize=(8, 5))
ax.hist(merged2['mv'], normed=True, bins=10000, histtype='stepfilled', label='Samples')
ax.plot(x, gamma.pdf(x, alpha), 'g-', lw=3, label='Gamma PDF')
ax.axvline(x=np.percentile(gamma.pdf(x, alpha), 25), linestyle='--', label='25th percentile')
ax.set_xlim(-5,10)
ax.set_xlabel('Miner Values')
ax.legend()
Out[324]:
In [325]:
# compute 25th percentile
np.percentile(gamma.pdf(x, alpha), 25)
Out[325]:
The gamma distribution appears to fit the empirical data better but we get zero for the 25th percentile
In [336]:
mu_normal
Out[336]:
In [337]:
merged2['p_label'] = mu_normal * (merged2.gweiPaid_b / merged2.gasUsed_b)
In [338]:
merged2['p_label'].hist(bins=3000)
plt.xlim(-0.1,1)
Out[338]:
If mu is higher around 0.01 we get a normal distribution
In [333]:
merged2.columns
Out[333]:
In [334]:
# select candidate features for modeling
sel_cols = ['gasLimit_t',
'gasUsed_t',
'newContract',
'blockTime',
'difficulty',
'gasLimit_b',
'gasUsed_b',
'reward',
'size',
'totalFee',
'amount_gwei',
'gasShare',
'gweiPaid',
'gweiPaid_b',
'gweiShare',
'free_t',
'day',
'hour',
'dayofweek',
'txcnt_second',
'avg_blocktime_6',
'avg_gasUsed_b_6',
'avg_tx_count_6',
'avg_uncle_count_6',
'avg_difficulty_6',
'avg_txcnt_second_6',
'avg_gasUsed_t_6',
'avg_price_6',
'avg_blocktime_60',
'avg_gasUsed_b_60',
'avg_tx_count_60',
'avg_uncle_count_60',
'avg_difficulty_60',
'avg_txcnt_second_60',
'avg_gasUsed_t_60',
'avg_price_60',
'mv']
In [287]:
features = merged2[sel_cols]
In [288]:
features.to_csv('./../data/training.csv')
In [289]:
labels = merged2['p_label']
In [290]:
labels.to_csv('./../data/labels.csv')
In [291]:
# compute mean, variance, standard deviation
mu_hat = np.mean(samples)
sigma_sq_hat = np.var(samples)
sigma_hat = np.std(samples)
print("Sample Mean: {0:1.3f}".format(mu_hat))
print("Sample Variance: {0:1.3f}".format(sigma_sq_hat))
print("Sample Standard Dev: {0:1.3f}".format(sigma_hat))
In [292]:
x = np.linspace(-5, 8, num=250)
fig, ax = plt.subplots(1, 1, figsize=(8, 5))
ax.hist(samples, normed=True, bins=25, histtype='stepfilled', label='Samples')
ax.plot(x, norm.pdf(x, mu_hat,sigma_hat), 'r-', lw=3, label='PDF')
ax.axvline(x=np.percentile(norm.pdf(x, mu_hat,sigma_hat), 25), linestyle='--', label='25th percentile')
ax.set_xlim(-2,6)
ax.set_xlabel('Miner Values')
ax.legend()
Out[292]:
In [293]:
# compute 25th percentile
np.percentile(norm.pdf(x, mu_hat,sigma_hat), 25)
Out[293]:
In [ ]: