In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as sp
from scipy.stats import norm
from scipy.stats import gamma
%matplotlib inline
In [2]:
df = pd.read_csv('./../data/clean_data.csv')
In [3]:
df.drop('Unnamed: 0', axis=1, inplace=True)
In [4]:
df['type'].value_counts()
Out[4]:
In [5]:
df['newContract'].value_counts()
Out[5]:
In [6]:
for col in df.columns:
print(col, df[col].isnull().sum())
In [7]:
df.drop('mixDigest', axis=1, inplace=True)
In [8]:
df.dropna(inplace=True)
In [9]:
df.shape
Out[9]:
In [10]:
df.head()
Out[10]:
In [11]:
# drop cases where gasUsed_t is zero since no gas was used
print('exclude {} rows with zero gas used'.format(df[df['gasUsed_t'] == 0].values.shape[0]))
In [12]:
df = df[df['gasUsed_t'] != 0]
In [13]:
df['txcnt_second'] = df['tx_count'].values / df['blockTime'].values
df['avg_gasUsed_t_perblock'] = df.groupby('block_id')['gasUsed_t'].transform('mean')
df['avg_price_perblock'] = df.groupby('block_id')['price_gwei'].transform('mean')
In [14]:
def rolling_avg(window_size):
price = df[['block_id', 'avg_price_perblock']].drop_duplicates().sort_values(
'block_id', ascending=True)
gasUsed_t = df[['block_id', 'avg_gasUsed_t_perblock']].drop_duplicates().sort_values(
'block_id', ascending=True)
txcnt_second = df[['block_id', 'txcnt_second']].drop_duplicates().sort_values(
'block_id', ascending=True)
tx_count = df[['block_id', 'tx_count']].drop_duplicates().sort_values(
'block_id', ascending=True)
gasUsed_b = df[['block_id', 'gasUsed_b']].drop_duplicates().sort_values(
'block_id', ascending=True)
uncle_count = df[['block_id', 'uncle_count']].drop_duplicates().sort_values(
'block_id', ascending=True)
difficulty = df[['block_id', 'difficulty']].drop_duplicates().sort_values(
'block_id', ascending=True)
blocktime = df[['block_id', 'blockTime']].drop_duplicates().sort_values(
'block_id', ascending=True)
# create new pandas dataframe with average values
rolling_avg = pd.DataFrame()
# calculate rolling averages
rolling_avg['avg_blocktime'] = blocktime['blockTime'].rolling(window=window_size).mean()
rolling_avg['avg_gasUsed_b'] = gasUsed_b['gasUsed_b'].rolling(window=window_size).mean()
rolling_avg['avg_tx_count'] = tx_count['tx_count'].rolling(window=window_size).mean()
rolling_avg['avg_uncle_count'] = uncle_count['uncle_count'].rolling(window=window_size).mean()
rolling_avg['avg_difficulty'] = difficulty['difficulty'].rolling(window=window_size).mean()
rolling_avg['avg_txcnt_second'] = txcnt_second['txcnt_second'].rolling(window=window_size).mean()
rolling_avg['avg_gasUsed_t'] = gasUsed_t['avg_gasUsed_t_perblock'].rolling(window=window_size).mean()
rolling_avg['avg_price'] = price['avg_price_perblock'].rolling(window=window_size).mean()
# insert blockids to merge on
rolling_avg['blockids'] = df['block_id'].drop_duplicates().sort_values(ascending=True)
return rolling_avg
In [15]:
num_blocks = [6, 60]
for num in num_blocks:
df_rolling_avg = rolling_avg(num)
df_rolling_avg.to_csv('./../data/block_avg_{}.csv'.format(num))
In [16]:
df_rolling_avg_6 = rolling_avg(6)
In [17]:
df_rolling_avg_60 = rolling_avg(60)
In [18]:
merged1 = pd.merge(df, df_rolling_avg_6, left_on='block_id', right_on='blockids')
In [19]:
merged2 = pd.merge(merged1, df_rolling_avg_60, left_on='block_id', right_on='blockids', suffixes=('_6', '_60'))
In [20]:
merged2.columns
Out[20]:
In [21]:
for col in merged2.columns:
print(col, merged2[col].isnull().sum())
In [22]:
merged2.dropna(inplace=True)
In [23]:
merged2['mv'] = merged2.gweiShare / merged2.gasShare
In [24]:
merged2['mv'].isnull().sum()
Out[24]:
In [25]:
merged2['mv'].describe()
Out[25]:
In [26]:
alpha = .25
mu= merged2.mv.quantile(alpha)
In [27]:
merged2.mv.apply(np.log10).hist(bins=100)
plt.xlim([-2,2])
ylims=plt.gca().get_ylim()
plt.vlines(np.log10(mu), ylims[0], ylims[1], 'r' )
Out[27]:
In [28]:
merged2.mv.hist(bins=np.arange(0,10,.20))
ylims=plt.gca().get_ylim()
plt.vlines(mu, ylims[0], ylims[1], 'r' )
Out[28]:
In [29]:
merged2.mv.hist(bins=np.arange(0,10,.20), color = 'k', alpha=0.5, histtype='stepfilled',
label='Miner Values')
ylims=plt.gca().get_ylim()
plt.vlines(mu, ylims[0], ylims[1], 'r', linestyle='--')
plt.title('Distribution of miner values', fontsize=18)
plt.legend()
plt.tight_layout()
plt.savefig('./../images/mv_dist.png', dpi=300)
In [30]:
mu
Out[30]:
In [31]:
merged2['p_label'] = mu* (merged2.gweiPaid_b / merged2.gasUsed_b)
In [32]:
merged2['p_label'].hist(bins=np.arange(0,50,2), color = 'b', alpha=0.7, histtype='stepfilled',
label='New Label')
merged2['price_gwei'].hist(bins=np.arange(0,50,.5), color = 'r', alpha=0.7,
histtype='stepfilled', label='Price')
plt.title('Constructed Label', fontsize=18)
plt.legend()
plt.tight_layout()
In [33]:
merged2['p_label2'] = mu*merged2.gweiPaid_b/(merged2.gasUsed_b+merged2.gasUsed_t*(1-mu))
In [34]:
merged2.p_label2.describe()
Out[34]:
In [35]:
merged2['p_label2'].hist(bins=np.arange(0,50,2), color = 'b', alpha=0.7, histtype='stepfilled',
label='New Label')
merged2['price_gwei'].hist(bins=np.arange(0,50,.5), color = 'r', alpha=0.7, histtype='stepfilled',
label='Price')
plt.title('Constructed Label', fontsize=16)
plt.legend()
plt.tight_layout()
plt.savefig('./../images/label.png', dpi=300)
Look our method smoothed the prices out!
In [36]:
merged2.columns
Out[36]:
In [37]:
# select candidate features for modeling
sel_cols = ['gasLimit_t',
'gasUsed_t',
'newContract',
'blockTime',
'difficulty',
'gasLimit_b',
'gasUsed_b',
'reward',
'size',
'type',
'totalFee',
'amount_gwei',
'gasShare',
'gweiPaid',
'gweiPaid_b',
'gweiShare',
'free_t',
'day',
'hour',
'dayofweek',
'txcnt_second',
'avg_blocktime_6',
'avg_gasUsed_b_6',
'avg_tx_count_6',
'avg_uncle_count_6',
'avg_difficulty_6',
'avg_txcnt_second_6',
'avg_gasUsed_t_6',
'avg_price_6',
'avg_blocktime_60',
'avg_gasUsed_b_60',
'avg_tx_count_60',
'avg_uncle_count_60',
'avg_difficulty_60',
'avg_txcnt_second_60',
'avg_gasUsed_t_60',
'avg_price_60',
'mv']
In [38]:
features = merged2[sel_cols]
In [39]:
features.to_csv('./../data/training.csv')
In [40]:
labels = merged2['p_label2']
In [41]:
labels.to_csv('./../data/labels.csv')
In [ ]: