In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
more info in the topic of gas costs here: http://ethdocs.org/en/latest/contracts-and-transactions/account-types-gas-and-transactions.html
In [2]:
df = pd.read_csv('./../data/data.csv')
In [3]:
df.shape
Out[3]:
In [4]:
df.info()
In [5]:
print('no txs: {}, no blocks: {}'.format(df.shape[0], np.unique(df['block_id'].values).shape[0]))
In [6]:
df.loc[:,'time_t'] = pd.to_datetime(df.time_t, yearfirst=True)
In [7]:
df.loc[:,'time_b'] = pd.to_datetime(df.time_b, yearfirst=True)
In [8]:
df['time_t'].head()
Out[8]:
In [9]:
df.drop('Unnamed: 0', axis=1, inplace=True)
Filter each string extracting the digits as integers
In [10]:
int(filter(str.isdigit, df['difficulty'][0]))
Out[10]:
In [11]:
df['difficulty'] = df['difficulty'].apply(lambda x: int(filter(str.isdigit, x)))
In [12]:
df['difficulty'].head()
Out[12]:
In [13]:
df['reward'] = df['reward'].apply(lambda x: int(filter(str.isdigit, x)))
In [14]:
df['reward'].head()
Out[14]:
In [15]:
df['totalFee'] = df['totalFee'].apply(lambda x: int(filter(str.isdigit, x)))
In [16]:
df['totalFee'].head()
Out[16]:
In [17]:
def float_to_int(col_list):
for col in col_list:
df[col] = df[col].apply(lambda x: np.rint(x))
df[col] = df[col].values.astype(int)
In [18]:
float_to_int(['amount', 'price', 'gasLimit_b', 'gasUsed_b'])
In [19]:
df['isContractTx'].isnull().sum()
Out[19]:
The column "isContractTx" is empty so drop it
In [20]:
df.drop('isContractTx', axis=1, inplace=True)
Binarize the amount column (1 if > 0 and 0 if 0)
In [21]:
df['amount_binary'] = df['amount'].map(lambda x: 1 if x > 0 else 0)
In [22]:
for c in df.columns:
print(c)
In [23]:
df['price_gwei'] = df['price'] / 1000000000.0
df['amount_gwei'] = df['amount'] / 1000000000.0
In [24]:
df[['price_gwei', 'gasUsed_t','gasUsed_b','difficulty' ]].describe()
Out[24]:
In [25]:
df['gasShare'] = df.gasUsed_t/df.gasUsed_b
df['gweiPaid'] = df.gasUsed_t*df.price_gwei
In [26]:
gweiDict = df[['gweiPaid','block_id']].groupby('block_id').sum().T.to_dict()
In [27]:
df['gweiPaid_b'] = df.block_id.apply(lambda b: gweiDict[b]['gweiPaid'])
In [28]:
df['gweiShare'] = df.gweiPaid/df.gweiPaid_b
In [29]:
df['free_t'] = (df.gasUsed_t ==0).apply(int)
In [30]:
df[['block_id', 'free_t']].groupby('block_id').mean().hist(bins=50)
Out[30]:
In [31]:
df[['type', 'free_t']].groupby('type').mean().plot(kind='bar', color='g')
plt.xlabel('')
plt.title('Free Transactions')
plt.tight_layout()
plt.savefig('./../images/free_transactions.png', dpi=300)
Suicide it seems is always free. Create and tx always have a cost. About 20% of call events are free but non of the create events are. more details below.
In [32]:
#shares by type of events
df[['hash_t', 'type']].groupby('type').count().hash_t.plot(kind='pie')
Out[32]:
In [33]:
#fees paid by type
df[['hash_t', 'type','gweiPaid']].groupby('type').sum().gweiPaid.plot(kind='pie')
plt.title('Total Fee Paid')
plt.ylabel('')
plt.tight_layout()
plt.savefig('./../images/pie_gweipaid.png', dpi=300)
In [34]:
#gas used by type
df[['hash_t', 'type','gasUsed_t']].groupby('type').sum().gasUsed_t.plot(kind='pie')
plt.title('Gas Used')
plt.ylabel('')
plt.tight_layout()
plt.savefig('./../images/pie_gasused.png', dpi=300)
In [35]:
#quick look at transactions
txdf = df[(df['type']=='tx')&(df.amount_gwei>0)].copy()
txdf['logGweiAmount'] = txdf.amount_gwei.apply(np.log10)
txdf['logGweiPrice'] = txdf.price_gwei.apply(np.log10)
txdf['logGasUsed'] = txdf.gasUsed_t.apply(np.log10)
In [36]:
txdf[['amount_gwei','price_gwei','gasUsed_t']].describe()
Out[36]:
In [37]:
txdf[['logGweiAmount','logGweiPrice','logGasUsed']].describe()
Out[37]:
In [38]:
#sns.pairplot(txdf[['logGweiAmount','logGweiPrice','logGasUsed']])
tx type is the only ones for which amount is non-zero. Moving forward we aren't looking at the ammount focusing on the gas amount used and the price paid for that gas.
In [39]:
df[df.gasUsed_t>0].gasUsed_t.apply(np.log10).hist(bins=40)
Out[39]:
In [40]:
df[(df.gasUsed_t>0)&(df['type']=='tx')].gasUsed_t.apply(np.log10).hist(bins=10,alpha=.5)
df[(df.gasUsed_t>0)&(df['type']=='call')].gasUsed_t.apply(np.log10).hist(bins=10,alpha=.5)
df[(df.gasUsed_t>0)&(df['type']=='create')].gasUsed_t.apply(np.log10).hist(bins=10,alpha=.5)
plt.title('Gas Used')
plt.legend(['tx','call','create'])
Out[40]:
In [41]:
df[(df.gasUsed_t>0)&(df['type']=='tx')].gweiPaid.apply(np.log10).hist(bins=10,alpha=.5)
df[(df.gasUsed_t>0)&(df['type']=='call')].gweiPaid.apply(np.log10).hist(bins=10,alpha=.5)
df[(df.gasUsed_t>0)&(df['type']=='create')].gweiPaid.apply(np.log10).hist(bins=10,alpha=.5)
plt.title('Gwei Paid')
plt.legend(['tx','call','create'])
Out[41]:
In [42]:
#compute the fraction of transactions with zero gasUsed by block
df[['block_id', 'free_t']].groupby('block_id').mean().apply(np.log10).plot()
plt.title('log of fraction of transactions with 0 gasUsed')
ax = plt.gca()
ax.set_yticklabels([round(10**y,3) for y in ax.get_yticks()])
Out[42]:
In [43]:
signals = df[['block_id', 'free_t','type']].groupby(['type','block_id']).mean().reset_index()
signals.head()
Out[43]:
In [44]:
signals.groupby('type').describe()
Out[44]:
In [45]:
df[df.gweiPaid>0].gweiPaid.apply(np.log10).hist()
Out[45]:
In [46]:
df.gasShare.describe()
Out[46]:
In [47]:
df[df.gasShare>0].gasShare.apply(np.log10).hist(bins=25)
Out[47]:
In [48]:
#reduced dataframe
rdf = df[['gasUsed_t','gasShare','gweiPaid','gweiShare', 'price_gwei', 'type']].copy()
In [49]:
rdf.head()
Out[49]:
In [50]:
#sns.pairplot(rdf[df.gweiPaid>0], hue="type")
In [51]:
#log10 values of reduced dataframe
ldf = rdf[df.gweiPaid>0].copy()
for c in ['gasUsed_t','gasShare','gweiPaid','gweiShare', 'price_gwei']:
ldf[c] = ldf[c].apply(np.log10)
In [52]:
ldf.describe()
Out[52]:
In [53]:
#sns.pairplot(ldf, hue="type")
In [97]:
sns.set(style='whitegrid', rc={"grid.linewidth": 0.1})
sns.set_context("notebook", font_scale=4.0)
sns.lmplot(x="gasUsed_t", y="price_gwei", hue="type",truncate=True, size=18, data=ldf,
scatter_kws={"s": 15,"alpha": .15})
plt.ylim(-1, 3)
plt.ylabel('Gas Price', fontsize=40)
plt.xlabel('Gas Used', fontsize=40)
plt.tight_layout()
plt.savefig('./../images/regmodel.png', dpi=300)
In [57]:
g = sns.PairGrid(ldf[ldf['type']=='create'], diag_sharey=False)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_upper(plt.scatter)
g.map_diag(sns.kdeplot, lw=3)
plt.savefig('./../images/pair_kde.png')
In [68]:
from scipy.stats import norm
In [96]:
sns.set(style='whitegrid', rc={"grid.linewidth": 0.1})
sns.set_context("notebook", font_scale=2.0)
x = np.linspace(-3, 3, num=250)
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
ax.plot(x, norm.pdf(x), linewidth=4)
ax.set_xlim(-3, 3)
ax.set_ylim(-0.1, 0.5)
ax.fill_between(x, norm.pdf(x), alpha=0.6)
ax.axvline(x=-1, color='r', linestyle='--')
ax.set_title("")
plt.tight_layout()
plt.savefig('./../images/hyp_dist.png', dpi=300)
In [ ]: