In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
more info in the topic of gas costs here: http://ethdocs.org/en/latest/contracts-and-transactions/account-types-gas-and-transactions.html
In [2]:
df = pd.read_csv('./../data/data.csv')
In [3]:
df.shape
Out[3]:
In [4]:
df.info()
In [5]:
print('no txs: {}, no blocks: {}'.format(df.shape[0], np.unique(df['block_id'].values).shape[0]))
In [6]:
df.loc[:,'time_t'] = pd.to_datetime(df.time_t, yearfirst=True)
In [7]:
df.loc[:,'time_b'] = pd.to_datetime(df.time_b, yearfirst=True)
In [8]:
df['time_t'].head()
Out[8]:
In [9]:
df.drop('Unnamed: 0', axis=1, inplace=True)
Filter each string extracting the digits as integers
In [10]:
int(filter(str.isdigit, df['difficulty'][0]))
Out[10]:
In [11]:
df['difficulty'] = df['difficulty'].apply(lambda x: int(filter(str.isdigit, x)))
In [12]:
df['difficulty'].head()
Out[12]:
In [13]:
df['reward'] = df['reward'].apply(lambda x: int(filter(str.isdigit, x)))
In [14]:
df['reward'].head()
Out[14]:
In [15]:
df['totalFee'] = df['totalFee'].apply(lambda x: int(filter(str.isdigit, x)))
In [16]:
df['totalFee'].head()
Out[16]:
In [17]:
def float_to_int(col_list):
for col in col_list:
df[col] = df[col].apply(lambda x: np.rint(x))
df[col] = df[col].values.astype(int)
In [18]:
float_to_int(['amount', 'price', 'gasLimit_b', 'gasUsed_b'])
In [19]:
df['isContractTx'].isnull().sum()
Out[19]:
The column "isContractTx" is empty so drop it
In [20]:
df.drop('isContractTx', axis=1, inplace=True)
Binarize the amount column (1 if > 0 and 0 if 0)
In [21]:
df['amount_binary'] = df['amount'].map(lambda x: 1 if x > 0 else 0)
In [22]:
for c in df.columns:
print(c)
In [87]:
df['price_gwei'] = df['price'] / 1000000000.0
df['amount_gwei'] = df['amount'] / 1000000000.0
In [24]:
df[['price_gwei', 'gasUsed_t','gasUsed_b','difficulty' ]].describe()
Out[24]:
In [25]:
df['gasShare'] = df.gasUsed_t/df.gasUsed_b
df['gweiPaid'] = df.gasUsed_t*df.price_gwei
In [47]:
gweiDict = df[['gweiPaid','block_id']].groupby('block_id').sum().T.to_dict()
In [49]:
df['gweiPaid_b'] = df.block_id.apply(lambda b: gweiDict[b]['gweiPaid'])
In [51]:
df['gweiShare'] = df.gweiPaid/df.gweiPaid_b
In [26]:
df['free_t'] = (df.gasUsed_t ==0).apply(int)
In [27]:
df[['block_id', 'free_t']].groupby('block_id').mean().hist(bins=50)
Out[27]:
In [28]:
df[['type', 'free_t']].groupby('type').mean().plot(kind='bar')
Out[28]:
Suicide it seems is always free. Create and tx always have a cost. About 20% of call events are free but non of the create events are. more details below.
In [77]:
#shares by type of events
df[['hash_t', 'type']].groupby('type').count().hash_t.plot(kind='pie')
Out[77]:
In [78]:
#fees paid by type
df[['hash_t', 'type','gweiPaid']].groupby('type').sum().gweiPaid.plot(kind='pie')
Out[78]:
In [79]:
#gas used by type
df[['hash_t', 'type','gasUsed_t']].groupby('type').sum().gasUsed_t.plot(kind='pie')
Out[79]:
In [93]:
#quick look at transactions
txdf = df[(df['type']=='tx')&(df.amount_gwei>0)].copy()
txdf['logGweiAmount'] = txdf.amount_gwei.apply(np.log10)
txdf['logGweiPrice'] = txdf.price_gwei.apply(np.log10)
txdf['logGasUsed'] = txdf.gasUsed_t.apply(np.log10)
In [94]:
txdf[['amount_gwei','price_gwei','gasUsed_t']].describe()
Out[94]:
In [97]:
txdf[['logGweiAmount','logGweiPrice','logGasUsed']].describe()
Out[97]:
In [101]:
sns.pairplot(txdf[['logGweiAmount','logGweiPrice','logGasUsed']])
Out[101]:
tx type is the only ones for which amount is non-zero. Moving forward we aren't looking at the ammount focusing on the gas amount used and the price paid for that gas.
In [29]:
df[df.gasUsed_t>0].gasUsed_t.apply(np.log10).hist(bins=40)
Out[29]:
In [30]:
df[(df.gasUsed_t>0)&(df['type']=='tx')].gasUsed_t.apply(np.log10).hist(bins=10,alpha=.5)
df[(df.gasUsed_t>0)&(df['type']=='call')].gasUsed_t.apply(np.log10).hist(bins=10,alpha=.5)
df[(df.gasUsed_t>0)&(df['type']=='create')].gasUsed_t.apply(np.log10).hist(bins=10,alpha=.5)
plt.title('Gas Used')
plt.legend(['tx','call','create'])
Out[30]:
In [31]:
df[(df.gasUsed_t>0)&(df['type']=='tx')].gweiPaid.apply(np.log10).hist(bins=10,alpha=.5)
df[(df.gasUsed_t>0)&(df['type']=='call')].gweiPaid.apply(np.log10).hist(bins=10,alpha=.5)
df[(df.gasUsed_t>0)&(df['type']=='create')].gweiPaid.apply(np.log10).hist(bins=10,alpha=.5)
plt.title('Gwei Paid')
plt.legend(['tx','call','create'])
Out[31]:
In [32]:
#compute the fraction of transactions with zero gasUsed by block
df[['block_id', 'free_t']].groupby('block_id').mean().apply(np.log10).plot()
plt.title('log of fraction of transactions with 0 gasUsed')
ax = plt.gca()
ax.set_yticklabels([round(10**y,3) for y in ax.get_yticks()])
Out[32]:
In [33]:
signals = df[['block_id', 'free_t','type']].groupby(['type','block_id']).mean().reset_index()
signals.head()
Out[33]:
In [34]:
signals.groupby('type').describe()
Out[34]:
In [35]:
df[df.gweiPaid>0].gweiPaid.apply(np.log10).hist()
Out[35]:
In [36]:
df.gasShare.describe()
Out[36]:
In [37]:
df[df.gasShare>0].gasShare.apply(np.log10).hist(bins=25)
Out[37]:
In [55]:
#reduced dataframe
rdf = df[['gasUsed_t','gasShare','gweiPaid','gweiShare', 'price_gwei', 'type']].copy()
In [56]:
rdf.head()
Out[56]:
In [57]:
sns.pairplot(rdf[df.gweiPaid>0], hue="type")
Out[57]:
In [62]:
#log10 values of reduced dataframe
ldf = rdf[df.gweiPaid>0].copy()
for c in ['gasUsed_t','gasShare','gweiPaid','gweiShare', 'price_gwei']:
ldf[c] = ldf[c].apply(np.log10)
In [63]:
ldf.describe()
Out[63]:
In [64]:
sns.pairplot(ldf, hue="type")
Out[64]:
In [68]:
sns.lmplot(x="gasUsed_t", y="price_gwei", hue="type",truncate=True, size=18, data=ldf, scatter_kws={"s": 15,"alpha": .15})
Out[68]:
In [75]:
g = sns.PairGrid(ldf[ldf['type']=='create'], diag_sharey=False)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_upper(plt.scatter)
g.map_diag(sns.kdeplot, lw=3)
Out[75]:
In [ ]: