In [875]:
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [876]:
df = pd.read_csv('./../data/data.csv')
In [877]:
df.shape
Out[877]:
In [878]:
df.info()
In [879]:
print('no txs: {}, no blocks: {}'.format(df.shape[0], np.unique(df['block_id'].values).shape[0]))
In [880]:
df.loc[:,'time_t'] = pd.to_datetime(df.time_t, yearfirst=True)
In [881]:
df.loc[:,'time_b'] = pd.to_datetime(df.time_b, yearfirst=True)
In [882]:
df['time_t'].head()
Out[882]:
In [883]:
df.drop('Unnamed: 0', axis=1, inplace=True)
Filter each string extracting the digits as integers
In [884]:
int(filter(str.isdigit, df['difficulty'][0]))
Out[884]:
In [885]:
df['difficulty'] = df['difficulty'].apply(lambda x: int(filter(str.isdigit, x)))
In [886]:
df['difficulty'].head()
Out[886]:
In [887]:
df['reward'] = df['reward'].apply(lambda x: int(filter(str.isdigit, x)))
In [888]:
df['reward'].head()
Out[888]:
In [889]:
df['totalFee'] = df['totalFee'].apply(lambda x: int(filter(str.isdigit, x)))
In [890]:
df['totalFee'].head()
Out[890]:
In [891]:
def float_to_int(col_list):
for col in col_list:
df[col] = df[col].apply(lambda x: np.rint(x))
df[col] = df[col].values.astype(int)
In [892]:
float_to_int(['amount', 'price', 'gasLimit_b', 'gasUsed_b'])
In [893]:
df['isContractTx'].isnull().sum()
Out[893]:
The column "isContractTx" is empty so drop it
In [894]:
df.drop('isContractTx', axis=1, inplace=True)
In [895]:
df['txIndex'].isnull().sum()
Out[895]:
In [896]:
df['txIndex'].value_counts()
Out[896]:
In [897]:
df['type'].value_counts()
Out[897]:
In [898]:
df['newContract'].value_counts()
Out[898]:
Binarize the amount column (1 if > 0 and 0 if 0)
In [899]:
df['amount_binary'] = df['amount'].map(lambda x: 1 if x > 0 else 0)
In [900]:
for col in df.columns:
print(col, df[col].isnull().sum())
In [800]:
df.info()
In [901]:
df['price_gwei'] = df['price'].values / 1000000000
In [903]:
plt.hist(df['price_gwei'], bins=3000, color='b', alpha=0.7)
plt.xlim(0,100)
plt.xlabel('Price (GWei)')
plt.show()
In [904]:
np.mean(df['price_gwei'].values)
Out[904]:
In [905]:
np.max(df['price_gwei'].values)
Out[905]:
In [906]:
np.min(df['price_gwei'].values)
Out[906]:
In [907]:
df['amount_binary'].value_counts()
Out[907]:
In [908]:
df['type'].value_counts()
Out[908]:
In [909]:
grouped_type = df.groupby('type')
In [910]:
df.groupby('type')['price_gwei'].mean()
Out[910]:
In [911]:
df.groupby('type')['price_gwei'].describe().T
Out[911]:
Call has the highest mean price, while suicide has the lowest
In [912]:
tx = df[df['type'] == 'tx']
call = df[df['type'] == 'call']
create = df[df['type'] == 'create']
suicide = df[df['type'] == 'suicide']
In [913]:
plt.hist(tx['price_gwei'], bins=4000, color='b', alpha=0.7)
plt.xlim(-10,100)
plt.title('tx', fontsize=18)
plt.xlabel('price', fontsize=18)
plt.show()
In [914]:
plt.hist(call['price_gwei'], bins=4000, color='b', alpha=0.7)
plt.xlim(-10,100)
plt.title('call', fontsize=18)
plt.xlabel('price', fontsize=18)
plt.show()
In [915]:
plt.hist(create['price_gwei'], bins=50, color='b', alpha=0.7)
plt.xlim(-10,100)
plt.title('create', fontsize=18)
plt.xlabel('price', fontsize=18)
plt.show()
In [916]:
plt.hist(suicide['price_gwei'], bins=10, color='b', alpha=0.7)
plt.xlim(-10,100)
plt.title('suicide', fontsize=18)
plt.xlabel('price', fontsize=18)
plt.show()
There seems to be some signal among the different transaction types
In [917]:
df['day'] = pd.DatetimeIndex(df['time_t']).day
In [919]:
# How many transactions each day?
df['day'].value_counts()
Out[919]:
In [920]:
df['hour'] = pd.DatetimeIndex(df['time_t']).hour
In [921]:
df['minute'] = pd.DatetimeIndex(df['time_t']).minute
In [922]:
df['second'] = pd.DatetimeIndex(df['time_t']).second
In [923]:
df['dayofweek'] = pd.DatetimeIndex(df['time_t']).dayofweek
In [925]:
# How many transaction occur on a given day of the week?
df['dayofweek'].value_counts()
Out[925]:
In [926]:
df.columns
Out[926]:
In [927]:
# Mean price per day
df['price_gwei'].groupby(df.day).mean()
Out[927]:
In [933]:
# How many blocks are in each day (24 hrs)?
df['block_id'].groupby(df.day).nunique()
Out[933]:
In [934]:
df[df['amount'] > 0].hist('amount', bins=20)
Out[934]:
In [935]:
df['amount_eth'] = df['amount'].map(lambda x: float(x) / 1000000000000000000)
In [936]:
df['amount_eth'].head()
Out[936]:
In [937]:
df[['amount_eth', 'type', 'newContract', 'price_gwei', 'block_id']].head()
Out[937]:
In [938]:
df['price_gwei'].describe()
Out[938]:
In [939]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['type_enc'] = le.fit_transform(df['type'])
In [940]:
df['type_enc'].value_counts()
Out[940]:
In [941]:
df_filtered = df[(df['amount_eth'] > 10**(-6)) & (df['price_gwei'] > 0)]
In [942]:
df_filtered = df[(df['amount_eth'] > 10**(-6)) & (df['price_gwei'] > 0)]
In [943]:
print('deleted {} rows having 0 amounts'.format(df[df['amount_eth'] == 0].values.shape[0]))
In [944]:
print('deleted {} rows having 0 gas price'.format(df[df['price_gwei'] == 0].values.shape[0]))
In [945]:
df_filtered['amount_eth'].apply(np.log10).hist(bins=100)
plt.xlabel('Amount')
Out[945]:
In [946]:
df_filtered['amount_eth'].hist()
plt.xlabel('Amount')
Out[946]:
In [947]:
df_filtered['price_gwei'].hist(bins=2000)
plt.xlim(0,100)
plt.xlabel('Gas Price')
Out[947]:
In [948]:
gas_log = np.log(df_filtered['price_gwei'].values)
In [949]:
plt.scatter(df_filtered['amount_eth'], df_filtered['price_gwei'])
plt.xlabel('Amount')
plt.ylabel('Gas Price')
Out[949]:
In [950]:
df['dayofweek'].describe()
Out[950]:
In [951]:
df_filtered['dayofweek'].hist(bins=7)
plt.xlabel("Day of Week (Mon-Sun)")
Out[951]:
In [952]:
df_filtered['hour'].describe()
Out[952]:
In [953]:
df_filtered['hour'].hist(bins=24)
plt.xlim(0,23)
plt.xlabel('Hour')
Out[953]:
In [954]:
df_filtered.groupby('type')['gasUsed_t'].mean()
Out[954]:
In [955]:
df_filtered['txcnt_second'] = df_filtered['tx_count'].values / df_filtered['blockTime'].values
In [956]:
df_filtered['avg_gasUsed_t_perblock'] = df_filtered.groupby('block_id')['gasUsed_t'].transform('mean')
In [957]:
df_filtered['avg_price_perblock'] = df_filtered.groupby('block_id')['price_gwei'].transform('mean')
In [958]:
df_filtered.to_csv('./../data/data_filtered.csv')
In [959]:
def rolling_avg(window_size):
price = df_filtered[['block_id', 'avg_price_perblock']].drop_duplicates().sort_values(
'block_id', ascending=True)
gasUsed_t = df_filtered[['block_id', 'avg_gasUsed_t_perblock']].drop_duplicates().sort_values(
'block_id', ascending=True)
txcnt_second = df_filtered[['block_id', 'txcnt_second']].drop_duplicates().sort_values(
'block_id', ascending=True)
tx_count = df_filtered[['block_id', 'tx_count']].drop_duplicates().sort_values(
'block_id', ascending=True)
gasUsed_b = df_filtered[['block_id', 'gasUsed_b']].drop_duplicates().sort_values(
'block_id', ascending=True)
uncle_count = df_filtered[['block_id', 'uncle_count']].drop_duplicates().sort_values(
'block_id', ascending=True)
difficulty = df_filtered[['block_id', 'difficulty']].drop_duplicates().sort_values(
'block_id', ascending=True)
blocktime = df_filtered[['block_id', 'blockTime']].drop_duplicates().sort_values(
'block_id', ascending=True)
# create new pandas dataframe with average values
rolling_avg = pd.DataFrame()
# calculate rolling averages
rolling_avg['avg_blocktime'] = blocktime['blockTime'].rolling(window=window_size).mean()
rolling_avg['avg_gasUsed_b'] = gasUsed_b['gasUsed_b'].rolling(window=window_size).mean()
rolling_avg['avg_tx_count'] = tx_count['tx_count'].rolling(window=window_size).mean()
rolling_avg['avg_uncle_count'] = uncle_count['uncle_count'].rolling(window=window_size).mean()
rolling_avg['avg_difficulty'] = difficulty['difficulty'].rolling(window=window_size).mean()
rolling_avg['avg_txcnt_second'] = txcnt_second['txcnt_second'].rolling(window=window_size).mean()
rolling_avg['avg_gasUsed_t'] = gasUsed_t['avg_gasUsed_t_perblock'].rolling(window=window_size).mean()
rolling_avg['avg_price'] = price['avg_price_perblock'].rolling(window=window_size).mean()
# insert blockids to merge on
rolling_avg['blockids'] = df_filtered['block_id'].drop_duplicates().sort_values(ascending=True)
return rolling_avg
There are between 600 and 3500 blocks in a given day, so try 5 through 4000 previous blocks
In [960]:
num_blocks = [5, 10, 25, 50, 100, 1000, 2000, 3000, 4000]
for num in num_blocks:
df_rolling_avg = rolling_avg(num)
df_rolling_avg.to_csv('./../data/block_avg_{}.csv'.format(num))
In [ ]: