Exploratory Analysis


In [875]:
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Load data


In [876]:
df = pd.read_csv('./../data/data.csv')

In [877]:
df.shape


Out[877]:
(1494279, 23)

In [878]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1494279 entries, 0 to 1494278
Data columns (total 23 columns):
Unnamed: 0      1494279 non-null int64
hash_t          1494279 non-null object
accountNonce    1494279 non-null int64
amount          1494279 non-null float64
block_id        1494279 non-null int64
gasLimit_t      1494279 non-null int64
gasUsed_t       1494279 non-null int64
isContractTx    0 non-null float64
newContract     1494279 non-null int64
price           1494279 non-null float64
time_t          1494279 non-null object
txIndex         515711 non-null float64
type            1494279 non-null object
blockTime       1494279 non-null int64
difficulty      1494279 non-null object
gasLimit_b      1494279 non-null float64
gasUsed_b       1494279 non-null float64
reward          1494279 non-null object
size            1494279 non-null int64
time_b          1494279 non-null object
totalFee        1494279 non-null object
tx_count        1494279 non-null int64
uncle_count     1494279 non-null int64
dtypes: float64(6), int64(10), object(7)
memory usage: 262.2+ MB

In [879]:
print('no txs: {}, no blocks: {}'.format(df.shape[0], np.unique(df['block_id'].values).shape[0]))


no txs: 1494279, no blocks: 14583

Convert dates to datetime


In [880]:
df.loc[:,'time_t'] = pd.to_datetime(df.time_t, yearfirst=True)

In [881]:
df.loc[:,'time_b'] = pd.to_datetime(df.time_b, yearfirst=True)

In [882]:
df['time_t'].head()


Out[882]:
0   2017-09-20 20:24:15
1   2017-09-20 20:24:15
2   2017-09-20 20:24:15
3   2017-09-20 20:24:15
4   2017-09-20 20:24:15
Name: time_t, dtype: datetime64[ns]

In [883]:
df.drop('Unnamed: 0', axis=1, inplace=True)

Convert long integer columns

  • difficulty
  • reward
  • totalFee

Filter each string extracting the digits as integers


In [884]:
int(filter(str.isdigit, df['difficulty'][0]))


Out[884]:
2338428842518947

In [885]:
df['difficulty'] = df['difficulty'].apply(lambda x: int(filter(str.isdigit, x)))

In [886]:
df['difficulty'].head()


Out[886]:
0    2338428842518947
1    2338428842518947
2    2338428842518947
3    2338428842518947
4    2338428842518947
Name: difficulty, dtype: int64

In [887]:
df['reward'] = df['reward'].apply(lambda x: int(filter(str.isdigit, x)))

In [888]:
df['reward'].head()


Out[888]:
0    5008232105000000000
1    5008232105000000000
2    5008232105000000000
3    5008232105000000000
4    5008232105000000000
Name: reward, dtype: int64

In [889]:
df['totalFee'] = df['totalFee'].apply(lambda x: int(filter(str.isdigit, x)))

In [890]:
df['totalFee'].head()


Out[890]:
0    8232105000000000
1    8232105000000000
2    8232105000000000
3    8232105000000000
4    8232105000000000
Name: totalFee, dtype: int64

Convert floats back to ints

  • when collecting data from the api, some of the ints were longer than 8 bytes causing mongo to throw up errors
  • to avoid this we temporarily converted those to floats

In [891]:
def float_to_int(col_list):
    for col in col_list:
        df[col] = df[col].apply(lambda x: np.rint(x))
        df[col] = df[col].values.astype(int)

In [892]:
float_to_int(['amount', 'price', 'gasLimit_b', 'gasUsed_b'])

In [893]:
df['isContractTx'].isnull().sum()


Out[893]:
1494279

The column "isContractTx" is empty so drop it


In [894]:
df.drop('isContractTx', axis=1, inplace=True)

In [895]:
df['txIndex'].isnull().sum()


Out[895]:
978568

In [896]:
df['txIndex'].value_counts()


Out[896]:
1.0     176351
2.0      92812
3.0      53512
4.0      42831
5.0      39828
6.0      34256
7.0      32021
8.0      11919
9.0       5642
10.0      5096
11.0      4882
12.0      4867
13.0      4317
14.0      3138
15.0      1954
16.0      1823
17.0       289
18.0       154
19.0        12
20.0         7
Name: txIndex, dtype: int64

In [897]:
df['type'].value_counts()


Out[897]:
tx         973640
call       492595
create      27726
suicide       318
Name: type, dtype: int64

In [898]:
df['newContract'].value_counts()


Out[898]:
0    1489351
1       4928
Name: newContract, dtype: int64

Binarize the amount column (1 if > 0 and 0 if 0)


In [899]:
df['amount_binary'] = df['amount'].map(lambda x: 1 if x > 0 else 0)

Check for NaN values


In [900]:
for col in df.columns:
    print(col, df[col].isnull().sum())


hash_t 0
accountNonce 0
amount 0
block_id 0
gasLimit_t 0
gasUsed_t 0
newContract 0
price 0
time_t 0
txIndex 978568
type 0
blockTime 0
difficulty 0
gasLimit_b 0
gasUsed_b 0
reward 0
size 0
time_b 0
totalFee 0
tx_count 0
uncle_count 0
amount_binary 0

Columns decriptions


In [800]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1494279 entries, 0 to 1494278
Data columns (total 22 columns):
hash_t           1494279 non-null object
accountNonce     1494279 non-null int64
amount           1494279 non-null int64
block_id         1494279 non-null int64
gasLimit_t       1494279 non-null int64
gasUsed_t        1494279 non-null int64
newContract      1494279 non-null int64
price            1494279 non-null int64
time_t           1494279 non-null datetime64[ns]
txIndex          515711 non-null float64
type             1494279 non-null object
blockTime        1494279 non-null int64
difficulty       1494279 non-null int64
gasLimit_b       1494279 non-null int64
gasUsed_b        1494279 non-null int64
reward           1494279 non-null int64
size             1494279 non-null int64
time_b           1494279 non-null datetime64[ns]
totalFee         1494279 non-null int64
tx_count         1494279 non-null int64
uncle_count      1494279 non-null int64
amount_binary    1494279 non-null int64
dtypes: datetime64[ns](2), float64(1), int64(17), object(2)
memory usage: 250.8+ MB

Decription:

Transaction level

  • hash_t : a unique identifier for each transaction
  • accountNonce : the number of transactions the sender account has made on the network (mechanism to ensure the same tranaction isn't submitted twice)
  • amount : for monetary transactions, the amount in Ether that is exchanged
  • block_id : a unique identifier for each block
  • gasLimit_t : the maximum amount of gas that is allowed to be spent on gas for a given transaction
  • gasUsed_t : the actual amount of gas that was consumed on the given transaction
  • newContract : whether it is a new contract or not
  • price : the price per unit gas (Wei)
  • txIndex : an integer from 1 to 20 (or NaN)
  • amount_binary : binarized amount column (1 if amount is non-zero)
  • type : the kind of transaction
    • tx
    • call
    • create
    • suicide

Block level

  • blockTime : the total amount of time spent on the block
  • difficulty :
  • gasLimit_b :
  • gasUsed_b :
  • reward :
  • size :
  • time_b :
  • totalFee :
  • tx_count : the total number of txs in a given block
  • uncle_count :

Categorical variables:

  • txIndex
  • type
  • amount_binary

Explore features

New column with converted price from Wei to Gwei


In [901]:
df['price_gwei'] = df['price'].values / 1000000000

In [903]:
plt.hist(df['price_gwei'], bins=3000, color='b', alpha=0.7)
plt.xlim(0,100)
plt.xlabel('Price (GWei)')
plt.show()



In [904]:
np.mean(df['price_gwei'].values)


Out[904]:
29.105697635574437

In [905]:
np.max(df['price_gwei'].values)


Out[905]:
37304.531999999999

In [906]:
np.min(df['price_gwei'].values)


Out[906]:
1.0000000000000001e-09

In [907]:
df['amount_binary'].value_counts()


Out[907]:
0    862496
1    631783
Name: amount_binary, dtype: int64

In [908]:
df['type'].value_counts()


Out[908]:
tx         973640
call       492595
create      27726
suicide       318
Name: type, dtype: int64

In [909]:
grouped_type = df.groupby('type')

In [910]:
df.groupby('type')['price_gwei'].mean()


Out[910]:
type
call       34.134513
create     11.723243
suicide     8.152529
tx         27.063300
Name: price_gwei, dtype: float64

In [911]:
df.groupby('type')['price_gwei'].describe().T


Out[911]:
type call create suicide tx
count 4.925950e+05 27726.000000 318.000000 9.736400e+05
mean 3.413451e+01 11.723243 8.152529 2.706330e+01
std 1.597499e+02 15.771398 10.768688 1.211050e+02
min 4.000000e-09 0.030000 0.500000 1.000000e-09
25% 2.000000e+01 1.000000 0.500000 2.000000e+01
50% 2.200000e+01 5.500000 0.500000 2.100000e+01
75% 5.100000e+01 21.000000 21.000000 2.844094e+01
max 3.730453e+04 512.000000 55.945000 3.678191e+04

Call has the highest mean price, while suicide has the lowest

Plot price for different types


In [912]:
tx = df[df['type'] == 'tx']
call = df[df['type'] == 'call']
create = df[df['type'] == 'create']
suicide = df[df['type'] == 'suicide']

In [913]:
plt.hist(tx['price_gwei'], bins=4000, color='b', alpha=0.7)
plt.xlim(-10,100)
plt.title('tx', fontsize=18)
plt.xlabel('price', fontsize=18)
plt.show()



In [914]:
plt.hist(call['price_gwei'], bins=4000, color='b', alpha=0.7)
plt.xlim(-10,100)
plt.title('call', fontsize=18)
plt.xlabel('price', fontsize=18)
plt.show()



In [915]:
plt.hist(create['price_gwei'], bins=50, color='b', alpha=0.7)
plt.xlim(-10,100)
plt.title('create', fontsize=18)
plt.xlabel('price', fontsize=18)
plt.show()



In [916]:
plt.hist(suicide['price_gwei'], bins=10, color='b', alpha=0.7)
plt.xlim(-10,100)
plt.title('suicide', fontsize=18)
plt.xlabel('price', fontsize=18)
plt.show()


There seems to be some signal among the different transaction types

Clean up time data

New column for separate day, time


In [917]:
df['day'] = pd.DatetimeIndex(df['time_t']).day

In [919]:
# How many transactions each day?
df['day'].value_counts()


Out[919]:
19    414215
18    298074
20    295637
21    171594
22    134105
23    124697
24     55957
Name: day, dtype: int64

In [920]:
df['hour'] = pd.DatetimeIndex(df['time_t']).hour

In [921]:
df['minute'] = pd.DatetimeIndex(df['time_t']).minute

In [922]:
df['second'] = pd.DatetimeIndex(df['time_t']).second

In [923]:
df['dayofweek'] = pd.DatetimeIndex(df['time_t']).dayofweek

In [925]:
# How many transaction occur on a given day of the week? 
df['dayofweek'].value_counts()


Out[925]:
1    414215
0    298074
2    295637
3    171594
4    134105
5    124697
6     55957
Name: dayofweek, dtype: int64

In [926]:
df.columns


Out[926]:
Index([u'hash_t', u'accountNonce', u'amount', u'block_id', u'gasLimit_t',
       u'gasUsed_t', u'newContract', u'price', u'time_t', u'txIndex', u'type',
       u'blockTime', u'difficulty', u'gasLimit_b', u'gasUsed_b', u'reward',
       u'size', u'time_b', u'totalFee', u'tx_count', u'uncle_count',
       u'amount_binary', u'price_gwei', u'day', u'hour', u'minute', u'second',
       u'dayofweek'],
      dtype='object')

In [927]:
# Mean price per day
df['price_gwei'].groupby(df.day).mean()


Out[927]:
day
18    27.951377
19    28.947561
20    36.006918
21    25.445163
22    25.895773
23    25.213314
24    27.555949
Name: price_gwei, dtype: float64

In [933]:
# How many blocks are in each day (24 hrs)?
df['block_id'].groupby(df.day).nunique()


Out[933]:
day
18    2818
19    3559
20    3030
21    1738
22    1403
23    1429
24     606
Name: block_id, dtype: int64

In [934]:
df[df['amount'] > 0].hist('amount', bins=20)


Out[934]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x12412ebd0>]], dtype=object)

Conversions

Conversion from Gwei to ETH

1 Gwei = 0.000000001 ETH

Amount

  • amount is ~ to 10^-18 ETH

Convert amount to ETH


In [935]:
df['amount_eth'] = df['amount'].map(lambda x: float(x) / 1000000000000000000)

In [936]:
df['amount_eth'].head()


Out[936]:
0    0.499093
1    0.149222
2    0.099608
3    0.199005
4    1.000289
Name: amount_eth, dtype: float64

In [937]:
df[['amount_eth', 'type', 'newContract', 'price_gwei', 'block_id']].head()


Out[937]:
amount_eth type newContract price_gwei block_id
0 0.499093 tx 0 21.0 4295887
1 0.149222 call 0 21.0 4295887
2 0.099608 tx 0 21.0 4295887
3 0.199005 tx 0 21.0 4295887
4 1.000289 tx 0 21.0 4295887

In [938]:
df['price_gwei'].describe()


Out[938]:
count    1.494279e+06
mean     2.910570e+01
std      1.341286e+02
min      1.000000e-09
25%      2.000000e+01
50%      2.100000e+01
75%      4.200000e+01
max      3.730453e+04
Name: price_gwei, dtype: float64

Encode the type column


In [939]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['type_enc'] = le.fit_transform(df['type'])

In [940]:
df['type_enc'].value_counts()


Out[940]:
3    973640
0    492595
1     27726
2       318
Name: type_enc, dtype: int64

Filter data

  • only monetary transactions (amount > 0)
  • only gas price > 0

In [941]:
df_filtered = df[(df['amount_eth'] > 10**(-6)) & (df['price_gwei'] > 0)]

In [942]:
df_filtered = df[(df['amount_eth'] > 10**(-6)) & (df['price_gwei'] > 0)]

In [943]:
print('deleted {} rows having 0 amounts'.format(df[df['amount_eth'] == 0].values.shape[0]))


deleted 747566 rows having 0 amounts

In [944]:
print('deleted {} rows having 0 gas price'.format(df[df['price_gwei'] == 0].values.shape[0]))


deleted 0 rows having 0 gas price

In [945]:
df_filtered['amount_eth'].apply(np.log10).hist(bins=100)
plt.xlabel('Amount')


Out[945]:
<matplotlib.text.Text at 0x18ffac610>

In [946]:
df_filtered['amount_eth'].hist()
plt.xlabel('Amount')


Out[946]:
<matplotlib.text.Text at 0x18ed51d50>

In [947]:
df_filtered['price_gwei'].hist(bins=2000)
plt.xlim(0,100)
plt.xlabel('Gas Price')


Out[947]:
<matplotlib.text.Text at 0x24fa0d810>

In [948]:
gas_log = np.log(df_filtered['price_gwei'].values)

In [949]:
plt.scatter(df_filtered['amount_eth'], df_filtered['price_gwei'])
plt.xlabel('Amount')
plt.ylabel('Gas Price')


Out[949]:
<matplotlib.text.Text at 0x1a207cb50>

In [950]:
df['dayofweek'].describe()


Out[950]:
count    1.494279e+06
mean     2.018310e+00
std      1.702585e+00
min      0.000000e+00
25%      1.000000e+00
50%      2.000000e+00
75%      3.000000e+00
max      6.000000e+00
Name: dayofweek, dtype: float64

In [951]:
df_filtered['dayofweek'].hist(bins=7)
plt.xlabel("Day of Week (Mon-Sun)")


Out[951]:
<matplotlib.text.Text at 0x1a207c410>

In [952]:
df_filtered['hour'].describe()


Out[952]:
count    629224.000000
mean         10.461402
std           6.589997
min           0.000000
25%           4.000000
50%          10.000000
75%          16.000000
max          23.000000
Name: hour, dtype: float64

In [953]:
df_filtered['hour'].hist(bins=24)
plt.xlim(0,23)
plt.xlabel('Hour')


Out[953]:
<matplotlib.text.Text at 0x1a2faadd0>

In [954]:
df_filtered.groupby('type')['gasUsed_t'].mean()


Out[954]:
type
call         3051.038417
create     340235.204340
suicide         0.000000
tx          29462.653340
Name: gasUsed_t, dtype: float64

Obtain averages over previous blocks for each block id to generate new features

  • difficulty
  • blocktime
  • transactions/second
  • uncle rate
  • gasused by transaction
  • gasused by block
  • gasprice

Add column with:

  • txs per second for each block
  • avg gasUsed for each block
  • avg price per transaction for each block

In [955]:
df_filtered['txcnt_second'] = df_filtered['tx_count'].values / df_filtered['blockTime'].values


/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [956]:
df_filtered['avg_gasUsed_t_perblock'] = df_filtered.groupby('block_id')['gasUsed_t'].transform('mean')


/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [957]:
df_filtered['avg_price_perblock'] = df_filtered.groupby('block_id')['price_gwei'].transform('mean')


/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [958]:
df_filtered.to_csv('./../data/data_filtered.csv')

Calculate the rolling average over given number of windows

  • This is very messy! Will clean up later

In [959]:
def rolling_avg(window_size):
    
    price = df_filtered[['block_id', 'avg_price_perblock']].drop_duplicates().sort_values(
                        'block_id', ascending=True)
    
    gasUsed_t = df_filtered[['block_id', 'avg_gasUsed_t_perblock']].drop_duplicates().sort_values(
                            'block_id', ascending=True)
    
    txcnt_second = df_filtered[['block_id', 'txcnt_second']].drop_duplicates().sort_values(
                                'block_id', ascending=True)
    
    tx_count = df_filtered[['block_id', 'tx_count']].drop_duplicates().sort_values(
                            'block_id', ascending=True)
    
    gasUsed_b = df_filtered[['block_id', 'gasUsed_b']].drop_duplicates().sort_values(
                            'block_id', ascending=True)
    
    uncle_count = df_filtered[['block_id', 'uncle_count']].drop_duplicates().sort_values(
                                'block_id', ascending=True)
    
    difficulty = df_filtered[['block_id', 'difficulty']].drop_duplicates().sort_values(
                                'block_id', ascending=True)
    
    blocktime = df_filtered[['block_id', 'blockTime']].drop_duplicates().sort_values(
                             'block_id', ascending=True)
    
    # create new pandas dataframe with average values
    rolling_avg = pd.DataFrame()
    
    # calculate rolling averages 
    rolling_avg['avg_blocktime'] = blocktime['blockTime'].rolling(window=window_size).mean()
    rolling_avg['avg_gasUsed_b'] = gasUsed_b['gasUsed_b'].rolling(window=window_size).mean()
    rolling_avg['avg_tx_count'] = tx_count['tx_count'].rolling(window=window_size).mean()
    rolling_avg['avg_uncle_count'] = uncle_count['uncle_count'].rolling(window=window_size).mean()
    rolling_avg['avg_difficulty'] = difficulty['difficulty'].rolling(window=window_size).mean()
    rolling_avg['avg_txcnt_second'] = txcnt_second['txcnt_second'].rolling(window=window_size).mean()
    rolling_avg['avg_gasUsed_t'] = gasUsed_t['avg_gasUsed_t_perblock'].rolling(window=window_size).mean()
    rolling_avg['avg_price'] = price['avg_price_perblock'].rolling(window=window_size).mean()
    
    # insert blockids to merge on
    rolling_avg['blockids'] = df_filtered['block_id'].drop_duplicates().sort_values(ascending=True)
    
    return rolling_avg

There are between 600 and 3500 blocks in a given day, so try 5 through 4000 previous blocks

Create csv files for each regime


In [960]:
num_blocks = [5, 10, 25, 50, 100, 1000, 2000, 3000, 4000]
for num in num_blocks:
    df_rolling_avg = rolling_avg(num)
    df_rolling_avg.to_csv('./../data/block_avg_{}.csv'.format(num))

In [ ]: