Exploratory Analysis



In [875]:

    
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Load data



In [876]:

    
df = pd.read_csv('./../data/data.csv')



In [877]:

    
df.shape









    Out[877]:





(1494279, 23)



In [878]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1494279 entries, 0 to 1494278
Data columns (total 23 columns):
Unnamed: 0      1494279 non-null int64
hash_t          1494279 non-null object
accountNonce    1494279 non-null int64
amount          1494279 non-null float64
block_id        1494279 non-null int64
gasLimit_t      1494279 non-null int64
gasUsed_t       1494279 non-null int64
isContractTx    0 non-null float64
newContract     1494279 non-null int64
price           1494279 non-null float64
time_t          1494279 non-null object
txIndex         515711 non-null float64
type            1494279 non-null object
blockTime       1494279 non-null int64
difficulty      1494279 non-null object
gasLimit_b      1494279 non-null float64
gasUsed_b       1494279 non-null float64
reward          1494279 non-null object
size            1494279 non-null int64
time_b          1494279 non-null object
totalFee        1494279 non-null object
tx_count        1494279 non-null int64
uncle_count     1494279 non-null int64
dtypes: float64(6), int64(10), object(7)
memory usage: 262.2+ MB



In [879]:

    
print('no txs: {}, no blocks: {}'.format(df.shape[0], np.unique(df['block_id'].values).shape[0]))









    



no txs: 1494279, no blocks: 14583

Convert dates to datetime



In [880]:

    
df.loc[:,'time_t'] = pd.to_datetime(df.time_t, yearfirst=True)



In [881]:

    
df.loc[:,'time_b'] = pd.to_datetime(df.time_b, yearfirst=True)



In [882]:

    
df['time_t'].head()









    Out[882]:





0   2017-09-20 20:24:15
1   2017-09-20 20:24:15
2   2017-09-20 20:24:15
3   2017-09-20 20:24:15
4   2017-09-20 20:24:15
Name: time_t, dtype: datetime64[ns]



In [883]:

    
df.drop('Unnamed: 0', axis=1, inplace=True)

Convert long integer columns

difficulty
reward
totalFee

Filter each string extracting the digits as integers



In [884]:

    
int(filter(str.isdigit, df['difficulty'][0]))









    Out[884]:





2338428842518947



In [885]:

    
df['difficulty'] = df['difficulty'].apply(lambda x: int(filter(str.isdigit, x)))



In [886]:

    
df['difficulty'].head()









    Out[886]:





0    2338428842518947
1    2338428842518947
2    2338428842518947
3    2338428842518947
4    2338428842518947
Name: difficulty, dtype: int64



In [887]:

    
df['reward'] = df['reward'].apply(lambda x: int(filter(str.isdigit, x)))



In [888]:

    
df['reward'].head()









    Out[888]:





0    5008232105000000000
1    5008232105000000000
2    5008232105000000000
3    5008232105000000000
4    5008232105000000000
Name: reward, dtype: int64



In [889]:

    
df['totalFee'] = df['totalFee'].apply(lambda x: int(filter(str.isdigit, x)))



In [890]:

    
df['totalFee'].head()









    Out[890]:





0    8232105000000000
1    8232105000000000
2    8232105000000000
3    8232105000000000
4    8232105000000000
Name: totalFee, dtype: int64

Convert floats back to ints

when collecting data from the api, some of the ints were longer than 8 bytes causing mongo to throw up errors
to avoid this we temporarily converted those to floats



In [891]:

    
def float_to_int(col_list):
    for col in col_list:
        df[col] = df[col].apply(lambda x: np.rint(x))
        df[col] = df[col].values.astype(int)



In [892]:

    
float_to_int(['amount', 'price', 'gasLimit_b', 'gasUsed_b'])



In [893]:

    
df['isContractTx'].isnull().sum()









    Out[893]:





1494279

The column "isContractTx" is empty so drop it



In [894]:

    
df.drop('isContractTx', axis=1, inplace=True)



In [895]:

    
df['txIndex'].isnull().sum()









    Out[895]:





978568



In [896]:

    
df['txIndex'].value_counts()









    Out[896]:





1.0     176351
2.0      92812
3.0      53512
4.0      42831
5.0      39828
6.0      34256
7.0      32021
8.0      11919
9.0       5642
10.0      5096
11.0      4882
12.0      4867
13.0      4317
14.0      3138
15.0      1954
16.0      1823
17.0       289
18.0       154
19.0        12
20.0         7
Name: txIndex, dtype: int64



In [897]:

    
df['type'].value_counts()









    Out[897]:





tx         973640
call       492595
create      27726
suicide       318
Name: type, dtype: int64



In [898]:

    
df['newContract'].value_counts()









    Out[898]:





0    1489351
1       4928
Name: newContract, dtype: int64

Binarize the amount column (1 if > 0 and 0 if 0)



In [899]:

    
df['amount_binary'] = df['amount'].map(lambda x: 1 if x > 0 else 0)

Check for NaN values



In [900]:

    
for col in df.columns:
    print(col, df[col].isnull().sum())









    



hash_t 0
accountNonce 0
amount 0
block_id 0
gasLimit_t 0
gasUsed_t 0
newContract 0
price 0
time_t 0
txIndex 978568
type 0
blockTime 0
difficulty 0
gasLimit_b 0
gasUsed_b 0
reward 0
size 0
time_b 0
totalFee 0
tx_count 0
uncle_count 0
amount_binary 0

Columns decriptions



In [800]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1494279 entries, 0 to 1494278
Data columns (total 22 columns):
hash_t           1494279 non-null object
accountNonce     1494279 non-null int64
amount           1494279 non-null int64
block_id         1494279 non-null int64
gasLimit_t       1494279 non-null int64
gasUsed_t        1494279 non-null int64
newContract      1494279 non-null int64
price            1494279 non-null int64
time_t           1494279 non-null datetime64[ns]
txIndex          515711 non-null float64
type             1494279 non-null object
blockTime        1494279 non-null int64
difficulty       1494279 non-null int64
gasLimit_b       1494279 non-null int64
gasUsed_b        1494279 non-null int64
reward           1494279 non-null int64
size             1494279 non-null int64
time_b           1494279 non-null datetime64[ns]
totalFee         1494279 non-null int64
tx_count         1494279 non-null int64
uncle_count      1494279 non-null int64
amount_binary    1494279 non-null int64
dtypes: datetime64[ns](2), float64(1), int64(17), object(2)
memory usage: 250.8+ MB

Decription:

Transaction level

hash_t : a unique identifier for each transaction
accountNonce : the number of transactions the sender account has made on the network (mechanism to ensure the same tranaction isn't submitted twice)
amount : for monetary transactions, the amount in Ether that is exchanged
block_id : a unique identifier for each block
gasLimit_t : the maximum amount of gas that is allowed to be spent on gas for a given transaction
gasUsed_t : the actual amount of gas that was consumed on the given transaction
newContract : whether it is a new contract or not
price : the price per unit gas (Wei)
txIndex : an integer from 1 to 20 (or NaN)
amount_binary : binarized amount column (1 if amount is non-zero)
type : the kind of transaction
- tx
- call
- create
- suicide

Block level

blockTime : the total amount of time spent on the block
difficulty :
gasLimit_b :
gasUsed_b :
reward :
size :
time_b :
totalFee :
tx_count : the total number of txs in a given block
uncle_count :

Categorical variables:

txIndex
type
amount_binary

Explore features

New column with converted price from Wei to Gwei



In [901]:

    
df['price_gwei'] = df['price'].values / 1000000000



In [903]:

    
plt.hist(df['price_gwei'], bins=3000, color='b', alpha=0.7)
plt.xlim(0,100)
plt.xlabel('Price (GWei)')
plt.show()



In [904]:

    
np.mean(df['price_gwei'].values)









    Out[904]:





29.105697635574437



In [905]:

    
np.max(df['price_gwei'].values)









    Out[905]:





37304.531999999999



In [906]:

    
np.min(df['price_gwei'].values)









    Out[906]:





1.0000000000000001e-09



In [907]:

    
df['amount_binary'].value_counts()









    Out[907]:





0    862496
1    631783
Name: amount_binary, dtype: int64



In [908]:

    
df['type'].value_counts()









    Out[908]:





tx         973640
call       492595
create      27726
suicide       318
Name: type, dtype: int64



In [909]:

    
grouped_type = df.groupby('type')



In [910]:

    
df.groupby('type')['price_gwei'].mean()









    Out[910]:





type
call       34.134513
create     11.723243
suicide     8.152529
tx         27.063300
Name: price_gwei, dtype: float64



In [911]:

    
df.groupby('type')['price_gwei'].describe().T









    Out[911]:







  
    
      type
      call
      create
      suicide
      tx
    
  
  
    
      count
      4.925950e+05
      27726.000000
      318.000000
      9.736400e+05
    
    
      mean
      3.413451e+01
      11.723243
      8.152529
      2.706330e+01
    
    
      std
      1.597499e+02
      15.771398
      10.768688
      1.211050e+02
    
    
      min
      4.000000e-09
      0.030000
      0.500000
      1.000000e-09
    
    
      25%
      2.000000e+01
      1.000000
      0.500000
      2.000000e+01
    
    
      50%
      2.200000e+01
      5.500000
      0.500000
      2.100000e+01
    
    
      75%
      5.100000e+01
      21.000000
      21.000000
      2.844094e+01
    
    
      max
      3.730453e+04
      512.000000
      55.945000
      3.678191e+04

Call has the highest mean price, while suicide has the lowest

Plot price for different types



In [912]:

    
tx = df[df['type'] == 'tx']
call = df[df['type'] == 'call']
create = df[df['type'] == 'create']
suicide = df[df['type'] == 'suicide']



In [913]:

    
plt.hist(tx['price_gwei'], bins=4000, color='b', alpha=0.7)
plt.xlim(-10,100)
plt.title('tx', fontsize=18)
plt.xlabel('price', fontsize=18)
plt.show()



In [914]:

    
plt.hist(call['price_gwei'], bins=4000, color='b', alpha=0.7)
plt.xlim(-10,100)
plt.title('call', fontsize=18)
plt.xlabel('price', fontsize=18)
plt.show()



In [915]:

    
plt.hist(create['price_gwei'], bins=50, color='b', alpha=0.7)
plt.xlim(-10,100)
plt.title('create', fontsize=18)
plt.xlabel('price', fontsize=18)
plt.show()



In [916]:

    
plt.hist(suicide['price_gwei'], bins=10, color='b', alpha=0.7)
plt.xlim(-10,100)
plt.title('suicide', fontsize=18)
plt.xlabel('price', fontsize=18)
plt.show()

There seems to be some signal among the different transaction types

Clean up time data

New column for separate day, time



In [917]:

    
df['day'] = pd.DatetimeIndex(df['time_t']).day



In [919]:

    
# How many transactions each day?
df['day'].value_counts()









    Out[919]:





19    414215
18    298074
20    295637
21    171594
22    134105
23    124697
24     55957
Name: day, dtype: int64



In [920]:

    
df['hour'] = pd.DatetimeIndex(df['time_t']).hour



In [921]:

    
df['minute'] = pd.DatetimeIndex(df['time_t']).minute



In [922]:

    
df['second'] = pd.DatetimeIndex(df['time_t']).second



In [923]:

    
df['dayofweek'] = pd.DatetimeIndex(df['time_t']).dayofweek



In [925]:

    
# How many transaction occur on a given day of the week? 
df['dayofweek'].value_counts()









    Out[925]:





1    414215
0    298074
2    295637
3    171594
4    134105
5    124697
6     55957
Name: dayofweek, dtype: int64



In [926]:

    
df.columns









    Out[926]:





Index([u'hash_t', u'accountNonce', u'amount', u'block_id', u'gasLimit_t',
       u'gasUsed_t', u'newContract', u'price', u'time_t', u'txIndex', u'type',
       u'blockTime', u'difficulty', u'gasLimit_b', u'gasUsed_b', u'reward',
       u'size', u'time_b', u'totalFee', u'tx_count', u'uncle_count',
       u'amount_binary', u'price_gwei', u'day', u'hour', u'minute', u'second',
       u'dayofweek'],
      dtype='object')



In [927]:

    
# Mean price per day
df['price_gwei'].groupby(df.day).mean()









    Out[927]:





day
18    27.951377
19    28.947561
20    36.006918
21    25.445163
22    25.895773
23    25.213314
24    27.555949
Name: price_gwei, dtype: float64



In [933]:

    
# How many blocks are in each day (24 hrs)?
df['block_id'].groupby(df.day).nunique()









    Out[933]:





day
18    2818
19    3559
20    3030
21    1738
22    1403
23    1429
24     606
Name: block_id, dtype: int64



In [934]:

    
df[df['amount'] > 0].hist('amount', bins=20)









    Out[934]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x12412ebd0>]], dtype=object)

Conversions

Conversion from Gwei to ETH

1 Gwei = 0.000000001 ETH

Amount

amount is ~ to 10^-18 ETH

Convert amount to ETH



In [935]:

    
df['amount_eth'] = df['amount'].map(lambda x: float(x) / 1000000000000000000)



In [936]:

    
df['amount_eth'].head()









    Out[936]:





0    0.499093
1    0.149222
2    0.099608
3    0.199005
4    1.000289
Name: amount_eth, dtype: float64



In [937]:

    
df[['amount_eth', 'type', 'newContract', 'price_gwei', 'block_id']].head()









    Out[937]:







  
    
      
      amount_eth
      type
      newContract
      price_gwei
      block_id
    
  
  
    
      0
      0.499093
      tx
      0
      21.0
      4295887
    
    
      1
      0.149222
      call
      0
      21.0
      4295887
    
    
      2
      0.099608
      tx
      0
      21.0
      4295887
    
    
      3
      0.199005
      tx
      0
      21.0
      4295887
    
    
      4
      1.000289
      tx
      0
      21.0
      4295887



In [938]:

    
df['price_gwei'].describe()









    Out[938]:





count    1.494279e+06
mean     2.910570e+01
std      1.341286e+02
min      1.000000e-09
25%      2.000000e+01
50%      2.100000e+01
75%      4.200000e+01
max      3.730453e+04
Name: price_gwei, dtype: float64

Encode the type column



In [939]:

    
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['type_enc'] = le.fit_transform(df['type'])



In [940]:

    
df['type_enc'].value_counts()









    Out[940]:





3    973640
0    492595
1     27726
2       318
Name: type_enc, dtype: int64

Filter data

only monetary transactions (amount > 0)
only gas price > 0



In [941]:

    
df_filtered = df[(df['amount_eth'] > 10**(-6)) & (df['price_gwei'] > 0)]



In [942]:

    
df_filtered = df[(df['amount_eth'] > 10**(-6)) & (df['price_gwei'] > 0)]



In [943]:

    
print('deleted {} rows having 0 amounts'.format(df[df['amount_eth'] == 0].values.shape[0]))









    



deleted 747566 rows having 0 amounts



In [944]:

    
print('deleted {} rows having 0 gas price'.format(df[df['price_gwei'] == 0].values.shape[0]))









    



deleted 0 rows having 0 gas price



In [945]:

    
df_filtered['amount_eth'].apply(np.log10).hist(bins=100)
plt.xlabel('Amount')









    Out[945]:





<matplotlib.text.Text at 0x18ffac610>



In [946]:

    
df_filtered['amount_eth'].hist()
plt.xlabel('Amount')









    Out[946]:





<matplotlib.text.Text at 0x18ed51d50>



In [947]:

    
df_filtered['price_gwei'].hist(bins=2000)
plt.xlim(0,100)
plt.xlabel('Gas Price')









    Out[947]:





<matplotlib.text.Text at 0x24fa0d810>



In [948]:

    
gas_log = np.log(df_filtered['price_gwei'].values)



In [949]:

    
plt.scatter(df_filtered['amount_eth'], df_filtered['price_gwei'])
plt.xlabel('Amount')
plt.ylabel('Gas Price')









    Out[949]:





<matplotlib.text.Text at 0x1a207cb50>



In [950]:

    
df['dayofweek'].describe()









    Out[950]:





count    1.494279e+06
mean     2.018310e+00
std      1.702585e+00
min      0.000000e+00
25%      1.000000e+00
50%      2.000000e+00
75%      3.000000e+00
max      6.000000e+00
Name: dayofweek, dtype: float64



In [951]:

    
df_filtered['dayofweek'].hist(bins=7)
plt.xlabel("Day of Week (Mon-Sun)")









    Out[951]:





<matplotlib.text.Text at 0x1a207c410>



In [952]:

    
df_filtered['hour'].describe()









    Out[952]:





count    629224.000000
mean         10.461402
std           6.589997
min           0.000000
25%           4.000000
50%          10.000000
75%          16.000000
max          23.000000
Name: hour, dtype: float64



In [953]:

    
df_filtered['hour'].hist(bins=24)
plt.xlim(0,23)
plt.xlabel('Hour')









    Out[953]:





<matplotlib.text.Text at 0x1a2faadd0>



In [954]:

    
df_filtered.groupby('type')['gasUsed_t'].mean()









    Out[954]:





type
call         3051.038417
create     340235.204340
suicide         0.000000
tx          29462.653340
Name: gasUsed_t, dtype: float64

Obtain averages over previous blocks for each block id to generate new features

difficulty
blocktime
transactions/second
uncle rate
gasused by transaction
gasused by block
gasprice

Add column with:

txs per second for each block
avg gasUsed for each block
avg price per transaction for each block



In [955]:

    
df_filtered['txcnt_second'] = df_filtered['tx_count'].values / df_filtered['blockTime'].values









    



/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [956]:

    
df_filtered['avg_gasUsed_t_perblock'] = df_filtered.groupby('block_id')['gasUsed_t'].transform('mean')









    



/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [957]:

    
df_filtered['avg_price_perblock'] = df_filtered.groupby('block_id')['price_gwei'].transform('mean')









    



/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [958]:

    
df_filtered.to_csv('./../data/data_filtered.csv')

Calculate the rolling average over given number of windows

This is very messy! Will clean up later



In [959]:

    
def rolling_avg(window_size):
    
    price = df_filtered[['block_id', 'avg_price_perblock']].drop_duplicates().sort_values(
                        'block_id', ascending=True)
    
    gasUsed_t = df_filtered[['block_id', 'avg_gasUsed_t_perblock']].drop_duplicates().sort_values(
                            'block_id', ascending=True)
    
    txcnt_second = df_filtered[['block_id', 'txcnt_second']].drop_duplicates().sort_values(
                                'block_id', ascending=True)
    
    tx_count = df_filtered[['block_id', 'tx_count']].drop_duplicates().sort_values(
                            'block_id', ascending=True)
    
    gasUsed_b = df_filtered[['block_id', 'gasUsed_b']].drop_duplicates().sort_values(
                            'block_id', ascending=True)
    
    uncle_count = df_filtered[['block_id', 'uncle_count']].drop_duplicates().sort_values(
                                'block_id', ascending=True)
    
    difficulty = df_filtered[['block_id', 'difficulty']].drop_duplicates().sort_values(
                                'block_id', ascending=True)
    
    blocktime = df_filtered[['block_id', 'blockTime']].drop_duplicates().sort_values(
                             'block_id', ascending=True)
    
    # create new pandas dataframe with average values
    rolling_avg = pd.DataFrame()
    
    # calculate rolling averages 
    rolling_avg['avg_blocktime'] = blocktime['blockTime'].rolling(window=window_size).mean()
    rolling_avg['avg_gasUsed_b'] = gasUsed_b['gasUsed_b'].rolling(window=window_size).mean()
    rolling_avg['avg_tx_count'] = tx_count['tx_count'].rolling(window=window_size).mean()
    rolling_avg['avg_uncle_count'] = uncle_count['uncle_count'].rolling(window=window_size).mean()
    rolling_avg['avg_difficulty'] = difficulty['difficulty'].rolling(window=window_size).mean()
    rolling_avg['avg_txcnt_second'] = txcnt_second['txcnt_second'].rolling(window=window_size).mean()
    rolling_avg['avg_gasUsed_t'] = gasUsed_t['avg_gasUsed_t_perblock'].rolling(window=window_size).mean()
    rolling_avg['avg_price'] = price['avg_price_perblock'].rolling(window=window_size).mean()
    
    # insert blockids to merge on
    rolling_avg['blockids'] = df_filtered['block_id'].drop_duplicates().sort_values(ascending=True)
    
    return rolling_avg

There are between 600 and 3500 blocks in a given day, so try 5 through 4000 previous blocks

Create csv files for each regime



In [960]:

    
num_blocks = [5, 10, 25, 50, 100, 1000, 2000, 3000, 4000]
for num in num_blocks:
    df_rolling_avg = rolling_avg(num)
    df_rolling_avg.to_csv('./../data/block_avg_{}.csv'.format(num))



In [ ]:

type	call	create	suicide	tx
count	4.925950e+05	27726.000000	318.000000	9.736400e+05
mean	3.413451e+01	11.723243	8.152529	2.706330e+01
std	1.597499e+02	15.771398	10.768688	1.211050e+02
min	4.000000e-09	0.030000	0.500000	1.000000e-09
25%	2.000000e+01	1.000000	0.500000	2.000000e+01
50%	2.200000e+01	5.500000	0.500000	2.100000e+01
75%	5.100000e+01	21.000000	21.000000	2.844094e+01
max	3.730453e+04	512.000000	55.945000	3.678191e+04

	amount_eth	type	price_gwei	block_id
0	0.499093	tx	21.0	4295887
1	0.149222	call	21.0	4295887
2	0.099608	tx	21.0	4295887
3	0.199005	tx	21.0	4295887
4	1.000289	tx	21.0	4295887