In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


//anaconda/lib/python2.7/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)

In [4]:
def merge_data():
    """
    Merged transaction and block data from json files
    """
    
    # load transaction data into dataframes and combine
    transactions1 = pd.read_json('./../data/transactions.json', lines=True)
    transactions2 = pd.read_json('./../data/new_transactions.json', lines = True)
    transactions = transactions1.append(transactions2)
    
    # load block data into dataframes and combine
    blocks1 = pd.read_json('./../data/blocks.json', lines=True)
    blocks2 = pd.read_json('./../data/blocks_more.json', lines=True)
    blocks3 = pd.read_json('./../data/blocks_2.json', lines=True)
    blocks = blocks1.append([blocks2, blocks3])
    
    # new block_id column converting floats back into integers
    blocks['block_id'] = blocks['number'].apply(lambda x: int(round(x)))
    
    # drop duplicate blocks
    blocks.drop_duplicates(subset='block_id', inplace=True)
    
    # merge transaction and block data 
    merged_df = transactions.merge(blocks, left_on='block_id', right_on='block_id', 
                                   suffixes=('_t', '_b'))
    
    # write to csv file
    merged_df.to_csv('./../data/data.csv')
    
    return merged_df

In [2]:
def clean_data(df):
    
    """
    Takes the merged data from 'merge_data()' and cleans it for modeling.
    """
    # drop columns
    
    # convert dates to datetime
    df.loc[:,'time_t'] = pd.to_datetime(df.time_t, yearfirst=True)
    df.loc[:,'time_b'] = pd.to_datetime(df.time_b, yearfirst=True)
    
    # drop unnamed column
    for col in df.columns:
        if col == 'Unnamed: 0':
            df.drop('Unnamed: 0', axis=1, inplace=True)
        else:
            continue
            
    # drop columns containing predominantly null values
    df.drop(['isContractTx', 'txIndex'], axis=1, inplace=True)
        
    # convert datatypes if necessary
    
    # convert long integers to regular integers
    df['difficulty'] = df['difficulty'].apply(lambda x: int(filter(str.isdigit, x)))
    df['reward'] = df['reward'].apply(lambda x: int(filter(str.isdigit, x)))
    df['totalFee'] = df['totalFee'].apply(lambda x: int(filter(str.isdigit, x)))
    
    # convert floats back to integers
    def float_to_int(col_list):
        for col in col_list:
            df[col] = df[col].apply(lambda x: np.rint(x))
            df[col] = df[col].values.astype(int)
            
    float_to_int(['amount', 'price', 'gasLimit_b', 'gasUsed_b'])
    
    # add new columns
    
    # price set for price per unit gas in gwei
    df['price_gwei'] = df['price'] / 1000000000.0
    
    # amount of money transfered in gwei
    df['amount_gwei'] = df['amount'] / 1000000000.0
    
    # fraction of gas used gasUsed in a single traction w.r.t. entire gasUsed in the block
    df['gasShare']  = df.gasUsed_t/df.gasUsed_b
    
    # actual price paid at the end of the transaction 
    df['gweiPaid'] = df.gasUsed_t*df.price_gwei
    
    # dict of price paid and blockid
    gweiDict = df[['gweiPaid','block_id']].groupby('block_id').sum().T.to_dict()
    
    # actual price paid per block
    df['gweiPaid_b'] = df.block_id.apply(lambda b: gweiDict[b]['gweiPaid'])
    
    # fraction of gwei paid w.r.t. the entire block
    df['gweiShare']  = df.gweiPaid/df.gweiPaid_b
    
    # transactions where no gas was used, thus it costs nothing (1 if it is free)
    df['free_t'] = (df.gasUsed_t ==0).apply(int)
    
    # separate columns for day, hour, dayofweek
    df['day'] = pd.DatetimeIndex(df['time_t']).day
    df['hour'] = pd.DatetimeIndex(df['time_t']).hour
    df['dayofweek'] = pd.DatetimeIndex(df['time_t']).dayofweek
    
    # write to csv file
    df.to_csv('./../data/clean_data.csv')
    
    return df

In [6]:
merged_df = merge_data()

In [3]:
df_toclean = pd.read_csv('./../data/data.csv')

In [4]:
clean_data(df_toclean)


Out[4]:
hash_t accountNonce amount block_id gasLimit_t gasUsed_t newContract price time_t type ... price_gwei amount_gwei gasShare gweiPaid gweiPaid_b gweiShare free_t day hour dayofweek
0 0xfc505bf23efe13154192262c3f7b99e2517b133fe5f6... 3324654 499093201921000000 4295887 50000 21000 0 21000000000 2017-09-20 20:24:15 tx ... 21.000000 4.990932e+08 0.054830 4.410000e+05 8.627577e+06 0.051115 0 20 20 2
1 0xfa0124a64e4cedc7d75dff3ee0fca13d32b2a368e0cb... 3324658 149222093810000000 4295887 20967 9416 0 21000000000 2017-09-20 20:24:15 call ... 21.000000 1.492221e+08 0.024585 1.977360e+05 8.627577e+06 0.022919 0 20 20 2
2 0xe2a23b916903133728ab3dbfac55a56651508a971e89... 3324660 99608460653500000 4295887 50000 21000 0 21000000000 2017-09-20 20:24:15 tx ... 21.000000 9.960846e+07 0.054830 4.410000e+05 8.627577e+06 0.051115 0 20 20 2
3 0xc431fcd82ad55dc1aee932fa9965b55f8515981147a0... 3324657 199004777431000000 4295887 50000 21000 0 21000000000 2017-09-20 20:24:15 tx ... 21.000000 1.990048e+08 0.054830 4.410000e+05 8.627577e+06 0.051115 0 20 20 2
4 0xb759c34cb5b5af6f94f7002ef812b8482880d1f73d10... 3324662 1000288571660000000 4295887 50000 21000 0 21000000000 2017-09-20 20:24:15 tx ... 21.000000 1.000289e+09 0.054830 4.410000e+05 8.627577e+06 0.051115 0 20 20 2
5 0xad177c193597850a4c7c95cb990233a1190bd6118d33... 3324658 149222093810000000 4295887 50000 39512 0 21000000000 2017-09-20 20:24:15 tx ... 21.000000 1.492221e+08 0.103163 8.297520e+05 8.627577e+06 0.096174 0 20 20 2
6 0xa0e7eaa972996066971a21a0c4b51c413ed77dc1dd03... 557 399349399000000000 4295887 35000 30981 0 21000000000 2017-09-20 20:24:15 tx ... 21.000000 3.993494e+08 0.080889 6.506010e+05 8.627577e+06 0.075409 0 20 20 2
7 0x6ae39472fc63e30f55d38fa09403c7fb6d49da2776fa... 3324655 304861003483000000 4295887 50000 21000 0 21000000000 2017-09-20 20:24:15 tx ... 21.000000 3.048610e+08 0.054830 4.410000e+05 8.627577e+06 0.051115 0 20 20 2
8 0x68f70813df3be71d7792dfd7551b4ef72773c4b74e8a... 3324658 149222093810000000 4295887 2300 0 0 21000000000 2017-09-20 20:24:15 call ... 21.000000 1.492221e+08 0.000000 0.000000e+00 8.627577e+06 0.000000 1 20 20 2
9 0x63425f7ad03a1c88a87e75e4b86b785e3244128c974e... 3324661 1001551304340000000 4295887 50000 39512 0 21000000000 2017-09-20 20:24:15 tx ... 21.000000 1.001551e+09 0.103163 8.297520e+05 8.627577e+06 0.096174 0 20 20 2
10 0x62fc87b620a1f1ccde895b1158d1207d8538b2691d09... 3324656 300354465551000000 4295887 50000 21000 0 21000000000 2017-09-20 20:24:15 tx ... 21.000000 3.003545e+08 0.054830 4.410000e+05 8.627577e+06 0.051115 0 20 20 2
11 0x475dde01e6d899c2be5c8bc7ca3eb131b46eb04c7b72... 1 173683479227000000 4295887 21000 21000 0 21000000000 2017-09-20 20:24:15 tx ... 21.000000 1.736835e+08 0.054830 4.410000e+05 8.627577e+06 0.051115 0 20 20 2
12 0x45c1a1284724e702fe701eed3bf0a1dd61323f5c49d3... 557 399349399000000000 4295887 2300 0 0 21000000000 2017-09-20 20:24:15 call ... 21.000000 3.993494e+08 0.000000 0.000000e+00 8.627577e+06 0.000000 1 20 20 2
13 0x4397eb4742151430d8912b737d25d03af3066be46395... 3324663 199106729755000000 4295887 50000 21000 0 21000000000 2017-09-20 20:24:15 tx ... 21.000000 1.991067e+08 0.054830 4.410000e+05 8.627577e+06 0.051115 0 20 20 2
14 0x37f164b5905ae06d6fb329e52c68d65c92560082b389... 11 309517000000000000 4295887 30000 21000 0 21000000000 2017-09-20 20:24:15 tx ... 21.000000 3.095170e+08 0.054830 4.410000e+05 8.627577e+06 0.051115 0 20 20 2
15 0x2b723aa9b043f82b71f3fe8aff5a2512547f22dfc0d4... 3324661 1001551304340000000 4295887 20967 9416 0 21000000000 2017-09-20 20:24:15 call ... 21.000000 1.001551e+09 0.024585 1.977360e+05 8.627577e+06 0.022919 0 20 20 2
16 0x2a8ed876da33d04b429a703ef584aafb5fde5ca7abcb... 3324659 49037782485800000 4295887 50000 21000 0 21000000000 2017-09-20 20:24:15 tx ... 21.000000 4.903778e+07 0.054830 4.410000e+05 8.627577e+06 0.051115 0 20 20 2
17 0x226d102f0552152fd09f59a554d7d90184b0c1f510d4... 0 4929028760000000000 4295887 21000 21000 0 21000000000 2017-09-20 20:24:15 tx ... 21.000000 4.929029e+09 0.054830 4.410000e+05 8.627577e+06 0.051115 0 20 20 2
18 0x142c3304507018369d65b4dec6e3100876ba324ee9d4... 17 423689742503000000 4295887 21000 21000 0 21000000000 2017-09-20 20:24:15 tx ... 21.000000 4.236897e+08 0.054830 4.410000e+05 8.627577e+06 0.051115 0 20 20 2
19 0x13e3824853ad342d3e38f8daf22fb79573ad11a1ec39... 112783 850533160000000000 4295887 21000 21000 0 30000000000 2017-09-20 20:24:15 tx ... 30.000000 8.505332e+08 0.054830 6.300000e+05 8.627577e+06 0.073022 0 20 20 2
20 0x0c981d56e7dbd75fd5137fcee58b00487d617329a9b0... 3324661 1001551304340000000 4295887 2300 0 0 21000000000 2017-09-20 20:24:15 call ... 21.000000 1.001551e+09 0.000000 0.000000e+00 8.627577e+06 0.000000 1 20 20 2
21 0xff287e7fdb22915154a691890da53ae3de827d875e51... 23 334373990920000000 4295886 35000 30981 0 21000000000 2017-09-20 20:24:05 tx ... 21.000000 3.343740e+08 0.013147 6.506010e+05 7.194889e+07 0.009043 0 20 20 2
22 0xfe9b34c5012de3a8c8e9d3ab36dffe93861eded3797a... 0 437361620000000000 4295886 90000 21051 0 21000000000 2017-09-20 20:24:05 tx ... 21.000000 4.373616e+08 0.008933 4.420710e+05 7.194889e+07 0.006144 0 20 20 2
23 0xfd5e020bbe174a2c15b62b5f943a22fa8dc1ef867f03... 162824 -9223372036854775808 4295886 100000 21000 0 21000000000 2017-09-20 20:24:05 tx ... 21.000000 -9.223372e+09 0.008911 4.410000e+05 7.194889e+07 0.006129 0 20 20 2
24 0xfd4ef54cf9f5f45049724589db66167c41eee75f94c0... 1689923 0 4295886 166396 802 0 51000000000 2017-09-20 20:24:05 call ... 51.000000 0.000000e+00 0.000340 4.090200e+04 7.194889e+07 0.000568 0 20 20 2
25 0xfc58718b463ed384e45ee14f16dd08a14e94ccd0de46... 50495 0 4295886 200000 89244 0 25000000000 2017-09-20 20:24:05 tx ... 25.000000 0.000000e+00 0.037871 2.231100e+06 7.194889e+07 0.031010 0 20 20 2
26 0xfb5ec60663a0ce2f8eb74dd49e2e4afb669a38039340... 6640 0 4295886 280000 279531 1 21000000000 2017-09-20 20:24:05 create ... 21.000000 0.000000e+00 0.118621 5.870151e+06 7.194889e+07 0.081588 0 20 20 2
27 0xf9ee35f861882d9a68ea0997888e2bc5d563fc1760ef... 0 381456920000000000 4295886 110000 80904 0 20000000000 2017-09-20 20:24:05 tx ... 20.000000 3.814569e+08 0.034332 1.618080e+06 7.194889e+07 0.022489 0 20 20 2
28 0xf49964747b4dec0363f66dd5741da753e1d826e5a5ec... 1689924 0 4295886 177209 702 0 51000000000 2017-09-20 20:24:05 call ... 51.000000 0.000000e+00 0.000298 3.580200e+04 7.194889e+07 0.000498 0 20 20 2
29 0xf136f2da84ca307de7146ddb8166ab6540410f04cabf... 1689920 0 4295886 187118 37117 0 51000000000 2017-09-20 20:24:05 tx ... 51.000000 0.000000e+00 0.015751 1.892967e+06 7.194889e+07 0.026310 0 20 20 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1494249 0x9537ba74cd5bb0c1c2e3857fb792ef8a59f2e5553370... 56064 0 4296215 1747606 150519 0 22000001337 2017-09-20 22:35:44 call ... 22.000001 0.000000e+00 0.055012 3.311418e+06 1.385286e+09 0.002390 0 20 22 2
1494250 0x92b590beb8aff25e4b1dfe6ece2b16ed4f8a0ced9115... 94 0 4296215 89334 89334 0 21000000000 2017-09-20 22:35:44 call ... 21.000000 0.000000e+00 0.032650 1.876014e+06 1.385286e+09 0.001354 0 20 22 2
1494251 0x92690d47f4146f54bd9218a1582ef7b612cca65a8c47... 1691160 0 4296215 163089 13902 0 51000000000 2017-09-20 22:35:44 call ... 51.000000 0.000000e+00 0.005081 7.090020e+05 1.385286e+09 0.000512 0 20 22 2
1494252 0x92389c16a9a7f072de31b27dc9aae614f8c12275845e... 1691160 0 4296215 171215 786 0 51000000000 2017-09-20 22:35:44 call ... 51.000000 0.000000e+00 0.000287 4.008600e+04 1.385286e+09 0.000029 0 20 22 2
1494253 0x9036d7686917eef12e73e840747e2d7b301b38ab62cf... 33 0 4296215 200000 51984 0 50000000000 2017-09-20 22:35:44 tx ... 50.000000 0.000000e+00 0.018999 2.599200e+06 1.385286e+09 0.001876 0 20 22 2
1494254 0x8fba7c6f8fc2043a26ae7d0c0c4d973822335edd9710... 1691159 0 4296215 187159 37158 0 51000000000 2017-09-20 22:35:44 tx ... 51.000000 0.000000e+00 0.013581 1.895058e+06 1.385286e+09 0.001368 0 20 22 2
1494255 0x8bca4f22a5b2b82a301fe6f1a861738dbd6ec7c03b06... 1691160 0 4296215 174069 765 0 51000000000 2017-09-20 22:35:44 call ... 51.000000 0.000000e+00 0.000280 3.901500e+04 1.385286e+09 0.000028 0 20 22 2
1494256 0x8a7fb65e54dccb2628b43f3baddb6a11f5ea5e92241a... 82 0 4296215 569 387 0 21000000000 2017-09-20 22:35:44 call ... 21.000000 0.000000e+00 0.000141 8.127000e+03 1.385286e+09 0.000006 0 20 22 2
1494257 0x89d7856b90b0163cef77318e487b857c6437b26b933d... 5 5084651480000000000 4296215 21000 21000 0 21000000000 2017-09-20 22:35:44 tx ... 21.000000 5.084651e+09 0.007675 4.410000e+05 1.385286e+09 0.000318 0 20 22 2
1494258 0x888143341279183b49c2f4a41db622efa0a901026d04... 657 466725000000000 4296215 21000 21000 0 21000000000 2017-09-20 22:35:44 tx ... 21.000000 4.667250e+05 0.007675 4.410000e+05 1.385286e+09 0.000318 0 20 22 2
1494259 0x869c673c8f138246f5b6496eb4c587d3cdaf134d6b6d... 208 0 4296215 144805 9042 0 8395237322010 2017-09-20 22:35:44 call ... 8395.237322 0.000000e+00 0.003305 7.590974e+07 1.385286e+09 0.054797 0 20 22 2
1494260 0x862e37d048eecc3b41aa44f0d28c93923211cd573cb1... 512 0 4296215 99999 51504 0 21000000000 2017-09-20 22:35:44 tx ... 21.000000 0.000000e+00 0.018824 1.081584e+06 1.385286e+09 0.000781 0 20 22 2
1494261 0x813bb4baa995a1d6ac6b49ed5af806fce559910354c1... 146 0 4296215 320299 854 0 197872040000 2017-09-20 22:35:44 call ... 197.872040 0.000000e+00 0.000312 1.689827e+05 1.385286e+09 0.000122 0 20 22 2
1494262 0x803347682926e3c41670f40e4e9754431581910d3273... 34920 2451680000000000000 4296215 90000 21000 0 55000000000 2017-09-20 22:35:44 tx ... 55.000000 2.451680e+09 0.007675 1.155000e+06 1.385286e+09 0.000834 0 20 22 2
1494263 0x7fc867b4c72c41d0b378126cba8a9f83d930c46aede4... 82 49462462040100000 4296215 47008 31984 0 21000000000 2017-09-20 22:35:44 tx ... 21.000000 4.946246e+07 0.011690 6.716640e+05 1.385286e+09 0.000485 0 20 22 2
1494264 0x7c93edca1b591eade2e1d7e15181b26b7ea882bb94af... 146 0 4296215 338791 824 0 197872040000 2017-09-20 22:35:44 call ... 197.872040 0.000000e+00 0.000301 1.630466e+05 1.385286e+09 0.000118 0 20 22 2
1494265 0x7a9fd2f6bb2a7940c4f8497fca6c0c34dc62331c009a... 1 500000000000000000 4296215 90000 21000 0 21000000000 2017-09-20 22:35:44 tx ... 21.000000 5.000000e+08 0.007675 4.410000e+05 1.385286e+09 0.000318 0 20 22 2
1494266 0x7a43c432d8da55e486079f51afc11ddf720078d4a17e... 243046 0 4296215 250000 129139 0 20000000000 2017-09-20 22:35:44 tx ... 20.000000 0.000000e+00 0.047198 2.582780e+06 1.385286e+09 0.001864 0 20 22 2
1494267 0x789084de775ff3228c4a2acee35fe75e5e68c46b776d... 1691160 0 4296215 165781 831 0 51000000000 2017-09-20 22:35:44 call ... 51.000000 0.000000e+00 0.000304 4.238100e+04 1.385286e+09 0.000031 0 20 22 2
1494268 0x7812fd24460de903b7ffd3822f2e4290d9055bf40c31... 146 0 4296215 329586 824 0 197872040000 2017-09-20 22:35:44 call ... 197.872040 0.000000e+00 0.000301 1.630466e+05 1.385286e+09 0.000118 0 20 22 2
1494269 0x7610481a6a738ece76df87ad6fc1cdbaa0ebc2183668... 0 173020000000000000 4296215 90000 21000 0 21000000000 2017-09-20 22:35:44 tx ... 21.000000 1.730200e+08 0.007675 4.410000e+05 1.385286e+09 0.000318 0 20 22 2
1494270 0x73ed25c8eb07fd151f0b65d7015bbef022f261b373e0... 162837 -9223372036854775808 4296215 100000 21000 0 21000000000 2017-09-20 22:35:44 tx ... 21.000000 -9.223372e+09 0.007675 4.410000e+05 1.385286e+09 0.000318 0 20 22 2
1494271 0x73c40f6754297dd26e5c96542cee0b3c5a6199d3480c... 1 3515056840000000000 4296215 90000 21000 0 21000000000 2017-09-20 22:35:44 tx ... 21.000000 3.515057e+09 0.007675 4.410000e+05 1.385286e+09 0.000318 0 20 22 2
1494272 0x71deab40b52347b19d3a696e7fd02cf9d79d6c3ed5f0... 208 0 4296215 185000 48146 0 8395237322010 2017-09-20 22:35:44 tx ... 8395.237322 0.000000e+00 0.017597 4.041971e+08 1.385286e+09 0.291779 0 20 22 2
1494273 0x719ead26c34bbc534055e1cc4ef23970edf6002b62cf... 963 0 4296215 125000 93050 0 8594998120540 2017-09-20 22:35:44 tx ... 8594.998121 0.000000e+00 0.034008 7.997646e+08 1.385286e+09 0.577328 0 20 22 2
1494274 0x6fde3be8085dae36426db6e015876c243c8b22c702ec... 0 90000000000000000 4296215 90000 90000 0 21000000000 2017-09-20 22:35:44 tx ... 21.000000 9.000000e+07 0.032893 1.890000e+06 1.385286e+09 0.001364 0 20 22 2
1494275 0x6f4788c9bd73359be8c730712389fe95073f2079774e... 3325849 100009484642000000 4296215 2300 0 0 21000000000 2017-09-20 22:35:44 call ... 21.000000 1.000095e+08 0.000000 0.000000e+00 1.385286e+09 0.000000 1 20 22 2
1494276 0x6e952bcf1c52470512b9cb58a64524f7a84d95196f0e... 0 -9223372036854775808 4296215 90000 21000 0 21000000000 2017-09-20 22:35:44 tx ... 21.000000 -9.223372e+09 0.007675 4.410000e+05 1.385286e+09 0.000318 0 20 22 2
1494277 0x6e040b0e892599454e22957540842e4a1f0bef409e8b... 63705 160425050000000000 4296215 50000 21000 0 30000000000 2017-09-20 22:35:44 tx ... 30.000000 1.604250e+08 0.007675 6.300000e+05 1.385286e+09 0.000455 0 20 22 2
1494278 0x6cd81393fe04e728fba4f6617b40409c7406d47f9e88... 56064 0 4296215 1615368 1798 0 22000001337 2017-09-20 22:35:44 call ... 22.000001 0.000000e+00 0.000657 3.955600e+04 1.385286e+09 0.000029 0 20 22 2

1494279 rows × 30 columns


In [ ]: