In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
The transaction data and block data are in separate json files
In [8]:
transactions1 = pd.read_json('./../data/transactions.json', lines=True)
In [9]:
transactions2 = pd.read_json('./../data/new_transactions.json', lines = True)
In [10]:
transactions1.head()
Out[10]:
In [11]:
transactions2.head()
Out[11]:
In [12]:
transactions1.shape, transactions2.shape
Out[12]:
Append the transaction data into a single dataframe
In [13]:
transactions = transactions1.append(transactions2)
In [14]:
transactions.shape
Out[14]:
In [15]:
transactions['hash'].values.shape
Out[15]:
In [16]:
np.unique(transactions['hash'].values).shape
Out[16]:
In [17]:
transactions.head()
Out[17]:
Apparently there are no duplicate transaction blocks
In [18]:
blocks1 = pd.read_json('./../data/blocks.json', lines=True)
In [19]:
blocks2 = pd.read_json('./../data/blocks_more.json', lines=True)
In [20]:
blocks3 = pd.read_json('./../data/blocks_2.json', lines=True)
In [21]:
blocks1.shape, blocks2.shape, blocks3.shape
Out[21]:
Append the block data into single dataframe
In [22]:
blocks = blocks1.append([blocks2, blocks3])
In [23]:
blocks.shape
Out[23]:
In [24]:
blocks['number'].head()
Out[24]:
In [25]:
blocks.head()
Out[25]:
In [26]:
blocks.columns
Out[26]:
Create new block_id column converting floats into integers
In [27]:
blocks['block_id'] = blocks['number'].apply(lambda x: int(round(x)))
In [28]:
blocks['number'].values.shape
Out[28]:
In [29]:
np.unique(blocks['number'].values).shape
Out[29]:
In [30]:
blocks['block_id'].unique().shape
Out[30]:
In [31]:
blocks[['number', 'block_id']].head()
Out[31]:
Drop duplicate blocks
In [32]:
blocks.drop_duplicates(subset='block_id', inplace=True)
In [33]:
blocks.shape
Out[33]:
In [34]:
blocks['number'].values.shape
Out[34]:
In [35]:
np.unique(blocks['number'].values).shape
Out[35]:
In [36]:
blocks.head()
Out[36]:
Now we have a unique block set
In [37]:
transactions.columns
Out[37]:
In [38]:
blocks.columns
Out[38]:
Merge the dataframes on block_id
In [39]:
merged_df = transactions.merge(blocks, left_on='block_id', right_on='block_id',
suffixes=('_t', '_b'))
In [40]:
transactions.shape
Out[40]:
In [41]:
blocks.shape
Out[41]:
In [42]:
merged_df.shape
Out[42]:
In [43]:
merged_df['hash_t'].unique().shape
Out[43]:
In [44]:
merged_df.columns
Out[44]:
In [45]:
merged_df.columns
Out[45]:
Select subset of columns to work with
In [46]:
sel_cols = ['hash_t',
'accountNonce',
'amount',
'block_id',
'gasLimit_t',
'gasUsed_t',
'isContractTx',
'newContract',
'price',
'time_t',
'txIndex',
'type',
'blockTime',
'difficulty',
'gasLimit_b',
'gasUsed_b',
'reward',
'size',
'time_b',
'totalFee',
'tx_count',
'uncle_count']
In [47]:
df = merged_df[sel_cols]
In [48]:
print('no. transactions: {}, no. blocks: {}'.format(np.unique(df['hash_t'].values).shape[0],
np.unique(df['block_id'].values).shape[0]))
Convert dates to datetime
In [49]:
df.loc[:,'time_t'] = pd.to_datetime(df.time_t, yearfirst=True)
In [50]:
df.loc[:,'time_b'] = pd.to_datetime(df.time_b, yearfirst=True)
In [51]:
dates = df['time_t'].values.astype('datetime64[D]')
In [52]:
np.unique(dates)
Out[52]:
The dates of blockchain data span from 9/18/17 to 9/24/17
In [58]:
df['time_t'].head()
Out[58]:
In [53]:
np.unique(df['hash_t'].values).shape
Out[53]:
In [54]:
df.shape
Out[54]:
Check for duplicate rows
In [55]:
df.duplicated(subset='hash_t').value_counts()
Out[55]:
No duplicates found
Creat CSV file of final dataset
In [57]:
df.to_csv('./../data/data.csv')
In [ ]: