Load library
In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import networkx as nx
import pygraphviz as pgv
import pydot as pyd
from networkx.drawing.nx_agraph import graphviz_layout
from networkx.drawing.nx_agraph import write_dot
Load data
In [2]:
%%time
edges = pd.read_csv('../data/edges.csv').drop('Unnamed: 0',1)
nodes = pd.read_csv('../data/nodes.csv').drop('Unnamed: 0',1)
rogues = pd.read_csv('../data/rogues.csv')
Build proper edge array
In [3]:
%%time
#Simple way (non parallel computing)
edge_array = []
for i in range(0,1000000):
edge_array.append((edges['from'][i],edges['to'][i],{'value':edges['value'][i],'time':edges['timestamp'][i],'hash':edges['hash'][i]}))
Generate a MultiDigraph with networkx and edge array
In [4]:
%%time
TG=nx.MultiDiGraph()
TG.add_weighted_edges_from(edge_array)
In [5]:
%%time
# Network Characteristics
print 'Number of nodes:', TG.number_of_nodes()
print 'Number of edges:', TG.number_of_edges()
print 'Number of connected components:', nx.number_connected_components(TG.to_undirected())
# Degree
degree_sequence = TG.degree().values()
degree_out_sequence = TG.out_degree().values()
degree_in_sequence = TG.in_degree().values()
print "Min degree ", np.min(degree_sequence)
print "Max degree ", np.max(degree_sequence)
print "Median degree ", np.median(degree_sequence)
print "Mean degree ", np.mean(degree_sequence)
print "Min degree IN", np.min(degree_in_sequence)
print "Max degree IN", np.max(degree_in_sequence)
print "Median degree IN", np.median(degree_in_sequence)
print "Mean degree IN", np.mean(degree_in_sequence)
print "Min degree OUT", np.min(degree_out_sequence)
print "Max degree OUT", np.max(degree_out_sequence)
print "Median degree OUT", np.median(degree_out_sequence)
print "Mean degree OUT", np.mean(degree_out_sequence)
In [6]:
%%time
# Degree distribution
y=nx.degree_histogram(TG)
plt.figure(1)
plt.loglog(y,'b-',marker='o')
plt.ylabel("Frequency")
plt.xlabel("Degree")
plt.draw()
plt.show()
In [7]:
#New dataframe for feature engineering
df = pd.DataFrame()
df['nodes']=TG.nodes()
Features description
# | Description | Variable |
---|---|---|
I | Degree | total_degree |
II | Degree in | degree_in |
III | Degree out | degree_out |
IV | Number of unique predecessors | unique_predecessors |
V | Number of unique successors | unique_successors |
VI | Mean ether amount in incoming transactions | mean_value_in |
VII | Mean ether amount in outgoing transactions | mean_value_out |
VIII | Std ether amount in incoming transactions | std_value_in |
IX | Std ether amount in outgoing transactions | std_value_out |
X | Ratio of the number of incoming transactions to the number of unique timestamps | ratio_in_timestamp |
XI | Ratio of the number of outgoing transactions to the number of unique timestamps | ratio_out_timestamp |
XII | Frequency of incoming transactions | frequency_in |
XIII | Frequency of outgoing transactions | frequency_out |
XIV | Ether balance of the node | balance |
XVI | Average velocity in | mean_velocity_out |
XVII | Average velocity out | mean_velocity_out |
XVIII | Std velocity in | std_velocity_in |
XIX | Std velocity out | std_velocity_out |
XX | Average acceleration in | mean_acceleration_in |
XXI | Average acceleration out | mean_acceleration_out |
α | Min path to a rogue node | min_path_to_rogue |
β | Min path from a rogue node | min_path_from_rogue |
δ | Amount of ether on the min path to a rogue node | amount_to_rogue |
ε | Amount of ether on the min path from a rogue node | amount_from_rogue |
1 | Average neighbours velocity out | |
2 | Average neighbours acceleration out | - |
In [8]:
df['total_degree']=df['nodes'].map(lambda x: TG.degree(x))
In [9]:
df['degree_in']=df['nodes'].map(lambda x: TG.in_degree(x))
df['degree_out']=df['nodes'].map(lambda x: TG.out_degree(x))
In [10]:
df['unique_successors']=df['nodes'].map(lambda x: len((TG.successors(x))))
df['unique_predecessors']=df['nodes'].map(lambda x: len((TG.predecessors(x))))
In [11]:
def get_mean_value_in(node):
'''
Return the mean value of all the in transactions of a given node
'''
#Get the in edges list
edges = TG.in_edges_iter(node, keys=False, data=True)
#Build a list of all the values of the in edges list
values=[]
for edge in edges:
values.append(float(edge[2]['weight']['value']))
#Compute the mean of this list
mean = np.average(values)
return mean
In [12]:
%%time
#Add the feature
df['mean_value_in']=df['nodes'].map(lambda x: get_mean_value_in(x))
In [13]:
#Write a function
def get_mean_value_out(node):
'''
Return the mean value of all the out transactions of a given node
'''
#Get the out edges list
edges = TG.out_edges_iter(node, keys=False, data=True)
#Build a list of all the values of the out edges list
values=[]
for edge in edges:
values.append(float(edge[2]['weight']['value']))
#Compute the mean of this list
mean = np.average(values)
return mean
In [14]:
%%time
#Add the feature
df['mean_value_out']=df['nodes'].map(lambda x: get_mean_value_out(x))
In [15]:
#Write a function
def get_std_value_in(node):
'''
Return the std value of all the in transactions of a given node
'''
#Get the in edges list
edges = TG.in_edges_iter(node, keys=False, data=True)
#Build a list of all the values of the in edges list
values=[]
for edge in edges:
values.append(float(edge[2]['weight']['value']))
#Compute the std of this list
std = np.std(values)
return std
In [16]:
%%time
#Add the feature
df['std_value_in']=df['nodes'].map(lambda x: get_std_value_in(x))
In [17]:
#Write a function
def get_std_value_out(node):
'''
Return the std value of all the out transactions of a given node
'''
#Get the out edges list
edges = TG.out_edges_iter(node, keys=False, data=True)
#Build a list of all the values of the out edges list
values=[]
for edge in edges:
values.append(float(edge[2]['weight']['value']))
#Compute the std of this list
std = np.std(values)
return std
In [18]:
%%time
#Add the feature
df['std_value_out']=df['nodes'].map(lambda x: get_std_value_out(x))
In [19]:
#Write a function
def get_ratio_in_timestamp(node):
'''
Return the ratio between the number of incoming transaction to the number of unique timestamp for these transactions
'''
#Get the list of incoming transactions
edges = TG.in_edges(node,keys=False, data=True)
#Build the list of timestamps
timestamps=[]
for edge in edges:
timestamps.append(edge[2]['weight']['time'])
#Compute the ratio
unique_time = float(len(np.unique(timestamps)))
transactions = float(len(edges))
if unique_time !=0:
ratio = transactions / unique_time
else:
ratio = np.nan
return ratio
In [20]:
%%time
#Add the feature
df['ratio_in_timestamp']=df['nodes'].map(lambda x: get_ratio_in_timestamp(x))
In [21]:
#Write a function
def get_ratio_out_timestamp(node):
'''
Return the ratio between the number of incoming transaction to the number of unique timestamp for these transactions
'''
#Get the list of outgoing transactions
edges = TG.out_edges(node,keys=False, data=True)
#Build the list of timestamps
timestamps=[]
for edge in edges:
timestamps.append(edge[2]['weight']['time'])
#Compute the ratio
unique_time = float(len(np.unique(timestamps)))
transactions = float(len(edges))
if unique_time !=0:
ratio = transactions / unique_time
else:
ratio = np.nan
return ratio
In [22]:
%%time
#Add the feature
df['ratio_out_timestamp']=df['nodes'].map(lambda x: get_ratio_out_timestamp(x))
In [23]:
#write function
def get_in_frequency(node):
'''
Return the incoming transaction frequency for the user (#in transactions / max date - min date)
'''
#Get the list of incoming transactions
edges = TG.in_edges(node,keys=False, data=True)
#Build the list of timestamps
timestamps=[]
for edge in edges:
timestamps.append(edge[2]['weight']['time'])
#Build the delta in seconds
date = pd.to_datetime(pd.Series(timestamps))
dt = date.max()-date.min()
#deltaseconds = dt.item().total_seconds()
if dt.total_seconds()!=0:
ratio = len(edges)/dt.total_seconds()
else:
ratio = np.nan
return ratio
In [24]:
%%time
#Add the feature
df['frequency_in']=df['nodes'].map(lambda x: get_in_frequency(x))
In [25]:
#write function
def get_out_frequency(node):
'''
Return the outgoing transaction frequency for the user (#in transactions / max date - min date)
'''
#Get the list of incoming transactions
edges = TG.out_edges(node,keys=False, data=True)
#Build the list of timestamps
timestamps=[]
for edge in edges:
timestamps.append(edge[2]['weight']['time'])
#Build the delta in seconds
date = pd.to_datetime(pd.Series(timestamps))
dt = date.max()-date.min()
#deltaseconds = dt.item().total_seconds()
if dt.total_seconds()!=0:
ratio = len(edges)/dt.total_seconds()
else:
ratio = np.nan
return ratio
In [26]:
%%time
#Add the feature
df['frequency_out']=df['nodes'].map(lambda x: get_out_frequency(x))
In [27]:
#write function
def get_balance(node):
'''
Return the balance (in wei) of a given node
'''
#Get edges in and edges out
edges_in = TG.in_edges(node,keys=False, data=True)
edges_out = TG.out_edges(node,keys=False, data=True)
#Build value in array and value out array
values_in=[]
for edge in edges_in:
values_in.append(float(edge[2]['weight']['value']))
values_out=[]
for edge in edges_out:
values_out.append(float(edge[2]['weight']['value']))
#Compute balance
balance = np.sum(values_in)-np.sum(values_out)
return balance
In [28]:
%%time
#Add the feature
df['balance']=df['nodes'].map(lambda x: get_balance(x))
In [29]:
#write function
def get_mean_velocity_in(node):
"""
Return the average ether velocitiy incoming into the node in wei/s
"""
#Get edges in collection
edges_in = TG.in_edges(node,keys=False, data=True)
values_in=[]
timestamps=[]
#Collect values and timestamps
for edge in edges_in:
values_in.append(float(edge[2]['weight']['value']))
timestamps.append(edge[2]['weight']['time'])
#Create Velocity list
velocities = []
#Convert date str to datetime
dates = pd.to_datetime(pd.Series(timestamps))
#Build the velocity array
for i in range(1,(len(edges_in)-1)):
if dates[i+1]!=dates[i-1]:
velocity = np.absolute(values_in[i+1]-values_in[i-1])/(dates[i+1]-dates[i-1]).total_seconds()
velocities.append(velocity)
#Return the velocities average
return np.average(np.absolute(velocities))
In [30]:
%%time
#Add the feature
df['mean_velocity_in']=df['nodes'].map(lambda x: get_mean_velocity_in(x))
In [31]:
#write function
def get_mean_velocity_out(node):
"""
Return the average ether velocitiy outgoing from the node in wei/s
"""
#Get edges out collection
edges_out = TG.out_edges(node,keys=False, data=True)
values_out=[]
timestamps=[]
#Collect values and timestamps
for edge in edges_out:
values_out.append(float(edge[2]['weight']['value']))
timestamps.append(edge[2]['weight']['time'])
#Create Velocity list
velocities = []
#Convert date str to datetime
dates = pd.to_datetime(pd.Series(timestamps))
#Build the velocity array
for i in range(1,(len(edges_out)-1)):
if dates[i+1]!=dates[i-1]:
velocity = np.absolute(values_out[i+1]-values_out[i-1])/(dates[i+1]-dates[i-1]).total_seconds()
velocities.append(velocity)
#Return the velocities average
return np.average(np.absolute(velocities))
In [32]:
%%time
#Add the feature
df['mean_velocity_out']=df['nodes'].map(lambda x: get_mean_velocity_out(x))
In [38]:
#write function
def get_std_velocity_in(node):
"""
Return the std ether velocitiy incoming into the node in wei/s
"""
#Get edges in collection
edges_in = TG.in_edges(node,keys=False, data=True)
values_in=[]
timestamps=[]
#Collect values and timestamps
for edge in edges_in:
values_in.append(float(edge[2]['weight']['value']))
timestamps.append(edge[2]['weight']['time'])
#Create Velocity list
velocities = []
#Convert date str to datetime
dates = pd.to_datetime(pd.Series(timestamps))
#Build the velocity array
for i in range(1,(len(edges_in)-1)):
if dates[i+1]!=dates[i-1]:
velocity = np.absolute(values_in[i+1]-values_in[i-1])/(dates[i+1]-dates[i-1]).total_seconds()
velocities.append(velocity)
#Return the velocities average
return np.std(np.absolute(velocities))
In [39]:
%%time
#Add the feature
df['std_velocity_in']=df['nodes'].map(lambda x: get_std_velocity_in(x))
In [40]:
#write function
def get_std_velocity_out(node):
"""
Return the std ether velocitiy outgoing from the node in wei/s
"""
#Get edges out collection
edges_out = TG.out_edges(node,keys=False, data=True)
values_out=[]
timestamps=[]
#Collect values and timestamps
for edge in edges_out:
values_out.append(float(edge[2]['weight']['value']))
timestamps.append(edge[2]['weight']['time'])
#Create Velocity list
velocities = []
#Convert date str to datetime
dates = pd.to_datetime(pd.Series(timestamps))
#Build the velocity array
for i in range(1,(len(edges_out)-1)):
if dates[i+1]!=dates[i-1]:
velocity = np.absolute(values_out[i+1]-values_out[i-1])/(dates[i+1]-dates[i-1]).total_seconds()
velocities.append(velocity)
#Return the velocities average
return np.std(np.absolute(velocities))
In [41]:
%%time
#Add the feature
df['std_velocity_out']=df['nodes'].map(lambda x: get_std_velocity_out(x))
In [52]:
#write function
def get_mean_acceleration_in(node):
"""
Return the average ether acceleration incoming into the node in wei.s-2
"""
#Get edges in collection
edges_in = TG.in_edges(node,keys=False, data=True)
values_in=[]
timestamps=[]
#Collect values and timestamps
for edge in edges_in:
values_in.append(float(edge[2]['weight']['value']))
timestamps.append(edge[2]['weight']['time'])
#Create Velocity list
velocities = []
#Convert date str to datetime
dates = pd.to_datetime(pd.Series(timestamps))
#Build the velocity array
for i in range(1,(len(edges_in)-1)):
if dates[i+1]!=dates[i-1]:
velocity = np.absolute(values_in[i+1]-values_in[i-1])/(dates[i+1]-dates[i-1]).total_seconds()
velocities.append(velocity)
#Make sure we have abs ...
velocities=np.absolute(velocities)
#Velocities range from 1 to N-1 (no 0 and N)
#Accelerations range from 2 to N-2
#Build the acceleration array
accelerations=[]
for i in range(1,(len(velocities)-1)):
if dates[i+1]!=dates[i-1]:
acceleration = np.absolute(velocities[i+1]-velocities[i-1])/(dates[i+1]-dates[i-1]).total_seconds()
accelerations.append(acceleration)
#Return the velocities average
return np.average(np.absolute(accelerations))
In [43]:
%%time
#Add the feature
df['mean_acceleration_in']=df['nodes'].map(lambda x: get_mean_acceleration_in(x))
In [44]:
#write function
def get_mean_acceleration_out(node):
"""
Return the average ether acceleration outgoing into the node in wei.s-2
"""
#Get edges out collection
edges_out = TG.out_edges(node,keys=False, data=True)
values_out=[]
timestamps=[]
#Collect values and timestamps
for edge in edges_out:
values_out.append(float(edge[2]['weight']['value']))
timestamps.append(edge[2]['weight']['time'])
#Create Velocity list
velocities = []
#Convert date str to datetime
dates = pd.to_datetime(pd.Series(timestamps))
#Build the velocity array
for i in range(1,(len(edges_out)-1)):
if dates[i+1]!=dates[i-1]:
velocity = np.absolute(values_out[i+1]-values_out[i-1])/(dates[i+1]-dates[i-1]).total_seconds()
velocities.append(velocity)
#Make sure we have abs ...
velocities=np.absolute(velocities)
#Velocities range from 1 to N-1 (no 0 and N)
#Accelerations range from 2 to N-2
#Build the acceleration array
accelerations=[]
for i in range(1,(len(velocities)-1)):
if dates[i+1]!=dates[i-1]:
acceleration = np.absolute(velocities[i+1]-velocities[i-1])/(dates[i+1]-dates[i-1]).total_seconds()
accelerations.append(acceleration)
#Return the velocities average
return np.average(np.absolute(accelerations))
In [45]:
%%time
#Add the feature
df['mean_acceleration_out']=df['nodes'].map(lambda x: get_mean_acceleration_out(x))
In [46]:
rogues = pd.read_csv("../data/rogues.csv")
rogues_id = np.array(rogues['id'])
fake_rogues = ['0x223294182093bfc6b11e8ef5722d496f066036c2','0xec1ebac9da3430213281c80fa6d46378341a96ae','0xe6447ae67346b5fb7ebd65ebfc4c7e6521b21f8a']
In [47]:
#write function
def min_path_to_rogue(node,rogues):
paths_lengths=[]
for rogue in rogues:
if nx.has_path(TG,node,rogue):
paths_lengths.append(nx.shortest_path_length(TG,node,rogue))
if len(paths_lengths)!=0:
return np.min(paths_lengths)
else:
return np.nan
In [48]:
%%time
#Add the feature
df['min_path_to_rogue']=df['nodes'].map(lambda x: min_path_to_rogue(x,fake_rogues))
In [51]:
#write function
def min_path_from_rogue(node,rogues):
paths_lengths=[]
for rogue in rogues:
if nx.has_path(TG,rogue,node):
paths_lengths.append(shortest_path_length(TG,rogue,node))
if len(paths_lengths)!=0:
return np.min(paths_lengths)
else:
return np.nan
In [50]:
%%time
#Add the feature
df['min_path_from_rogue']=df['nodes'].map(lambda x: min_path_from_rogue(x,fake_rogues))
In [53]:
df.to_csv('../data/features.csv')
In [ ]: