Load library
In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import networkx as nx
import pylab as plt
Load data
In [2]:
edges = pd.read_csv('../data/edges.csv').drop('Unnamed: 0',1)
nodes = pd.read_csv('../data/nodes.csv').drop('Unnamed: 0',1)
Build proper edge array
In [6]:
#Simple way (non parallel computing)
edge_array = []
for i in range(0,edges.shape[0]):
edge_array.append((edges['from'][i],edges['to'][i],{'value':edges['value'][i],'time':edges['timestamp'][i],'hash':edges['hash'][i]}))
Generate a MultiDigraph with networkx and edge array
In [8]:
%%time
TG=nx.MultiDiGraph()
TG.add_weighted_edges_from(edge_array)
In [9]:
%%time
nx.write_gml(TG,'../data/graph.gml')
In [10]:
%%time
# Network Characteristics
print 'Number of nodes:', TG.number_of_nodes()
print 'Number of edges:', TG.number_of_edges()
print 'Number of connected components:', nx.number_connected_components(TG.to_undirected())
# Degree
degree_sequence = TG.degree().values()
degree_out_sequence = TG.out_degree().values()
degree_in_sequence = TG.in_degree().values()
print "Min degree ", np.min(degree_sequence)
print "Max degree ", np.max(degree_sequence)
print "Median degree ", np.median(degree_sequence)
print "Mean degree ", np.mean(degree_sequence)
print "Min degree IN", np.min(degree_in_sequence)
print "Max degree IN", np.max(degree_in_sequence)
print "Median degree IN", np.median(degree_in_sequence)
print "Mean degree IN", np.mean(degree_in_sequence)
print "Min degree OUT", np.min(degree_out_sequence)
print "Max degree OUT", np.max(degree_out_sequence)
print "Median degree OUT", np.median(degree_out_sequence)
print "Mean degree OUT", np.mean(degree_out_sequence)
In [11]:
%%time
# Degree distribution
y=nx.degree_histogram(TG)
plt.figure(1)
plt.loglog(y,'b-',marker='o')
plt.ylabel("Frequency")
plt.xlabel("Degree")
plt.draw()
plt.show()
In [12]:
%%time
#New dataframe for feature engineering
df = pd.DataFrame()
df['nodes']=TG.nodes()
Add total degree [I]
In [13]:
%%time
df['total_degree']=df['nodes'].map(lambda x: TG.degree(x))
Add degree in and degree out [II] [III]
In [14]:
%%time
df['degree_in']=df['nodes'].map(lambda x: TG.in_degree(x))
df['degree_out']=df['nodes'].map(lambda x: TG.out_degree(x))
Add unique predecessors and unique successors (must be < degree_in and out) [IV][V]
In [15]:
%%time
df['unique_successors']=df['nodes'].map(lambda x: len((TG.successors(x))))
df['unique_predecessors']=df['nodes'].map(lambda x: len((TG.predecessors(x))))
Add mean ether value going in the node [VI]
In [16]:
#Write a function
def get_mean_value_in(node):
'''
Return the mean value of all the in transactions of a given node
'''
#Get the in edges list
edges = TG.in_edges_iter(node, keys=False, data=True)
#Build a list of all the values of the in edges list
values=[]
for edge in edges:
values.append(float(edge[2]['weight']['value']))
#Compute the mean of this list
mean = np.average(values)
return mean
In [17]:
%%time
#Add the feature
df['mean_value_in']=df['nodes'].map(lambda x: get_mean_value_in(x))
Add mean ether value going out the node [VII]
In [18]:
#Write a function
def get_mean_value_out(node):
'''
Return the mean value of all the out transactions of a given node
'''
#Get the out edges list
edges = TG.out_edges_iter(node, keys=False, data=True)
#Build a list of all the values of the out edges list
values=[]
for edge in edges:
values.append(float(edge[2]['weight']['value']))
#Compute the mean of this list
mean = np.average(values)
return mean
In [19]:
%%time
#Add the feature
df['mean_value_out']=df['nodes'].map(lambda x: get_mean_value_out(x))
Add std ether value going in the node [VIII]
In [20]:
#Write a function
def get_std_value_in(node):
'''
Return the std value of all the in transactions of a given node
'''
#Get the in edges list
edges = TG.in_edges_iter(node, keys=False, data=True)
#Build a list of all the values of the in edges list
values=[]
for edge in edges:
values.append(float(edge[2]['weight']['value']))
#Compute the std of this list
std = np.std(values)
return std
In [21]:
%%time
#Add the feature
df['std_value_in']=df['nodes'].map(lambda x: get_std_value_in(x))
Add std ether value going out the node [IX]
In [22]:
#Write a function
def get_std_value_out(node):
'''
Return the std value of all the out transactions of a given node
'''
#Get the out edges list
edges = TG.out_edges_iter(node, keys=False, data=True)
#Build a list of all the values of the out edges list
values=[]
for edge in edges:
values.append(float(edge[2]['weight']['value']))
#Compute the std of this list
std = np.std(values)
return std
In [23]:
%%time
#Add the feature
df['std_value_out']=df['nodes'].map(lambda x: get_std_value_out(x))
In [24]:
df.tail(10)
Out[24]:
In [ ]: