Graph Construction and feature engineering

Load library


In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import networkx as nx
import pylab as plt

Load data


In [2]:
edges = pd.read_csv('../data/edges.csv').drop('Unnamed: 0',1)
nodes = pd.read_csv('../data/nodes.csv').drop('Unnamed: 0',1)

Graph generation & analysis

Build proper edge array


In [6]:
#Simple way (non parallel computing)
edge_array = []
for i in range(0,edges.shape[0]):
    edge_array.append((edges['from'][i],edges['to'][i],{'value':edges['value'][i],'time':edges['timestamp'][i],'hash':edges['hash'][i]}))

Generate a MultiDigraph with networkx and edge array


In [8]:
%%time
TG=nx.MultiDiGraph()
TG.add_weighted_edges_from(edge_array)


CPU times: user 52.3 s, sys: 1.65 s, total: 54 s
Wall time: 53.9 s

In [9]:
%%time
nx.write_gml(TG,'../data/graph.gml')


CPU times: user 4min 59s, sys: 2.83 s, total: 5min 1s
Wall time: 5min 3s

In [10]:
%%time
# Network Characteristics
print 'Number of nodes:', TG.number_of_nodes() 
print 'Number of edges:', TG.number_of_edges() 
print 'Number of connected components:', nx.number_connected_components(TG.to_undirected())

# Degree
degree_sequence = TG.degree().values()
degree_out_sequence = TG.out_degree().values()
degree_in_sequence = TG.in_degree().values()

print "Min degree ", np.min(degree_sequence)
print "Max degree ", np.max(degree_sequence)
print "Median degree ", np.median(degree_sequence)
print "Mean degree ", np.mean(degree_sequence)

print "Min degree IN", np.min(degree_in_sequence)
print "Max degree IN", np.max(degree_in_sequence)
print "Median degree IN", np.median(degree_in_sequence)
print "Mean degree IN", np.mean(degree_in_sequence)

print "Min degree OUT", np.min(degree_out_sequence)
print "Max degree OUT", np.max(degree_out_sequence)
print "Median degree OUT", np.median(degree_out_sequence)
print "Mean degree OUT", np.mean(degree_out_sequence)


Number of nodes: 453155
Number of edges: 9440492
Number of connected components: 97
Min degree  1
Max degree  1240383
Median degree  2.0
Mean degree  41.6656199314
Min degree IN 0
Max degree IN 1084279
Median degree IN 1.0
Mean degree IN 20.8328099657
Min degree OUT 0
Max degree OUT 1240380
Median degree OUT 1.0
Mean degree OUT 20.8328099657
CPU times: user 3min 11s, sys: 4.41 s, total: 3min 15s
Wall time: 3min 15s

In [11]:
%%time
# Degree distribution
y=nx.degree_histogram(TG)
plt.figure(1)
plt.loglog(y,'b-',marker='o')
plt.ylabel("Frequency")
plt.xlabel("Degree")
plt.draw()
plt.show()


CPU times: user 3.45 s, sys: 1.05 s, total: 4.5 s
Wall time: 3.44 s

Features Engineering


In [12]:
%%time
#New dataframe for feature engineering
df = pd.DataFrame()
df['nodes']=TG.nodes()


CPU times: user 184 ms, sys: 4 ms, total: 188 ms
Wall time: 186 ms

Add total degree [I]


In [13]:
%%time
df['total_degree']=df['nodes'].map(lambda x: TG.degree(x))


CPU times: user 2.59 s, sys: 12 ms, total: 2.6 s
Wall time: 2.6 s

Add degree in and degree out [II] [III]


In [14]:
%%time
df['degree_in']=df['nodes'].map(lambda x: TG.in_degree(x))
df['degree_out']=df['nodes'].map(lambda x: TG.out_degree(x))


CPU times: user 3.9 s, sys: 12 ms, total: 3.92 s
Wall time: 3.91 s

Add unique predecessors and unique successors (must be < degree_in and out) [IV][V]


In [15]:
%%time
df['unique_successors']=df['nodes'].map(lambda x: len((TG.successors(x))))
df['unique_predecessors']=df['nodes'].map(lambda x: len((TG.predecessors(x))))


CPU times: user 1.93 s, sys: 4 ms, total: 1.93 s
Wall time: 1.93 s

Add mean ether value going in the node [VI]


In [16]:
#Write a function
def get_mean_value_in(node):
    '''
    Return the mean value of all the in transactions of a given node
    '''
    #Get the in edges list
    edges = TG.in_edges_iter(node, keys=False, data=True)
    #Build a list of all the values of the in edges list
    values=[]
    for edge in edges:
        values.append(float(edge[2]['weight']['value']))
    #Compute the mean of this list
    mean = np.average(values)
    
    return mean

In [17]:
%%time
#Add the feature
df['mean_value_in']=df['nodes'].map(lambda x: get_mean_value_in(x))


/home/julien_ha/.local/lib/python2.7/site-packages/numpy/core/_methods.py:59: RuntimeWarning: Mean of empty slice.
  warnings.warn("Mean of empty slice.", RuntimeWarning)
CPU times: user 20.6 s, sys: 104 ms, total: 20.7 s
Wall time: 20.6 s

Add mean ether value going out the node [VII]


In [18]:
#Write a function
def get_mean_value_out(node):
    '''
    Return the mean value of all the out transactions of a given node
    '''
    #Get the out edges list
    edges = TG.out_edges_iter(node, keys=False, data=True)
    #Build a list of all the values of the out edges list
    values=[]
    for edge in edges:
        values.append(float(edge[2]['weight']['value']))
    #Compute the mean of this list
    mean = np.average(values)
    return mean

In [19]:
%%time
#Add the feature
df['mean_value_out']=df['nodes'].map(lambda x: get_mean_value_out(x))


CPU times: user 20 s, sys: 80 ms, total: 20 s
Wall time: 20 s

Add std ether value going in the node [VIII]


In [20]:
#Write a function
def get_std_value_in(node):
    '''
    Return the std value of all the in transactions of a given node
    '''
    #Get the in edges list
    edges = TG.in_edges_iter(node, keys=False, data=True)
    #Build a list of all the values of the in edges list
    values=[]
    for edge in edges:
        values.append(float(edge[2]['weight']['value']))
    #Compute the std of this list
    std = np.std(values)
    
    return std

In [21]:
%%time
#Add the feature
df['std_value_in']=df['nodes'].map(lambda x: get_std_value_in(x))


/home/julien_ha/.local/lib/python2.7/site-packages/numpy/core/_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
CPU times: user 31 s, sys: 144 ms, total: 31.1 s
Wall time: 31 s

Add std ether value going out the node [IX]


In [22]:
#Write a function
def get_std_value_out(node):
    '''
    Return the std value of all the out transactions of a given node
    '''
    #Get the out edges list
    edges = TG.out_edges_iter(node, keys=False, data=True)
    #Build a list of all the values of the out edges list
    values=[]
    for edge in edges:
        values.append(float(edge[2]['weight']['value']))
    #Compute the std of this list
    std = np.std(values)
    
    return std

In [23]:
%%time
#Add the feature
df['std_value_out']=df['nodes'].map(lambda x: get_std_value_out(x))


CPU times: user 29.4 s, sys: 96 ms, total: 29.5 s
Wall time: 29.5 s

In [24]:
df.tail(10)


Out[24]:
nodes total_degree degree_in degree_out unique_successors unique_predecessors mean_value_in mean_value_out std_value_in std_value_out
453145 0xf61f8c3a516d168ffb0af2a73efe38e5f4d91c89 2 1 1 1 1 4.916000e+19 4.906000e+19 0.000000e+00 0.000000e+00
453146 0x3afbe386c9a7175273a59073bff95b8d6227af8c 7 6 1 1 3 2.390808e+20 1.000000e+18 3.491246e+20 0.000000e+00
453147 0xf9e44886996df8c2faa8b1c994fa93db2f6370b8 8 4 4 1 1 6.545107e+19 6.545002e+19 5.508868e+19 5.508868e+19
453148 0xebb0745ea25ebcfc8515fcffe9050b70853a20ca 2 2 0 0 1 5.076577e+15 NaN 5.076577e+15 NaN
453149 0x3979be329bfc1ee0f34a038661adb14fc1777805 9 5 4 2 3 4.473706e+18 5.591084e+18 5.300123e+18 5.415933e+18
453150 0x028433ba0094fde690f4d8638d222b790e7d2ee0 10 6 4 1 3 6.516618e+15 3.130702e+15 5.001146e+15 1.478797e+15
453151 0xac2bbdf9a2488e9274639377a3e5ced716131c42 2 1 1 1 1 3.869389e+17 3.865189e+17 0.000000e+00 0.000000e+00
453152 0x518228781b079d9919eef7660bad73bf5e6917a3 2 1 1 1 1 1.350173e+18 1.349573e+18 0.000000e+00 0.000000e+00
453153 0xb8d0bf3023c7bb941be874c3b9f80eff21808a6d 2 1 1 1 1 9.121600e+14 0.000000e+00 0.000000e+00 0.000000e+00
453154 0x9f18a06c42373ca61f81937593665126e795a0ec 2 1 1 1 1 6.600000e+18 6.599580e+18 0.000000e+00 0.000000e+00

In [ ]: