In [17]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
In [12]:
%pylab inline
In [4]:
df = pd.read_csv('dns.txt', names=['time', 'src', 'dest'])
In [8]:
df = df[df.time < 3600]
In [30]:
G = nx.from_pandas_dataframe(df, 'src', 'dest', create_using=nx.DiGraph())
In [79]:
plt.figure(figsize=(14,14))
nx.draw(G)
In [32]:
N, K = G.order(), G.size()
avg_deg = float(K) / N
print "Nodes: ", N
print "Edges: ", K
print "Average degree: ", avg_deg
print "SCC: ", nx.number_strongly_connected_components(G)
print "WCC: ", nx.number_weakly_connected_components(G)
In [47]:
in_degrees = G.in_degree() # dictionary node:degree
in_values = sorted(set(in_degrees.values()))
in_hist = [in_degrees.values().count(x) for x in in_values]
out_degrees = G.out_degree() # dictionary node:degree
out_values = sorted(set(out_degrees.values()))
out_hist = [out_degrees.values().count(x) for x in out_values]
plt.figure(figsize=(10,8))
plt.grid(True)
plt.loglog(in_values, in_hist, 'ro-') # in-degree
plt.loglog(out_values, out_hist, 'bv-') # out-degree
plt.legend(['In-degree', 'Out-degree'])
plt.xlabel('Degree')
plt.ylabel('Number of nodes')
plt.title('DNS requests for 1 hour')
plt.xlim([0, 2*10**2])
#plt.savefig('./output/cam_net_degree_distribution.pdf')
#plt.close()
Out[47]:
In [41]:
print in_hist
print out_hist
In [46]:
sorted(in_degrees.values(), reverse=True)[:20]
Out[46]:
In [48]:
sorted(out_degrees.values(), reverse=True)[:20]
Out[48]:
In [62]:
G_ud = G.to_undirected()
# Clustering coefficient of all nodes (in a dictionary)
clust_coefficients = nx.clustering(G_ud)
# Average clustering coefficient
avg_clust = sum(clust_coefficients.values()) / len(clust_coefficients)
print avg_clust
# Or use directly the built-in method
print nx.average_clustering(G_ud)
In [67]:
# Connected components are sorted in descending order of their size
G_components = list(nx.connected_component_subgraphs(G_ud))
G_mc = max(nx.connected_component_subgraphs(G_ud), key=len)
# Betweenness centrality
bet_cen = nx.betweenness_centrality(G_mc)
#print bet_cen
# Closeness centrality
clo_cen = nx.closeness_centrality(G_mc)
#print clo_cen
# Eigenvector centrality
eig_cen = nx.eigenvector_centrality(G_mc, max_iter=10000)
#print eig_cen
In [68]:
def get_top_keys(dictionary, top):
items = dictionary.items()
items.sort(reverse=True, key=lambda x: x[1])
return map(lambda x: x[0], items[:top])
top_bet_cen = get_top_keys(bet_cen,10)
top_clo_cen = get_top_keys(clo_cen,10)
top_eig_cent = get_top_keys(eig_cen,10)
In [69]:
print top_bet_cen
print top_clo_cen
print top_eig_cent
In [75]:
reader = pd.read_csv('flows.txt', names=['time', 'duration','src', 'src_port', 'dest',
'dest_port', 'protocol', 'packet_count', 'byte_count'], chunksize=102170)
for chunk in reader:
df2 = chunk
break
In [76]:
df2.describe()
Out[76]:
In [78]:
G2 = nx.from_pandas_dataframe(df2, 'src', 'dest', create_using=nx.DiGraph())
In [80]:
plt.figure(figsize=(16,16))
nx.draw(G2)
In [ ]: