Graphistry is great -- Graphistry and RAPIDS/BlazingDB is better!
This tutorial series visually analyzes Zeek/Bro network connection logs using different compute engines:
Part II Contents:
Time using GPU-based RAPIDS Python cudf bindings and Graphistry for a full ETL & visual analysis flow:
TIP: If you get out of memory errors, you usually must restart the kernel & refresh the page
In [ ]:
#!pip install graphistry -q
import pandas as pd
import numpy as np
import cudf
import graphistry
#graphistry.register(key='MY_KEY', protocol='https', server='graphistry.site.com')
graphistry.__version__
In [ ]:
%%time
!curl https://www.secrepo.com/maccdc2012/conn.log.gz | gzip -d > conn.log
!head -n 3 conn.log
In [ ]:
# OPTIONAL: For slow devices, work on a subset
#!awk 'NR % 20 == 0' < conn.log > conn-5pc.log
#!awk 'NR % 100 == 0' < conn.log > conn-1pc.log
#!nvidia-smi
In [ ]:
cdf = cudf.read_csv("./conn.log", sep="\t", header=None,
names=["time", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "proto", "service",
"duration", "orig_bytes", "resp_bytes", "conn_state", "local_orig", "missed_bytes",
"history", "orig_pkts", "orig_ip_bytes", "resp_pkts", "resp_ip_bytes", "tunnel_parents"],
dtype=['date', 'str', 'str', 'int', 'str', 'int', 'str', 'str',
'int', 'int', 'int', 'str', 'str', 'int',
'str', 'int', 'int', 'int', 'int', 'str'],
na_values=['-'], index_col=False)
In [ ]:
#fillna
for c in cdf.columns:
if c in ['uid', 'id.orig_h', 'id.resp_h', 'proto', 'service', 'conn_state', 'history', 'tunnel_parents', 'local_orig']:
continue
cdf[c] = cdf[c].fillna(0)
In [ ]:
print('# rows', len(cdf))
cdf.head(3)
In [ ]:
LIMIT = 12000000
In [ ]:
cdf_summary = cdf\
.head(LIMIT)\
.apply_rows(
sum_bytes,
incols=['orig_bytes', 'resp_bytes'],
outcols=dict(sum_bytes=np.int64),
kwargs=dict())\
.groupby(['id.orig_h', 'id.resp_h', 'conn_state'])\
.agg({
'time': ['min', 'max', 'count'],
'id.resp_p': ['count'],
'uid': ['count'],
'duration': ['min', 'max', 'mean'],
'orig_bytes': ['min', 'max', 'sum', 'mean'],
'resp_bytes': ['min', 'max', 'sum', 'mean'],
'sum_bytes': ['min', 'max', 'sum', 'mean']
})
In [ ]:
print('# rows', len(cdf_summary))
cdf_summary.head(3).to_pandas()
In [ ]:
hg = graphistry.hypergraph(
cdf_summary.to_pandas(),
['id.orig_h', 'id.resp_h'],
direct=True,
opts={
'CATEGORIES': {
'ip': ['id.orig_h', 'id.resp_h']
}
})
In [ ]:
hg['graph'].plot()
In [ ]: