In [0]:
!pip install graphistry -q
!pip install awscli -q
In [0]:
!aws configure set region us-west-2
!aws configure set aws_access_key_id "FILL_ME_IN"
!aws configure set aws_secret_access_key "FILL_ME_IN"
In [0]:
import pandas as pd
import json
import graphistry
#graphistry.register(key='FILL_ME_IN', server='FILL_ME_IN')
In [113]:
!aws logs describe-log-groups
In [40]:
!aws logs filter-log-events --log-group-name VPCFlowDemo > data.json
!ls -al data.json
In [108]:
with open('data.json', 'r') as f:
data = json.load(f)
df = pd.DataFrame([x['message'].split(" ") for x in data['events']])
df.columns = cols = ['version', 'accountid', 'interfaceid', 'src_ip', 'dest_ip', 'src_port', 'dest_port', 'protocol', 'packets', 'bytes', 'time_start', 'time_end', 'action', 'status']
print('# rows', len(df))
df.sample(3)
Out[108]:
In [114]:
# Int->Float for precision errors
df2 = df.copy()
for c in ['packets', 'bytes']:
df2[c] = df2[c].astype(float)
summary_df = df2\
.groupby(['src_ip', 'dest_ip', 'interfaceid', 'dest_port', 'protocol', 'action', 'status'])\
.agg({
'time_start': ['min', 'max'],
'time_end': ['min', 'max'],
'packets': ['min', 'max', 'sum', 'count'],
'bytes': ['min', 'max', 'sum', 'count']
}).reset_index()
summary_df.columns = [(" ".join(x)).strip().replace(" ", "_") for x in list(summary_df.columns)]
print('# rows', len(summary_df))
summary_df.sample(3)
Out[114]:
In [110]:
hg = graphistry.hypergraph(
summary_df,
entity_types=['src_ip', 'dest_ip'], #'dest_port', 'interfaceid', 'action', ...
direct=True)
hg['graph'].bind(edge_title='bytes_sum').plot()
Out[110]:
In [0]: