In [ ]:
!wget -Pq data/ https://blazingsql-colab.s3.amazonaws.com/netflow_parquet/1_0_0.parquet
!wget -Pq data/ https://blazingsql-colab.s3.amazonaws.com/netflow_parquet/1_1_0.parquet
!wget -Pq data/ https://blazingsql-colab.s3.amazonaws.com/netflow_parquet/1_2_0.parquet
!wget -Pq data/ https://blazingsql-colab.s3.amazonaws.com/netflow_parquet/1_3_0.parquet
In [1]:
!ls -alh data
In [2]:
from blazingsql import BlazingContext
bc = BlazingContext()
In [3]:
local_path = !pwd
local_path
Out[3]:
In [4]:
bc.create_table('netflow', local_path[0] + '/data/*_0.parquet')
Out[4]:
In [5]:
%%time
result = bc.sql('''
SELECT
a.firstSeenSrcIp as source,
a.firstSeenDestIp as destination,
count(a.firstSeenDestPort) as targetPorts,
SUM(a.firstSeenSrcTotalBytes) as bytesOut,
SUM(a.firstSeenDestTotalBytes) as bytesIn,
SUM(a.durationSeconds) as durationSeconds,
MIN(parsedDate) as firstFlowDate,
MAX(parsedDate) as lastFlowDate,
COUNT(*) as attemptCount
FROM
netflow a
GROUP BY
a.firstSeenSrcIp,
a.firstSeenDestIp
''').get()
gdf = result.columns
gdf.head(3)
Out[5]:
In [6]:
import graphistry
#upload protobuf instead of json
graphistry.register(api=2)
In [7]:
len(gdf.to_pandas())
Out[7]:
In [8]:
graphistry.bind(source='source', destination='destination').plot(gdf.to_pandas())
Out[8]: