Origianl: Dirk Loss, http://dirk-loss.de, @dloss. v1.1, 2013-06-02
Modified for Python3 on Win32 & further modified by: William George 2015-04-20
In [16]:
# This whole business is totally unnecessary if you're path is setup right. But if it's not,
# this is probably easier than actually fixing it.
%load_ext autoreload
import os
wireshark_path = "C:\\Program Files\\Wireshark\\" + os.pathsep
# or, if it's under 'program files(x86)'...
# wireshark_path = "C:\\Program Files (x86)\\Wireshark\\" + os.pathsep
os.environ['path'] += wireshark_path
In [17]:
from utilities import *
from pprint import *
In [18]:
%autoreload
In [19]:
pcap_folder = 'C:\\Users\\william.george\\Desktop\\SUA-Test-Data\\'
os.chdir(pcap_folder)
os.getcwd()
!dir
In [77]:
pcap_file = pcap_folder + 'test_2_merge.pcap'
output_file = pcap_folder + 'frame.len'
In [ ]:
!tshark -n -r $pcap_file -T fields -Eheader=y -e frame.number -e frame.len > $output_file
Let's have a look at the file:
In [21]:
import pandas as pd
For a better overview, we plot the frame length over time.
We initialise IPython to show inline graphics:
In [22]:
%pylab inline
Set a figure size in inches:
In [23]:
figsize(17,10)
Pandas automatically uses Matplotlib for plotting. We plot with small dots and an alpha channel of 0.2:
So there are always lots of small packets (< 100 bytes) and lots of large packets (> 1400 bytes). Some bursts of packets with other sizes (around 400 bytes, 1000 bytes, etc.) can be clearly seen.
Passing all those arguments to tshark is quite cumbersome. Here is a convenience function that reads the given fields into a Pandas DataFrame:
In [60]:
import subprocess
import datetime
import pandas as pd
def read_pcap(filename, fields=[], display_filter=[],
timeseries=False, strict=False, outfile=None):
""" Read PCAP file into Pandas DataFrame object.
Uses tshark command-line tool from Wireshark.
filename: Name or full path of the PCAP file to read
fields: List of fields to include as columns
display_filter: Additional filter to restrict frames
strict: Only include frames that contain all given fields
(Default: false)
timeseries: Create DatetimeIndex from frame.time_epoch
(Default: false)
Syntax for fields and display_filter is specified in
Wireshark's Display Filter Reference:
http://www.wireshark.org/docs/dfref/
"""
if timeseries:
fields = ["frame.time_epoch"] + fields
fieldspec = " ".join("-e %s" % f for f in fields)
display_filters = fields if strict else ['']
if display_filter:
display_filters += display_filter
display_filters = list(filter(None, display_filters))
# display_filter is concatenated with ' and '. If one or more filters
# need to be 'ORed' togeather, then supply them as a single string
# e.g. ['frame.len > 60', '(ip.addr == 10.10.10.10 or ip.addr == 20.20.20.20)']
# gives '-2 -R "frame.len > 60 and (ip.addr == 10.10.10.10 or ip.addr == 20.20.20.20)"'
filterspec = '-2 -R "%s"' % " and ".join(f for f in display_filters)
options = "-r %s -n -T fields -Eheader=y" % filename
cmd = "tshark %s %s %s" % (options, filterspec, fieldspec)
print('filterspec:{0}\n'.format(filterspec),
'display_filters:{0}\n'.format(display_filters),
'options:{0}\n'.format(options),
'cmd:{0}\n'.format(cmd)
)
proc_arguments = {'shell': True}
if outfile is not None:
with open(outfile, 'w') as f:
proc_arguments['stdout'] = f
proc = subprocess.Popen(cmd, **proc_arguments)
return outfile
else:
proc_arguments['stdout'] = subprocess.PIPE
proc = subprocess.Popen(cmd, **proc_arguments)
if timeseries:
df = pd.read_table(proc.stdout,
index_col = "frame.time_epoch",
parse_dates=True,
date_parser=datetime.datetime.fromtimestamp)
else:
df = pd.read_table(proc.stdout,
parse_dates='frame.time_epoch',
date_parser=datetime.datetime.fromtimestamp)
return df
In [ ]:
# # original read call
# df=read_pcap(pcap_file, fields = ["frame.len", "ip.src", "ip.dst", 'tcp.stream', 'tcp.srcport', 'tcp.dstport'], timeseries=True).dropna()
# df
df=read_pcap(pcap_file, fields = ["frame.len", "ip.src", "ip.dst", 'tcp.stream', 'tcp.srcport', 'tcp.dstport'], display_filter=['ip', 'tcp'], timeseries=True, outfile=output_file)
In [154]:
df = pd.read_table(output_file, names=['time','len','ip.src','ip.dst','stream','tcp.src', 'tcp.dst'], skiprows=1)
import dateutil
sample_time = 1429133053.239977000
print(pd.to_datetime(sample_time, unit='s'))
df.time = pd.to_datetime(df.time, unit='s')
df[[True if x not in [0,1,2,3, 145, 141] else False for x in df['stream']]]
Out[154]:
In [155]:
df2 = df.head(100)
In [158]:
df.head(100).to_json(date_unit='us')
Out[158]:
Then we re-sample the timeseries into buckets of 1 second, summing over the lengths of all frames that were captured in that second:
In [161]:
df[df.stream == 1]
Out[161]:
In [ ]:
# THIS WHOLE BLOCK IS COMMENTED OUT BECAUSE I DON'T TRUST IT RIGHT NOW. THIS IS THE OLD WAY.
# flows = framelen.groupby(('tcp.stream', 'ip.src'))
# keys = sorted(list(flows.groups.keys()), key=lambda x: x[0])
# #list_streams = []
# #for key in keys:( # zip (iter(x),...)
# def f(x):
# print('running one time!')
# return pd.Series({'frame.len':x[0],'ip.src':x[1]})
# def extract_flow(flow):
# ipdst = flow['ip.dst'][0]
# tcpstrm = flow['tcp.stream'][0]
# tcpsrc = flow['tcp.srcport'][0]
# tcpdst = flow['tcp.dstport'][0]
# flow_Bps = flow.resample("S", how="sum")
# flow_filter = np.isnan(flow_Bps['tcp.dstport']) == False
# flow_Bps.loc[flow_filter, "tcp.stream" : "tcp.dstport"] = (tcpstrm, tcpsrc, tcpdst)
# return flow_Bps.loc[flow_filter]
# flow_list = []
# for key in keys:
# flow_list.append(extract_flow(flows.get_group(key)))
# pprint(flow_list[0].head(2))
# #stream_df = pd.DataFrame.from_records(stream_list)
# # stream1 = streams.get_group(keys[4])
# # extract_stream(stream1)
# # stream1 = streams.get_group(keys[3])
# # ostrm = stream1['tcp.stream'][0]
# # tcpsrc = stream1['tcp.srcport'][0]
# # tcpdst = stream1['tcp.dstport'][0]
# # ipdst = stream1['ip.dst'][0]
# # stream_Bps = stream1.resample("S", how="sum")
# # stream_filter = np.isnan(stream_Bps['tcp.dstport']) == False
# # stream_filter# is np.float64(np.nan))
# # #stream_Bps['tcp.srcport'] = 80
# # stream_Bps.loc[stream_filter, "tcp.stream" :"tcp.dstport"] = (ostrm, tcpsrc, tcpdst)
# # stream_Bps.loc[stream_filter]
# # # #help(streams)
# # # #stream1
In [ ]:
bytes_per_second=framelen.resample("S", how="sum")
help(framelen.resample)
Here are the first 5 rows. We get NaN for those timestamps where no frames were captured:
In [ ]:
bytes_per_second.sort('tcp.stream')
In [ ]:
framelen.sort('tcp.stream', inplace=False).dropna()
In [ ]:
#bytes_per_second.groupby("tcp.stream")["frame.len"].sum().sort('tcp.len',ascending=False,inplace=False).head(10)
#bytes_per_second.groupby('tcp.stream')['frame.len'].sum()
In [ ]:
plt = (bytes_per_second.groupby('tcp.stream')).plot()
ylabel('kbps')
xlabel('Time')
axhline(linewidth=2, color='r', y=2048)
time_zero = bytes_per_second.index[0]
annotate("2048 kbps",xy=(time_zero,2048), xycoords='data', xytext=(-30,30), textcoords='offset points', size=10,
bbox=dict(boxstyle="round", fc="0.8"),
arrowprops=dict(arrowstyle="simple"))
#plt.set_xlim(-1,100)
Let's try to replicate the TCP Time-Sequence Graph that is known from Wireshark (Statistics > TCP Stream Analysis > Time-Sequence Graph (Stevens).
In [ ]:
filters = []
fields=["tcp.stream", "ip.src", "ip.dst", "tcp.seq", "tcp.ack", "tcp.window_size", "tcp.len"]
#filters=["ip.addr eq 161.217.20.5"]
ts=read_pcap(pcap_file, fields, display_filter = filters, timeseries=True, strict=True)
ts
Now we have to select a TCP stream to analyse. As an example, we just pick stream number 10:
In [ ]:
stream=ts[ts["tcp.stream"] == 0]
In [ ]:
stream
Pandas only print the overview because the table is to wide. So we force a display:
In [ ]:
print(stream.to_string())
Add a column that shows who sent the packet (client or server).
The fancy lambda expression is a function that distinguishes between the client and the server side of the stream by comparing the source IP address with the source IP address of the first packet in the stream (for TCP steams that should have been sent by the client).
In [ ]:
stream["type"] = stream.apply(lambda x: "client" if x["ip.src"] == stream.irow(0)["ip.src"] else "server", axis=1)
In [ ]:
print(stream.to_string())
In [ ]:
client_stream=stream[stream.type == "client"]
In [ ]:
client_stream["tcp.seq"].plot(style="r-o")
Notice that the x-axis shows the real timestamps.
For comparison, change the x-axis to be the packet number in the stream:
In [ ]:
client_stream.index = arange(len(client_stream))
client_stream["tcp.seq"].plot(style="r-o")
Looks different of course.
In [ ]:
def most_bytes_per_stream(df):
return (df.groupby("tcp.stream"))["tcp.len"].sum().sort('tcp.len',ascending=False,inplace=False).head(10)
bytes_per_stream = most_bytes_per_stream(ts)
print(bytes_per_stream.index)
df_filter = ts['tcp.stream'].isin(bytes_per_stream.index)#[row in bytes_per_stream.index for row in ts['tcp.stream']]
streams = ts[df_filter]
streams.pivot(index=streams.index, columns='tcp.stream', values='tcp.seq')
#df[str(df.index) in str(bytes_per_stream.index)]
#bytes_per_stream.sort('tcp.len', inplace=False,ascending=False).head(5)
In [ ]:
per_stream=ts.groupby("tcp.stream")
per_stream.head()
In [ ]:
bytes_per_stream = per_stream["tcp.len"].sum()
bytes_per_stream.head()
In [ ]:
bytes_per_stream.plot(kind='bar')
In [ ]:
bytes_per_stream.max()
In [ ]:
biggest_stream=bytes_per_stream.idxmax()
biggest_stream
In [ ]:
bytes_per_stream.ix[biggest_stream]