All of this data has been contributed to contagio by various sources. The methods of traffic capture/generation/sandbox setup probably varies widly between most (all?) samples. This underscores the need for good data when doing analysis, but in this case we're going to make the best of what we've got.
Thanks to everybody who selflessly contibutes data to sites like contagio, keep up the great work!
In [1]:
import pandas as pd
import numpy as np
import string
import pylab
import re
import pandas
import time
import os
import collections
import matplotlib
import struct
import socket
import json
from datetime import datetime
from netaddr import IPNetwork, IPAddress
%matplotlib inline
print pd.__version__
pylab.rcParams['figure.figsize'] = (16.0, 5.0)
In [2]:
# Mapping of fields of the files we want to read in and initial setup of pandas dataframes
logs_to_process = {
'conn.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','proto','service','duration','orig_bytes','resp_bytes','conn_state','local_orig','missed_bytes','history','orig_pkts','orig_ip_bytes','resp_pkts','resp_ip_bytes','tunnel_parents','threat','sample'],
'dns.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','proto','trans_id','query','qclass','qclass_name','qtype','qtype_name','rcode','rcode_name','AA','TC','RD','RA','Z','answers','TTLs','rejected','threat','sample'],
'files.log' : ['ts','fuid','tx_hosts','rx_hosts','conn_uids','source','depth','analyzers','mime_type','filename','duration','local_orig','is_orig','seen_bytes','total_bytes','missing_bytes','overflow_bytes','timedout','parent_fuid','md5','sha1','sha256','extracted','threat','sample'],
'ftp.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','user','password','command','arg','mime_type','file_size','reply_code','reply_msg','data_channel.passive','data_channel.orig_h','data_channel.resp_h','data_channel.resp_p','fuid','threat','sample'],
'http.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','trans_depth','method','host','uri','referrer','user_agent','request_body_len','response_body_len','status_code','status_msg','info_code','info_msg','filename','tags','username','password','proxied','orig_fuids','orig_mime_types','resp_fuids','resp_mime_types','threat','sample'],
'notice.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','fuid','file_mime_type','file_desc','proto','note','msg','sub','src','dst','p','n','peer_descr','actions','suppress_for','dropped','remote_location.country_code','remote_location.region','remote_location.city','remote_location.latitude','remote_location.longitude','threat','sample'],
'signatures.log' : ['ts','src_addr','src_port','dst_addr','dst_port','note','sig_id','event_msg','sub_msg','sig_count','host_count','threat','sample'],
'smtp.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','trans_depth','helo','mailfrom','rcptto','date','from','to','reply_to','msg_id','in_reply_to','subject','x_originating_ip','first_received','second_received','last_reply','path','user_agent','fuids','is_webmail','threat','sample'],
'ssl.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','version','cipher','server_name','session_id','subject','issuer_subject','not_valid_before','not_valid_after','last_alert','client_subject','client_issuer_subject','cert_hash','validation_status','threat','sample'],
'tunnel.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','tunnel_type','action','threat','sample'],
'weird.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','name','addl','notice','peer','threat','sample']
}
conndf = pd.DataFrame(columns=logs_to_process['conn.log'])
dnsdf = pd.DataFrame(columns=logs_to_process['dns.log'])
filesdf = pd.DataFrame(columns=logs_to_process['files.log'])
ftpdf = pd.DataFrame(columns=logs_to_process['ftp.log'])
httpdf = pd.DataFrame(columns=logs_to_process['http.log'])
noticedf = pd.DataFrame(columns=logs_to_process['notice.log'])
sigdf = pd.DataFrame(columns=logs_to_process['signatures.log'])
smtpdf = pd.DataFrame(columns=logs_to_process['smtp.log'])
ssldf = pd.DataFrame(columns=logs_to_process['ssl.log'])
tunneldf = pd.DataFrame(columns=logs_to_process['tunnel.log'])
weirddf = pd.DataFrame(columns=logs_to_process['weird.log'])
In [3]:
# Process the directory structure
# If you download the complete PCAP zip from Contagio and unzip a structure like:
# PCAPS_TRAFFIC_PATTERNS
# |->CRIME
# |-> <sample>
# |->APT
# |-> <sample>
# |->METASPLOIT
# |-> <sample>
#
# Will appear and this is the structure that's walk CRIME/APT/METASPLOIT will make their way into the "threat" tag
# while the sample/PCAP name will wind up in "sample"
#
# Bro data generated via the "run_bro.sh" shell script (this places all Bro output in the respective sample directories and
# contributes to the directory structure above
for dirName, subdirList, fileList in os.walk('.'):
#print('Found directory: %s' % dirName)
for fname in fileList:
tags = dirName.split('/')
if len(tags) == 4 and fname in logs_to_process:
#print ('%s/%s' %(dirName, fname))
logname = fname.split('.')
try:
tempdf = pd.read_csv(dirName+'/'+fname, sep='\t',skiprows=8, header=None,
names=logs_to_process[fname][:-2], skipfooter=1)
tempdf['threat'] = tags[2]
tempdf['sample'] = tags[3]
if tags[2] == "0":
print ('%s/%s' %(dirName, fname))
if fname == 'conn.log':
conndf = conndf.append(tempdf)
if fname == 'dns.log':
dnsdf = dnsdf.append(tempdf)
if fname == 'files.log':
filesdf = filesdf.append(tempdf)
if fname == 'ftp.log':
ftpdf = ftpdf.append(tempdf)
if fname == 'http.log':
httpdf = httpdf.append(tempdf)
if fname == 'notice.log':
noticedf = noticedf.append(tempdf)
if fname == 'signatures.log':
sigdf = sigdf.append(tempdf)
if fname == 'smtp.log':
smtpdf = smtpdf.append(tempdf)
if fname == 'ssl.log':
ssldf = ssldf.append(tempdf)
if fname == 'tunnel.log':
tunneldf = tunneldf.append(tempdf)
if fname == 'weird.log':
weirddf = weirddf.append(tempdf)
except Exception as e:
print "[*] error: %s, on %s/%s" % (str(e), dirName, fname)
# Read in and configure the maxmind db (free ASN)
maxmind = pd.read_csv("./GeoIPASNum2.csv", sep=',', header=None, names=['low','high','asn'])
maxmind['low'] = maxmind['low'].astype(int)
maxmind['high'] = maxmind['high'].astype(int)
In [4]:
# Helper Functions
def ip2int(addr):
try:
return struct.unpack("!I", socket.inet_aton(addr))[0]
except Exception as e:
pass
#print "Error: %s - %s" % (str(e), addr)
return 0
maxcache = {}
def maxmind_lookup(ip):
if ip in maxcache:
return maxcache[ip]
i = ip2int(ip)
if i == 0:
return "UNKNOWN"
results = list(maxmind.loc[(maxmind["low"] < i) & (maxmind['high'] > i)]['asn'])
if len(results) > 0:
maxcache[ip] = results[0]
return results[0]
maxcache[ip] = "UNKNOWN"
return "UNKNOWN"
def box_plot_df_setup(series_a, series_b):
# Count up all the times that a category from series_a
# matches up with a category from series_b. This is
# basically a gigantic contingency table
cont_table = collections.defaultdict(lambda : collections.Counter())
for val_a, val_b in zip(series_a.values, series_b.values):
cont_table[val_a][val_b] += 1
# Create a dataframe
# A dataframe with keys from series_a as the index, series_b_keys
# as the columns and the counts as the values.
dataframe = pd.DataFrame(cont_table.values(), index=cont_table.keys())
dataframe.fillna(0, inplace=True)
return dataframe
def is_ip(ip):
try:
socket.inet_aton(ip)
return True
except socket.error:
return False
In [5]:
# misc cleanup of the Bro conn.log dataframe
try:
conndf.orig_bytes[conndf.orig_bytes == '-'] = 0
except Exception as e:
pass
try:
conndf.resp_bytes[conndf.resp_bytes == '-'] = 0
except Exception as e:
pass
conndf['orig_bytes'] = conndf['orig_bytes'].astype(long)
conndf['resp_bytes'] = conndf['resp_bytes'].astype(long)
conndf['total_bytes'] = conndf['orig_bytes'] + conndf['resp_bytes']
# and augmentation (asn)
conndf['maxmind_asn'] = conndf['id.resp_h'].map(maxmind_lookup)
# add date
good_datetime = [datetime.fromtimestamp(float(date)) for date in conndf['ts'].values]
conndf['date'] = pd.Series(good_datetime, index=conndf.index)
# reindex the dataframes
conndf = conndf.reindex()
httpdf = httpdf.reindex()
dnsdf = dnsdf.reindex()
noticedf = noticedf.reindex()
filesdf = filesdf.reindex()
smtpdf = smtpdf.reindex()
In [7]:
for threat in ['APT', 'CRIME']:
subset = conndf[conndf['threat'] == threat][['date','sample']]
subset['count'] = 1
pivot = pd.pivot_table(subset, values='count', rows=['date'], cols=['sample'], fill_value=0)
by = lambda x: lambda y: getattr(y, x)
grouped = pivot.groupby([by('year'),by('month')]).sum()
ax = grouped.plot()
pylab.ylabel('Connections')
pylab.xlabel('Date Recorded')
patches, labels = ax.get_legend_handles_labels()
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2, title="Sample Name")
In [8]:
print "Total Samples: %s" % conndf['sample'].nunique()
print ""
print "APT Samples: %s" % conndf[conndf['threat'] == 'APT']['sample'].nunique()
print "Crime Samples: %s" % conndf[conndf['threat'] == 'CRIME']['sample'].nunique()
print "Metasploit Samples: %s" % conndf[conndf['threat'] == 'METASPLOIT']['sample'].nunique()
print ""
print "Connection Log Entries: %s" % conndf.shape[0]
print "DNS Log Entries: %s" % dnsdf.shape[0]
print "HTTP Log Entries: %s" % httpdf.shape[0]
print "Files Log Entries: %s" % filesdf.shape[0]
print "SMTP Log Entries: %s" % smtpdf.shape[0]
print "Weird Log Entries: %s" % weirddf.shape[0]
print "SSL Log Entries: %s" % ssldf.shape[0]
print "Notice Log Entries: %s" % noticedf.shape[0]
print "Tunnel Log Entries: %s" % tunneldf.shape[0]
print "Signature Log Entries: %s" % sigdf.shape[0]
This is an example of how to go from an alert (in this case a Bro signature match) to gathering information about the alert via the other data in just a few lines of code.
We'll want to
In [9]:
# Get all the destination addresses from all the signature hits, in this case it's only one.
sig_dst_ips = sigdf['dst_addr'].tolist()
sigdf[['dst_addr', 'dst_port','sig_id','sub_msg','threat','sample']]
Out[9]:
In [10]:
# Let's see what other information we can gather about the network sessions surrounding that signature
for ip in sig_dst_ips:
print "**** IP: %s ****" %ip
print " ** Flow Information **"
print conndf[conndf['id.resp_h'] == ip][['id.resp_p','proto','service','duration','conn_state','orig_ip_bytes','resp_ip_bytes']]
print " ** HTTP Information **"
print httpdf[httpdf['id.resp_h'] == ip][['method','host','uri','user_agent']]
files = httpdf[httpdf['id.resp_h'] == ip]['orig_fuids']
flist = files.append(httpdf[httpdf['id.resp_h'] == ip]['resp_fuids']).tolist()
# We use SHA1 because that's what gets tossed in the Bro notice.log for the Team Cymru MHR alerts
print " ** File SHA1 **"
for f in flist:
if f != '-':
sha1 = filesdf[filesdf['fuid'] == f]['sha1'].tolist()
for m in sha1:
print "Sample Hash: %s" % m
if noticedf[noticedf['sub'].str.contains(m)][['sub','sample']].shape[0] > 0:
print noticedf[noticedf['sub'].str.contains(m)][['sub','sample']]
print "Filename: %s mime-type: %s" % (filesdf[filesdf['sha1'] == m]['filename'].tolist()[0], filesdf[filesdf['sha1'] == m]['mime_type'].tolist()[0])
print ""
#print md5
In [11]:
print dnsdf.qtype_name.value_counts()
In [12]:
for q in dnsdf['qtype_name'].unique().tolist():
print "Query Type: %s" % q
print dnsdf[dnsdf['qtype_name'] == q]['query'].value_counts().head(5)
print ""
In [13]:
dnsdf['rcode_name'].value_counts()
Out[13]:
Want to take a guess at which one(s) possibly use a DGA to connect/find to C2 domains?
In [14]:
dnsdf[dnsdf['rcode_name'] == 'NXDOMAIN']['sample'].value_counts().head(10)
Out[14]:
Generally in HTTP traffic we expect to see a value in the 'Host' header. This indicates the virtual host that the client is connecting to on a given IP address. It could be interesting to see what hostnames are present in the HTTP 'Host' header, yet no DNS query was seen. Keep in mind this could be as simple as it wasn't recorded/included in the PCAP, or that it really didn't happen.
In [15]:
intersect_hostnames = set(pd.Series(list(set(httpdf['host']).intersection(set(dnsdf['query'])))))
interesting = []
tempdf = pd.DataFrame()
for hn in list(set(httpdf['host'])):
if hn not in intersect_hostnames and not is_ip(hn):
#print hn
interesting.append(hn)
tempdf = tempdf.append(httpdf[httpdf['host'] == hn])
In [16]:
tempdf['count'] = 1
tempdf[['host', 'id.resp_h', 'sample', 'count']].groupby(['sample', 'host', 'id.resp_h']).sum().sort('count', ascending=0)
Out[16]:
From the looks of the above, it seems that for some of the samples DNS traffic wasn't logged vs. malware doing something tricky. We can verify (below) that there really doesn't appear to be any DNS traffic related to one of the domains.
In [17]:
print dnsdf[dnsdf['query'] == "dgyqimolcqm.cm"]
print dnsdf[dnsdf.answers.str.contains('dgyqimolcqm.cm')]
print dnsdf[dnsdf['sample'] == "BIN_ZeroAccess_Sirefef_C2A9CCC8C6A6DF1CA1725F9"]['query'].value_counts().head(50)
Well, at least there's always HTTP traffic to look at for the above domain.
In [18]:
httpdf[httpdf['host'] == "dgyqimolcqm.cm"][['id.orig_h','id.orig_p','id.resp_h','id.resp_p','uri','sample','threat']]
Out[18]:
In [19]:
print "%s Unique User-Agents in %s samples." % (httpdf['user_agent'].nunique(), httpdf['sample'].nunique())
In [20]:
tempdf = pd.DataFrame(columns=['sample','num_ua'])
for sample in list(set(httpdf['sample'])):
tempdf = tempdf.append({'sample':sample, 'num_ua':httpdf[httpdf['sample'] == sample]['user_agent'].nunique()}, ignore_index=True)
tempdf.sort('num_ua', ascending=0).head()
Out[20]:
Let's check one of the ones that doesn't completely stand out, purplehaze
In [21]:
# Well, at least we know what UA this sample uses for C2, and it seems we can see some other OS activity as well
tsample = 'purplehaze'
httpdf[httpdf['sample'] == tsample].user_agent.value_counts()
Out[21]:
In [22]:
httpdf['count'] = 1
grouped = httpdf[httpdf['sample'] == tsample][['sample','user_agent','host','count']].groupby(['sample', 'user_agent', 'host']).sum()
grouped.sort('count', ascending = 0).head(10)
Out[22]:
Now for something more interesting, Dirtjumper
In [23]:
tsample = 'BIN_dirtjumper_2011-10'
httpdf[httpdf['sample'] == tsample].user_agent.value_counts()
Out[23]:
In [24]:
grouped = httpdf[httpdf['sample'] == tsample][['sample','user_agent','host','count']].groupby(['sample', 'host']).sum()
grouped.sort('count', ascending = 0)
Out[24]:
In [25]:
grouped = httpdf[httpdf['sample'] == tsample][['sample','user_agent','host','count']].groupby(['sample', 'user_agent', 'host']).sum()
grouped.sort('count', ascending = 0)
Out[25]:
First we can check the coverage of the ASN database (thanks again Maxmind). The coverage appears to be pretty good, only not marking RFC1918 addresses in addtion to broadcast, etc... IPs. However, we can see a few IP addresses that aren't covered, oh well. :)
Keep in mind we're only looking at destination IP addresses.
In [26]:
conndf[conndf['maxmind_asn'] == "UNKNOWN"]['id.resp_h'].value_counts()
Out[26]:
In [27]:
ax = box_plot_df_setup(conndf[conndf['threat'] == 'APT']['sample'], conndf[conndf['threat'] == 'APT']['maxmind_asn']).T.plot(kind='bar', stacked=True)
pylab.ylabel('Sample Occurrences')
pylab.xlabel('ASN (Autonomous System Number)')
patches, labels = ax.get_legend_handles_labels()
ax.legend(patches, labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="Sample Name")
Out[27]:
Just showing how to look at all the various AS contacted by a single sample. It also points out that the multiple parking above on L3 might be due to DNS requests.
In [29]:
conndf[conndf['sample'] == "BIN_8202_6d2c12085f0018daeb9c1a53e53fd4d1"][['maxmind_asn','id.resp_h']]
Out[29]:
In [30]:
conndf['count'] = 1
grouped = conndf.groupby(['sample', 'id.resp_p']).sum()
grouped.sort('total_bytes', ascending = 0).head(10)
Out[30]:
In [31]:
# That port 1935 from above might be interesting, where's it going?
conndf[conndf['id.resp_p'] == 1935][['id.resp_h','proto']]
Out[31]:
In [32]:
# Same with port 336
conndf[conndf['id.resp_p'] == 336][['id.resp_h','proto']]
Out[32]:
In [33]:
smtpdf.sample.value_counts()
Out[33]:
Looks like we've found quite a few (wonder if they still work). I'm also amazed at how many are included with one sample. There are at most 3 other hosts in this list not related to the Asprox sample, but it seems that Asprox sends quite a bit of email and includes a pretty good list of open relays.
In [34]:
print "Unique Hosts found as the HELO portion of SMTP traffic: %s" % smtpdf.helo.nunique()
print ""
print "Some of the examples"
print smtpdf.helo.value_counts().head(10)
In [35]:
smtpdf['count'] = 1
grouped = smtpdf[smtpdf['from'] != "-"][['from','subject','count']].groupby(['from', 'subject']).sum()
grouped.sort('count', ascending = 0).head(20)
Out[35]:
Network traffic is fine and dandy, but now it's time for some more eye-candy!
It's nice to have the context around how systems communicate. We've got some great stats/data surrounding the c2 and delivery mechanisms, so let's see how they related to the files that get transferred. Bro can extract files from IRC, SMTP, HTTP, and FTP out of the box.
What's the most popular, and what does it look like per-protocol?
In [36]:
ax = box_plot_df_setup(filesdf['source'], filesdf['mime_type']).T.plot(kind='bar', stacked=True)
pylab.xlabel('Mime-Type')
pylab.ylabel('Number of Files')
patches, labels = ax.get_legend_handles_labels()
ax.legend(patches, labels, title="Service Type")
Out[36]:
In [37]:
ax = box_plot_df_setup(filesdf.loc[(filesdf["mime_type"] != 'text/html') & (filesdf['mime_type'] != 'text/plain')]['source'], filesdf.loc[(filesdf["mime_type"] != 'text/html') & (filesdf['mime_type'] != 'text/plain')]['mime_type']).T.plot(kind='bar', stacked=True)
pylab.xlabel('Mime-Type')
pylab.ylabel('Number of Files')
patches, labels = ax.get_legend_handles_labels()
ax.legend(patches, labels, title="Service Type")
Out[37]:
In [38]:
filesdf['count'] = 1
filesdf[filesdf['filename'] != '-'][['source','mime_type','seen_bytes','count']].groupby(['source','mime_type']).sum().sort('count', ascending=0).head(10)
Out[38]:
In [39]:
filesdf[filesdf['filename'] != '-'][['source','mime_type','filename','count']].groupby(['source','mime_type','filename']).sum().sort('count', ascending=0).head(10)
Out[39]:
In [40]:
filesdf[filesdf['filename'] != '-'][['sample','mime_type','filename','count']].groupby(['sample','mime_type','filename']).sum().sort('count', ascending=0).head(10)
Out[40]:
In [41]:
noticedf['count'] = 1
noticedf[['note','msg','count']].groupby(['note','msg']).sum().sort('count', ascending=0)
Out[41]:
In [42]:
# We can get a slightly different look at the world by throwing some ports into the mix! Looks like we might have some winners here.
noticedf[['note','msg','id.resp_p','count']].groupby(['note','msg','id.resp_p']).sum().sort('count', ascending=0)
Out[42]:
In [43]:
noticedf[noticedf['note'] == 'Scan::Address_Scan']['sample']
Out[43]:
We've come full circle, it looks like we've got more confirmation that we have some malware samples that are really good at SPAM, and disply it by connecting to lots of hosts in rapid succession.
In [44]:
ssldf['id.resp_p'].value_counts()
Out[44]:
In [45]:
ssldf.subject.value_counts().head(10)
Out[45]:
In [46]:
ssldf['count'] = 1
ssldf[['version','cipher','count']].groupby(['version','cipher']).sum().sort('count', ascending=0)
Out[46]:
Since we've got a decent grasp on the ports used, the types of ciphers present as well as popular certs that were seen in malware, perhaps there are a couple of ways we can begin to relate that information back to samples to get an idea of what the sample might be doing or how it works.
In [47]:
ssldf[['sample','server_name','id.resp_p','count']].groupby(['sample','id.resp_p','server_name']).sum().sort('count', ascending=0)
Out[47]:
To run the visualization in your web browser:
In [48]:
data = {'name' : 'ssl'}
samples = list(set(ssldf['sample'].tolist()))
data['children'] = list()
sampleindex = 0
for sample in samples:
data['children'].append({'name' : sample, 'children' : list()})
ports = set(ssldf[ssldf['sample'] == sample]['id.resp_p'].tolist())
portindex = 0
for port in ports:
data['children'][sampleindex]['children'].append({'name' : str(port), 'children' : list()})
hostnames = set(list(ssldf.loc[(ssldf['id.resp_p'] == int(port)) & (ssldf['sample'] == sample)]['server_name']))
for hostname in hostnames:
data['children'][sampleindex]['children'][portindex]['children'].append({'name' : hostname, 'size' : 1})
portindex += 1
sampleindex += 1
json.dump(data, open('ssl.json', 'w'))
To show how easy D3 can be once you have the JSON output, you can also point your browser (after folliwng the steps above) to http://localhost:9999/ssl_cartesian.html
And you'll get some output similar to:
Note: If you run these at home, you can zoom in and zoom out with your browswer hot-keys to get a much nicer view of the graph.
In [49]:
# Ports per sample
ax = box_plot_df_setup(ssldf['id.resp_p'], ssldf['sample']).T.plot(kind='bar', stacked=True)
pylab.ylabel('Total # of connections')
pylab.xlabel('Samples')
patches, labels = ax.get_legend_handles_labels()
ax.legend(patches, labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="Port")
Out[49]:
In [50]:
# Or as you might see it in an operational sense...
ax = box_plot_df_setup(ssldf['id.resp_p'], ssldf['id.orig_h']).T.plot(kind='bar', stacked=True)
pylab.ylabel('Total # of connections')
pylab.xlabel('Source IP')
patches, labels = ax.get_legend_handles_labels()
ax.legend(patches, labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="Port")
Out[50]:
Tbot uses TOR for communication. (http://contagiodump.blogspot.com/2012/12/dec-2012-skynet-tor-botnet-trojantbot.html)
The weird.log shows protocol issues/anomalies as well as information pertaining to possible data loss, etc... We weren't able to find anything exciting in there, but that doesn't mean you won't!
In [51]:
weirddf.name.value_counts()
Out[51]: