In [2]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn
import datetime
from dateutil import parser
from pymongo import MongoClient
In [61]:
client = MongoClient()
db = client['rf_test']
col_entries = db['entries']
col_inst = db['inst']
col_inds = db['inds']
col_nouns = db['nouns']
In [16]:
def list_indicators():
d = col_inst.distinct("attributes.indicator")
return [x for x in d]
def compute_time_series(indicator):
"""
Convert the time stamp in the db into something that we can plot.
"""
c = col_inst.find({"attributes.indicator": indicator})
c = [x for x in c]
c_dates = [parser.parse(a['start']) for a in c]
c_dates = [mdates.date2num(b) for b in c_dates]
return c_dates
In [19]:
def plot_dist(s_dates):
"""
Plot a time series.
"""
locator = mdates.AutoDateLocator()
locator.intervald[mdates.DAILY] = [2]
formatter = mdates.AutoDateFormatter(locator)
fig, ax = plt.subplots()
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter=formatter)
ax = seaborn.distplot(s_dates, ax=ax)
seaborn.plt.show()
return (fig, ax)
In [7]:
inds = list_indicators()
In [14]:
stux_dates = compute_time_series("Stuxnet")
In [21]:
try:
plot_dist(stux_dates)
except:
print("something's wrong")
In [22]:
superfish_dates = compute_time_series("Superfish")
In [23]:
try:
plot_dist(superfish_dates)
except:
print("something's wrong")
In [31]:
def analyse_inds(indicators):
i = 0
for ind in indicators:
dates = compute_time_series(ind)
size = len(dates)
data = {
"_id": ind,
"dates": dates,
"size": size,
}
col_inds.insert_one(data)
i += 1
print("{}/{} Inserted {}".format(i,len(indicators),ind))
In [32]:
# Test with the first thirty indicators
analyse_inds(inds[20:30])
In [116]:
def print_sorted(col, attr):
a = col.find({}).sort(attr, -1)
xs = []
ys = []
for ind in a:
print("{} | {}".format(ind[attr], ind['_id']))
xs.append(ind['_id'])
ys.append(ind[attr])
return (xs, ys)
def plot_ranking(data, limit = 10):
"""
Bar plot the data, but have some limit,
since if we plot all of it, we can't read the x-axis' label
"""
xs, ys = data
seaborn.barplot(x = xs[:limit], y = ys[:limit])
seaborn.plt.show()
In [117]:
plot_ranking(print_sorted(col_inds, "size"))
In [47]:
from nltk.tag import pos_tag
In [48]:
def find_nouns(sentence):
words = sentence.split()
tagged = pos_tag(words)
return [w for w, t in tagged if t == 'NNP']
In [54]:
# Test:
find_nouns("I love McDonald, and also the hammer. Superfish is a good friend of Stuxnet")
Out[54]:
In [109]:
# We go through the sentences in the dataset.
# Using the indicator as the id, we find all of the words associated with it.
# Let's test on a small one, try "Regin malware"
def other_nouns(indicator):
c = col_inst.find({"attributes.indicator": indicator})
c = [a for a in c] # I think this could be optimize for memory usage
noun_count = {}
for item in c:
s = item['item_fragment']
nouns = find_nouns(s)
for noun in nouns:
try:
noun_count[noun].append(item['id'])
except KeyError:
noun_count[noun] = [item['id']]
#print("Added {}".format(s))
r = {}
for k, v in noun_count.items():
r[k] = {
"ids": v,
"size": len(v),
}
noun_count = sanitize(r)
col_nouns.insert_one({"_id": indicator, "associated_nouns": noun_count})
return noun_count
def sanitize(d):
r = {}
for k, v in d.items():
try:
k.index(".")
except ValueError:
r[k] = v
else:
nk = k.replace(".", "")
r[nk] = v
return r
def associated_nouns(indicator):
noun_dict = other_nouns(indicator)
ranking = sorted([(k, v['size']) for k, v in regin.items()], key= lambda x: x[1])[::-1]
for n,s in ranking:
print("{} | {}".format(s, n))
return ranking
def plot_associated_nouns(ranks):
"""
Bar plot for the ranking table of nouns.
However, this is worthless since all of the label runs together if you tried with the whole table.
"""
seaborn.barplot(x = [i[0] for i in ranks], y = [i[1] for i in ranks])
seaborn.plt.show()
In [110]:
# Much better if you only look at a limited set
plot_associated_nouns(ranking[:10])
In [ ]: