In [ ]:
# numpy, scipy, scipy.stats.stats, collections, random, time
from collections import Counter, defaultdict
from itertools import chain, combinations
from time import localtime
import json
from scipy import std, mean
from scipy.stats.stats import pearsonr
from matplotlib import pyplot as plt
In [ ]:
# This lesson requires the following modules
from __future__ import unicode_literals, print_function, division
# And %pylab features
%pylab
# Show plots into browser
%matplotlib inline
In [ ]:
#
# Exercise: use this frame for the exercise
#
def ping_rtt(seconds=10):
"""@return: a list of ping RTT"""
from course import sh
# get sample output
# find a solution in ipython
# test and paste the code
raise NotImplementedError
!ping -c3 www.google.it || ping -n3 www.google.it
In [ ]:
# %load course/ping_rtt.py
__author__ = 'rpolli'
def ping_rtt():
"""
goal: slicing data
goal: using zip to transpose data
"""
from course import sh
import sys
cmd = "ping -c10 www.google.it"
if 'win' in sys.platform:
cmd = "ping -n10 www.google.it"
ping_output = sh(cmd)
filter_lines = (x.split() for x in ping_output if 'time' in x)
if 'win' in sys.platform:
ping_output = [x[6::2] for x in filter_lines]
else:
ping_output = [x[-4:-1:2] for x in filter_lines]
ttl, rtt = zip(*ping_output)
return [float(x.split("=")[1]) for x in rtt]
In [ ]:
rtt = ping_rtt()
print(*rtt)
In [ ]:
# We can generate a distribution using
from collections import Counter
distro = Counter(rtt)
print("Un-bucketed distribution:", distro)
In [ ]:
# If we split data in 1-second buckets....
distro = Counter(int(i) for i in rtt)
print("One second bucket distribution:",distro)
Exercise: which is the behavior of defaultdict?
In [ ]:
# defaultdict can be used to implement more complex behaviors.
from collections import defaultdict
distro = defaultdict(int)
# This works even if rtt is a generator
for x in rtt:
distro[x] += 1
print(distro)
In [ ]:
# or list.count if rtt is in-memory
distro2 = {x: rtt.count(x) for x in set(rtt)}
print(distro2)
In [ ]:
# Exercise: use this cell to benchmark those three approaches while increasing the size of rtt
In [ ]:
from scipy import std, mean
fair = [1, 1] # chickens
unfair = [0, 2] # chickens
# Same mean!
assert mean(fair) == mean(unfair)
In [ ]:
# Use standard deviation!
print("is unfair?", std(fair)) # 0
print("is unfair?", std(unfair)) # 1
In [ ]:
#Check your computed values vs the $\sigma$ returned by ping
"""
goal: remember to convert to numeric / float
goal: use scipy
goal: check stdev
"""
from scipy import std, mean # max, min are builtin
rtt = ping_rtt()
fmt_s = 'stdev: {:0.2}, mean: {}, min: {}, max: {}'
rtt_std, rtt_mean = std(rtt), mean(rtt)
rtt_max, rtt_min = max(rtt), min(rtt)
print(fmt_s.format(rtt_std, rtt_mean, rtt_max, rtt_min))
In [ ]:
# The data/maillog file contains
ret = !cat ../data/maillog
print(*ret[:3], sep='\n')
In [ ]:
# Use the following cell to solve the exercise.
In [ ]:
#
# Time distribution exercise frame
#
solution = "cmV0ID0gIWdyZXAgcmVtb3ZlZCBkYXRhL21haWxsb2cKaG91cnMgPSBbeFs6Ml0gZm9yIHggaW4g\ncmV0LmZpZWxkcygyKV0K"
from course import show_solution
show_solution(solution)
In [ ]:
ret = !grep removed ../data/maillog
#print(ret.fields(2))
hours = [int(x[:2]) for x in ret.fields(2)]
print(hours[:10], "..")
time_d = Counter(hours)
print("The distribution is",time_d.viewitems())
In [ ]:
# Exercise: find the 3 and 4 most common values of time_d
In [ ]:
# To plot data..
from matplotlib import pyplot as plt
# and set the interactive mode
plt.ion()
# Plotting an histogram...
frequency, slots, _ = hist(hours)
plt.title("Hourly distribution")
In [ ]:
# .. returns a
from collections import OrderedDict
distribution = OrderedDict(zip(slots,
frequency))
In [ ]:
# Print it nicely with
import json
print(json.dumps(distribution, indent=1))
size_d = { # mail size between
0: xxx # 0 - 10k
1: xxx # 10k - 20k
..
}
In [ ]:
#
# Exercise frame
#
In [ ]:
# Get the sigma_size from maillog
solution = b'CnJldCA9ICFncmVwIHNpemUgZGF0YS9tYWlsbG9nCnNpemVzID0gW2ludCh4WzU6LTFdKSBmb3Ig\neCBpbiByZXQuZmllbGRzKDcpXQoK\n'
from course import show_solution
# show_solution(solution)
def maillog_size():
ret = !grep size ../data/maillog
sizes = [int(x[5:-1]) for x in ret.fields(7)]
return sizes
sizes = maillog_size()
In [ ]:
frequency, slots, _ = plt.hist(sizes, bins=5)
In [ ]:
# We can use our time_d to simulate the load during the day
from time import localtime
hour = localtime().tm_hour
mail_per_minute = time_d[hour] / 60 # minutes in hour
print("Estimated frequency is", mail_per_minute)
In [ ]:
# And sizes to create a mail load generator
mean_size, sigma_size = mean(sizes), std(sizes)
print(mean_size, sigma_size)
In [ ]:
# A mail load generator creating attachments of a given size...
from random import gauss
def mail_size_g():
mu = min(sizes)
x = gauss(mean_size, sigma_size) # a random number
# as gaussian can give negative numbers
# we should ceil up (or skip) negative values
return x if x > mu else mu
In [ ]:
# How is our estimation far from reality?
for emails in 5000, 10000, 100000:
X = [mail_size_g() for x in xrange(emails)]
print(emails, mean(X) / mean_size, std(X) / sigma_size)
# Homework: write a script to simulate the load of your mailserver and tweak your mail_size_g
# Hint: create a set of attachment files: 1k.out, 10k.out, 100k.out, 1000k.out, 5000k.out
# Hint: use mail_size_g() to select one of those files instead of creating a given email
To better simulate mail size, consider using a Cauchy distribution.
See www.ieee802.org/16/tgm/contrib/C80216m-07_122.pdf
In [ ]:
# Let's plot the following datasets
# taken from a 4-hour distribution
mail_sent = [1, 5, 500, 250, 100, 7]
kB_s = [70, 300, 29000, 12500, 450, 500]
# A scatter plot can suggest relations
# between data
plt.scatter(mail_sent, kB_s)
plt.title("I am a SCATTER plot!")
The Pearson Coefficient $\rho$ is a relation indicator.
In [ ]:
from scipy.stats.stats import pearsonr
ret = pearsonr(mail_sent, kB_s)
print(ret)
#>(0.9823, 0.0004)
correlation, probability = ret
a way of selecting members from a grouping, such that the order of selection does not matter
In [ ]:
# Given a table with many data series
from course import table
#table = {...
# 'cpu_usr': [10, 23, 55, ..],
# 'byte_in': [2132, 3212, 3942, ..],
# }
print(*table.keys(),sep='\n')
# We can combine all their names with
from itertools import combinations
list(combinations(table,2))[:10]
In [ ]:
# We can try every combination between data series
# and check if there's some correlation
from itertools import combinations
from scipy.stats.stats import pearsonr
from course import table
for k1, k2 in combinations(table, 2):
r_coeff, probability = pearsonr(table[k1], table[k2])
# I'm *still* not interested in data under this threshold
if r_coeff < 0.6:
continue
print("linear correlation between {:8} and {:8} is {:3}".format(
k1, k2, r_coeff))
Coloring scatter-plot gives us one more dimension in a 2d plot: time!
We have to mark our ordered data-series with a tick-series. eg.
| tick | 0 | 0 | 0 | 1 | 1 | .. |
|---|---|---|---|---|---|---|
| cpu_usr | 10 | 23 | 55 | 33 | 10 | .. |
| byte_in | 2132 | 3212 | 3942 | 321 | 42 | .. |
Elements with the same tick are in the same bucket.
We will now group our series in 3 buckets of 8-hours. We'll use the itertools.chain function which
flattens a list of iterators.
In [ ]:
from itertools import chain # magic function to flatten a list of iterators
# Dividing samples in 3 buckets.
BUCKETS = ["morning", "afternoon", "night"]
samples = len(table['byte_in'])
# scale samples on 24 hours
ticks = list(chain(*(
[x] * (samples // len(BUCKETS)) # generates a [ 0 ] * bucket_size list.
for x in range(0, 24, 24 // len(BUCKETS)) # rescale all samples on 24 hours
))
)
print(ticks)
In [ ]:
# Any doubts? Just check the ticks distribution ;)
print(Counter(ticks))
# Exercise: use json.dumps to prettify the distribution.
In [ ]:
# Create a formatter associating a label to every color point:
# - we just take into account the tick value, ignoring other values
# - the tick is divided by 8 (the bucket_size) to implement the following mapping
# - [0, 8) -> morning
# - [8, 16) -> afternoon
# - [16, 24) -> night
formatter = plt.FuncFormatter(lambda tick, *ignored_args: BUCKETS[tick//8])
plt.scatter(table['byte_in'], table['csw'], c=ticks)
plt.colorbar(ticks = range(0, 28, 8), format=formatter)
In [ ]:
from scipy import arange
set(map(int, arange(0, 3, 1/samples)))
In [ ]:
#netfishing_correlation_plot():
# Consider using pandas.tools.plotting import scatter_matrix
from itertools import combinations
from scipy.stats.stats import pearsonr
from course import table
from matplotlib import pyplot as plt
for (k1, v1), (k2,v2) in combinations(table.items(), 2):
r_coeff, probability = pearsonr(table[k1], table[k2])
if r_coeff < 0.6:
continue
plt.scatter(v1, v2, c=ticks)
plt.xlabel(k1); plt.ylabel(k2)
plt.title("r={:0.3f} p={:0.3f}".format(r_coeff, probability))
plt.colorbar(ticks = range(0, 28, 8), format=formatter)
plt.show()
# Uncomment the following lines to save the plot on disk
# plt.savefig("colorbar_{}_{}.png".format(k1, k2))
# plt.close()
In [ ]:
In [ ]:
from __future__ import print_function, unicode_literals
def ping_rtt(seconds=10):
"""
goal: slicing data
goal: using zip to transpose data
"""
import sys
from course import sh
cmd = "ping -c{seconds} www.google.it"
if 'win' in sys.platform:
cmd = "ping -n{seconds} www.google.it"
ping_output = sh(cmd.format(seconds=seconds))
# Filter out uninteresting lines
ping_output = [x.replace(b"=", b" ") for x in ping_output if b'from' in x]
if 'win' in sys.platform:
ping_output = [x.split()[6::2] for x in ping_output]
else:
ping_output = [x.split()[-4:-1:2] for x in ping_output]
ttl, rtt = zip(*ping_output)
return map(float, rtt)
print(*ping_rtt(3))
In [ ]:
# Exercise II solution
# deliveder emails are like the following
#May 14 16:00:04 rpolli postfix/qmgr[122]: 4DC3DA: removed"
ret = !grep removed maillog # get the interesting lines
ts = ret.fields(2) # find the timestamp (3rd column)
hours = [ int(ts) for x in ts ]
time_d = {x: hours.count(x) for x in set(hours)}