In [59]:
from collections import defaultdict
import cPickle
import datetime
import glob
import os.path
import re
from analyze import PDict
from matplotlib import pyplot, mpl
from matplotlib.patches import Rectangle
import numpy as np
#
# Utility functions
#
DATE = re.compile(r'.*(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})_\w+_day.pkl')
PATH = '<PATH FOR RESULTS>/datamining/dbenv/results'
ISO = re.compile(r'^.*/([^/]+.iso(?:torrent|metalink)?)(?:\?\d*)?$')
MONTH = {
'01': 'Jan',
'02': 'Feb',
'03': 'Mar',
'04': 'Apr',
'05': 'May',
'06': 'Jun',
'07': 'Jul',
'08': 'Aug',
'09': 'Sep',
'10': 'Oct',
'11': 'Nov',
'12': 'Dic',
}
def day_name(date_):
"""Name of the daily file. Return YYYYMMDD string."""
return '%04d%02d%02d'%(date_.year, date_.month, date_.day)
def week_name(date_):
"""Name the weekly file. Return YYYYWW string."""
# Group the date at the end of the week, so some dates can appears
# in a different year. For example, 32/12/2012 will appear as a
# 201301, the first week of 2013.
delta = datetime.timedelta(days=6-date_.weekday())
week_date = date_ + delta
return '%04d%02d'%(week_date.year, week_date.isocalendar()[1])
def month_name(date_):
"""Name of the monthly file. Return YYYYMM string."""
return '%04d%02d'%(date_.year, date_.month)
def accumulate(dict_a, dict_b):
"""Accumulate the counts or sets from dict_b in dict_a"""
for key, value in dict_b.iteritems():
if type(value) is int:
dict_a[key] = dict_a.get(key, 0) + value
elif type(value) is set:
dict_a[key] = dict_a.get(key, set()) | value
else:
raise TypeError('Value is neither int or set.')
def collapse(table, products, other='other'):
def _collapse(dict_):
d = defaultdict(int)
for (k, v) in dict_.iteritems():
p = products[k] if k in products else other
if p:
d[p] = d[p] + v
return d
return {k: _collapse(v) for (k, v) in table.iteritems()}
def row(table, dates, product):
return np.array([table[d][p] for d in dates])
In [60]:
#
# Process downloads
#
# ISO = re.compile(r'^.*/([^/]+.iso(?:torrent|metalink)?)(?:\?\d*)?$')
def curate_iso(downloads):
"""Extract the name of the ISO"""
cured = defaultdict(int)
for key, value in downloads.iteritems():
newkey = ISO.findall(key)
if newkey:
cured[newkey[0]] = cured[newkey[0]] + value
return cured
day_table, week_table, month_table = {}, {}, {}
acc_week, acc_month = None, None
cur_week, cur_month = None, None
acc_full = {}
for dname in sorted(glob.glob(os.path.join(PATH, '*_download_day.pkl'))):
year, month, day = (int(x) for x in DATE.match(dname).groups())
date_ = datetime.date(year, month, day)
acc_day, cur_day = curate_iso(cPickle.load(open(dname, 'rb'))), day_name(date_)
acc_week, cur_week = acc_week if cur_week == week_name(date_) else {}, week_name(date_)
acc_month, cur_month = acc_month if cur_month == month_name(date_) else {}, month_name(date_)
day_table[cur_day], week_table[cur_week], month_table[cur_month] = acc_day, acc_week, acc_month
accumulate(acc_week, acc_day)
accumulate(acc_month, acc_day)
accumulate(acc_full, acc_day)
In [61]:
#
# Detect the names of the products (downloads)
#
product_names = acc_full.keys()
main_products = sorted(p for p in product_names if acc_full[p] > 10)
others = set(product_names) - set(main_products)
product_groups = { p: '-'.join(p.split('-')[:2]) for p in main_products
if '.' in p.split('-')[1] and '10' not in p.split('-')[1] }
main_products = sorted(set(product_groups.values()))
day_table = collapse(day_table, product_groups)
week_table = collapse(week_table, product_groups)
month_table = collapse(month_table, product_groups)
main_products.insert(0, 'other')
print 'Main products:', main_products
# print 'Grouping', product_groups
In [65]:
#
# Draw download dataset
#
cmap = mpl.cm.jet
norm = mpl.colors.Normalize(vmin=0, vmax=len(main_products))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
for name, dataset in (#('Daily', day_table),):
#('Weekly', week_table),):
('Monthly', month_table),):
dates = sorted(dataset)[:-1]
x = np.arange(len(dates))
y = np.row_stack([row(dataset, dates, p) for p in main_products])
colors = [smap.to_rgba(i) for i in range(len(main_products))]
plt.stackplot(x, y, colors=colors, edgecolor='none', alpha=0.7)
legend_rec = [Rectangle((0, 0), 1, 1, fc=c) for c in colors]
legend(legend_rec, main_products)
# Linear regression
y_stack = np.cumsum(y, axis=0)
p = np.poly1d(np.polyfit(x, y_stack[-1, :], 1))
slope = p[1]
print p[0]
plt.plot(x, p(x), '--k')
plt.xlabel('Date', fontsize='x-large')
plt.ylabel('Downloads', fontsize='x-large')
plt.title('Number of downloads (%s)'%name, fontsize='xx-large')
if name == 'Daily':
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['%s/%s/%s'%(dates[int(i)][6:], dates[int(i)][4:6], dates[int(i)][:4]) for i in xt[:-1]], rotation=30, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, ['%dK'%(i/1000) for i in yt], fontsize='large')
plt.annotate('Slope = %0.2f'%slope,
xy=(500, 20000),
xytext=(5, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
# plt.xlim(0, len(dates)-1)
# plt.ylim(0, max(y_stack[-1, :]))
plt.axis('tight')
elif name == 'Weekly':
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['%s (Week %s)'%(dates[int(i)][:4], dates[int(i)][4:]) for i in xt[:-1]], rotation=30, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, ['%dK'%(i/1000) for i in yt], fontsize='large')
plt.annotate('Slope = %0.2f'%slope,
xy=(dates.index(week_name(datetime.date(year=2011, month=8, day=1))), 110000),
xytext=(-5, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 11.3',
xy=(dates.index(week_name(datetime.date(year=2010, month=7, day=13))), 460000),
xytext=(5, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 11.4',
xy=(dates.index(week_name(datetime.date(year=2011, month=3, day=10))), 250000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 12.1',
xy=(dates.index(week_name(datetime.date(year=2011, month=11, day=16))), 360000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('Open Build Service',
xy=(dates.index(week_name(datetime.date(year=2012, month=5, day=27))), 320000),
xytext=(-80, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 12.2',
xy=(dates.index(week_name(datetime.date(year=2012, month=9, day=5))), 230000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 12.3',
xy=(dates.index(week_name(datetime.date(year=2013, month=3, day=13))), 220000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
# plt.annotate('Hackweek 9',
# xy=(dates.index(week_name(datetime.date(year=2013, month=5, day=5))), 240000),
# xytext=(-40, 30), textcoords='offset points',
# arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'))
plt.xlim(0, len(dates)-1)
plt.ylim(0, max(y_stack[-1, :])+200000)
elif name == 'Monthly':
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['%s %s'%(MONTH[dates[int(i)][4:]], dates[int(i)][:4]) for i in xt[:-1]], rotation=30, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, ['%dK'%(i/1000) for i in yt], fontsize='large')
plt.annotate('Slope = %0.2f'%slope,
xy=(17, 480000),
xytext=(-25, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 11.3', xy=(6, 920000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 11.4', xy=(14, 680000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 12.1', xy=(22, 810000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('OBS / openSUSE 12.2 Delay ann. (?)', xy=(28, 820000),
xytext=(-80, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 12.2', xy=(32, 590000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 12.3', xy=(38, 620000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('osC10', xy=(9, 420000),
xytext=(-5, 30), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('osC11', xy=(20, 330000),
xytext=(-25, 30), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('osC12', xy=(33, 490000),
xytext=(-5, 30), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.xlim(0, len(dates)-1)
plt.ylim(0, max(y_stack[-1, :])+300000)
xl, yl = plt.xlim()[1], plt.ylim()[1]
#plt.axes().set_aspect(0.7*xl/yl)
#plt.savefig('%s.png'%name, dpi=250, bbox_inches='tight')
#plt.tight_layout()
plt.show()
# plt.close()
# f = open('table-count-partial.txt', 'w')
# for i, date in enumerate(dates):
# print >>f, '"{} {}"'.format(date[:4], m[date[4:]]), i,
# for product in main_products:
# print >>f, plot_table[date, product],
# print >>f
# f.close()
In [66]:
#
# Process UUIDs
#
def compact(dict_):
return {
'openSUSE-%s'%k if not k.lower().startswith('opensuse') else k: len(v)
for k, v in dict_.iteritems() if v != set([None])
}
day_table, week_table, month_table = {}, {}, {}
acc_week, cur_week, prev_week = None, None, None
acc_month, cur_month, prev_month = None, None, None
for dname in sorted(glob.glob(os.path.join(PATH, '*_uuid_day.pkl'))):
year, month, day = (int(x) for x in DATE.match(dname).groups())
date_ = datetime.date(year, month, day)
acc_day, cur_day = cPickle.load(open(dname, 'rb')), day_name(date_)
acc_week, prev_week, cur_week = acc_week if cur_week == week_name(date_) else {}, cur_week, week_name(date_)
acc_month, prev_month, cur_month = acc_month if cur_month == month_name(date_) else {}, cur_month, month_name(date_)
day_table[cur_day], week_table[cur_week], month_table[cur_month] = acc_day, acc_week, acc_month
accumulate(acc_week, acc_day)
accumulate(acc_month, acc_day)
day_table[cur_day] = compact(day_table[cur_day])
if prev_week and prev_week != cur_week:
week_table[prev_week] = compact(week_table[prev_week])
if prev_month and prev_month != cur_month:
month_table[prev_month] = compact(month_table[prev_month])
week_table[cur_week] = compact(week_table[cur_week])
month_table[cur_month] = compact(month_table[cur_month])
In [67]:
#
# Detect the names of the products (UUIDs)
#
# acc_full is a waste of memory, use month_table instead to get
# an approximation
acc_full = {}
for d in month_table:
accumulate(acc_full, month_table[d])
product_names = acc_full.keys()
main_products = sorted(p for p in product_names if acc_full[p] > 10)
others = set(product_names) - set(main_products)
product_groups = { p: '-'.join(p.split('-')[:2]) for p in main_products
if ('.' in p.split('-')[1] or 'factory' == p.split('-')[1]) and '10' not in p.split('-')[1] }
product_groups['openSUSE-11.5'] = 'openSUSE-12.1'
main_products = sorted(set(product_groups.values()))
day_table_ = collapse(day_table, product_groups)
week_table_ = collapse(week_table, product_groups)
month_table_ = collapse(month_table, product_groups)
main_products.insert(0, 'other')
print 'Main products:', main_products
In [68]:
#
# Draw UUID dataset
#
cmap = mpl.cm.jet
norm = mpl.colors.Normalize(vmin=0, vmax=len(main_products))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
for name, dataset in (#('Daily', day_table_),):
#('Weekly', week_table_),):
('Monthly', month_table_),):
dates = sorted(dataset)[:-1]
x = np.arange(len(dates))
y = np.row_stack([row(dataset, dates, p) for p in main_products])
colors = [smap.to_rgba(i) for i in range(len(main_products))]
plt.stackplot(x, y, colors=colors, edgecolor='none', alpha=0.7)
legend_rec = [Rectangle((0, 0), 1, 1, fc=c) for c in colors]
legend(legend_rec, main_products)
# Linear regression
y_stack = np.cumsum(y, axis=0)
p = np.poly1d(np.polyfit(x, y_stack[-1, :], 1))
slope = p[1]
print p[1], p[0]
plt.plot(x, p(x), '--k')
plt.xlabel('Date', fontsize='x-large')
plt.ylabel('UUIDs', fontsize='x-large')
plt.title('Number of UUIDs (%s)'%name, fontsize='xx-large')
if name == 'Daily':
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['%s/%s/%s'%(dates[int(i)][6:], dates[int(i)][4:6], dates[int(i)][:4]) for i in xt[:-1]], rotation=30, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, ['%dK'%(i/1000) for i in yt], fontsize='large')
plt.annotate('Slope = %0.2f'%slope,
xy=(550, 99000),
xytext=(5, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.xlim(0, len(dates)-1)
plt.ylim(0, max(y_stack[-1, :])+80000)
#plt.axis('tight')
elif name == 'Weekly':
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['%s (Week %s)'%(dates[int(i)][:4], dates[int(i)][4:]) for i in xt[:-1]], rotation=30, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, ['%dK'%(i/1000) for i in yt], fontsize='large')
plt.annotate('Slope = %0.2f'%slope,
xy=(dates.index(week_name(datetime.date(year=2011, month=7, day=1))), 220000),
xytext=(-5, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 11.3',
xy=(dates.index(week_name(datetime.date(year=2010, month=7, day=13))), 216000),
xytext=(5, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 11.4',
xy=(dates.index(week_name(datetime.date(year=2011, month=3, day=10))), 250000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 12.1',
xy=(dates.index(week_name(datetime.date(year=2011, month=11, day=16))), 260000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 12.2',
xy=(dates.index(week_name(datetime.date(year=2012, month=9, day=5))), 230000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 12.3',
xy=(dates.index(week_name(datetime.date(year=2013, month=3, day=13))), 260000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.xlim(0, len(dates)-1)
plt.ylim(0, max(y_stack[-1, :])+200000)
elif name == 'Monthly':
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['%s %s'%(MONTH[dates[int(i)][4:]], dates[int(i)][:4]) for i in xt[:-1]], rotation=30, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, ['%dK'%(i/1000) for i in yt], fontsize='large')
plt.annotate('Slope = %0.2f'%slope,
xy=(18, 420000),
xytext=(-25, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 11.3', xy=(6, 430000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 11.4', xy=(14, 510000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 12.1', xy=(22, 480000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 12.2', xy=(32, 430000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
plt.annotate('openSUSE 12.3', xy=(38, 460000),
xytext=(-60, 25), textcoords='offset points',
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
#plt.annotate('osC10', xy=(9, 420000),
# xytext=(-5, 30), textcoords='offset points',
# arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'))
#plt.annotate('osC11', xy=(20, 330000),
# xytext=(-25, 30), textcoords='offset points',
# arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'))
#plt.annotate('osC12', xy=(33, 490000),
# xytext=(-5, 30), textcoords='offset points',
# arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'))
plt.xlim(0, len(dates)-1)
plt.ylim(0, max(y_stack[-1, :])+300000)
plt.show()
In [26]:
#
# Process medium
#
day_table, week_table, month_table = {}, {}, {}
acc_week, acc_month = None, None
cur_week, cur_month = None, None
acc_full = {}
for dname in sorted(glob.glob(os.path.join(PATH, '*_medium_day.pkl'))):
year, month, day = (int(x) for x in DATE.match(dname).groups())
date_ = datetime.date(year, month, day)
acc_day, cur_day = cPickle.load(open(dname, 'rb')), day_name(date_)
acc_week, cur_week = acc_week if cur_week == week_name(date_) else {}, week_name(date_)
acc_month, cur_month = acc_month if cur_month == month_name(date_) else {}, month_name(date_)
day_table[cur_day], week_table[cur_week], month_table[cur_month] = acc_day, acc_week, acc_month
accumulate(acc_week, acc_day)
accumulate(acc_month, acc_day)
accumulate(acc_full, acc_day)
In [27]:
#
# Detect the names of the mediums
#
product_names = acc_full.keys()
main_products = sorted(p for p in product_names if acc_full[p] > 100)
others = set(product_names) - set(main_products)
# Create the map manually
product_groups = {
'DVD': 'dvd',
'EC2': 'EC2',
'MINI': 'mini',
'mini': 'mini',
'biarch': 'dvd-biarch',
'cd': 'cd',
'dvd': 'dvd',
'dvd, dvd': 'dvd',
'dvd-biarch': 'dvd-biarch',
'dvd-promo': 'dvd-promo',
'promodvd': 'dvd-promo',
'ftp': 'ftp',
'livecd-gnome': 'livecd-gnome',
'livetree-gnome': 'livecd-gnome',
'usb-gnome': 'livecd-gnome',
'livecd-kde': 'livecd-kde',
'livecd-kde3': 'livecd-kde',
'livetree-kde': 'livecd-kde',
'usb-kde': 'livecd-kde',
}
main_products = sorted(set(product_groups.values()))
medium_day_table = collapse(day_table, product_groups)
medium_week_table = collapse(week_table, product_groups)
medium_month_table = collapse(month_table, product_groups)
main_products.insert(0, 'other')
medium_main_products = main_products
print 'Main products:', main_products
In [28]:
#
# Process arch
#
day_table, week_table, month_table = {}, {}, {}
acc_week, acc_month = None, None
cur_week, cur_month = None, None
acc_full = {}
for dname in sorted(glob.glob(os.path.join(PATH, '*_arch_day.pkl'))):
year, month, day = (int(x) for x in DATE.match(dname).groups())
date_ = datetime.date(year, month, day)
acc_day, cur_day = cPickle.load(open(dname, 'rb')), day_name(date_)
acc_week, cur_week = acc_week if cur_week == week_name(date_) else {}, week_name(date_)
acc_month, cur_month = acc_month if cur_month == month_name(date_) else {}, month_name(date_)
day_table[cur_day], week_table[cur_week], month_table[cur_month] = acc_day, acc_week, acc_month
accumulate(acc_week, acc_day)
accumulate(acc_month, acc_day)
accumulate(acc_full, acc_day)
In [29]:
#
# Detect different archs
#
product_names = acc_full.keys()
main_products = sorted(p for p in product_names if acc_full[p] > 100)
others = set(product_names) - set(main_products)
product_groups = {
# 'armv5tel': 'arm',
# 'armv7hl': 'arm',
'i586': 'i586',
'i686': 'i586',
# 'ia64': 'ia64',
# 'noarch': 'noarch',
# 'ppc': 'ppc',
# 'ppc64': 'ppc',
'x86_64': 'x86_64'
}
for arch in ('i586', 'x86_64'):
product_groups.update({ p: arch for p in main_products if arch in p })
main_products = sorted(set(product_groups.values()))
arch_day_table = collapse(day_table, product_groups, other=None)
arch_week_table = collapse(week_table, product_groups, other=None)
arch_month_table = collapse(month_table, product_groups, other=None)
arch_main_products = main_products
print 'Main products:', main_products
In [34]:
#
# Draw subplots Medium and Arch
#
cmap = mpl.cm.jet
norm = mpl.colors.Normalize(vmin=0, vmax=len(medium_main_products))
smap_medium = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
norm = mpl.colors.Normalize(vmin=0, vmax=len(arch_main_products))
smap_arch = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
for name, dataset_medium, dataset_arch in (#('Daily', medium_day_table, arch_day_table),):
('Weekly', medium_week_table, arch_week_table),):
#('Monthly', medium_month_table, arch_month_table),):
dates = sorted(dataset_medium)[:-1]
x = np.arange(len(dates))
y_medium = np.row_stack([row(dataset_medium, dates, p) for p in medium_main_products])
y_arch = np.row_stack([row(dataset_arch, dates, p) for p in arch_main_products])
colors_medium = [smap_medium.to_rgba(i) for i in range(len(medium_main_products))]
colors_arch = [smap_arch.to_rgba(i) for i in range(len(arch_main_products))]
plt.subplot(2, 1, 1)
plt.stackplot(x, y_medium, colors=colors_medium, edgecolor='none', alpha=0.7)
legend_rec = [Rectangle((0, 0), 1, 1, fc=c) for c in colors_medium]
legend(legend_rec, medium_main_products, loc='upper left')
if name == 'Daily':
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['%s/%s/%s'%(dates[int(i)][6:], dates[int(i)][4:6], dates[int(i)][:4]) for i in xt[:-1]], rotation=15, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, ['%dM'%(i/1000000) for i in yt], fontsize='large')
elif name == 'Weekly':
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['%s (Week %s)'%(dates[int(i)][:4], dates[int(i)][4:]) for i in xt[:-1]], rotation=15, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, ['%dM'%(i/1000000) for i in yt], fontsize='large')
elif name == 'Monthly':
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['%s %s'%(MONTH[dates[int(i)][4:]], dates[int(i)][:4]) for i in xt[:-1]], rotation=15, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, ['%dM'%(i/1000000) for i in yt], fontsize='x-large')
plt.xlabel('Date', fontsize='x-large')
plt.ylabel('Hits', fontsize='x-large')
plt.title('Hits by medium (%s)'%name, fontsize='xx-large')
plt.axis('tight')
plt.subplot(2, 1, 2)
plt.stackplot(x, y_arch, colors=colors_arch, edgecolor='none', alpha=0.7)
legend_rec = [Rectangle((0, 0), 1, 1, fc=c) for c in colors_arch]
legend(legend_rec, arch_main_products)
if name == 'Daily':
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['%s/%s/%s'%(dates[int(i)][6:], dates[int(i)][4:6], dates[int(i)][:4]) for i in xt[:-1]], rotation=15, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, ['%dM'%(i/1000000) for i in yt], fontsize='large')
elif name == 'Weekly':
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['%s (Week %s)'%(dates[int(i)][:4], dates[int(i)][4:]) for i in xt[:-1]], rotation=15, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, ['%dM'%(i/1000000) for i in yt], fontsize='large')
elif name == 'Monthly':
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['%s %s'%(MONTH[dates[int(i)][4:]], dates[int(i)][:4]) for i in xt[:-1]], rotation=15, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, ['%dM'%(i/1000000) for i in yt], fontsize='large')
plt.xlabel('Date', fontsize='x-large')
plt.ylabel('Hits', fontsize='x-large')
plt.title('Hits by architecture (%s)'%name, fontsize='xx-large')
plt.axis('tight')
plt.show()
In [ ]:
#
# Process IPs - WIP
#
day_table, week_table, month_table = {}, {}, {}
acc_week, cur_week, prev_week = None, None, None
acc_month, cur_month, prev_month = None, None, None
for dname in sorted(glob.glob(os.path.join(PATH, '*_ip_day.pkl'))):
year, month, day = (int(x) for x in DATE.match(dname).groups())
date_ = datetime.date(year, month, day)
acc_day, cur_day = cPickle.load(open(dname, 'rb')), day_name(date_)
acc_week, prev_week, cur_week = acc_week if cur_week == week_name(date_) else {}, cur_week, week_name(date_)
acc_month, prev_month, cur_month = acc_month if cur_month == month_name(date_) else {}, cur_month, month_name(date_)
day_table[cur_day], week_table[cur_week], month_table[cur_month] = acc_day, acc_week, acc_month
accumulate(acc_week, acc_day)
accumulate(acc_month, acc_day)
day_table[cur_day] = compact(day_table[cur_day])
if prev_week and prev_week != cur_week:
week_table[prev_week] = compact(week_table[prev_week])
if prev_month and prev_month != cur_month:
month_table[prev_month] = compact(month_table[prev_month])
week_table[cur_week] = compact(week_table[cur_week])
month_table[cur_month] = compact(month_table[cur_month])
In [10]:
#
# Detect the names of the products (IPs) - WIP
#
# acc_full is a waste of memory, use month_table instead to get
# an approximation
acc_full = {}
for d in month_table:
accumulate(acc_full, month_table[d])
product_names = acc_full.keys()
main_products = sorted(p for p in product_names if acc_full[p] > 100)
others = set(product_names) - set(main_products)
product_groups = { p: '-'.join(p.split('-')[:2]) for p in main_products
if ('.' in p.split('-')[1] or 'factory' == p.split('-')[1]) and '10' not in p.split('-')[1] }
product_groups['openSUSE-11.5'] = 'openSUSE-12.1'
main_products = sorted(set(product_groups.values()))
# Save the grouping for later
product_groups_for_ips = product_groups
day_table_ = collapse(day_table, product_groups)
# week_table_ = collapse(week_table, product_groups)
# month_table_ = collapse(month_table, product_groups)
main_products.insert(0, 'other')
print 'Main products:', main_products
In [87]:
#
# Read Fedora data for IPs
#
import csv
FEDORA = '/suse/aplanas/Documents/01-Downloads_Tracker/dvd-1.0/fedora_ips.csv'
yum_table, last_version, table = {}, None, None
for line in csv.reader(open(FEDORA, 'r')):
if not line[0]:
continue
if line[0].lower().startswith('fedora'):
last_version = line[0]
table = []
yum_table[last_version] = table
continue
table.append((line[0],
datetime.datetime.strptime(line[1].split('--')[0].strip(), '%Y-%m-%d'),
datetime.datetime.strptime(line[1].split('--')[1].strip(), '%Y-%m-%d'),
int(line[2].replace('.', '')),
int(line[3].replace('.', ''))))
# Fix Fedora 15
f15 = yum_table['Fedora 15']
line = f15[0]
del f15[0]
nips = line[3] / 4
nips_acc = line[4]
lines = [['Week %d'%(i+1), line[1]+datetime.timedelta(days=7*i), line[1]+datetime.timedelta(days=7*(i+1)), nips, nips*(i+1)] for i in range(4)]
line = f15[0]
del f15[0]
nips = line[3] / 2
lines2 = [['Week %d'%(i+5), line[1]+datetime.timedelta(days=7*i), line[1]+datetime.timedelta(days=7*(i+1)), nips, nips_acc+nips*(i+1)] for i in range(2)]
import itertools
yum_table['Fedora 15'] = list(itertools.chain(lines, lines2, f15))
In [88]:
#
# Read weeks/intervals IPs according to Fedora data, using the same approach
# http://fedoraproject.org/wiki/Statistics
# http://fedoraproject.org/wiki/Statistics/Commands
#
import operator
import socket
def opensuse_day(day, products):
def fix(ver):
return 'openSUSE-%s'%ver if not ver.lower().startswith('opensuse') else ver
try:
d = cPickle.load(open(os.path.join(PATH, '%04d%02d%02d_ip_day.pkl'%(day.year, day.month, day.day)), 'rb'))
except:
print 'Error reading', day
d = {}
return { fix(k): v for k, v in d.iteritems() if fix(k) in products }
timeline = (
(datetime.date(year=2010, month=7, day=15), 'openSUSE-11.3'),
(datetime.date(year=2011, month=3, day=10), 'openSUSE-11.4'),
(datetime.date(year=2011, month=11, day=16), 'openSUSE-12.1'),
(datetime.date(year=2012, month=9, day=5), 'openSUSE-12.2'),
(datetime.date(year=2013, month=3, day=13), 'openSUSE-12.3'),
)
from_, to = timeline[0][0], datetime.date.today()
data = (opensuse_day(day, set(([p for d, p in timeline if d <= day][-1],))) for day in (from_+datetime.timedelta(days=d) for d in range((to-from_).days+1)))
acc_ips = defaultdict(set)
count_ips = defaultdict(list)
for d in data:
for product, ips in d.iteritems():
prev = len(acc_ips[product])
acc_ips[product].update(socket.inet_aton(ip) for ip in ips if ip != '2001:720:c1c:1200:3ed9:2bff:fe61:117a')
count_ips[product].append(len(acc_ips[product]) - prev)
# Free memory
acc_ips = None
In [89]:
# Group count_ips by weeks
zypper_table = {}
for product in count_ips:
label_data = ('Week-%s'%(i+1) for i in range(len(count_ips[product])))
week_data = map(sum, zip(*([iter(count_ips[product])]*7)))
acc_week_data = np.cumsum(week_data)
from_ = [t[0] for t in timeline if t[1]==product][0]
dates = [from_+datetime.timedelta(days=i) for i in range(len(count_ips[product]))]
from_date = (d[0] for d in zip(*([iter(dates)]*7)))
to_date = (d[-1] for d in zip(*([iter(dates)]*7)))
zypper_table[product] = zip(label_data, from_date, to_date, week_data, acc_week_data)
In [90]:
#
# Draw timeframes with rectangles
def tox(d):
from_ = datetime.date(year=2010, month=1, day=1)
return (d-from_).days
from_ = datetime.date(year=2010, month=1, day=1)
to = datetime.date(year=2013, month=7, day=2)
days = (to - from_).days
plt.xlim(0, days)
plt.ylim(0, 3)
plt.xlabel('Date', fontsize='x-large')
plt.title('Releases openSUSE / Fedora', fontsize='xx-large')
xt,_ = plt.xticks()
plt.xticks(xt[:-1], [from_+datetime.timedelta(days=i) for i in xt[:-1]], rotation=30, fontsize='large')
plt.yticks([1, 2], ['Fedora', 'openSUSE'], fontsize='large')
cmap = mpl.cm.jet
norm = mpl.colors.Normalize(vmin=0, vmax=len(zypper_table))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
#font = "sans-serif"
for i, product in enumerate(zypper_table):
plt.axhspan(ymin=2-0.3, ymax=2+0.3,
xmin=tox(zypper_table[product][0][1])/float(days), xmax=tox(zypper_table[product][-1][2])/float(days),
facecolor=smap.to_rgba(i), alpha=0.5)
plt.text(tox(zypper_table[product][0][1]) + (tox(zypper_table[product][-1][2]) - tox(zypper_table[product][0][1]))/2, 2, product, ha='center', fontsize='large')#, family=font, size=14)
cmap = mpl.cm.summer
norm = mpl.colors.Normalize(vmin=0, vmax=len(yum_table))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
for i, product in enumerate(yum_table):
plt.axhspan(ymin=1-0.3, ymax=1+0.3,
xmin=tox(yum_table[product][0][1].date())/float(days), xmax=tox(yum_table[product][-1][2].date())/float(days),
facecolor=smap.to_rgba(i), alpha=0.5)
plt.text(tox(yum_table[product][0][1].date()) + (tox(yum_table[product][-1][2].date()) - tox(yum_table[product][0][1].date()))/2, 1, product, ha='center', fontsize='large')#, family=font, size=14)
plt.show()
In [91]:
#
# Draw IP datasets (openSUSE and Fedora)
#
from mpl_toolkits.mplot3d import Axes3D
cmap = mpl.cm.jet
norm = mpl.colors.Normalize(vmin=0, vmax=len(zypper_table))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
fig = plt.figure()
ax = fig.add_subplot(121, projection='3d')
for color, z, label in zip((smap.to_rgba(i) for i in range(len(zypper_table))),
range(len(zypper_table)),
sorted(zypper_table)):
xs = np.arange(min(len(zypper_table[label]), 25))
ys = np.array([l[3] for l in zypper_table[label]][:25])
ax.bar(xs, ys, zs=z, zdir='y', color=color, alpha=0.8)
ax.set_xlabel('Date', fontsize='large')
ax.set_ylabel('Distribution', fontsize='large')
ax.set_zlabel('IPs', fontsize='large')
ax.set_title('New IPs in openSUSE (Zypper) Repository')
ax.set_zlim(0, 250000)
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['Week %s'%int(i) for i in xt[:-1]])
plt.yticks(np.arange(len(zypper_table)), sorted(zypper_table))
zt = ax.get_zticks()
ax.set_zticklabels(['%dK'%(i/1000) for i in zt])
cmap = mpl.cm.summer
norm = mpl.colors.Normalize(vmin=0, vmax=len(yum_table))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
ax = fig.add_subplot(122, projection='3d')
for color, z, label in zip((smap.to_rgba(i) for i in range(len(yum_table))),
range(len(yum_table)),
sorted(yum_table)):
xs = np.arange(min(len(yum_table[label]), 25))
ys = np.array([l[3] for l in yum_table[label]][:25])
ax.bar(xs, ys, zs=z, zdir='y', color=color, alpha=0.8)
ax.set_xlabel('Date', fontsize='large')
ax.set_ylabel('Distribution', fontsize='large')
ax.set_zlabel('IPs', fontsize='large')
ax.set_title('New IPs in Fedora (Yum) Repository')
ax.set_zlim(0, 250000)
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['Week %s'%int(i) for i in xt[:-1]])
plt.yticks(np.arange(len(yum_table)), sorted(yum_table))
zt = ax.get_zticks()
ax.set_zticklabels(['%dK'%(i/1000) for i in zt])
plt.show()
In [92]:
#
# Downloads grouped by IP, using Fedora metric
#
FEDORA_DOWNLOADS = '/suse/aplanas/Documents/01-Downloads_Tracker/dvd-1.0/fedora_downloads.csv'
fedora_download_table, last_version, table = {}, None, None
for line in csv.reader(open(FEDORA_DOWNLOADS, 'r')):
if not line[0]:
continue
if line[0].lower().startswith('fedora'):
last_version = line[0]
table = []
fedora_download_table[last_version] = table
continue
table.append((line[0],
datetime.datetime.strptime(line[1].split('--')[0].strip(), '%Y-%m-%d'),
datetime.datetime.strptime(line[1].split('--')[1].strip(), '%Y-%m-%d'),
int(line[2].replace('.', '')),
int(line[3].replace('.', ''))))
In [93]:
#
# Read the downlads grouped by ip from openSUSE
#
def curate_iso_set(downloads):
"""Extract the name of the ISO"""
cured = defaultdict(set)
for key, value in downloads.iteritems():
newkey = ISO.findall(key)
if newkey:
newkey = '-'.join(newkey[0].split('-')[:2])
cured[newkey].update(value)
return cured
def opensuse_downloads_day(day, products):
try:
d = curate_iso_set(cPickle.load(open(os.path.join(PATH, '%04d%02d%02d_download_ip_day.pkl'%(day.year, day.month, day.day)), 'rb')))
except:
print 'Error reading', day
d = {}
return { k: v for k, v in d.iteritems() if k in products }
timeline = (
(datetime.date(year=2010, month=7, day=15), 'openSUSE-11.3'),
(datetime.date(year=2011, month=3, day=10), 'openSUSE-11.4'),
(datetime.date(year=2011, month=11, day=16), 'openSUSE-12.1'),
(datetime.date(year=2012, month=9, day=5), 'openSUSE-12.2'),
(datetime.date(year=2013, month=3, day=13), 'openSUSE-12.3'),
)
from_, to = timeline[0][0], datetime.date.today()
data = (opensuse_downloads_day(day, set(([p for d, p in timeline if d <= day][-1],))) for day in (from_+datetime.timedelta(days=d) for d in range((to-from_).days+1)))
count_dips = defaultdict(list)
for d in data:
for product, ips in d.iteritems():
count_dips[product].append(len(ips))
In [94]:
# Group count_dips by weeks
opensuse_download_table = {}
for product in count_dips:
label_data = ('Week-%s'%(i+1) for i in range(len(count_dips[product])))
week_data = map(sum, zip(*([iter(count_dips[product])]*7)))
acc_week_data = np.cumsum(week_data)
from_ = [t[0] for t in timeline if t[1]==product][0]
dates = [from_+datetime.timedelta(days=i) for i in range(len(count_dips[product]))]
from_date = (d[0] for d in zip(*([iter(dates)]*7)))
to_date = (d[-1] for d in zip(*([iter(dates)]*7)))
opensuse_download_table[product] = zip(label_data, from_date, to_date, week_data, acc_week_data)
In [95]:
#
# Draw download IP datasets (openSUSE and Fedora)
#
from mpl_toolkits.mplot3d import Axes3D
cmap = mpl.cm.jet
norm = mpl.colors.Normalize(vmin=0, vmax=len(opensuse_download_table))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
fig = plt.figure()
ax = fig.add_subplot(121, projection='3d')
for color, z, label in zip((smap.to_rgba(i) for i in range(len(opensuse_download_table))),
range(len(opensuse_download_table)),
sorted(opensuse_download_table)):
xs = np.arange(min(len(opensuse_download_table[label]), 25))
ys = np.array([l[3] for l in opensuse_download_table[label]][:25])
ax.bar(xs, ys, zs=z, zdir='y', color=color, alpha=0.8)
ax.set_xlabel('Date', fontsize='large')
ax.set_ylabel('Distribution', fontsize='large')
ax.set_zlabel('Downloads', fontsize='large')
ax.set_title('openSUSE downloads')
ax.set_zlim(0, 250000)
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['Week %s'%int(i) for i in xt[:-1]])
plt.yticks(np.arange(len(opensuse_download_table)), sorted(opensuse_download_table))
zt = ax.get_zticks()
ax.set_zticklabels(['%dK'%(i/1000) for i in zt])
cmap = mpl.cm.summer
norm = mpl.colors.Normalize(vmin=0, vmax=len(yum_table))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
ax = fig.add_subplot(122, projection='3d')
for color, z, label in zip((smap.to_rgba(i) for i in range(len(fedora_download_table))),
range(len(fedora_download_table)),
sorted(fedora_download_table)):
xs = np.arange(min(len(fedora_download_table[label]), 25))
ys = np.array([l[3] for l in fedora_download_table[label]][:25])
ax.bar(xs, ys, zs=z, zdir='y', color=color, alpha=0.8)
ax.set_xlabel('Date', fontsize='large')
ax.set_ylabel('Distribution', fontsize='large')
ax.set_zlabel('Downloads', fontsize='large')
ax.set_title('Fedora downloads')
ax.set_zlim(0, 250000)
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['Week %s'%int(i) for i in xt[:-1]])
plt.yticks(np.arange(len(fedora_download_table)), sorted(fedora_download_table))
zt = ax.get_zticks()
ax.set_zticklabels(['%dK'%(i/1000) for i in zt])
plt.show()
In [86]:
import itertools
factory = [int(i) for i in open('factory', 'r')]
tumbleweed = [int(i) for i in open('tumbleweed', 'r')]
plt.plot(np.arange(len(factory)), factory, 'r', np.arange(len(tumbleweed)), tumbleweed, 'b', lw=2)
plt.title('Factory - Tumbleweed', fontsize='xx-large')
plt.xlabel('Date', fontsize='x-large')
plt.ylabel('UUIDs', fontsize='x-large')
plt.legend(['Factory', 'Tumbleweed'])
dates=['%s %s'%(m, y) for y in [2010, 2011, 2012, 2013] for m in ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dic')]
xt,_ = plt.xticks()
plt.xticks(xt, [dates[int(i)] for i in xt], rotation=30, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, [int(i) for i in yt], fontsize='large')
plt.grid()
plt.axis('tight')
plt.show()
In [106]:
import csv
with open('SocialMediaMetrics.csv', 'rb') as csvfile:
reader = csv.reader(csvfile)
dates = reader.next()
data_suse = {}
data_fedora = {}
labels_suse = []
labels_fedora = []
for row in reader:
if 'Fedora' in row[0]:
labels_fedora.append(row[0])
data_fedora[row[0]] = [int(i) if i else 0 for i in row[1:]]
else:
labels_suse.append(row[0])
data_suse[row[0]] = [int(i) if i else 0 for i in row[1:]]
plt.subplot(211)
cmap = mpl.cm.jet
norm = mpl.colors.Normalize(vmin=0, vmax=len(labels_suse))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
colors = {l: smap.to_rgba(i) for i,l in enumerate(labels_suse)}
bottom = [0] * len(dates)
for l in labels_suse:
plt.bar(np.arange(len(dates)), data_suse[l], bottom=bottom, label=l, color=colors[l])
bottom = [bottom[i] + data_suse[l][i] for i in range(len(dates))]
plt.title('openSUSE Social Media Direct Reach', fontsize='xx-large')
plt.xlabel('Date', fontsize='x-large')
plt.ylabel('Followers', fontsize='x-large')
plt.legend(loc='upper left')
xt,_ = plt.xticks()
plt.xticks(xt[:-1], [dates[int(i)] for i in xt[:-1]], rotation=30, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, [int(i) for i in yt], fontsize='large')
plt.grid()
plt.subplot(212)
cmap = mpl.cm.winter
norm = mpl.colors.Normalize(vmin=0, vmax=len(labels_fedora))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
colors = {l: smap.to_rgba(i) for i,l in enumerate(labels_fedora)}
bottom = [0] * len(dates)
for l in labels_fedora:
plt.bar(np.arange(len(dates)), data_fedora[l], bottom=bottom, label=l, color=colors[l])
bottom = [bottom[i] + data_fedora[l][i] for i in range(len(dates))]
plt.title('Fedora Social Media Direct Reach', fontsize='xx-large')
plt.xlabel('Date', fontsize='x-large')
plt.ylabel('Followers', fontsize='x-large')
plt.legend(loc='upper left')
xt,_ = plt.xticks()
plt.xticks(xt[:-1], [dates[int(i)] for i in xt[:-1]], rotation=30, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, [int(i) for i in yt], fontsize='large')
plt.grid()
plt.show()
In [ ]: