notebook.community

Edit and run



In [59]:

    
from collections import defaultdict
import cPickle
import datetime
import glob
import os.path
import re

from analyze import PDict

from matplotlib import pyplot, mpl
from matplotlib.patches import Rectangle
import numpy as np

#
# Utility functions
#

DATE = re.compile(r'.*(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})_\w+_day.pkl')

PATH = '<PATH FOR RESULTS>/datamining/dbenv/results'

ISO = re.compile(r'^.*/([^/]+.iso(?:torrent|metalink)?)(?:\?\d*)?$')

MONTH = {
    '01': 'Jan',
    '02': 'Feb',
    '03': 'Mar',
    '04': 'Apr',
    '05': 'May',
    '06': 'Jun',
    '07': 'Jul',
    '08': 'Aug',
    '09': 'Sep',
    '10': 'Oct',
    '11': 'Nov',
    '12': 'Dic',
}

def day_name(date_):
    """Name of the daily file. Return YYYYMMDD string."""
    return '%04d%02d%02d'%(date_.year, date_.month, date_.day)


def week_name(date_):
    """Name the weekly file. Return YYYYWW string."""

    # Group the date at the end of the week, so some dates can appears
    # in a different year. For example, 32/12/2012 will appear as a
    # 201301, the first week of 2013.

    delta = datetime.timedelta(days=6-date_.weekday())
    week_date = date_ + delta
    return '%04d%02d'%(week_date.year, week_date.isocalendar()[1])


def month_name(date_):
    """Name of the monthly file. Return YYYYMM string."""
    return '%04d%02d'%(date_.year, date_.month)


def accumulate(dict_a, dict_b):
    """Accumulate the counts or sets from dict_b in dict_a"""
    for key, value in dict_b.iteritems():
        if type(value) is int:
            dict_a[key] = dict_a.get(key, 0) + value
        elif type(value) is set:
            dict_a[key] = dict_a.get(key, set()) | value
        else:
            raise TypeError('Value is neither int or set.')

def collapse(table, products, other='other'):
    def _collapse(dict_):
        d = defaultdict(int)
        for (k, v) in dict_.iteritems():
            p = products[k] if k in products else other
            if p:
                d[p] = d[p] + v
        return d
    return {k: _collapse(v) for (k, v) in table.iteritems()}

def row(table, dates, product):
    return np.array([table[d][p] for d in dates])



In [60]:

    
#
# Process downloads
#

# ISO = re.compile(r'^.*/([^/]+.iso(?:torrent|metalink)?)(?:\?\d*)?$')

def curate_iso(downloads):
    """Extract the name of the ISO"""
    cured = defaultdict(int)
    for key, value in downloads.iteritems():
        newkey = ISO.findall(key)
        if newkey:
            cured[newkey[0]] = cured[newkey[0]] + value
    return cured


day_table, week_table, month_table = {}, {}, {}

acc_week, acc_month = None, None
cur_week, cur_month = None, None

acc_full = {}

for dname in sorted(glob.glob(os.path.join(PATH, '*_download_day.pkl'))):
    year, month, day = (int(x) for x in DATE.match(dname).groups())
    date_ = datetime.date(year, month, day)

    acc_day, cur_day = curate_iso(cPickle.load(open(dname, 'rb'))), day_name(date_)

    acc_week, cur_week = acc_week if cur_week == week_name(date_) else {}, week_name(date_)
    acc_month, cur_month = acc_month if cur_month == month_name(date_) else {}, month_name(date_)

    day_table[cur_day], week_table[cur_week], month_table[cur_month] = acc_day, acc_week, acc_month

    accumulate(acc_week, acc_day)
    accumulate(acc_month, acc_day)
    accumulate(acc_full, acc_day)



In [61]:

    
#
# Detect the names of the products (downloads)
#

product_names = acc_full.keys()
main_products = sorted(p for p in product_names if acc_full[p] > 10)
others = set(product_names) - set(main_products)

product_groups = { p: '-'.join(p.split('-')[:2]) for p in main_products
                   if '.' in p.split('-')[1] and '10' not in p.split('-')[1] }
main_products = sorted(set(product_groups.values()))

day_table = collapse(day_table, product_groups)
week_table = collapse(week_table, product_groups)
month_table = collapse(month_table, product_groups)

main_products.insert(0, 'other')
print 'Main products:', main_products
# print 'Grouping', product_groups









    



Main products: ['other', 'openSUSE-11.0', 'openSUSE-11.1', 'openSUSE-11.2', 'openSUSE-11.3', 'openSUSE-11.4', 'openSUSE-12.1', 'openSUSE-12.2', 'openSUSE-12.3']



In [65]:

    
#
# Draw download dataset
#

cmap = mpl.cm.jet
norm = mpl.colors.Normalize(vmin=0, vmax=len(main_products))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)

for name, dataset in (#('Daily', day_table),):
                      #('Weekly', week_table),):
                      ('Monthly', month_table),):
    dates = sorted(dataset)[:-1]
    x = np.arange(len(dates))
    y = np.row_stack([row(dataset, dates, p) for p in main_products])
    colors = [smap.to_rgba(i) for i in range(len(main_products))]
    plt.stackplot(x, y, colors=colors, edgecolor='none', alpha=0.7)
    legend_rec = [Rectangle((0, 0), 1, 1, fc=c) for c in colors]
    legend(legend_rec, main_products)

    # Linear regression
    y_stack = np.cumsum(y, axis=0)
    p = np.poly1d(np.polyfit(x, y_stack[-1, :], 1))
    slope = p[1]
    print p[0]
    plt.plot(x, p(x), '--k')

    plt.xlabel('Date', fontsize='x-large')
    plt.ylabel('Downloads', fontsize='x-large')
    plt.title('Number of downloads (%s)'%name, fontsize='xx-large')

    if name == 'Daily':
        xt,_ = plt.xticks()
        plt.xticks(xt[:-1], ['%s/%s/%s'%(dates[int(i)][6:], dates[int(i)][4:6], dates[int(i)][:4]) for i in xt[:-1]], rotation=30, fontsize='large')
        yt,_ = plt.yticks()
        plt.yticks(yt, ['%dK'%(i/1000) for i in yt], fontsize='large')

        plt.annotate('Slope = %0.2f'%slope,
                     xy=(500, 20000),
                     xytext=(5, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')

        # plt.xlim(0, len(dates)-1)
        # plt.ylim(0, max(y_stack[-1, :]))
        plt.axis('tight')

    elif name == 'Weekly':
        xt,_ = plt.xticks()
        plt.xticks(xt[:-1], ['%s (Week %s)'%(dates[int(i)][:4], dates[int(i)][4:]) for i in xt[:-1]], rotation=30, fontsize='large')
        yt,_ = plt.yticks()
        plt.yticks(yt, ['%dK'%(i/1000) for i in yt], fontsize='large')

        plt.annotate('Slope = %0.2f'%slope,
                     xy=(dates.index(week_name(datetime.date(year=2011, month=8, day=1))), 110000),
                     xytext=(-5, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')

        plt.annotate('openSUSE 11.3',
                     xy=(dates.index(week_name(datetime.date(year=2010, month=7, day=13))), 460000),
                     xytext=(5, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('openSUSE 11.4',
                     xy=(dates.index(week_name(datetime.date(year=2011, month=3, day=10))), 250000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('openSUSE 12.1',
                     xy=(dates.index(week_name(datetime.date(year=2011, month=11, day=16))), 360000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('Open Build Service',
                     xy=(dates.index(week_name(datetime.date(year=2012, month=5, day=27))), 320000),
                     xytext=(-80, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('openSUSE 12.2',
                     xy=(dates.index(week_name(datetime.date(year=2012, month=9, day=5))), 230000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('openSUSE 12.3',
                     xy=(dates.index(week_name(datetime.date(year=2013, month=3, day=13))), 220000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        # plt.annotate('Hackweek 9',
        #              xy=(dates.index(week_name(datetime.date(year=2013, month=5, day=5))), 240000),
        #              xytext=(-40, 30), textcoords='offset points',
        #              arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'))

        plt.xlim(0, len(dates)-1)
        plt.ylim(0, max(y_stack[-1, :])+200000)

    elif name == 'Monthly':
        xt,_ = plt.xticks()
        plt.xticks(xt[:-1], ['%s %s'%(MONTH[dates[int(i)][4:]], dates[int(i)][:4]) for i in xt[:-1]], rotation=30, fontsize='large')
        yt,_ = plt.yticks()
        plt.yticks(yt, ['%dK'%(i/1000) for i in yt], fontsize='large')

        plt.annotate('Slope = %0.2f'%slope,
                     xy=(17, 480000),
                     xytext=(-25, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')

        plt.annotate('openSUSE 11.3', xy=(6, 920000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('openSUSE 11.4', xy=(14, 680000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('openSUSE 12.1', xy=(22, 810000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('OBS / openSUSE 12.2 Delay ann. (?)', xy=(28, 820000),
                     xytext=(-80, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('openSUSE 12.2', xy=(32, 590000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('openSUSE 12.3', xy=(38, 620000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('osC10', xy=(9, 420000),
                     xytext=(-5, 30), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('osC11', xy=(20, 330000),
                     xytext=(-25, 30), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('osC12', xy=(33, 490000),
                     xytext=(-5, 30), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')

        plt.xlim(0, len(dates)-1)
        plt.ylim(0, max(y_stack[-1, :])+300000)

    xl, yl = plt.xlim()[1], plt.ylim()[1]
    #plt.axes().set_aspect(0.7*xl/yl)
    #plt.savefig('%s.png'%name, dpi=250, bbox_inches='tight')
    #plt.tight_layout()
    plt.show()
    # plt.close()

    # f = open('table-count-partial.txt', 'w')
    # for i, date in enumerate(dates):
    #     print >>f, '"{} {}"'.format(date[:4], m[date[4:]]), i,
    # for product in main_products:
    #     print >>f, plot_table[date, product],
    #     print >>f
    # f.close()









    



441697.076412



In [66]:

    
#
# Process UUIDs
#

def compact(dict_):
    return {
        'openSUSE-%s'%k if not k.lower().startswith('opensuse') else k: len(v)
        for k, v in dict_.iteritems() if v != set([None])
    }


day_table, week_table, month_table = {}, {}, {}

acc_week, cur_week, prev_week = None, None, None
acc_month, cur_month, prev_month = None, None, None

for dname in sorted(glob.glob(os.path.join(PATH, '*_uuid_day.pkl'))):
    year, month, day = (int(x) for x in DATE.match(dname).groups())
    date_ = datetime.date(year, month, day)

    acc_day, cur_day = cPickle.load(open(dname, 'rb')), day_name(date_)

    acc_week, prev_week, cur_week = acc_week if cur_week == week_name(date_) else {}, cur_week, week_name(date_)
    acc_month, prev_month, cur_month = acc_month if cur_month == month_name(date_) else {}, cur_month, month_name(date_)

    day_table[cur_day], week_table[cur_week], month_table[cur_month] = acc_day, acc_week, acc_month

    accumulate(acc_week, acc_day)
    accumulate(acc_month, acc_day)

    day_table[cur_day] = compact(day_table[cur_day])
    if prev_week and prev_week != cur_week:
        week_table[prev_week] = compact(week_table[prev_week])
    if prev_month and prev_month != cur_month:
        month_table[prev_month] = compact(month_table[prev_month])

week_table[cur_week] = compact(week_table[cur_week])
month_table[cur_month] = compact(month_table[cur_month])



In [67]:

    
#
# Detect the names of the products (UUIDs)
#

# acc_full is a waste of memory, use month_table instead to get
# an approximation
acc_full = {}
for d in month_table:
    accumulate(acc_full, month_table[d])

product_names = acc_full.keys()
main_products = sorted(p for p in product_names if acc_full[p] > 10)
others = set(product_names) - set(main_products)

product_groups = { p: '-'.join(p.split('-')[:2]) for p in main_products 
                  if ('.' in p.split('-')[1] or 'factory' == p.split('-')[1]) and '10' not in p.split('-')[1] }
product_groups['openSUSE-11.5'] = 'openSUSE-12.1'
main_products = sorted(set(product_groups.values()))

day_table_ = collapse(day_table, product_groups)
week_table_ = collapse(week_table, product_groups)
month_table_ = collapse(month_table, product_groups)

main_products.insert(0, 'other')
print 'Main products:', main_products









    



Main products: ['other', 'openSUSE-11.0', 'openSUSE-11.1', 'openSUSE-11.2', 'openSUSE-11.3', 'openSUSE-11.4', 'openSUSE-12.1', 'openSUSE-12.2', 'openSUSE-12.3', 'openSUSE-13.1', 'openSUSE-factory']



In [68]:

    
#
# Draw UUID dataset
#

cmap = mpl.cm.jet
norm = mpl.colors.Normalize(vmin=0, vmax=len(main_products))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)

for name, dataset in (#('Daily', day_table_),):
                      #('Weekly', week_table_),):
                      ('Monthly', month_table_),):
    dates = sorted(dataset)[:-1]
    x = np.arange(len(dates))
    y = np.row_stack([row(dataset, dates, p) for p in main_products])
    colors = [smap.to_rgba(i) for i in range(len(main_products))]
    plt.stackplot(x, y, colors=colors, edgecolor='none', alpha=0.7)
    legend_rec = [Rectangle((0, 0), 1, 1, fc=c) for c in colors]
    legend(legend_rec, main_products)

    # Linear regression
    y_stack = np.cumsum(y, axis=0)
    p = np.poly1d(np.polyfit(x, y_stack[-1, :], 1))
    slope = p[1]
    print p[1], p[0]
    plt.plot(x, p(x), '--k')

    plt.xlabel('Date', fontsize='x-large')
    plt.ylabel('UUIDs', fontsize='x-large')
    plt.title('Number of UUIDs (%s)'%name, fontsize='xx-large')

    if name == 'Daily':
        xt,_ = plt.xticks()
        plt.xticks(xt[:-1], ['%s/%s/%s'%(dates[int(i)][6:], dates[int(i)][4:6], dates[int(i)][:4]) for i in xt[:-1]], rotation=30, fontsize='large')
        yt,_ = plt.yticks()
        plt.yticks(yt, ['%dK'%(i/1000) for i in yt], fontsize='large')

        plt.annotate('Slope = %0.2f'%slope,
                     xy=(550, 99000),
                     xytext=(5, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')

        plt.xlim(0, len(dates)-1)
        plt.ylim(0, max(y_stack[-1, :])+80000)
        #plt.axis('tight')

    elif name == 'Weekly':
        xt,_ = plt.xticks()
        plt.xticks(xt[:-1], ['%s (Week %s)'%(dates[int(i)][:4], dates[int(i)][4:]) for i in xt[:-1]], rotation=30, fontsize='large')
        yt,_ = plt.yticks()
        plt.yticks(yt, ['%dK'%(i/1000) for i in yt], fontsize='large')

        plt.annotate('Slope = %0.2f'%slope,
                     xy=(dates.index(week_name(datetime.date(year=2011, month=7, day=1))), 220000),
                     xytext=(-5, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')

        plt.annotate('openSUSE 11.3',
                     xy=(dates.index(week_name(datetime.date(year=2010, month=7, day=13))), 216000),
                     xytext=(5, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('openSUSE 11.4',
                     xy=(dates.index(week_name(datetime.date(year=2011, month=3, day=10))), 250000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('openSUSE 12.1',
                     xy=(dates.index(week_name(datetime.date(year=2011, month=11, day=16))), 260000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('openSUSE 12.2',
                     xy=(dates.index(week_name(datetime.date(year=2012, month=9, day=5))), 230000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('openSUSE 12.3',
                     xy=(dates.index(week_name(datetime.date(year=2013, month=3, day=13))), 260000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')

        plt.xlim(0, len(dates)-1)
        plt.ylim(0, max(y_stack[-1, :])+200000)

    elif name == 'Monthly':
        xt,_ = plt.xticks()
        plt.xticks(xt[:-1], ['%s %s'%(MONTH[dates[int(i)][4:]], dates[int(i)][:4]) for i in xt[:-1]], rotation=30, fontsize='large')
        yt,_ = plt.yticks()
        plt.yticks(yt, ['%dK'%(i/1000) for i in yt], fontsize='large')

        plt.annotate('Slope = %0.2f'%slope,
                     xy=(18, 420000),
                     xytext=(-25, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')

        plt.annotate('openSUSE 11.3', xy=(6, 430000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('openSUSE 11.4', xy=(14, 510000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('openSUSE 12.1', xy=(22, 480000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('openSUSE 12.2', xy=(32, 430000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        plt.annotate('openSUSE 12.3', xy=(38, 460000),
                     xytext=(-60, 25), textcoords='offset points',
                     arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'), fontsize='x-large')
        #plt.annotate('osC10', xy=(9, 420000),
        #             xytext=(-5, 30), textcoords='offset points',
        #             arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'))
        #plt.annotate('osC11', xy=(20, 330000),
        #             xytext=(-25, 30), textcoords='offset points',
        #             arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'))
        #plt.annotate('osC12', xy=(33, 490000),
        #             xytext=(-5, 30), textcoords='offset points',
        #             arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=.2'))

        plt.xlim(0, len(dates)-1)
        plt.ylim(0, max(y_stack[-1, :])+300000)

    plt.show()









    



-354.178915809 423271.096346



In [26]:

    
#
# Process medium
#

day_table, week_table, month_table = {}, {}, {}

acc_week, acc_month = None, None
cur_week, cur_month = None, None

acc_full = {}

for dname in sorted(glob.glob(os.path.join(PATH, '*_medium_day.pkl'))):
    year, month, day = (int(x) for x in DATE.match(dname).groups())
    date_ = datetime.date(year, month, day)

    acc_day, cur_day = cPickle.load(open(dname, 'rb')), day_name(date_)

    acc_week, cur_week = acc_week if cur_week == week_name(date_) else {}, week_name(date_)
    acc_month, cur_month = acc_month if cur_month == month_name(date_) else {}, month_name(date_)

    day_table[cur_day], week_table[cur_week], month_table[cur_month] = acc_day, acc_week, acc_month

    accumulate(acc_week, acc_day)
    accumulate(acc_month, acc_day)
    accumulate(acc_full, acc_day)



In [27]:

    
#
# Detect the names of the mediums
#

product_names = acc_full.keys()
main_products = sorted(p for p in product_names if acc_full[p] > 100)
others = set(product_names) - set(main_products)

# Create the map manually
product_groups = {
    'DVD': 'dvd',
    'EC2': 'EC2',
    'MINI': 'mini',
    'mini': 'mini',
    'biarch': 'dvd-biarch',
    'cd': 'cd',
    'dvd': 'dvd',
    'dvd, dvd': 'dvd',
    'dvd-biarch': 'dvd-biarch',
    'dvd-promo': 'dvd-promo',
    'promodvd': 'dvd-promo',
    'ftp': 'ftp',
    'livecd-gnome': 'livecd-gnome',
    'livetree-gnome': 'livecd-gnome',
    'usb-gnome': 'livecd-gnome',
    'livecd-kde': 'livecd-kde',
    'livecd-kde3': 'livecd-kde',
    'livetree-kde': 'livecd-kde',
    'usb-kde': 'livecd-kde',
}
main_products = sorted(set(product_groups.values()))

medium_day_table = collapse(day_table, product_groups)
medium_week_table = collapse(week_table, product_groups)
medium_month_table = collapse(month_table, product_groups)

main_products.insert(0, 'other')
medium_main_products = main_products
print 'Main products:', main_products









    



Main products: ['other', 'EC2', 'cd', 'dvd', 'dvd-biarch', 'dvd-promo', 'ftp', 'livecd-gnome', 'livecd-kde', 'mini']



In [28]:

    
#
# Process arch
#

day_table, week_table, month_table = {}, {}, {}

acc_week, acc_month = None, None
cur_week, cur_month = None, None

acc_full = {}

for dname in sorted(glob.glob(os.path.join(PATH, '*_arch_day.pkl'))):
    year, month, day = (int(x) for x in DATE.match(dname).groups())
    date_ = datetime.date(year, month, day)

    acc_day, cur_day = cPickle.load(open(dname, 'rb')), day_name(date_)

    acc_week, cur_week = acc_week if cur_week == week_name(date_) else {}, week_name(date_)
    acc_month, cur_month = acc_month if cur_month == month_name(date_) else {}, month_name(date_)

    day_table[cur_day], week_table[cur_week], month_table[cur_month] = acc_day, acc_week, acc_month

    accumulate(acc_week, acc_day)
    accumulate(acc_month, acc_day)
    accumulate(acc_full, acc_day)



In [29]:

    
#
# Detect different archs
#

product_names = acc_full.keys()
main_products = sorted(p for p in product_names if acc_full[p] > 100)
others = set(product_names) - set(main_products)

product_groups = {
    # 'armv5tel': 'arm',
    # 'armv7hl': 'arm',
    'i586': 'i586',
    'i686': 'i586',
    # 'ia64': 'ia64',
    # 'noarch': 'noarch',
    # 'ppc': 'ppc',
    # 'ppc64': 'ppc',
    'x86_64': 'x86_64'
}
for arch in ('i586', 'x86_64'):
    product_groups.update({ p: arch for p in main_products if arch in p })
main_products = sorted(set(product_groups.values()))

arch_day_table = collapse(day_table, product_groups, other=None)
arch_week_table = collapse(week_table, product_groups, other=None)
arch_month_table = collapse(month_table, product_groups, other=None)

arch_main_products = main_products
print 'Main products:', main_products









    



Main products: ['i586', 'x86_64']



In [34]:

    
#
# Draw subplots Medium and Arch
#

cmap = mpl.cm.jet
norm = mpl.colors.Normalize(vmin=0, vmax=len(medium_main_products))
smap_medium = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
norm = mpl.colors.Normalize(vmin=0, vmax=len(arch_main_products))
smap_arch = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)

for name, dataset_medium, dataset_arch in (#('Daily', medium_day_table, arch_day_table),):
                                           ('Weekly', medium_week_table, arch_week_table),):
                                           #('Monthly', medium_month_table, arch_month_table),):
    dates = sorted(dataset_medium)[:-1]
    x = np.arange(len(dates))
    y_medium = np.row_stack([row(dataset_medium, dates, p) for p in medium_main_products])
    y_arch = np.row_stack([row(dataset_arch, dates, p) for p in arch_main_products])
    colors_medium = [smap_medium.to_rgba(i) for i in range(len(medium_main_products))]
    colors_arch = [smap_arch.to_rgba(i) for i in range(len(arch_main_products))]

    plt.subplot(2, 1, 1)
    plt.stackplot(x, y_medium, colors=colors_medium, edgecolor='none', alpha=0.7)
    legend_rec = [Rectangle((0, 0), 1, 1, fc=c) for c in colors_medium]
    legend(legend_rec, medium_main_products, loc='upper left')

    if name == 'Daily':
        xt,_ = plt.xticks()
        plt.xticks(xt[:-1], ['%s/%s/%s'%(dates[int(i)][6:], dates[int(i)][4:6], dates[int(i)][:4]) for i in xt[:-1]], rotation=15, fontsize='large')
        yt,_ = plt.yticks()
        plt.yticks(yt, ['%dM'%(i/1000000) for i in yt], fontsize='large')
    elif name == 'Weekly':
        xt,_ = plt.xticks()
        plt.xticks(xt[:-1], ['%s (Week %s)'%(dates[int(i)][:4], dates[int(i)][4:]) for i in xt[:-1]], rotation=15, fontsize='large')
        yt,_ = plt.yticks()
        plt.yticks(yt, ['%dM'%(i/1000000) for i in yt], fontsize='large')
    elif name == 'Monthly':
        xt,_ = plt.xticks()
        plt.xticks(xt[:-1], ['%s %s'%(MONTH[dates[int(i)][4:]], dates[int(i)][:4]) for i in xt[:-1]], rotation=15, fontsize='large')
        yt,_ = plt.yticks()
        plt.yticks(yt, ['%dM'%(i/1000000) for i in yt], fontsize='x-large')

    plt.xlabel('Date', fontsize='x-large')
    plt.ylabel('Hits', fontsize='x-large')
    plt.title('Hits by medium (%s)'%name, fontsize='xx-large')
    plt.axis('tight')

    plt.subplot(2, 1, 2)
    plt.stackplot(x, y_arch, colors=colors_arch, edgecolor='none', alpha=0.7)
    legend_rec = [Rectangle((0, 0), 1, 1, fc=c) for c in colors_arch]
    legend(legend_rec, arch_main_products)

    if name == 'Daily':
        xt,_ = plt.xticks()
        plt.xticks(xt[:-1], ['%s/%s/%s'%(dates[int(i)][6:], dates[int(i)][4:6], dates[int(i)][:4]) for i in xt[:-1]], rotation=15, fontsize='large')
        yt,_ = plt.yticks()
        plt.yticks(yt, ['%dM'%(i/1000000) for i in yt], fontsize='large')
    elif name == 'Weekly':
        xt,_ = plt.xticks()
        plt.xticks(xt[:-1], ['%s (Week %s)'%(dates[int(i)][:4], dates[int(i)][4:]) for i in xt[:-1]], rotation=15, fontsize='large')
        yt,_ = plt.yticks()
        plt.yticks(yt, ['%dM'%(i/1000000) for i in yt], fontsize='large')
    elif name == 'Monthly':
        xt,_ = plt.xticks()
        plt.xticks(xt[:-1], ['%s %s'%(MONTH[dates[int(i)][4:]], dates[int(i)][:4]) for i in xt[:-1]], rotation=15, fontsize='large')
        yt,_ = plt.yticks()
        plt.yticks(yt, ['%dM'%(i/1000000) for i in yt], fontsize='large')

    plt.xlabel('Date', fontsize='x-large')
    plt.ylabel('Hits', fontsize='x-large')
    plt.title('Hits by architecture (%s)'%name, fontsize='xx-large')
    plt.axis('tight')

    plt.show()



In [ ]:

    
#
# Process IPs - WIP
#

day_table, week_table, month_table = {}, {}, {}

acc_week, cur_week, prev_week = None, None, None
acc_month, cur_month, prev_month = None, None, None

for dname in sorted(glob.glob(os.path.join(PATH, '*_ip_day.pkl'))):
    year, month, day = (int(x) for x in DATE.match(dname).groups())
    date_ = datetime.date(year, month, day)

    acc_day, cur_day = cPickle.load(open(dname, 'rb')), day_name(date_)

    acc_week, prev_week, cur_week = acc_week if cur_week == week_name(date_) else {}, cur_week, week_name(date_)
    acc_month, prev_month, cur_month = acc_month if cur_month == month_name(date_) else {}, cur_month, month_name(date_)

    day_table[cur_day], week_table[cur_week], month_table[cur_month] = acc_day, acc_week, acc_month

    accumulate(acc_week, acc_day)
    accumulate(acc_month, acc_day)

    day_table[cur_day] = compact(day_table[cur_day])
    if prev_week and prev_week != cur_week:
        week_table[prev_week] = compact(week_table[prev_week])
    if prev_month and prev_month != cur_month:
        month_table[prev_month] = compact(month_table[prev_month])

week_table[cur_week] = compact(week_table[cur_week])
month_table[cur_month] = compact(month_table[cur_month])



In [10]:

    
#
# Detect the names of the products (IPs) - WIP
#

# acc_full is a waste of memory, use month_table instead to get
# an approximation
acc_full = {}
for d in month_table:
    accumulate(acc_full, month_table[d])

product_names = acc_full.keys()
main_products = sorted(p for p in product_names if acc_full[p] > 100)
others = set(product_names) - set(main_products)

product_groups = { p: '-'.join(p.split('-')[:2]) for p in main_products 
                  if ('.' in p.split('-')[1] or 'factory' == p.split('-')[1]) and '10' not in p.split('-')[1] }
product_groups['openSUSE-11.5'] = 'openSUSE-12.1'
main_products = sorted(set(product_groups.values()))

# Save the grouping for later
product_groups_for_ips = product_groups

day_table_ = collapse(day_table, product_groups)
# week_table_ = collapse(week_table, product_groups)
# month_table_ = collapse(month_table, product_groups)

main_products.insert(0, 'other')
print 'Main products:', main_products









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-0206773ab40c> in <module>()
      6 # an approximation
      7 acc_full = {}
----> 8 for d in month_table:
      9     accumulate(acc_full, month_table[d])
     10 

NameError: name 'month_table' is not defined



In [87]:

    
#
# Read Fedora data for IPs
#

import csv

FEDORA = '/suse/aplanas/Documents/01-Downloads_Tracker/dvd-1.0/fedora_ips.csv'

yum_table, last_version, table = {}, None, None
for line in csv.reader(open(FEDORA, 'r')):
    if not line[0]:
        continue
    if line[0].lower().startswith('fedora'):
        last_version = line[0]
        table = []
        yum_table[last_version] = table
        continue
    table.append((line[0],
                  datetime.datetime.strptime(line[1].split('--')[0].strip(), '%Y-%m-%d'),
                  datetime.datetime.strptime(line[1].split('--')[1].strip(), '%Y-%m-%d'),
                  int(line[2].replace('.', '')),
                  int(line[3].replace('.', ''))))

# Fix Fedora 15
f15 = yum_table['Fedora 15']
line = f15[0]
del f15[0]
nips = line[3] / 4
nips_acc = line[4]
lines = [['Week %d'%(i+1), line[1]+datetime.timedelta(days=7*i), line[1]+datetime.timedelta(days=7*(i+1)), nips, nips*(i+1)] for i in range(4)]

line = f15[0]
del f15[0]
nips = line[3] / 2
lines2 = [['Week %d'%(i+5), line[1]+datetime.timedelta(days=7*i), line[1]+datetime.timedelta(days=7*(i+1)), nips, nips_acc+nips*(i+1)] for i in range(2)]
import itertools
yum_table['Fedora 15'] = list(itertools.chain(lines, lines2, f15))



In [88]:

    
#
# Read weeks/intervals IPs according to Fedora data, using the same approach
#   http://fedoraproject.org/wiki/Statistics
#   http://fedoraproject.org/wiki/Statistics/Commands
#

import operator
import socket


def opensuse_day(day, products):
    def fix(ver):
        return 'openSUSE-%s'%ver if not ver.lower().startswith('opensuse') else ver
    try:
        d = cPickle.load(open(os.path.join(PATH, '%04d%02d%02d_ip_day.pkl'%(day.year, day.month, day.day)), 'rb'))
    except:
        print 'Error reading', day
        d = {}

    return { fix(k): v for k, v in d.iteritems() if fix(k) in products }


timeline = (
    (datetime.date(year=2010, month=7, day=15), 'openSUSE-11.3'),
    (datetime.date(year=2011, month=3, day=10), 'openSUSE-11.4'),
    (datetime.date(year=2011, month=11, day=16), 'openSUSE-12.1'),
    (datetime.date(year=2012, month=9, day=5), 'openSUSE-12.2'),
    (datetime.date(year=2013, month=3, day=13), 'openSUSE-12.3'),
)

from_, to = timeline[0][0], datetime.date.today()

data = (opensuse_day(day, set(([p for d, p in timeline if d <= day][-1],))) for day in (from_+datetime.timedelta(days=d) for d in range((to-from_).days+1)))

acc_ips = defaultdict(set)
count_ips = defaultdict(list)
for d in data:
    for product, ips in d.iteritems():
        prev = len(acc_ips[product])
        acc_ips[product].update(socket.inet_aton(ip) for ip in ips if ip != '2001:720:c1c:1200:3ed9:2bff:fe61:117a')
        count_ips[product].append(len(acc_ips[product]) - prev)

# Free memory
acc_ips = None









    



Error reading 2013-07-06
Error reading 2013-07-07
Error reading 2013-07-08
Error reading 2013-07-09
Error reading 2013-07-10
Error reading 2013-07-11
Error reading 2013-07-12
Error reading 2013-07-13
Error reading 2013-07-14
Error reading 2013-07-15
Error reading 2013-07-16
Error reading 2013-07-17



In [89]:

    
# Group count_ips by weeks

zypper_table = {}
for product in count_ips:
    label_data = ('Week-%s'%(i+1) for i in range(len(count_ips[product])))
    week_data = map(sum, zip(*([iter(count_ips[product])]*7)))
    acc_week_data = np.cumsum(week_data)
    from_ = [t[0] for t in timeline if t[1]==product][0]
    dates = [from_+datetime.timedelta(days=i) for i in range(len(count_ips[product]))]
    from_date = (d[0] for d in zip(*([iter(dates)]*7)))
    to_date = (d[-1] for d in zip(*([iter(dates)]*7)))
    
    zypper_table[product] = zip(label_data, from_date, to_date, week_data, acc_week_data)



In [90]:

    
#
# Draw timeframes with rectangles

def tox(d):
    from_ = datetime.date(year=2010, month=1, day=1)
    return (d-from_).days

from_ = datetime.date(year=2010, month=1, day=1)
to = datetime.date(year=2013, month=7, day=2)
days = (to - from_).days
plt.xlim(0, days)
plt.ylim(0, 3)

plt.xlabel('Date', fontsize='x-large')
plt.title('Releases openSUSE / Fedora', fontsize='xx-large')

xt,_ = plt.xticks()
plt.xticks(xt[:-1], [from_+datetime.timedelta(days=i) for i in xt[:-1]], rotation=30, fontsize='large')
plt.yticks([1, 2], ['Fedora', 'openSUSE'], fontsize='large')

cmap = mpl.cm.jet
norm = mpl.colors.Normalize(vmin=0, vmax=len(zypper_table))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)

#font = "sans-serif"

for i, product in enumerate(zypper_table):
    plt.axhspan(ymin=2-0.3, ymax=2+0.3,
                xmin=tox(zypper_table[product][0][1])/float(days), xmax=tox(zypper_table[product][-1][2])/float(days),
                facecolor=smap.to_rgba(i), alpha=0.5)
    plt.text(tox(zypper_table[product][0][1]) + (tox(zypper_table[product][-1][2]) - tox(zypper_table[product][0][1]))/2, 2, product, ha='center', fontsize='large')#, family=font, size=14)

cmap = mpl.cm.summer
norm = mpl.colors.Normalize(vmin=0, vmax=len(yum_table))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)

for i, product in enumerate(yum_table):
    plt.axhspan(ymin=1-0.3, ymax=1+0.3,
                xmin=tox(yum_table[product][0][1].date())/float(days), xmax=tox(yum_table[product][-1][2].date())/float(days),
                facecolor=smap.to_rgba(i), alpha=0.5)
    plt.text(tox(yum_table[product][0][1].date()) + (tox(yum_table[product][-1][2].date()) - tox(yum_table[product][0][1].date()))/2, 1, product, ha='center', fontsize='large')#, family=font, size=14)

plt.show()



In [91]:

    
#
# Draw IP datasets (openSUSE and Fedora)
#
from mpl_toolkits.mplot3d import Axes3D

cmap = mpl.cm.jet
norm = mpl.colors.Normalize(vmin=0, vmax=len(zypper_table))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)

fig = plt.figure()
ax = fig.add_subplot(121, projection='3d')
for color, z, label in zip((smap.to_rgba(i) for i in range(len(zypper_table))),
                           range(len(zypper_table)),
                           sorted(zypper_table)):
    xs = np.arange(min(len(zypper_table[label]), 25))
    ys = np.array([l[3] for l in zypper_table[label]][:25])
    ax.bar(xs, ys, zs=z, zdir='y', color=color, alpha=0.8)

ax.set_xlabel('Date', fontsize='large')
ax.set_ylabel('Distribution', fontsize='large')
ax.set_zlabel('IPs', fontsize='large')
ax.set_title('New IPs in openSUSE (Zypper) Repository')

ax.set_zlim(0, 250000)
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['Week %s'%int(i) for i in xt[:-1]])
plt.yticks(np.arange(len(zypper_table)), sorted(zypper_table))
zt = ax.get_zticks()
ax.set_zticklabels(['%dK'%(i/1000) for i in zt])


cmap = mpl.cm.summer
norm = mpl.colors.Normalize(vmin=0, vmax=len(yum_table))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)

ax = fig.add_subplot(122, projection='3d')
for color, z, label in zip((smap.to_rgba(i) for i in range(len(yum_table))),
                           range(len(yum_table)),
                           sorted(yum_table)):
    xs = np.arange(min(len(yum_table[label]), 25))
    ys = np.array([l[3] for l in yum_table[label]][:25])
    ax.bar(xs, ys, zs=z, zdir='y', color=color, alpha=0.8)

ax.set_xlabel('Date', fontsize='large')
ax.set_ylabel('Distribution', fontsize='large')
ax.set_zlabel('IPs', fontsize='large')
ax.set_title('New IPs in Fedora (Yum) Repository')

ax.set_zlim(0, 250000)
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['Week %s'%int(i) for i in xt[:-1]])
plt.yticks(np.arange(len(yum_table)), sorted(yum_table))
zt = ax.get_zticks()
ax.set_zticklabels(['%dK'%(i/1000) for i in zt])

plt.show()



In [92]:

    
#
# Downloads grouped by IP, using Fedora metric
#

FEDORA_DOWNLOADS = '/suse/aplanas/Documents/01-Downloads_Tracker/dvd-1.0/fedora_downloads.csv'

fedora_download_table, last_version, table = {}, None, None
for line in csv.reader(open(FEDORA_DOWNLOADS, 'r')):
    if not line[0]:
        continue
    if line[0].lower().startswith('fedora'):
        last_version = line[0]
        table = []
        fedora_download_table[last_version] = table
        continue
    table.append((line[0],
                  datetime.datetime.strptime(line[1].split('--')[0].strip(), '%Y-%m-%d'),
                  datetime.datetime.strptime(line[1].split('--')[1].strip(), '%Y-%m-%d'),
                  int(line[2].replace('.', '')),
                  int(line[3].replace('.', ''))))



In [93]:

    
#
# Read the downlads grouped by ip from openSUSE
#

def curate_iso_set(downloads):
    """Extract the name of the ISO"""
    cured = defaultdict(set)
    for key, value in downloads.iteritems():
        newkey = ISO.findall(key)
        if newkey:
            newkey = '-'.join(newkey[0].split('-')[:2])
            cured[newkey].update(value)
    return cured

def opensuse_downloads_day(day, products):
    try:
        d = curate_iso_set(cPickle.load(open(os.path.join(PATH, '%04d%02d%02d_download_ip_day.pkl'%(day.year, day.month, day.day)), 'rb')))
    except:
        print 'Error reading', day
        d = {}
    return { k: v for k, v in d.iteritems() if k in products }


timeline = (
    (datetime.date(year=2010, month=7, day=15), 'openSUSE-11.3'),
    (datetime.date(year=2011, month=3, day=10), 'openSUSE-11.4'),
    (datetime.date(year=2011, month=11, day=16), 'openSUSE-12.1'),
    (datetime.date(year=2012, month=9, day=5), 'openSUSE-12.2'),
    (datetime.date(year=2013, month=3, day=13), 'openSUSE-12.3'),
)

from_, to = timeline[0][0], datetime.date.today()

data = (opensuse_downloads_day(day, set(([p for d, p in timeline if d <= day][-1],))) for day in (from_+datetime.timedelta(days=d) for d in range((to-from_).days+1)))

count_dips = defaultdict(list)
for d in data:
    for product, ips in d.iteritems():
        count_dips[product].append(len(ips))









    



Error reading 2013-07-06
Error reading 2013-07-07
Error reading 2013-07-08
Error reading 2013-07-09
Error reading 2013-07-10
Error reading 2013-07-11
Error reading 2013-07-12
Error reading 2013-07-13
Error reading 2013-07-14
Error reading 2013-07-15
Error reading 2013-07-16
Error reading 2013-07-17



In [94]:

    
# Group count_dips by weeks

opensuse_download_table = {}
for product in count_dips:
    label_data = ('Week-%s'%(i+1) for i in range(len(count_dips[product])))
    week_data = map(sum, zip(*([iter(count_dips[product])]*7)))
    acc_week_data = np.cumsum(week_data)
    from_ = [t[0] for t in timeline if t[1]==product][0]
    dates = [from_+datetime.timedelta(days=i) for i in range(len(count_dips[product]))]
    from_date = (d[0] for d in zip(*([iter(dates)]*7)))
    to_date = (d[-1] for d in zip(*([iter(dates)]*7)))
    
    opensuse_download_table[product] = zip(label_data, from_date, to_date, week_data, acc_week_data)



In [95]:

    
#
# Draw download IP datasets (openSUSE and Fedora)
#

from mpl_toolkits.mplot3d import Axes3D

cmap = mpl.cm.jet
norm = mpl.colors.Normalize(vmin=0, vmax=len(opensuse_download_table))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)

fig = plt.figure()
ax = fig.add_subplot(121, projection='3d')
for color, z, label in zip((smap.to_rgba(i) for i in range(len(opensuse_download_table))),
                           range(len(opensuse_download_table)),
                           sorted(opensuse_download_table)):
    xs = np.arange(min(len(opensuse_download_table[label]), 25))
    ys = np.array([l[3] for l in opensuse_download_table[label]][:25])
    ax.bar(xs, ys, zs=z, zdir='y', color=color, alpha=0.8)

ax.set_xlabel('Date', fontsize='large')
ax.set_ylabel('Distribution', fontsize='large')
ax.set_zlabel('Downloads', fontsize='large')
ax.set_title('openSUSE downloads')

ax.set_zlim(0, 250000)
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['Week %s'%int(i) for i in xt[:-1]])
plt.yticks(np.arange(len(opensuse_download_table)), sorted(opensuse_download_table))
zt = ax.get_zticks()
ax.set_zticklabels(['%dK'%(i/1000) for i in zt])


cmap = mpl.cm.summer
norm = mpl.colors.Normalize(vmin=0, vmax=len(yum_table))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)

ax = fig.add_subplot(122, projection='3d')
for color, z, label in zip((smap.to_rgba(i) for i in range(len(fedora_download_table))),
                           range(len(fedora_download_table)),
                           sorted(fedora_download_table)):
    xs = np.arange(min(len(fedora_download_table[label]), 25))
    ys = np.array([l[3] for l in fedora_download_table[label]][:25])
    ax.bar(xs, ys, zs=z, zdir='y', color=color, alpha=0.8)

ax.set_xlabel('Date', fontsize='large')
ax.set_ylabel('Distribution', fontsize='large')
ax.set_zlabel('Downloads', fontsize='large')
ax.set_title('Fedora downloads')

ax.set_zlim(0, 250000)
xt,_ = plt.xticks()
plt.xticks(xt[:-1], ['Week %s'%int(i) for i in xt[:-1]])
plt.yticks(np.arange(len(fedora_download_table)), sorted(fedora_download_table))
zt = ax.get_zticks()
ax.set_zticklabels(['%dK'%(i/1000) for i in zt])

plt.show()



In [86]:

    
import itertools

factory = [int(i) for i in open('factory', 'r')]
tumbleweed = [int(i) for i in open('tumbleweed', 'r')]

plt.plot(np.arange(len(factory)), factory, 'r', np.arange(len(tumbleweed)), tumbleweed, 'b', lw=2)
plt.title('Factory - Tumbleweed', fontsize='xx-large')
plt.xlabel('Date', fontsize='x-large')
plt.ylabel('UUIDs', fontsize='x-large')
plt.legend(['Factory', 'Tumbleweed'])
dates=['%s %s'%(m, y) for y in [2010, 2011, 2012, 2013] for m in ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dic')]
xt,_ = plt.xticks()
plt.xticks(xt, [dates[int(i)] for i in xt], rotation=30, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, [int(i) for i in yt], fontsize='large')
plt.grid()
plt.axis('tight')
plt.show()



In [106]:

    
import csv
with open('SocialMediaMetrics.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    dates = reader.next()
    data_suse = {}
    data_fedora = {}
    labels_suse = []
    labels_fedora = []
    for row in reader:
        if 'Fedora' in row[0]:
            labels_fedora.append(row[0])
            data_fedora[row[0]] = [int(i) if i else 0 for i in row[1:]]
        else:
            labels_suse.append(row[0])
            data_suse[row[0]] = [int(i) if i else 0 for i in row[1:]]

plt.subplot(211)

cmap = mpl.cm.jet
norm = mpl.colors.Normalize(vmin=0, vmax=len(labels_suse))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
colors = {l: smap.to_rgba(i) for i,l in enumerate(labels_suse)}
bottom = [0] * len(dates)
for l in labels_suse:
    plt.bar(np.arange(len(dates)), data_suse[l], bottom=bottom, label=l, color=colors[l])
    bottom = [bottom[i] + data_suse[l][i] for i in range(len(dates))]

plt.title('openSUSE Social Media Direct Reach', fontsize='xx-large')
plt.xlabel('Date', fontsize='x-large')
plt.ylabel('Followers', fontsize='x-large')
plt.legend(loc='upper left')
xt,_ = plt.xticks()
plt.xticks(xt[:-1], [dates[int(i)] for i in xt[:-1]], rotation=30, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, [int(i) for i in yt], fontsize='large')
plt.grid()

plt.subplot(212)

cmap = mpl.cm.winter
norm = mpl.colors.Normalize(vmin=0, vmax=len(labels_fedora))
smap = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
colors = {l: smap.to_rgba(i) for i,l in enumerate(labels_fedora)}
bottom = [0] * len(dates)
for l in labels_fedora:
    plt.bar(np.arange(len(dates)), data_fedora[l], bottom=bottom, label=l, color=colors[l])
    bottom = [bottom[i] + data_fedora[l][i] for i in range(len(dates))]

plt.title('Fedora Social Media Direct Reach', fontsize='xx-large')
plt.xlabel('Date', fontsize='x-large')
plt.ylabel('Followers', fontsize='x-large')
plt.legend(loc='upper left')
xt,_ = plt.xticks()
plt.xticks(xt[:-1], [dates[int(i)] for i in xt[:-1]], rotation=30, fontsize='large')
yt,_ = plt.yticks()
plt.yticks(yt, [int(i) for i in yt], fontsize='large')
plt.grid()

plt.show()



In [ ]: