In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import sys
from natsort import natsorted, ns
%matplotlib inline

In [2]:
results = dict()

with open('/home/felipe/queirozfcom/files/stackoverflow-data-explorer-count-by-tag-2016-2017-taken-on-2017-10-26.txt','r') as f:
    for line in f:
                
        tag,count,yearmonth =line.split()
        
        tag = tag.strip()
        count = count.strip()
        yearmonth = yearmonth.strip()

        # pad numbers
        yearmonth = re.sub(r'^(\d{4})-(\d{1})$',r'\1-0\2',yearmonth)
        
        if yearmonth=='2017-10':
            continue
            # month has not ended 
        
        if results.get(tag,None) is None:
            results[tag] = [(yearmonth,int(count) )]
        else:
            results[tag] = results[tag] + [(yearmonth, int(count) )]

In [3]:
results['pymc']


Out[3]:
[('2016-01', 3),
 ('2016-10', 7),
 ('2016-11', 8),
 ('2016-12', 6),
 ('2016-02', 4),
 ('2016-03', 5),
 ('2016-04', 2),
 ('2016-05', 8),
 ('2016-06', 2),
 ('2016-07', 2),
 ('2016-08', 4),
 ('2016-09', 6),
 ('2017-01', 6),
 ('2017-02', 6),
 ('2017-03', 2),
 ('2017-04', 9),
 ('2017-05', 8),
 ('2017-06', 4),
 ('2017-07', 7),
 ('2017-08', 5),
 ('2017-09', 9)]

In [4]:
periods = ['2016-01','2016-02','2016-03','2016-04','2016-05','2016-06',
           '2016-07','2016-08','2016-09','2016-10','2016-11','2016-12',
           '2017-01','2017-02','2017-03','2017-04','2017-05','2017-06',
           '2017-07','2017-08','2017-09']

In [5]:
list(results.keys())


Out[5]:
['numpy',
 'pymc',
 'scrapy',
 'statsmodels',
 'beautifulsoup',
 'mxnet',
 'tensorflow',
 'seaborn',
 'theano',
 'opencv',
 'spacy',
 'networkx',
 'matplotlib',
 'pymc3',
 'nltk',
 'scikit-learn',
 'bokeh',
 'pytorch',
 'keras',
 'scipy',
 'pandas',
 'cntk',
 'gensim']

In [6]:
for key in list(results.keys()):
    existing_dates = [pair[0] for pair in results[key]]
    
    missing = set(periods) - set(existing_dates)
    
    for period in missing:
        results[key] = results[key] + [(period,0)]

In [7]:
with open('/home/felipe/queirozfcom/files/crossvalidated-data-explorer-count-by-tag-2016-2017-taken-on-2017-10-26.txt','r') as f:
    for line in f:
                
        tag,count,yearmonth =line.split()

        tag = tag.strip()
        count = count.strip()
        yearmonth = yearmonth.strip()
        
        # pad numbers
        yearmonth = re.sub(r'^(\d{4})-(\d{1})$',r'\1-0\2',yearmonth)
        
        if yearmonth=='2017-10':
            continue
            # month has not ended 
        
        current_counts_for_this_tag = results[tag]
        
        current_pair = [(date,cnt) for date,cnt in current_counts_for_this_tag if date == yearmonth][0]
                          
        _,current_count = current_pair
                    
        new_count = current_count + int(count)
        
        current_counts_for_this_tag.remove(current_pair)
        
                
        new_pair = (yearmonth,new_count)
        
        current_counts_for_this_tag.append(new_pair)
                                    
        results[tag] = current_counts_for_this_tag       
        
        break

In [8]:
# merge pymc and pymc3
pymc = results['pymc3']
pymc


Out[8]:
[('2016-01', 6),
 ('2016-10', 5),
 ('2016-11', 13),
 ('2016-12', 9),
 ('2016-02', 6),
 ('2016-03', 7),
 ('2016-04', 1),
 ('2016-05', 8),
 ('2016-06', 5),
 ('2016-07', 9),
 ('2016-08', 7),
 ('2016-09', 11),
 ('2017-01', 12),
 ('2017-02', 17),
 ('2017-03', 14),
 ('2017-04', 19),
 ('2017-05', 20),
 ('2017-06', 10),
 ('2017-07', 8),
 ('2017-08', 20),
 ('2017-09', 11)]

In [9]:
new_pairs = []

for (date,count) in results['pymc3']:
    
    count_pymc = [pair[1] for pair in pymc if pair[0]==date]
    
    if count_pymc:
        new_count = count + count_pymc[0]
    else:
        new_count = count
          
    new_pairs.append((date,new_count))
    
new_pairs


Out[9]:
[('2016-01', 12),
 ('2016-10', 10),
 ('2016-11', 26),
 ('2016-12', 18),
 ('2016-02', 12),
 ('2016-03', 14),
 ('2016-04', 2),
 ('2016-05', 16),
 ('2016-06', 10),
 ('2016-07', 18),
 ('2016-08', 14),
 ('2016-09', 22),
 ('2017-01', 24),
 ('2017-02', 34),
 ('2017-03', 28),
 ('2017-04', 38),
 ('2017-05', 40),
 ('2017-06', 20),
 ('2017-07', 16),
 ('2017-08', 40),
 ('2017-09', 22)]

In [10]:
results.pop('pymc3',None)
results.pop('pymc',None)

results['pymc/pymc3'] = new_pairs

In [11]:
for key in list(results.keys()):
    existing_values = [pair[0] for pair in results[key]]
    
    missing = set(periods) - set(existing_values)
    
    for period in missing:
        results[key] = results[key] + [(period,0)]

In [12]:
for key in list(results.keys()):
    
    results[key] = natsorted(results[key],key=lambda pair: pair[0])

In [13]:
results


Out[13]:
{'beautifulsoup': [('2016-01', 150),
  ('2016-02', 183),
  ('2016-03', 178),
  ('2016-04', 165),
  ('2016-05', 176),
  ('2016-06', 184),
  ('2016-07', 185),
  ('2016-08', 171),
  ('2016-09', 151),
  ('2016-10', 195),
  ('2016-11', 189),
  ('2016-12', 233),
  ('2017-01', 250),
  ('2017-02', 243),
  ('2017-03', 261),
  ('2017-04', 232),
  ('2017-05', 249),
  ('2017-06', 269),
  ('2017-07', 285),
  ('2017-08', 262),
  ('2017-09', 254)],
 'bokeh': [('2016-01', 32),
  ('2016-02', 22),
  ('2016-03', 35),
  ('2016-04', 35),
  ('2016-05', 47),
  ('2016-06', 39),
  ('2016-07', 39),
  ('2016-08', 53),
  ('2016-09', 51),
  ('2016-10', 34),
  ('2016-11', 47),
  ('2016-12', 51),
  ('2017-01', 83),
  ('2017-02', 82),
  ('2017-03', 71),
  ('2017-04', 69),
  ('2017-05', 79),
  ('2017-06', 73),
  ('2017-07', 78),
  ('2017-08', 57),
  ('2017-09', 76)],
 'cntk': [('2016-01', 0),
  ('2016-02', 3),
  ('2016-03', 0),
  ('2016-04', 2),
  ('2016-05', 5),
  ('2016-06', 1),
  ('2016-07', 1),
  ('2016-08', 0),
  ('2016-09', 2),
  ('2016-10', 1),
  ('2016-11', 9),
  ('2016-12', 11),
  ('2017-01', 74),
  ('2017-02', 32),
  ('2017-03', 40),
  ('2017-04', 28),
  ('2017-05', 27),
  ('2017-06', 21),
  ('2017-07', 17),
  ('2017-08', 20),
  ('2017-09', 17)],
 'gensim': [('2016-01', 10),
  ('2016-02', 12),
  ('2016-03', 28),
  ('2016-04', 19),
  ('2016-05', 11),
  ('2016-06', 17),
  ('2016-07', 13),
  ('2016-08', 10),
  ('2016-09', 13),
  ('2016-10', 21),
  ('2016-11', 30),
  ('2016-12', 23),
  ('2017-01', 20),
  ('2017-02', 26),
  ('2017-03', 37),
  ('2017-04', 35),
  ('2017-05', 37),
  ('2017-06', 36),
  ('2017-07', 42),
  ('2017-08', 39),
  ('2017-09', 39)],
 'keras': [('2016-01', 27),
  ('2016-02', 18),
  ('2016-03', 37),
  ('2016-04', 52),
  ('2016-05', 63),
  ('2016-06', 60),
  ('2016-07', 59),
  ('2016-08', 66),
  ('2016-09', 83),
  ('2016-10', 122),
  ('2016-11', 172),
  ('2016-12', 148),
  ('2017-01', 227),
  ('2017-02', 252),
  ('2017-03', 404),
  ('2017-04', 358),
  ('2017-05', 427),
  ('2017-06', 407),
  ('2017-07', 474),
  ('2017-08', 412),
  ('2017-09', 401)],
 'matplotlib': [('2016-01', 465),
  ('2016-02', 470),
  ('2016-03', 530),
  ('2016-04', 528),
  ('2016-05', 486),
  ('2016-06', 501),
  ('2016-07', 499),
  ('2016-08', 483),
  ('2016-09', 427),
  ('2016-10', 473),
  ('2016-11', 502),
  ('2016-12', 410),
  ('2017-01', 466),
  ('2017-02', 580),
  ('2017-03', 700),
  ('2017-04', 731),
  ('2017-05', 669),
  ('2017-06', 677),
  ('2017-07', 682),
  ('2017-08', 579),
  ('2017-09', 547)],
 'mxnet': [('2016-01', 1),
  ('2016-02', 1),
  ('2016-03', 5),
  ('2016-04', 0),
  ('2016-05', 6),
  ('2016-06', 0),
  ('2016-07', 5),
  ('2016-08', 6),
  ('2016-09', 4),
  ('2016-10', 2),
  ('2016-11', 10),
  ('2016-12', 14),
  ('2017-01', 20),
  ('2017-02', 21),
  ('2017-03', 20),
  ('2017-04', 7),
  ('2017-05', 11),
  ('2017-06', 20),
  ('2017-07', 17),
  ('2017-08', 25),
  ('2017-09', 18)],
 'networkx': [('2016-01', 29),
  ('2016-02', 42),
  ('2016-03', 46),
  ('2016-04', 34),
  ('2016-05', 23),
  ('2016-06', 15),
  ('2016-07', 18),
  ('2016-08', 27),
  ('2016-09', 26),
  ('2016-10', 31),
  ('2016-11', 33),
  ('2016-12', 34),
  ('2017-01', 27),
  ('2017-02', 53),
  ('2017-03', 53),
  ('2017-04', 49),
  ('2017-05', 50),
  ('2017-06', 40),
  ('2017-07', 45),
  ('2017-08', 42),
  ('2017-09', 53)],
 'nltk': [('2016-01', 76),
  ('2016-02', 61),
  ('2016-03', 92),
  ('2016-04', 73),
  ('2016-05', 60),
  ('2016-06', 73),
  ('2016-07', 77),
  ('2016-08', 54),
  ('2016-09', 72),
  ('2016-10', 75),
  ('2016-11', 102),
  ('2016-12', 86),
  ('2017-01', 65),
  ('2017-02', 85),
  ('2017-03', 94),
  ('2017-04', 102),
  ('2017-05', 95),
  ('2017-06', 96),
  ('2017-07', 100),
  ('2017-08', 96),
  ('2017-09', 84)],
 'numpy': [('2016-01', 686),
  ('2016-02', 753),
  ('2016-03', 807),
  ('2016-04', 823),
  ('2016-05', 692),
  ('2016-06', 770),
  ('2016-07', 791),
  ('2016-08', 759),
  ('2016-09', 769),
  ('2016-10', 809),
  ('2016-11', 874),
  ('2016-12', 782),
  ('2017-01', 833),
  ('2017-02', 963),
  ('2017-03', 1146),
  ('2017-04', 1070),
  ('2017-05', 1032),
  ('2017-06', 1015),
  ('2017-07', 1072),
  ('2017-08', 1032),
  ('2017-09', 993)],
 'opencv': [('2016-01', 528),
  ('2016-02', 554),
  ('2016-03', 636),
  ('2016-04', 718),
  ('2016-05', 673),
  ('2016-06', 651),
  ('2016-07', 551),
  ('2016-08', 504),
  ('2016-09', 412),
  ('2016-10', 583),
  ('2016-11', 620),
  ('2016-12', 573),
  ('2017-01', 646),
  ('2017-02', 654),
  ('2017-03', 823),
  ('2017-04', 759),
  ('2017-05', 786),
  ('2017-06', 752),
  ('2017-07', 780),
  ('2017-08', 747),
  ('2017-09', 694)],
 'pandas': [('2016-01', 914),
  ('2016-02', 1041),
  ('2016-03', 1117),
  ('2016-04', 1199),
  ('2016-05', 1281),
  ('2016-06', 1284),
  ('2016-07', 1473),
  ('2016-08', 1376),
  ('2016-09', 1318),
  ('2016-10', 1363),
  ('2016-11', 1555),
  ('2016-12', 1466),
  ('2017-01', 1584),
  ('2017-02', 1638),
  ('2017-03', 2078),
  ('2017-04', 1885),
  ('2017-05', 1983),
  ('2017-06', 2068),
  ('2017-07', 2172),
  ('2017-08', 2249),
  ('2017-09', 2081)],
 'pymc/pymc3': [('2016-01', 12),
  ('2016-02', 12),
  ('2016-03', 14),
  ('2016-04', 2),
  ('2016-05', 16),
  ('2016-06', 10),
  ('2016-07', 18),
  ('2016-08', 14),
  ('2016-09', 22),
  ('2016-10', 10),
  ('2016-11', 26),
  ('2016-12', 18),
  ('2017-01', 24),
  ('2017-02', 34),
  ('2017-03', 28),
  ('2017-04', 38),
  ('2017-05', 40),
  ('2017-06', 20),
  ('2017-07', 16),
  ('2017-08', 40),
  ('2017-09', 22)],
 'pytorch': [('2016-01', 0),
  ('2016-02', 0),
  ('2016-03', 0),
  ('2016-04', 0),
  ('2016-05', 0),
  ('2016-06', 0),
  ('2016-07', 0),
  ('2016-08', 0),
  ('2016-09', 0),
  ('2016-10', 0),
  ('2016-11', 0),
  ('2016-12', 0),
  ('2017-01', 4),
  ('2017-02', 3),
  ('2017-03', 15),
  ('2017-04', 26),
  ('2017-05', 25),
  ('2017-06', 58),
  ('2017-07', 34),
  ('2017-08', 37),
  ('2017-09', 43)],
 'scikit-learn': [('2016-01', 186),
  ('2016-02', 196),
  ('2016-03', 206),
  ('2016-04', 214),
  ('2016-05', 224),
  ('2016-06', 219),
  ('2016-07', 222),
  ('2016-08', 223),
  ('2016-09', 188),
  ('2016-10', 229),
  ('2016-11', 291),
  ('2016-12', 254),
  ('2017-01', 265),
  ('2017-02', 254),
  ('2017-03', 325),
  ('2017-04', 303),
  ('2017-05', 268),
  ('2017-06', 293),
  ('2017-07', 359),
  ('2017-08', 321),
  ('2017-09', 290)],
 'scipy': [('2016-01', 145),
  ('2016-02', 171),
  ('2016-03', 197),
  ('2016-04', 178),
  ('2016-05', 187),
  ('2016-06', 156),
  ('2016-07', 159),
  ('2016-08', 157),
  ('2016-09', 159),
  ('2016-10', 196),
  ('2016-11', 217),
  ('2016-12', 178),
  ('2017-01', 150),
  ('2017-02', 191),
  ('2017-03', 238),
  ('2017-04', 243),
  ('2017-05', 210),
  ('2017-06', 215),
  ('2017-07', 194),
  ('2017-08', 184),
  ('2017-09', 159)],
 'scrapy': [('2016-01', 149),
  ('2016-02', 162),
  ('2016-03', 158),
  ('2016-04', 138),
  ('2016-05', 170),
  ('2016-06', 165),
  ('2016-07', 164),
  ('2016-08', 133),
  ('2016-09', 141),
  ('2016-10', 145),
  ('2016-11', 150),
  ('2016-12', 133),
  ('2017-01', 191),
  ('2017-02', 197),
  ('2017-03', 158),
  ('2017-04', 199),
  ('2017-05', 221),
  ('2017-06', 210),
  ('2017-07', 249),
  ('2017-08', 214),
  ('2017-09', 208)],
 'seaborn': [('2016-01', 34),
  ('2016-02', 47),
  ('2016-03', 50),
  ('2016-04', 48),
  ('2016-05', 49),
  ('2016-06', 40),
  ('2016-07', 36),
  ('2016-08', 62),
  ('2016-09', 34),
  ('2016-10', 51),
  ('2016-11', 40),
  ('2016-12', 35),
  ('2017-01', 48),
  ('2017-02', 50),
  ('2017-03', 64),
  ('2017-04', 68),
  ('2017-05', 56),
  ('2017-06', 56),
  ('2017-07', 77),
  ('2017-08', 81),
  ('2017-09', 73)],
 'spacy': [('2016-01', 3),
  ('2016-02', 1),
  ('2016-03', 2),
  ('2016-04', 3),
  ('2016-05', 4),
  ('2016-06', 9),
  ('2016-07', 3),
  ('2016-08', 8),
  ('2016-09', 8),
  ('2016-10', 8),
  ('2016-11', 11),
  ('2016-12', 14),
  ('2017-01', 9),
  ('2017-02', 17),
  ('2017-03', 24),
  ('2017-04', 24),
  ('2017-05', 22),
  ('2017-06', 15),
  ('2017-07', 17),
  ('2017-08', 22),
  ('2017-09', 31)],
 'statsmodels': [('2016-01', 13),
  ('2016-02', 11),
  ('2016-03', 26),
  ('2016-04', 23),
  ('2016-05', 23),
  ('2016-06', 15),
  ('2016-07', 14),
  ('2016-08', 16),
  ('2016-09', 8),
  ('2016-10', 16),
  ('2016-11', 15),
  ('2016-12', 19),
  ('2017-01', 19),
  ('2017-02', 27),
  ('2017-03', 27),
  ('2017-04', 19),
  ('2017-05', 21),
  ('2017-06', 28),
  ('2017-07', 37),
  ('2017-08', 40),
  ('2017-09', 19)],
 'tensorflow': [('2016-01', 170),
  ('2016-02', 229),
  ('2016-03', 242),
  ('2016-04', 272),
  ('2016-05', 332),
  ('2016-06', 496),
  ('2016-07', 491),
  ('2016-08', 506),
  ('2016-09', 448),
  ('2016-10', 583),
  ('2016-11', 748),
  ('2016-12', 736),
  ('2017-01', 857),
  ('2017-02', 1036),
  ('2017-03', 1325),
  ('2017-04', 1203),
  ('2017-05', 1281),
  ('2017-06', 1405),
  ('2017-07', 1509),
  ('2017-08', 1436),
  ('2017-09', 1220)],
 'theano': [('2016-01', 75),
  ('2016-02', 71),
  ('2016-03', 87),
  ('2016-04', 76),
  ('2016-05', 79),
  ('2016-06', 67),
  ('2016-07', 72),
  ('2016-08', 61),
  ('2016-09', 73),
  ('2016-10', 80),
  ('2016-11', 93),
  ('2016-12', 119),
  ('2017-01', 77),
  ('2017-02', 88),
  ('2017-03', 123),
  ('2017-04', 74),
  ('2017-05', 92),
  ('2017-06', 76),
  ('2017-07', 66),
  ('2017-08', 43),
  ('2017-09', 53)]}

In [17]:
plt.clf()

img = plt.gcf()

ax = plt.gca()
for key in list(results.keys()):
    x = periods
    y = [pair[1] for pair in results[key]]
    plt.plot(np.array(x),np.array(y),label=key)
    plt.xticks(rotation=45)
    
plt.legend(loc='lower right')
plt.grid(True)

plt.yscale('symlog')


img.set_size_inches(20,12)



In [ ]: