In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import sys
from natsort import natsorted, ns
%matplotlib inline

In [9]:
results = dict()

with open('/home/felipe/queirozfcom/files/stackoverflow-data-explorer-count-by-tag-2016-2017-taken-on-2017-10-26.txt','r') as f:
    for line in f:
                
        tag,count,yearmonth =line.split()
        
        tag = tag.strip()
        count = count.strip()
        yearmonth = yearmonth.strip()

        # pad numbers
        yearmonth = re.sub(r'^(\d{4})-(\d{1})$',r'\1-0\2',yearmonth)
        
        if yearmonth=='2017-10':
            continue
            # month has not ended 
        
        if results.get(tag,None) is None:
            results[tag] = [(yearmonth,int(count) )]
        else:
            results[tag] = results[tag] + [(yearmonth, int(count) )]

In [10]:
results['pymc']


Out[10]:
[('2016-01', 3),
 ('2016-10', 7),
 ('2016-11', 8),
 ('2016-12', 6),
 ('2016-02', 4),
 ('2016-03', 5),
 ('2016-04', 2),
 ('2016-05', 8),
 ('2016-06', 2),
 ('2016-07', 2),
 ('2016-08', 4),
 ('2016-09', 6),
 ('2017-01', 6),
 ('2017-02', 6),
 ('2017-03', 2),
 ('2017-04', 9),
 ('2017-05', 8),
 ('2017-06', 4),
 ('2017-07', 7),
 ('2017-08', 5),
 ('2017-09', 9)]

In [11]:
periods = ['2016-01','2016-02','2016-03','2016-04','2016-05','2016-06',
           '2016-07','2016-08','2016-09','2016-10','2016-11','2016-12',
           '2017-01','2017-02','2017-03','2017-04','2017-05','2017-06',
           '2017-07','2017-08','2017-09']

In [12]:
list(results.keys())


Out[12]:
['pandas',
 'spacy',
 'tensorflow',
 'scrapy',
 'theano',
 'pymc',
 'mxnet',
 'pymc3',
 'pytorch',
 'numpy',
 'opencv',
 'bokeh',
 'gensim',
 'beautifulsoup',
 'statsmodels',
 'keras',
 'networkx',
 'matplotlib',
 'scipy',
 'scikit-learn',
 'cntk',
 'nltk',
 'seaborn']

In [13]:
for key in list(results.keys()):
    existing_dates = [pair[0] for pair in results[key]]
    
    missing = set(periods) - set(existing_dates)
    
    for period in missing:
        results[key] = results[key] + [(period,0)]

In [14]:
with open('/home/felipe/queirozfcom/files/crossvalidated-data-explorer-count-by-tag-2016-2017-taken-on-2017-10-26.txt','r') as f:
    for line in f:
                
        tag,count,yearmonth =line.split()

        tag = tag.strip()
        count = count.strip()
        yearmonth = yearmonth.strip()
        
        # pad numbers
        yearmonth = re.sub(r'^(\d{4})-(\d{1})$',r'\1-0\2',yearmonth)
        
        if yearmonth=='2017-10':
            continue
            # month has not ended 
        
        current_counts_for_this_tag = results[tag]
        
        current_pair = [(date,cnt) for date,cnt in current_counts_for_this_tag if date == yearmonth][0]
                          
        _,current_count = current_pair
                    
        new_count = current_count + int(count)
        
        current_counts_for_this_tag.remove(current_pair)
        
                
        new_pair = (yearmonth,new_count)
        
        current_counts_for_this_tag.append(new_pair)
                                    
        results[tag] = current_counts_for_this_tag       
        
        break


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-14-bb1bd547dfc8> in <module>()
     17         current_counts_for_this_tag = results[tag]
     18 
---> 19         current_pair = [(date,cnt) for date,cnt in current_counts_for_this_tag if date == yearmonth][0]
     20 
     21 

<ipython-input-14-bb1bd547dfc8> in <listcomp>(.0)
     17         current_counts_for_this_tag = results[tag]
     18 
---> 19         current_pair = [(date,cnt) for date,cnt in current_counts_for_this_tag if date == yearmonth][0]
     20 
     21 

ValueError: not enough values to unpack (expected 2, got 1)

In [15]:
# merge pymc and pymc3
pymc = results['pymc3']
pymc


Out[15]:
[('2016-01', 6),
 ('2016-10', 5),
 ('2016-11', 13),
 ('2016-12', 9),
 ('2016-02', 6),
 ('2016-03', 7),
 ('2016-04', 1),
 ('2016-05', 8),
 ('2016-06', 5),
 ('2016-07', 9),
 ('2016-08', 7),
 ('2016-09', 11),
 ('2017-01', 12),
 ('2017-02', 17),
 ('2017-03', 14),
 ('2017-04', 19),
 ('2017-05', 20),
 ('2017-06', 10),
 ('2017-07', 8),
 ('2017-08', 20),
 ('2017-09', 11)]

In [16]:
new_pairs = []

for (date,count) in results['pymc3']:
    
    count_pymc = [pair[1] for pair in pymc if pair[0]==date]
    
    if count_pymc:
        new_count = count + count_pymc[0]
    else:
        new_count = count
          
    new_pairs.append((date,new_count))
    
new_pairs


Out[16]:
[('2016-01', 12),
 ('2016-10', 10),
 ('2016-11', 26),
 ('2016-12', 18),
 ('2016-02', 12),
 ('2016-03', 14),
 ('2016-04', 2),
 ('2016-05', 16),
 ('2016-06', 10),
 ('2016-07', 18),
 ('2016-08', 14),
 ('2016-09', 22),
 ('2017-01', 24),
 ('2017-02', 34),
 ('2017-03', 28),
 ('2017-04', 38),
 ('2017-05', 40),
 ('2017-06', 20),
 ('2017-07', 16),
 ('2017-08', 40),
 ('2017-09', 22)]

In [17]:
results.pop('pymc3',None)
results.pop('pymc',None)

results['pymc/pymc3'] = new_pairs

In [ ]:
for key in list(results.keys()):
    existing_values = [pair[0] for pair in results[key]]
    
    missing = set(periods) - set(existing_values)
    
    for period in missing:
        results[key] = results[key] + [(period,0)]

In [ ]:
for key in list(results.keys()):
    
    results[key] = natsorted(results[key],key=lambda pair: pair[0])

In [ ]:
results

In [ ]:
plt.clf()

img = plt.gcf()

ax = plt.gca()
for key in list(results.keys()):
    x = periods
    y = [pair[1] for pair in results[key]]
    plt.plot(np.array(x),np.array(y),label=key)
    
plt.legend(loc='lower right')
plt.grid(True)

plt.yscale('symlog')
    
img.set_size_inches(20,12)

In [ ]: