In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import sys
from natsort import natsorted, ns
%matplotlib inline
In [9]:
results = dict()
with open('/home/felipe/queirozfcom/files/stackoverflow-data-explorer-count-by-tag-2016-2017-taken-on-2017-10-26.txt','r') as f:
for line in f:
tag,count,yearmonth =line.split()
tag = tag.strip()
count = count.strip()
yearmonth = yearmonth.strip()
# pad numbers
yearmonth = re.sub(r'^(\d{4})-(\d{1})$',r'\1-0\2',yearmonth)
if yearmonth=='2017-10':
continue
# month has not ended
if results.get(tag,None) is None:
results[tag] = [(yearmonth,int(count) )]
else:
results[tag] = results[tag] + [(yearmonth, int(count) )]
In [10]:
results['pymc']
Out[10]:
In [11]:
periods = ['2016-01','2016-02','2016-03','2016-04','2016-05','2016-06',
'2016-07','2016-08','2016-09','2016-10','2016-11','2016-12',
'2017-01','2017-02','2017-03','2017-04','2017-05','2017-06',
'2017-07','2017-08','2017-09']
In [12]:
list(results.keys())
Out[12]:
In [13]:
for key in list(results.keys()):
existing_dates = [pair[0] for pair in results[key]]
missing = set(periods) - set(existing_dates)
for period in missing:
results[key] = results[key] + [(period,0)]
In [14]:
with open('/home/felipe/queirozfcom/files/crossvalidated-data-explorer-count-by-tag-2016-2017-taken-on-2017-10-26.txt','r') as f:
for line in f:
tag,count,yearmonth =line.split()
tag = tag.strip()
count = count.strip()
yearmonth = yearmonth.strip()
# pad numbers
yearmonth = re.sub(r'^(\d{4})-(\d{1})$',r'\1-0\2',yearmonth)
if yearmonth=='2017-10':
continue
# month has not ended
current_counts_for_this_tag = results[tag]
current_pair = [(date,cnt) for date,cnt in current_counts_for_this_tag if date == yearmonth][0]
_,current_count = current_pair
new_count = current_count + int(count)
current_counts_for_this_tag.remove(current_pair)
new_pair = (yearmonth,new_count)
current_counts_for_this_tag.append(new_pair)
results[tag] = current_counts_for_this_tag
break
In [15]:
# merge pymc and pymc3
pymc = results['pymc3']
pymc
Out[15]:
In [16]:
new_pairs = []
for (date,count) in results['pymc3']:
count_pymc = [pair[1] for pair in pymc if pair[0]==date]
if count_pymc:
new_count = count + count_pymc[0]
else:
new_count = count
new_pairs.append((date,new_count))
new_pairs
Out[16]:
In [17]:
results.pop('pymc3',None)
results.pop('pymc',None)
results['pymc/pymc3'] = new_pairs
In [ ]:
for key in list(results.keys()):
existing_values = [pair[0] for pair in results[key]]
missing = set(periods) - set(existing_values)
for period in missing:
results[key] = results[key] + [(period,0)]
In [ ]:
for key in list(results.keys()):
results[key] = natsorted(results[key],key=lambda pair: pair[0])
In [ ]:
results
In [ ]:
plt.clf()
img = plt.gcf()
ax = plt.gca()
for key in list(results.keys()):
x = periods
y = [pair[1] for pair in results[key]]
plt.plot(np.array(x),np.array(y),label=key)
plt.legend(loc='lower right')
plt.grid(True)
plt.yscale('symlog')
img.set_size_inches(20,12)
In [ ]: