In [208]:
%matplotlib inline
import os
import glob
import pylab
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 5)
import seaborn as sns
sns.set_style('whitegrid')
from matplotlib.dates import date2num
from datetime import datetime
from pysurvey.plot import setup_sns as setup
from pysurvey.plot import minmax, icolorbar, density, legend, text, dateticks
Had to impose a limit so that I did not have to spin up a dataset or something with google.
Decided to spin up a dataset and see how well this really works. Google -- thanks for the free trial!
There are a few cohort tables that are a bit more detailed and would be good to use. They have been built for the following months:
I have some decent compute nodes at my disposal so lets do the clean up and visualization here.
In [76]:
# From: https://www.reddit.com/r/autowikibot/wiki/redditbots
bots = open('/Users/ajmendez/data/reddit/botlist.txt', 'r').read().splitlines()
In [20]:
filename
Out[20]:
In [209]:
titles = []
dfs = []
for filename in glob.glob('/Users/ajmendez/data/reddit/*.csv'):
# make sure we are just reading the year csvs
if '20' not in filename:
continue
title = int(os.path.splitext(os.path.basename(filename))[0])
titles.append(title)
df = pd.read_csv(filename)
tmp = df['author'].str.extract(u'(\d+)')
df['year'] = title
df['number'] = tmp.apply(lambda x: int(x) if isinstance(x, (str, unicode)) else np.nan)
df['nlength'] = tmp.apply(lambda x: len(x) if isinstance(x, (str,unicode)) else 0)
# df['botname'] = df['author'].str.extract(u'({})'.format('|'.join(bots))) # slow
df['botname'] = df['author'].str.extract(u'(.*[Bb]ot$)')
df['isbot'] = df['botname'].notnull()
dfs.append(df)
print('Found Years: {}'.format(', '.join(map(str, titles))))
In [180]:
reddit = pd.concat(dfs)
In [85]:
# Save it to disk.
reddit.to_csv('/Users/ajmendez/data/reddit/all.csv')
In [22]:
bins = np.arange(0, 101,1)
setup(figsize=(12, 18))
for i, (title,df) in enumerate(zip(titles,dfs)):
ax = setup(subplt=(len(titles),1,i+1),
ylabel=title,
xlabel='Two Digit Number', xticks=(i==len(titles)-1))
isdigits = (df['nlength'] == 2)
x = df.loc[isdigits, 'number'].as_matrix()
pylab.axvline(titles[i]-2000 + 0.5, lw=2, alpha=0.8, zorder=-2, color='k')
pylab.axvline(titles[i]-22-1900 + 0.5, lw=2, alpha=0.8, zorder=-2, color='r')
pylab.hist(x, bins=bins, alpha=0.6, lw=0)
v,l = np.histogram(x, bins)
ii = np.argsort(-v)[:10]
print title, l[ii], v[ii]
pylab.tight_layout()
In [5]:
bins = np.arange(1960, 2020, 1)
setup(figsize=(12, 18))
for i, (title, df) in enumerate(zip(titles, dfs)):
ax = setup(subplt=(len(titles),1,i+1),
ylabel=titles[i], #ylog=True, yr=[1, 1e4],
xr=minmax(bins), xlabel='Four Digit Number', xticks=i==len(titles)-1)
isdigits = (df['nlength'] == 4)
pylab.axvline(title + 0.5, zorder=-2, lw=2, color='k')
pylab.hist(df.loc[isdigits, 'number'].as_matrix(), bins=bins, alpha=0.6, lw=0)
pylab.tight_layout()
In [6]:
def make_bins(xr, yr, ndigit=2):
x,y = np.arange(xr[0],xr[1]+2), np.arange(yr[0], yr[1]+2),
X,Y = np.meshgrid(x*1.0,y*1.0)
Z = np.zeros( (len(x)-1, len(y)-1) )
for i,df in enumerate(dfs):
isdigits = (df['nlength'] == ndigit)
tmp = df.loc[isdigits, 'number'].as_matrix()
Z[:, i] = np.histogram(tmp, bins=x, density=True)[0]
return X, Y, np.ma.MaskedArray(Z, Z==0)
xr = [0,101]
yr = [2007, 2014]
X,Y,Z = make_bins(xr, yr)
In [25]:
years = np.arange(2007, 2015)
setup(figsize=(18,5),
xr=[0,100], xlabel='Two Digit Number',
xtickv=np.arange(0,101, 10) + 0.5, xticknames=np.arange(0,101, 10),
ylabel='Comment Year', ytickv=years+0.5, yticknames=titles,)
pcm = pylab.pcolormesh(X,Y,Z.T, label='Fraction of Users / bin',
vmin=0, vmax=0.04,
zorder=-2, alpha=0.9, cmap=pylab.cm.Spectral_r)
icolorbar(pcm, loc=1, borderpad=-4, tickfmt='{:.2f}', width='10%')
years = np.arange(2007, 2016)
pylab.plot(years-2000, years, lw=2, alpha=0.2)
numbers = [42, 69, 77, 33, 22, 88, 55]
for number in numbers:
pylab.text(number+0.5, 2015, number, ha='center', va='bottom', alpha=0.5)
In [26]:
NZ = np.ma.MaskedArray(Z.T/np.mean(Z, axis=1), Z.T==0)
setup(figsize=(18,5),
xr=[0,100], xlabel='Two Digit Number',
xtickv=np.arange(0,101, 10) + 0.5, xticknames=np.arange(0,101, 10),
ylabel='Comment Year', ytickv=years+0.5, yticknames=titles,)
pcm = pylab.pcolormesh(X,Y,NZ, label='Excess Users / bin',
vmin=0.3, vmax=1.8,
zorder=-2, alpha=0.9, cmap=pylab.cm.Spectral_r)
icolorbar(pcm, loc=1, borderpad=-4, tickfmt='{:.1f}', width='10%')
numbers = [42, 69, 77, 33, 22, 88, 55]
for number in numbers:
pylab.text(number+0.5, 2015, number, ha='center', va='bottom', alpha=0.5)
In [13]:
four_xr = [1960,2021]
four_X,four_Y,four_Z = make_bins(four_xr, yr, ndigit=4)
In [15]:
years = np.arange(2007, 2015)
setup(figsize=(18,5),
xr=four_xr, xlabel='Four Digit Number',
xtickv=np.arange(1960,2021, 10) + 0.5, xticknames=np.arange(1960,2021, 10),
yr=yr, ylabel='Comment Year', ytickv=years+0.5, yticknames=titles,)
pcm = pylab.pcolormesh(four_X, four_Y, four_Z.T, label='Fractional Users / bin',
vmin=0, vmax=0.04,
zorder=-2, alpha=0.9, cmap=pylab.cm.Spectral_r)
icolorbar(pcm, loc=1, borderpad=-4, tickfmt='{:.2f}')
years = np.arange(2007, 2016)
pylab.plot(years, years, lw=2, alpha=0.5)
# numbers = [42, 69, 77, 33, 23, 88, 55]
# for number in numbers:
# pylab.text(number+0.5, 2015, number, ha='center', va='bottom')
Out[15]:
In [16]:
years = np.arange(2007, 2015)
four_NZ = np.ma.MaskedArray(four_Z.T/np.mean(four_Z, axis=1), four_Z.T==0)
setup(figsize=(18,5),
xr=four_xr, xlabel='Four Digit Number',
xtickv=np.arange(1960,2021, 10) + 0.5, xticknames=np.arange(1960,2021, 10),
yr=yr, ylabel='Comment Year', ytickv=years+0.5, yticknames=titles,)
pcm = pylab.pcolormesh(four_X, four_Y, four_NZ, label='Excess Users / bin',
vmin=0.2, vmax=2.8,
zorder=-2, alpha=0.9, cmap=pylab.cm.Spectral_r)
icolorbar(pcm, loc=1, borderpad=-4, tickfmt='{:.2f}')
pylab.plot(years, years, lw=4, alpha=0.9)
pylab.plot(years-25, years, lw=4, alpha=0.9, color='k')
Out[16]:
In [17]:
df = dfs[-1]
isgood = ( (df['nlength'] == 4) & (dfs[-1]['number'] == 1960) )
df.loc[isgood, 'author']
Out[17]:
In [19]:
narr = np.arange(0,101)
numbers = [42, 69, 92]
setup(figsize=(12,4), subplt=(1,3,1), title='"Interesting" numbers',
xr=yr, xlabel='Year', xtickv=np.arange(2007,2015,2), xticknames=np.arange(2007,2015,2),
yr=[0,0.04], ytickv=np.arange(0,0.041,0.01), ylabel='Fraction of users', )
for number in numbers:
i = np.where(number == narr)[0]
pylab.plot(Y[:-1,i], Z.T[:,i], lw=3, alpha=0.8, label=number)
legend(loc=1, alpha=0.8)
numbers = [1980, 1982, 1986, 1988, 1990, 1992]
# people use 99 in excess of years, so disable for plot
Z99 = Z.T.view()
Z99[:,-3:] = np.nan
setup(subplt=(1,3,2), yticks=False, title='Two Digit Cohort (age)',
xr=yr, xlabel='Year', xtickv=np.arange(2007,2015,2), xticknames=np.arange(2007,2015,2),
yr=[0,0.04], ytickv=np.arange(0,0.041,0.01), ylabel='Fraction of users', )
for k,number in enumerate(numbers):
i = np.arange(8)
j = np.where(number-1900 == narr)[0]
pylab.plot(Y[:-1,j], Z99[i,j+i], lw=3, alpha=0.8,
label=2007-number, color=pylab.cm.Blues_r(k*1.0/len(numbers)))
legend(loc=2, alpha=0.8)
narr = np.arange(1960,2020)
setup(subplt=(1,3,3), yticks=False, title='Four Digit Cohort (age)',
xr=yr, xlabel='Year', xtickv=np.arange(2007,2015,2), xticknames=np.arange(2007,2015,2),
yr=[0,0.04], ytickv=np.arange(0,0.041,0.01), ylabel='Fraction of users', )
for k,number in enumerate(numbers):
i = np.arange(8)
j = np.where(number == narr)[0]
pylab.plot(Y[:-1,j], four_Z.T[i,j+i], lw=3, alpha=0.8,
label=2007-number, color=pylab.cm.Blues_r(k*1.0/len(numbers)))
# legend(loc=2, alpha=0.8)
pylab.tight_layout()
In [410]:
2015-1992, 2015-1980
Out[410]:
In [135]:
isgood = (dfall['nlength'] == 4)
setup(figsize=(16,5), ytickv=np.arange(2007,2015)+0.5, yticknames=np.arange(2007,2015))
den = density(dfall.loc[isgood, 'number'].astype(int),
dfall.loc[isgood, 'year'].astype(int),
weights=np.ones(isgood.sum())*1e2, vmin=0, vmax=4,
bins=[np.arange(1950, 2021), np.arange(2007,2015)],
ynorm=True, xnorm=False, cmap=pylab.cm.Spectral_r, colorbar=False)
icolorbar(den)
Out[135]:
In [55]:
wiki = pd.read_csv('/Users/ajmendez/data/usernames/wikipedia.csv')
tmp = df['author'].str.extract(u'(\d+)')
wiki['number'] = tmp.apply(lambda x: int(x) if isinstance(x, (str, unicode)) else np.nan)
wiki['nlength'] = tmp.apply(lambda x: len(x) if isinstance(x, (str,unicode)) else 0)
print len(wiki)
In [171]:
def add_line(year, label, y=11500, color='k',offset=0, **kwargs):
pylab.axvline(year+0.5, color=color, zorder=-1, alpha=0.2, **kwargs)
text(year+0.6+offset, y, label, color=color, fontsize=12,
ha='center', va='top', multialignment='right', rotation=90,
**kwargs)
isdigit = (wiki['nlength'] == 2)
setup(figsize=(12,5), title='Wikipedia Revisions',
ylabel='Number / bin',
xlabel='Two digit number')
_ = pylab.hist(wiki.loc[isdigit, 'number'].as_matrix(),
bins=np.arange(0,101), alpha=0.7, lw=0)
add_line(13, '2013?', color='g', offset=1.5)
add_line(23, '23 years old')
add_line(69, '69', color='DarkBlue')
add_line(42, '42', color='DarkBlue')
add_line(88, '26 years old', color='r')
add_line(92, '22 years old')
In [132]:
hn = pd.read_csv('/Users/ajmendez/data/usernames/hackernews.csv')
tmp = df['author'].str.extract(u'(\d+)')
hn['number'] = tmp.apply(lambda x: int(x) if isinstance(x, (str, unicode)) else np.nan)
hn['nlength'] = tmp.apply(lambda x: len(x) if isinstance(x, (str,unicode)) else 0)
print len(hn)
In [175]:
isdigit = (hn['nlength'] == 2)
setup(figsize=(12,5), title='Hacker News Comments',
ylabel='Number / bin',
xlabel='Two digit number')
_ = pylab.hist(hn.loc[isdigit, 'number'].as_matrix(),
bins=np.arange(0,101), alpha=0.7, lw=0)
add_line(13, '2013?', y=650, color='g', offset=1.5)
add_line(23, '23 years old', y=650, offset=1.5)
add_line(69, '69', color='DarkBlue', y=650)
add_line(42, '42', color='DarkBlue', y=650)
add_line(88, '27 years old', color='r', y=650)
add_line(92, '23 years old', y=650)
In [178]:
isdigit = (wiki['nlength'] == 4)
setup(figsize=(12,5), title='Wikipedia Revisions',
ylabel='Number / bin',
xlabel='Two digit number')
_ = pylab.hist(wiki.loc[isdigit, 'number'].as_matrix(),
bins=np.arange(1960,2021), alpha=0.7, lw=0)
add_line(1991, '23 years old', y=1900)
add_line(2014, '2014', y=1900)
In [176]:
isdigit = (hn['nlength'] == 4)
setup(figsize=(12,5), title='Hacker News Comments',
ylabel='Number / bin',
xlabel='Four digit number')
_ = pylab.hist(hn.loc[isdigit, 'number'].as_matrix(),
bins=np.arange(1960,2021), alpha=0.7, lw=0)
add_line(1991, '24 years old', y=130)
add_line(1984, '31 years old', y=130)
add_line(2014, '2014', y=130)
In [186]:
y=9000
isdigit = (reddit['nlength'] == 4)
setup(figsize=(12,5), title='Hacker News Comments',
ylabel='Number / bin',
xlabel='Four digit number')
_ = pylab.hist(reddit.loc[isdigit, 'number'].as_matrix(),
bins=np.arange(1960,2021), alpha=0.7, lw=0)
add_line(1990, '24 years old', y=y)
add_line(2014, '2014', y=y)
In [201]:
def make_hist(df, label):
isdigit = (df['nlength'] == 4)
color = next(pylab.gca()._get_lines.color_cycle)
pylab.hist(df.loc[isdigit, 'number'].as_matrix(), label=label, histtype='step',
bins=np.arange(1960,2021), alpha=0.7, lw=2, color=color, normed=True)
pylab.hist(df.loc[isdigit, 'number'].as_matrix(),
bins=np.arange(1960,2021), alpha=0.1, lw=0, color=color, normed=True)
setup(figsize=(12,5),
ylabel='Fraction',
xlabel='Two digit number')
make_hist(wiki, 'Wikipedia')
make_hist(reddit, 'Reddit')
make_hist(hn, 'Hacker News')
legend(loc=2)
Out[201]:
In [202]:
def make_hist(df, label):
isdigit = (df['nlength'] == 2)
color = next(pylab.gca()._get_lines.color_cycle)
pylab.hist(df.loc[isdigit, 'number'].as_matrix(), label=label, histtype='step',
bins=np.arange(0,101), alpha=0.7, lw=2, color=color, normed=True)
pylab.hist(df.loc[isdigit, 'number'].as_matrix(),
bins=np.arange(0,101), alpha=0.1, lw=0, color=color, normed=True)
setup(figsize=(12,5),
ylabel='Fraction',
xlabel='Two digit number')
make_hist(wiki, 'Wikipedia')
make_hist(reddit, 'Reddit')
make_hist(hn, 'Hacker News')
legend(loc=2)
Out[202]: