In [1]:

    
%matplotlib inline
import os
import glob
import pylab
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 5)
import seaborn as sns
sns.set_style('whitegrid')

import matplotlib.colors
from matplotlib.dates import date2num
from datetime import datetime

from pysurvey.plot import setup_sns as setup
from pysurvey.plot import minmax, icolorbar, density, legend, text, dateticks

# histogram rather than groups SELECT subreddit, created_year as year, INTEGER(ROUND(number, 0)) AS bin, count(*) as count, sum(nauthor) as nauthor FROM ( SELECT author, subreddit, YEAR(sec_to_timestamp(created_utc)) as created_year, COUNT(*) as nauthor, LENGTH(REGEXP_EXTRACT(author, r'(\d+)')) as number_length, INTEGER(REGEXP_EXTRACT(author, r'(\d+)')) as number FROM TABLE_QUERY([fh-bigquery:reddit_comments], "table_id CONTAINS '20' AND LENGTH(table_id)< 8" ) WHERE (REGEXP_MATCH(author, r'\d{4}') ) GROUP BY author, subreddit, number, number_length, created_year ORDER BY nauthor DESC ) WHERE (number > 0) and (number < 2020) and (number_length == 2) GROUP BY bin, subreddit, year order by year desc, subreddit desc, bin asc



In [42]:

    
# Only includes up to 2014
# df = pd.read_csv('/Users/ajmendez/data/reddit/subreddit_numbers_eachyear.csv')
# includes out to 2016
# df = pd.read_csv('/Users/ajmendez/data/reddit/subreddit_2_eachyear.csv')
df = pd.read_csv('/Users/ajmendez/data/reddit/subreddit_2_eachyear_v1.csv')
df









    Out[42]:






  
    
      
      Unnamed: 0
      subreddit
      year
      bin
      count
      nauthor
      age
      ntotal
      nunique
      nyear
      ngoodyear
    
  
  
    
      0
      0
      zen
      2016
      1
      1
      2
      2015
      1
      1
      7
      0
    
    
      1
      1
      youtubecomments
      2016
      26
      1
      2
      1990
      1
      1
      2
      0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      11604
      11604
      WTF
      2008
      46
      1
      1
      1962
      1
      1
      9
      0
    
    
      11605
      11605
      reddit.com
      2007
      76
      1
      1
      1931
      1
      1
      5
      0
    
  

11606 rows × 11 columns



In [47]:

    
df['age'] = df['year'] - df['bin']



In [4]:

    
# Update the unique, and totals for simple filtering.
df['ntotal'] = df['nunique'] = 0
for (subreddit, year), d in df.groupby(['subreddit', 'year']):
    isgood = (df['subreddit'] == subreddit) & (df['year'] == year)
    df.loc[isgood, ['ntotal', 'nunique']]  = np.sum(d['count']), len(d)



In [5]:

    
df['nyear'] = 0
for subreddit, d in df.groupby('subreddit'):
    isgood = (df['subreddit'] == subreddit)
    df.loc[isgood, 'nyear'] = len(np.unique(d['year']))



In [6]:

    
df['ngoodyear'] = 0
isgood = (df['ntotal'] > 10) & (df['nunique'] > 5)
for subreddit,d in df[isgood].groupby('subreddit'):
    df.loc[isgood & (df['subreddit'] == subreddit), 'ngoodyear'] = len(np.unique(d['year']))



In [7]:

    
# df.to_csv('/Users/ajmendez/data/reddit/subreddit_numbers_eachyear_v1.csv')
# df.to_csv('/Users/ajmendez/data/reddit/subreddit_2_eachyear_v1.csv')



In [26]:

    
ax = setup(figsize=(12,6))
ax.get_yaxis().get_major_formatter().set_useOffset(False)
den = density(df['bin'], df['year'], weights=df['count'], 
              bins=(np.arange(0, 100, 2), np.arange(2007, 2017,1)),
#               ynorm=True,
              cmap=pylab.cm.jet, logvrange=True,
                   )



In [21]:

    
numbers = df[df['year'] != 2007]



In [33]:

    
yearnorm = matplotlib.colors.Normalize(2007,2016)
bins = np.arange(0, 100, 1)
setup(figsize=(12,6))
values = [33,44,66,77,88,69,42]

for k,(year, d) in enumerate(numbers.groupby('year'),1):
    color = pylab.cm.Spectral(yearnorm(year))
    ax = setup(xlabel='Year', ylabel='Fraction')
    v,l = np.histogram(d['bin'], bins=bins, weights=d['count'])
    pylab.plot(l[:-1], v*1.0/np.sum(v),  '-s', alpha=0.7, color=color, label=year)
    for value in values:
        pylab.axvline(value, color='k', alpha=0.5, zorder=-2)
        text(value, 0.45, value, rotation=90, ha='center', va='bottom')
legend(ax=ax, loc=2)
pylab.tight_layout()



In [312]:

    
yearnorm = matplotlib.colors.Normalize(2007,2016)
bins = np.arange(1960, 2000, 2)

V,L = np.histogram(numbers['bin'], bins=bins, weights=numbers['count'])
setup(figsize=(12,8))
for k,(year, d) in enumerate(numbers.groupby('year'),1):
    color = pylab.cm.Spectral(yearnorm(year))
    ax = setup(subplt=(2,1,1), xlabel='Year', ylabel='Fraction', xticks=False)
    v,l = np.histogram(d['bin'], bins=bins, weights=d['count'])
    pylab.plot(year-l[:-1], v*1.0/np.sum(v),  '-s', alpha=0.7, color=color, label=year)
    
    
    ax2 = setup(subplt=(2,1,2), xlabel='Year', ylabel='Relative Fraction')
    pylab.plot(year-l[:-1], v*1.0/np.sum(v) / (V*1.0 / np.sum(V)) - 1,  '-s', alpha=0.7, color=color, label=year)
    
legend(ax=ax, loc=2)
pylab.tight_layout()



In [303]:

    
## yearnorm = matplotlib.colors.Normalize(2007,2016)


setup(figsize=(12,12))
out = np.zeros( (3, 10) )
for k,(year, d) in enumerate(numbers.groupby('year')):
    ax = setup(subplt=(2,1,1), xlabel='Age', ylabel='Cumulative Fraction', xticks=True)
#     pylab.axhline(0.5)
    color = pylab.cm.Spectral(yearnorm(year))
    bins = np.arange(13,60,1)
    v,l = np.histogram(d['year'] - d['bin'], bins=bins, weights=d['count'])
    
    x = l[:-1]
    y = np.cumsum(v)*1.0/np.sum(v)
    
#     avg = np.average(year-d['bin'], weights=d['count'])
    avg = np.interp([0.25, 0.5, 0.75], y, x)
#     pylab.plot(avg, 0.5, 's', markersize=20, color=color)
    out[:, k] = year, avg[1], (avg[2]-avg[0])/2.0
#     pylab.axvline(avg, lw=2, alpha=0.5, color=color)
    pylab.plot(x, y,  '-s', alpha=0.7,
                   color=color, label=year)

    
legend(loc=2)
pylab.tight_layout()



In [290]:

    
ax = setup(figsize=(8,6), 
           xlabel='Year', xr=[2006.5, 2016.5], 
           ylabel='Median Age', yr=[22, 32])
ax.get_xaxis().get_major_formatter().set_useOffset(False)
pylab.plot(out[0], out[1], '-s')









    Out[290]:





[<matplotlib.lines.Line2D at 0x119873b90>]



In [313]:

    
numbers = df[(df['ngoodyear'] == 10) & (df['year'] != 2000) & (df['year'] != 2001) ]
print np.unique(numbers['subreddit']), len(np.unique(numbers['subreddit']))









    



['entertainment' 'politics' 'programming' 'science'] 4



In [314]:

    
yearnorm = matplotlib.colors.Normalize(2007,2014)
setup(figsize=(8,8))
for k,(subreddit, d) in enumerate(numbers.groupby('subreddit'),1):
    ax = setup(subplt=(2,2,k), title=subreddit, autoticks=True,
              xlabel='age', xr=[10,50], 
               ylabel='Fraction', yr=[0,1])
#     pylab.axvspan(-20, 20, color='0.5', alpha=0.3, zorder=-2),
    for i, (year, e) in enumerate(d.groupby('year')):
        bins = np.arange(year-1998,60, 2)
        v,l = np.histogram(e['year'] - e['bin'], bins=bins, weights=e['count'])
        y = np.cumsum(v)*1.0/v.sum()
        age = np.interp([0.5], y, l[:-1])[0]
        pylab.plot(l[:-1], y,
                   color=pylab.cm.jet(yearnorm(year)), 
                   label='{}: {:0.1f}'.format(year, age))
    legend(loc=4)
#     break
pylab.tight_layout()



In [315]:

    
yearnorm = matplotlib.colors.Normalize(2007,2014)
setup(figsize=(12,5))
for k,(subreddit, d) in enumerate(numbers.groupby('subreddit'),1):
    ax = setup(subplt=(1,4,k), title=subreddit, autoticks=True,
              xlabel='year', xr=[2006.5,2016.5], 
               ylabel='Fraction', yr=[20,34])
    ax.get_xaxis().get_major_formatter().set_useOffset(False)
    
    out = np.zeros( (2, 10) )
    for i, (year, e) in enumerate(d.groupby('year')):
        bins = np.arange(year-1999,60, 1)
        v,l = np.histogram(e['year'] - e['bin'], bins=bins, weights=e['count'])
        x, y = l[:-1], np.cumsum(v)*1.0/np.sum(v)
        avg = np.interp([0.5], y, x)
        out[:, i] = year, avg
    pylab.plot(out[0], out[1], '-s', label='Average: {:0.1f}'.format(np.mean(out[1])))
    legend(loc=4)
pylab.tight_layout()



In [44]:

    
numbers = df[df['ngoodyear'] >= 5]
print np.unique(numbers['subreddit']), len(np.unique(numbers['subreddit']))









    



['AskReddit' 'IAmA' 'WTF' 'funny' 'gaming' 'pics' 'politics'
 'todayilearned' 'videos' 'worldnews'] 10



In [45]:

    
subreddit_ages = []
for k,(subreddit, d) in enumerate(numbers.groupby('subreddit')):
    out = np.zeros( (3, 10) )
    for i, (year, e) in enumerate(d.groupby('year')):
        bins = np.arange(year-1999,60, 2)
        v,l = np.histogram(e['year'] - e['bin'], bins=bins, weights=e['count'])
        x, y = l[:-1], np.cumsum(v)*1.0/np.sum(v)
        avg = np.interp([0.25, 0.5, 0.75], y, x)
        out[:, i] = year, avg[1], avg[2] - avg[0]
    tmp = np.mean(out[1][out[1] > 0])
    tmp2 = np.mean(out[2][out[2] > 0])
    subreddit_ages.append([subreddit, tmp, tmp2, len(d)])
for i,(subreddit, age, std, nobs) in enumerate(sorted(subreddit_ages, key=lambda x: -x[1])):
    print '{:15s} {: 4.1f} {:4.1f} {:8d}'.format(subreddit, age, std, nobs)









    



AskReddit        nan  nan      312
IAmA             nan  nan      129
WTF              nan  nan      133
funny            nan  nan      196
gaming           nan  nan      118
pics             nan  nan      196
politics         nan  nan       93
todayilearned    nan  nan      137
videos           nan  nan      146
worldnews        nan  nan      148



In [46]:

    
out









    Out[46]:





array([[ 2011.,  2012.,  2013.,  2014.,  2015.,  2016.,     0.,     0.,
            0.,     0.],
       [   nan,    nan,    nan,    nan,    nan,    nan,     0.,     0.,
            0.,     0.],
       [   nan,    nan,    nan,    nan,    nan,    nan,     0.,     0.,
            0.,     0.]])



In [ ]:

	Unnamed: 0	subreddit	year	bin	count	nauthor	age	ntotal	nunique	nyear	ngoodyear
0	0	zen	2016	1	1	2	2015	1	1	7	0
1	1	youtubecomments	2016	26	1	2	1990	1	1	2	0
...	...	...	...	...	...	...	...	...	...	...	...
11604	11604	WTF	2008	46	1	1	1962	1	1	9	0
11605	11605	reddit.com	2007	76	1	1	1931	1	1	5	0