In [2]:
%matplotlib inline
import os
import glob
import pylab
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 5)
import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.colors
from matplotlib.dates import date2num
from datetime import datetime
from pysurvey.plot import setup_sns as setup
from pysurvey.plot import minmax, icolorbar, density, legend, text, dateticks
In [4]:
# df = pd.read_csv('/Users/ajmendez/data/reddit/subreddit_numbers.csv')
df = pd.read_csv('/Users/ajmendez/data/reddit/subreddit_numbers_v1.csv')
Out[4]:
In [14]:
# Update the unique, and totals for simple filtering.
df['ntotal'] = df['nunique'] = 0
for (subreddit, year), d in df.groupby(['subreddit', 'year']):
isgood = (df['subreddit'] == subreddit) & (df['year'] == year)
df.loc[isgood, ['ntotal', 'nunique']] = np.sum(d['count']), len(d)
In [105]:
df['nyear'] = 0
for subreddit, d in df.groupby('subreddit'):
isgood = (df['subreddit'] == subreddit)
df.loc[isgood, 'nyear'] = len(np.unique(d['year']))
# y = d[(d['nunique'] > 3) & (d['ntotal'] > 10)].groupby('year')
# df.loc[isgood, ['nyear', 'ngoodyear']] = [len(np.unique(d['year'])), len(y)]
In [29]:
df['ngoodyear'] = 0
isgood = (df['ntotal'] > 10) & (df['nunique'] > 5)
for subreddit,d in df[isgood].groupby('subreddit'):
df.loc[isgood & (df['subreddit'] == subreddit), 'ngoodyear'] = len(np.unique(d['year']))
In [113]:
df.to_csv('/Users/ajmendez/data/reddit/subreddit_numbers_v1.csv')
In [71]:
yearnorm = matplotlib.colors.Normalize(2007,2016)
bins = np.arange(1900, 2031, 10)
V,L = np.histogram(df['bin'], bins=bins, weights=df['count'])
setup(figsize=(8,6))
for k,(year, d) in enumerate(numbers.groupby('year'),1):
ax = setup(subplt=(2,1,1), xlabel='Year', ylabel='Fraction', xticks=False)
if k == 1:
pylab.plot(L[:-1], V*1.0/np.sum(V), lw=2, color='k')
v,l = np.histogram(d['bin'], bins=bins, weights=d['count'])
pylab.plot(l[:-1], v*1.0/np.sum(v), '-s', alpha=0.7,
color=pylab.cm.Spectral(yearnorm(year)), label=year)
ax = setup(subplt=(2,1,2), xlabel='Year', ylabel='Relative Fraction')
if k == 1:
pylab.axhline(0, color='k', lw=2)
pylab.plot(l[:-1], v*1.0/np.sum(v) / (V*1.0 / np.sum(V)) - 1, '-s', alpha=0.7,
color=pylab.cm.Spectral(yearnorm(year)), label=year)
legend(loc=2)
pylab.tight_layout()
In [76]:
## yearnorm = matplotlib.colors.Normalize(2007,2016)
bins = np.arange(1900, 2006, 10)
V,L = np.histogram(df['bin'], bins=bins, weights=df['count'])
setup(figsize=(12,12))
for k,(year, d) in enumerate(df.groupby('year'),1):
print year
ax = setup(subplt=(2,1,1), xlabel='Age', ylabel='Fraction', xticks=False)
if k == 1:
pylab.plot(2014-L[:-1], V*1.0/np.sum(V), lw=2, color='k')
v,l = np.histogram(d['bin'], bins=bins, weights=d['count'])
pylab.plot(year-l[:-1], v*1.0/np.sum(v), '-s', alpha=0.7,
color=pylab.cm.Spectral(yearnorm(year)), label=year)
ax = setup(subplt=(2,1,2), xlabel='Age', ylabel='Relative Fraction')
if k == 1:
pylab.axhline(0, color='k', lw=2)
pylab.plot(year-l[:-1], v*1.0/np.sum(v) / (V*1.0 / np.sum(V)) - 1, '-s', alpha=0.7,
color=pylab.cm.Spectral(yearnorm(year)), label=year)
legend(loc=2)
pylab.tight_layout()
In [66]:
# numbers = df[(df['nyear'] > 7)] # too much noise
numbers = df[ (df['ntotal'] > 100) & (df['nunique'] > 10) & (df['ngoodyear'] > 7)]
tmp = np.unique(numbers['subreddit'])
print tmp, len(tmp)
numbers
Out[66]:
In [38]:
yearnorm = matplotlib.colors.Normalize(2007,2016)
setup(figsize=(18,6))
for k,(subreddit, d) in enumerate(numbers.groupby('subreddit'),1):
ax = setup(subplt=(1,3,k), title=subreddit, xlabel='Year', ylabel='Fraction')
# pylab.axvspan(1990, 2020, color='0.5', alpha=0.3, zorder=-2),
for i, (year, e) in enumerate(d.groupby('year')):
pylab.plot(e['year'] - e['bin'], (e['count'])/np.sum(e['count']), '-s',
color=pylab.cm.Blues_r(yearnorm(year)), label=year)
legend(loc=2)
# break
pylab.tight_layout()
In [40]:
# numbers = df[(df['nyear'] > 7)] # too much noise
numbers = df[ (df['ntotal'] > 100) & (df['nunique'] > 10) & (df['ngoodyear'] > 6)]
tmp = np.unique(numbers['subreddit'])
print tmp, len(tmp)
numbers
Out[40]:
In [43]:
yearnorm = matplotlib.colors.Normalize(2007,2014)
setup(figsize=(18,18))
for k,(subreddit, d) in enumerate(numbers.groupby('subreddit'),1):
ax = setup(subplt=(5,5,k), title=subreddit)
pylab.axvspan(-20, 20, color='0.5', alpha=0.3, zorder=-2),
for i, (year, e) in enumerate(d.groupby('year')):
pylab.plot(e['year'] - e['bin'], (e['count'])/np.sum(e['count']), '-s',
color=pylab.cm.jet(yearnorm(year)), label=year)
legend(loc=2)
# break
pylab.tight_layout()
In [ ]: