In [42]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('/Users/arimorcos/Github/getRedditDataset/')
from celebReddit import countMisspellings, countWords
import redditDB
import datetime
from matplotlib import pyplot as pp
import numpy as np
import pandas as PD
%matplotlib inline
In [91]:
"""
Get word count for each day in list
"""
minDate = '140401000000'
maxDate = '150319235959'
# convert to date objects
minDateObj = datetime.datetime(2000 + int(minDate[0:2]), int(minDate[2:4]), int(minDate[4:6]),
int(minDate[6:8]), int(minDate[8:10]), int(minDate[10:12]))
maxDateObj = datetime.datetime(2000 + int(maxDate[0:2]), int(maxDate[2:4]), int(maxDate[4:6]),
int(maxDate[6:8]), int(maxDate[8:10]), int(maxDate[10:12]))
# get nDays
nDays = (maxDateObj - minDateObj).days
# initialize database
defaultDB = redditDB.RedditDB('oneYearDefaults')
# get subreddit list
subList = defaultDB.getSubreddits()
# loop through each subreddit
nPosts = {}
for sub in subList:
# loop through each day
nPosts[sub] = []
date = []
for day in range(nDays):
startDate = (minDateObj + datetime.timedelta(days=day)).strftime('%Y%m%d%H%M%S')
endDate = (minDateObj + datetime.timedelta(days=(day + 1))).strftime('%Y%m%d%H%M%S')
date.append(startDate)
# get nPosts
nPosts[sub].append(defaultDB.getNPosts(sub, startDate, endDate))
In [144]:
""" Smooth posts """
def smoothList(tempList, span=10):
smooth = PD.Series(tempList)
smooth = PD.rolling_mean(smooth, span)
return list(smooth)
smoothPosts = {}
for sub in normPosts:
smoothPosts[sub] = smoothList(normPosts[sub], 10)
In [95]:
""" Convert to date format for plots """
origDate = datetime.date(1,1,1)
dayDate = []
for day in range(len(date)):
date[day] = (datetime.date(int(date[day][0:4]), int(date[day][4:6]), int(date[day][6:8])) - origDate).days
In [102]:
""" Normalize posts """
normPosts = {}
for sub in nPosts:
meanPost = statistics.mean(nPosts[sub][0:30])
normPosts[sub] = [float(postNum)/meanPost for postNum in nPosts[sub]]
In [130]:
""" Mean of all subs """
meanSub = []
for day in range(len(date)):
meanSub.append(statistics.mean([normPosts[sub][day] for sub in subList]))
crossDate = datetime.date(2014, 5, 7)
crossDate = (crossDate - datetime.date(1,1,1)).days
#crossDate = date.index(crossDate)
In [160]:
smoothMeanSub = smoothList(meanSub, 5)
pp.figure(figsize=(16,16))
pp.plot_date(date, smoothMeanSub, xdate=True, ydate=False, ls='-', marker=None)
pp.plot_date([crossDate, crossDate], [0, 500], xdate=True, ydate=False, ls='--', color='k')
ax = pp.gca()
ax.set_ylim(0, 3.5)
ax.tick_params(labelsize=15)
locs, labels = pp.xticks()
pp.setp(labels, rotation=-45)
Out[160]:
In [107]:
for sub in subList:
hand = pp.plot_date(date,smoothPosts[sub], xdate=True, ydate=False, ls='-', marker=None)
In [ ]: