In [1]:
import praw
import datetime
import pandas as pd

In [2]:
user_agent = 'User-Agent: windows:reddit_analytics:v1 (by /u/willycs40)'
r = praw.Reddit(user_agent=user_agent)

In [3]:
start_dt = datetime.datetime(2014,8,1)
end_dt = datetime.datetime(2015,8,14)
day_delta = datetime.timedelta(days=10)

timespans = []
current_dt = start_dt
while current_dt <= end_dt:
    timespans.append(current_dt)
    current_dt += day_delta

epochs = [int((x - datetime.datetime(1970,1,1)).total_seconds()) for x in timespans]
epochs = zip(epochs[0:-1],epochs[1:])
print('{} epochs in total.'.format(len(epochs)))


37
[(1406851200, 1407715200), (1407715200, 1408579200), (1408579200, 1409443200), (1409443200, 1410307200), (1410307200, 1411171200), (1411171200, 1412035200), (1412035200, 1412899200), (1412899200, 1413763200), (1413763200, 1414627200), (1414627200, 1415491200), (1415491200, 1416355200), (1416355200, 1417219200), (1417219200, 1418083200), (1418083200, 1418947200), (1418947200, 1419811200), (1419811200, 1420675200), (1420675200, 1421539200), (1421539200, 1422403200), (1422403200, 1423267200), (1423267200, 1424131200), (1424131200, 1424995200), (1424995200, 1425859200), (1425859200, 1426723200), (1426723200, 1427587200), (1427587200, 1428451200), (1428451200, 1429315200), (1429315200, 1430179200), (1430179200, 1431043200), (1431043200, 1431907200), (1431907200, 1432771200), (1432771200, 1433635200), (1433635200, 1434499200), (1434499200, 1435363200), (1435363200, 1436227200), (1436227200, 1437091200), (1437091200, 1437955200), (1437955200, 1438819200)]

In [8]:
def process_submissions(submissions):
    '''A method to take a set of submissions and return of lists of lists with selected fields only'''
    posts = []
    for post in submissions:
        all_posts.append([post.id
               , post.title
               , post.created_utc
               , post.author
               , post.url
               , post.domain
               , post.permalink
               , post.ups
               , post.downs
               , post.score
               , post.is_self
               , post.num_comments
               , post.subreddit
               , post.thumbnail
               ])
    return posts

In [5]:
all_posts = []
for epoch in epochs:
    query = 'timestamp:{}..{}'.format(epoch[0], epoch[1]-1)
    print(query)
    submissions = r.search(query, subreddit='machinelearning', sort='new', limit=100, syntax='cloudsearch')
    all_posts.extend(process_submissions(submissions))
print('{} posts retrieved in total'.format(len(all_posts)))


timestamp:1406851200..1407715199
timestamp:1407715200..1408579199
timestamp:1408579200..1409443199
timestamp:1409443200..1410307199
timestamp:1410307200..1411171199
timestamp:1411171200..1412035199
timestamp:1412035200..1412899199
timestamp:1412899200..1413763199
timestamp:1413763200..1414627199
timestamp:1414627200..1415491199
timestamp:1415491200..1416355199
timestamp:1416355200..1417219199
timestamp:1417219200..1418083199
timestamp:1418083200..1418947199
timestamp:1418947200..1419811199
timestamp:1419811200..1420675199
timestamp:1420675200..1421539199
timestamp:1421539200..1422403199
timestamp:1422403200..1423267199
timestamp:1423267200..1424131199
timestamp:1424131200..1424995199
timestamp:1424995200..1425859199
timestamp:1425859200..1426723199
timestamp:1426723200..1427587199
timestamp:1427587200..1428451199
timestamp:1428451200..1429315199
timestamp:1429315200..1430179199
timestamp:1430179200..1431043199
timestamp:1431043200..1431907199
timestamp:1431907200..1432771199
timestamp:1432771200..1433635199
timestamp:1433635200..1434499199
timestamp:1434499200..1435363199
timestamp:1435363200..1436227199
timestamp:1436227200..1437091199
timestamp:1437091200..1437955199
timestamp:1437955200..1438819199
c:/Users/will/Documents/Projects/reddit_analytics/venv\Lib\site-packages\requests\packages\urllib3\util\ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning

Convert list to dataframe, convert created epoch to date, simplify thumbnail, create a title length and is_question columns


In [20]:
df = pd.DataFrame(all_posts, columns=['id','title','created','author','url','domain','permalink','ups','downs','score', 'is_self', 'num_comments','subreddit', 'thumbnail'])
df['thumbnail'] = df['thumbnail'].apply(lambda x: False if x in ['self','default'] else True)
df['created'] = df['created'].apply(lambda x: datetime.datetime(1970,1,1) + datetime.timedelta(seconds=x))
df['title_length'] = df['title'].apply(lambda x: len(x))
df['is_question'] =  df['title'].apply(lambda x: True if '?' in x else False)

In [21]:
df.head()


Out[21]:
id title created author url domain permalink ups downs score is_self num_comments subreddit thumbnail title_length is_question
0 2d5io0 Resources for learning about ensemble learning? 2014-08-10 15:45:08 rovingr http://www.reddit.com/r/MachineLearning/commen... self.MachineLearning https://www.reddit.com/r/MachineLearning/comme... 11 0 11 True 10 MachineLearning False 47 True
1 2d4y8z how cross validation help to gain a better model? 2014-08-10 09:44:01 phoenixbai http://www.reddit.com/r/MachineLearning/commen... self.MachineLearning https://www.reddit.com/r/MachineLearning/comme... 0 0 0 True 12 MachineLearning False 49 True
2 2czt7i Machine Learning Theory: An Introductory Primer 2014-08-08 17:00:53 hs613 http://www.toptal.com/machine-learning/machine... toptal.com https://www.reddit.com/r/MachineLearning/comme... 51 0 51 False 5 MachineLearning True 47 False
3 2czekz Using scikit-learn Pipelines and FeatureUnions 2014-08-08 14:41:59 eloisius http://zacstewart.com/2014/08/05/pipelines-of-... zacstewart.com https://www.reddit.com/r/MachineLearning/comme... 15 0 15 False 0 MachineLearning True 46 False
4 2czdp2 Implementation of Developmental Learning MOOC ... 2014-08-08 14:33:25 alexgmcm http://liris.cnrs.fr/ideal/mooc/ liris.cnrs.fr https://www.reddit.com/r/MachineLearning/comme... 1 0 1 False 5 MachineLearning True 70 False

In [18]:
df = df.drop(['title','url','domain','permalink','author','subreddit'], axis=1)

In [24]:
df.to_csv('machinedreaming_posts_full.csv', encoding='utf-8', index=False)