In [1]:
import praw
import datetime
import pandas as pd
In [2]:
user_agent = 'User-Agent: windows:reddit_analytics:v1 (by /u/willycs40)'
r = praw.Reddit(user_agent=user_agent)
In [3]:
start_dt = datetime.datetime(2014,8,1)
end_dt = datetime.datetime(2015,8,14)
day_delta = datetime.timedelta(days=10)
timespans = []
current_dt = start_dt
while current_dt <= end_dt:
timespans.append(current_dt)
current_dt += day_delta
epochs = [int((x - datetime.datetime(1970,1,1)).total_seconds()) for x in timespans]
epochs = zip(epochs[0:-1],epochs[1:])
print('{} epochs in total.'.format(len(epochs)))
In [8]:
def process_submissions(submissions):
'''A method to take a set of submissions and return of lists of lists with selected fields only'''
posts = []
for post in submissions:
all_posts.append([post.id
, post.title
, post.created_utc
, post.author
, post.url
, post.domain
, post.permalink
, post.ups
, post.downs
, post.score
, post.is_self
, post.num_comments
, post.subreddit
, post.thumbnail
])
return posts
In [5]:
all_posts = []
for epoch in epochs:
query = 'timestamp:{}..{}'.format(epoch[0], epoch[1]-1)
print(query)
submissions = r.search(query, subreddit='machinelearning', sort='new', limit=100, syntax='cloudsearch')
all_posts.extend(process_submissions(submissions))
print('{} posts retrieved in total'.format(len(all_posts)))
Convert list to dataframe, convert created epoch to date, simplify thumbnail, create a title length and is_question columns
In [20]:
df = pd.DataFrame(all_posts, columns=['id','title','created','author','url','domain','permalink','ups','downs','score', 'is_self', 'num_comments','subreddit', 'thumbnail'])
df['thumbnail'] = df['thumbnail'].apply(lambda x: False if x in ['self','default'] else True)
df['created'] = df['created'].apply(lambda x: datetime.datetime(1970,1,1) + datetime.timedelta(seconds=x))
df['title_length'] = df['title'].apply(lambda x: len(x))
df['is_question'] = df['title'].apply(lambda x: True if '?' in x else False)
In [21]:
df.head()
Out[21]:
In [18]:
df = df.drop(['title','url','domain','permalink','author','subreddit'], axis=1)
In [24]:
df.to_csv('machinedreaming_posts_full.csv', encoding='utf-8', index=False)