notebook.community

Edit and run



In [1]:

    
import praw
import datetime
import pandas as pd



In [2]:

    
user_agent = 'User-Agent: windows:reddit_analytics:v1 (by /u/willycs40)'
r = praw.Reddit(user_agent=user_agent)



In [3]:

    
start_dt = datetime.datetime(2014,8,1)
end_dt = datetime.datetime(2015,8,14)
day_delta = datetime.timedelta(days=10)

timespans = []
current_dt = start_dt
while current_dt <= end_dt:
    timespans.append(current_dt)
    current_dt += day_delta

epochs = [int((x - datetime.datetime(1970,1,1)).total_seconds()) for x in timespans]
epochs = zip(epochs[0:-1],epochs[1:])
print('{} epochs in total.'.format(len(epochs)))









    



37
[(1406851200, 1407715200), (1407715200, 1408579200), (1408579200, 1409443200), (1409443200, 1410307200), (1410307200, 1411171200), (1411171200, 1412035200), (1412035200, 1412899200), (1412899200, 1413763200), (1413763200, 1414627200), (1414627200, 1415491200), (1415491200, 1416355200), (1416355200, 1417219200), (1417219200, 1418083200), (1418083200, 1418947200), (1418947200, 1419811200), (1419811200, 1420675200), (1420675200, 1421539200), (1421539200, 1422403200), (1422403200, 1423267200), (1423267200, 1424131200), (1424131200, 1424995200), (1424995200, 1425859200), (1425859200, 1426723200), (1426723200, 1427587200), (1427587200, 1428451200), (1428451200, 1429315200), (1429315200, 1430179200), (1430179200, 1431043200), (1431043200, 1431907200), (1431907200, 1432771200), (1432771200, 1433635200), (1433635200, 1434499200), (1434499200, 1435363200), (1435363200, 1436227200), (1436227200, 1437091200), (1437091200, 1437955200), (1437955200, 1438819200)]



In [8]:

    
def process_submissions(submissions):
    '''A method to take a set of submissions and return of lists of lists with selected fields only'''
    posts = []
    for post in submissions:
        all_posts.append([post.id
               , post.title
               , post.created_utc
               , post.author
               , post.url
               , post.domain
               , post.permalink
               , post.ups
               , post.downs
               , post.score
               , post.is_self
               , post.num_comments
               , post.subreddit
               , post.thumbnail
               ])
    return posts



In [5]:

    
all_posts = []
for epoch in epochs:
    query = 'timestamp:{}..{}'.format(epoch[0], epoch[1]-1)
    print(query)
    submissions = r.search(query, subreddit='machinelearning', sort='new', limit=100, syntax='cloudsearch')
    all_posts.extend(process_submissions(submissions))
print('{} posts retrieved in total'.format(len(all_posts)))









    



timestamp:1406851200..1407715199
timestamp:1407715200..1408579199
timestamp:1408579200..1409443199
timestamp:1409443200..1410307199
timestamp:1410307200..1411171199
timestamp:1411171200..1412035199
timestamp:1412035200..1412899199
timestamp:1412899200..1413763199
timestamp:1413763200..1414627199
timestamp:1414627200..1415491199
timestamp:1415491200..1416355199
timestamp:1416355200..1417219199
timestamp:1417219200..1418083199
timestamp:1418083200..1418947199
timestamp:1418947200..1419811199
timestamp:1419811200..1420675199
timestamp:1420675200..1421539199
timestamp:1421539200..1422403199
timestamp:1422403200..1423267199
timestamp:1423267200..1424131199
timestamp:1424131200..1424995199
timestamp:1424995200..1425859199
timestamp:1425859200..1426723199
timestamp:1426723200..1427587199
timestamp:1427587200..1428451199
timestamp:1428451200..1429315199
timestamp:1429315200..1430179199
timestamp:1430179200..1431043199
timestamp:1431043200..1431907199
timestamp:1431907200..1432771199
timestamp:1432771200..1433635199
timestamp:1433635200..1434499199
timestamp:1434499200..1435363199
timestamp:1435363200..1436227199
timestamp:1436227200..1437091199
timestamp:1437091200..1437955199
timestamp:1437955200..1438819199






    



c:/Users/will/Documents/Projects/reddit_analytics/venv\Lib\site-packages\requests\packages\urllib3\util\ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning

Convert list to dataframe, convert created epoch to date, simplify thumbnail, create a title length and is_question columns



In [20]:

    
df = pd.DataFrame(all_posts, columns=['id','title','created','author','url','domain','permalink','ups','downs','score', 'is_self', 'num_comments','subreddit', 'thumbnail'])
df['thumbnail'] = df['thumbnail'].apply(lambda x: False if x in ['self','default'] else True)
df['created'] = df['created'].apply(lambda x: datetime.datetime(1970,1,1) + datetime.timedelta(seconds=x))
df['title_length'] = df['title'].apply(lambda x: len(x))
df['is_question'] =  df['title'].apply(lambda x: True if '?' in x else False)



In [21]:

    
df.head()









    Out[21]:






  
    
      
      id
      title
      created
      author
      url
      domain
      permalink
      ups
      downs
      score
      is_self
      num_comments
      subreddit
      thumbnail
      title_length
      is_question
    
  
  
    
      0
      2d5io0
      Resources for learning about ensemble learning?
      2014-08-10 15:45:08
      rovingr
      http://www.reddit.com/r/MachineLearning/commen...
      self.MachineLearning
      https://www.reddit.com/r/MachineLearning/comme...
      11
      0
      11
      True
      10
      MachineLearning
      False
      47
      True
    
    
      1
      2d4y8z
      how cross validation help to gain a better model?
      2014-08-10 09:44:01
      phoenixbai
      http://www.reddit.com/r/MachineLearning/commen...
      self.MachineLearning
      https://www.reddit.com/r/MachineLearning/comme...
      0
      0
      0
      True
      12
      MachineLearning
      False
      49
      True
    
    
      2
      2czt7i
      Machine Learning Theory: An Introductory Primer
      2014-08-08 17:00:53
      hs613
      http://www.toptal.com/machine-learning/machine...
      toptal.com
      https://www.reddit.com/r/MachineLearning/comme...
      51
      0
      51
      False
      5
      MachineLearning
      True
      47
      False
    
    
      3
      2czekz
      Using scikit-learn Pipelines and FeatureUnions
      2014-08-08 14:41:59
      eloisius
      http://zacstewart.com/2014/08/05/pipelines-of-...
      zacstewart.com
      https://www.reddit.com/r/MachineLearning/comme...
      15
      0
      15
      False
      0
      MachineLearning
      True
      46
      False
    
    
      4
      2czdp2
      Implementation of Developmental Learning MOOC ...
      2014-08-08 14:33:25
      alexgmcm
      http://liris.cnrs.fr/ideal/mooc/
      liris.cnrs.fr
      https://www.reddit.com/r/MachineLearning/comme...
      1
      0
      1
      False
      5
      MachineLearning
      True
      70
      False



In [18]:

    
df = df.drop(['title','url','domain','permalink','author','subreddit'], axis=1)



In [24]:

    
df.to_csv('machinedreaming_posts_full.csv', encoding='utf-8', index=False)

	id	title	created	author	url	domain	permalink	ups	score	is_self	num_comments	subreddit	thumbnail	title_length	is_question
0	2d5io0	Resources for learning about ensemble learning?	2014-08-10 15:45:08	rovingr	http://www.reddit.com/r/MachineLearning/commen...	self.MachineLearning	https://www.reddit.com/r/MachineLearning/comme...	11	11	True	10	MachineLearning	False	47	True
1	2d4y8z	how cross validation help to gain a better model?	2014-08-10 09:44:01	phoenixbai	http://www.reddit.com/r/MachineLearning/commen...	self.MachineLearning	https://www.reddit.com/r/MachineLearning/comme...	0	0	True	12	MachineLearning	False	49	True
2	2czt7i	Machine Learning Theory: An Introductory Primer	2014-08-08 17:00:53	hs613	http://www.toptal.com/machine-learning/machine...	toptal.com	https://www.reddit.com/r/MachineLearning/comme...	51	51	False	5	MachineLearning	True	47	False
3	2czekz	Using scikit-learn Pipelines and FeatureUnions	2014-08-08 14:41:59	eloisius	http://zacstewart.com/2014/08/05/pipelines-of-...	zacstewart.com	https://www.reddit.com/r/MachineLearning/comme...	15	15	False	0	MachineLearning	True	46	False
4	2czdp2	Implementation of Developmental Learning MOOC ...	2014-08-08 14:33:25	alexgmcm	http://liris.cnrs.fr/ideal/mooc/	liris.cnrs.fr	https://www.reddit.com/r/MachineLearning/comme...	1	1	False	5	MachineLearning	True	70	False