In [22]:
import requests
from bs4 import BeautifulSoup

In [23]:
# Grab the Reddit Homepage
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
response = requests.get("http://www.reddit.com", headers=headers)

In [24]:
#Feed it into BeautifulSoup
reddit = BeautifulSoup(response.text, 'html.parser')

In [25]:
#reddit

In [26]:
one_sibling_up = reddit.find_all('div', {'class': 'clearleft'})

In [27]:
#because only every other clearleft has a post in it:
posts = [tag.find_next_sibling('div') for tag in one_sibling_up if tag.find_next_sibling('div')]

In [90]:
#Function to change the votes into int
def vote_count_int(x):
    if x == '•':
        return 0
    else:
        return int(x)

In [91]:
all_stories = []
for article in posts:
    #subreddit
    if article.find("a", {'class': 'subreddit hover may-blank' }) is None:
        posts.remove(article)
    else:
        article_subreddit = article.find("a", {'class': 'subreddit hover may-blank' })
    #title
    if article.find("a", {'class': 'title may-blank ' }) is None:
        posts.remove(article)
    else:
        article_title = article.find("a", {'class': 'title may-blank ' }) 
    #time
    if article.find("time", {'class' : 'live-timestamp'}) is None:  
        posts.remove(article)
    else:
        time = article.find("time", {'class' : 'live-timestamp'}).get('datetime')
    #URL
    if article.find("a", {'class': 'title may-blank ' }) is None:
        posts.remove(article)
    else: 
        article_URL = article.find("a", {'class': 'title may-blank ' }).get('href')
    #Thumbnails
    if article.find('img') is None:
        posts.remove(article)
    else:
        image_url = article.find('img')
        if image_url:
            thumbnail = image_url.get('src')
    #votes
    if article.find("div", {'class': 'score unvoted' }) is None:
        posts.remove(article)
    else:
        article_score = article.find("div", {'class': 'score unvoted' })
    #Dictionary
    article_subreddit_dict = {'subreddit': article_subreddit.string, 'title': article_title.string, \
                              'time': time, 'URL': article_URL, 'thumbnail-URL': thumbnail, \
                              'votes': vote_count_int(article_score.string)}
                              
    
    #Dict List
    all_stories.append(article_subreddit_dict)

In [92]:
all_stories


Out[92]:
[{'URL': 'http://www.usnews.com/news/politics/articles/2016-06-23/the-latest-business-leaders-endorse-clinton',
  'subreddit': '/r/politics',
  'thumbnail-URL': '//b.thumbs.redditmedia.com/tc5Ut1c5FOyNLBdK0BpkuEZ-aC2eEu252qOcLU6j7Hw.jpg',
  'time': '2016-06-23T18:54:46+00:00',
  'title': "Presumptive GOP nominee Donald Trump is commending the Supreme Court's decision blocking President Barack Obama's efforts to shield millions living in the U.S. illegally from deportation",
  'votes': 0},
 {'URL': 'http://abcnews.go.com/Politics/state-dept-appears-disabled-security-features-blocking-clintons/story?id=40078931',
  'subreddit': '/r/politics',
  'thumbnail-URL': '//b.thumbs.redditmedia.com/4Mgue1kjmHJzI_WgzMnd921rog-Fn20BWZIMaqYTNco.jpg',
  'time': '2016-06-23T18:46:14+00:00',
  'title': "State Dept. Appears to Have Disabled Security Features Blocking Clinton's Private Email",
  'votes': 0},
 {'URL': 'http://thehill.com/blogs/congress-blog/judicial/284611-chaos-following-scotuss-4-4-split-on-texass-amnesty-challenge',
  'subreddit': '/r/politics',
  'thumbnail-URL': '//b.thumbs.redditmedia.com/9k9rxJY8kk71kWMXMoAEzQhdShimq3RVlM_dQy088SI.jpg',
  'time': '2016-06-23T18:48:32+00:00',
  'title': "Chaos following SCOTUS’s 4-4 split on Texas's amnesty-challenge looks inevitable",
  'votes': 0},
 {'URL': 'https://www.washingtonpost.com/posteverything/wp/2016/06/23/bernie-sanders-heres-what-we-want/',
  'subreddit': '/r/politics',
  'thumbnail-URL': '//b.thumbs.redditmedia.com/IYKB7NfTYswU4bReO8meGQkdT5pawuicEWYLEBYjILY.jpg',
  'time': '2016-06-23T11:28:57+00:00',
  'title': 'Bernie Sanders: Here’s what we want "We want an economy that is not based on uncontrollable greed, monopolistic practices and illegal behavior. We want an economy that protects the human needs and dignity of all people — children, the elderly, the sick, working people and the poor."',
  'votes': 5150},
 {'URL': 'http://nypost.com/2016/06/22/clinton-staff-disabled-security-of-private-email-server-report/',
  'subreddit': '/r/politics',
  'thumbnail-URL': '//b.thumbs.redditmedia.com/24K9u-poEnhjgUEfxMwqLXW6yY_9O99_Io20CEUjh_Q.jpg',
  'time': '2016-06-23T13:46:50+00:00',
  'title': 'Clinton staff disabled security of private email server',
  'votes': 2125},
 {'URL': 'http://arstechnica.com/information-technology/2016/06/clintons-private-e-mail-was-blocked-by-spam-filters-so-state-it-turned-them-off/',
  'subreddit': '/r/politics',
  'thumbnail-URL': '//b.thumbs.redditmedia.com/Gub8eW2F2whyH3svfAmR27ocSS6SyJs_-xaPBKmV9oY.jpg',
  'time': '2016-06-23T16:53:56+00:00',
  'title': 'Clinton’s private e-mail was blocked by spam filters—so State IT turned them off',
  'votes': 750},
 {'URL': 'http://www.chron.com/news/politics/article/Emails-Key-security-features-disabled-on-8319055.php',
  'subreddit': '/r/politics',
  'thumbnail-URL': '//b.thumbs.redditmedia.com/zGkiPbVriz8PB0nX-T3yUAtiFRjbYUd4lDrrEPxMxhE.jpg',
  'time': '2016-06-23T12:32:01+00:00',
  'title': "State Dept. scrambled on trouble on Clinton's server",
  'votes': 1125},
 {'URL': 'https://www.denverpost.com/2016/06/22/imf-downgrades-outlook-us-economy/',
  'subreddit': '/r/politics',
  'thumbnail-URL': '//b.thumbs.redditmedia.com/wceBPJCsANRQ2MSPj7wy2Ls3k7KrjCqr21jyg8mooKg.jpg',
  'time': '2016-06-23T14:22:07+00:00',
  'title': 'U.S. should raise minimum wage, overhaul taxes and offer paid maternity leave, IMF says',
  'votes': 775},
 {'URL': 'http://www.politico.com/blogs/2016-dem-primary-live-updates-and-results/2016/06/superdelegates-maine-convention-224703',
  'subreddit': '/r/politics',
  'thumbnail-URL': '//a.thumbs.redditmedia.com/0edRQMR0d7yUF95Adeb7pbtvNEdtz9vGnfV6nMRm7U0.jpg',
  'time': '2016-06-23T13:07:26+00:00',
  'title': 'Maine Democrat moves to abolish superdelegates',
  'votes': 854},
 {'URL': 'http://thehill.com/regulation/284468-gop-fails-to-block-obamas-financial-adviser-rule',
  'subreddit': '/r/politics',
  'thumbnail-URL': '//b.thumbs.redditmedia.com/XZlSyiKFPWo2KU40KyZz85fEiOO9OJy2qGdyz5cO_TI.jpg',
  'time': '2016-06-23T03:22:03+00:00',
  'title': "GOP fails to block Obama's financial adviser rule requiring financial advisers to act in the best interest of their clients who are saving for retirement.",
  'votes': 4595},
 {'URL': 'http://www.newsmax.com/CalThomas/honesty-integrity/2016/06/23/id/735281/',
  'subreddit': '/r/politics',
  'thumbnail-URL': '//b.thumbs.redditmedia.com/x3UBEZDHrpKraFmVKzSgdowPlIesmFMsR_Ow6QELPwU.jpg',
  'time': '2016-06-23T15:35:25+00:00',
  'title': 'Democrats Should Demand Better Than Hillary',
  'votes': 431},
 {'URL': 'http://thehill.com/policy/national-security/284604-clinton-aide-joked-about-funders-appointment-to-intelligence-board',
  'subreddit': '/r/politics',
  'thumbnail-URL': '//b.thumbs.redditmedia.com/N_92gaU-4sgymupI1hBoO-mO3Oszi5Eotay9UCh-yVQ.jpg',
  'time': '2016-06-23T16:51:34+00:00',
  'title': 'HRC aide jokes about donor appointment via email',
  'votes': 293}]

In [106]:
import pandas as pd

In [107]:
#convert to Pandas

In [109]:
#date string import

In [116]:
stories_df = pd.DataFrame(all_stories)
stories_df.head(2)


Out[116]:
URL subreddit thumbnail-URL time title votes
0 http://www.usnews.com/news/politics/articles/2... /r/politics //b.thumbs.redditmedia.com/tc5Ut1c5FOyNLBdK0Bp... 2016-06-23T18:54:46+00:00 Presumptive GOP nominee Donald Trump is commen... 0
1 http://abcnews.go.com/Politics/state-dept-appe... /r/politics //b.thumbs.redditmedia.com/4Mgue1kjmHJzI_WgzMn... 2016-06-23T18:46:14+00:00 State Dept. Appears to Have Disabled Security ... 0

In [119]:
CSV_FORMAT = stories_df[['URL', 'subreddit', 'thumbnail-URL', 'time', 'title', 'votes']]

In [110]:
import time

In [111]:
datestring = time.strftime("%m-%h-%d")
datestring


Out[111]:
'06-Jun-23'

In [112]:
#creating .csv file

In [120]:
filename = "reddit-frontpage-" + datestring + ".csv"
CSV_FORMAT.to_csv(filename, index=False)

In [ ]:


In [ ]: