In [7]:
import requests
from bs4 import BeautifulSoup

In [5]:
!pip install requests


Collecting requests
  Downloading requests-2.10.0-py2.py3-none-any.whl (506kB)
Installing collected packages: requests
Successfully installed requests-2.10.0

In [12]:
#Grab the NYT homepage
response = requests.get("http://www.nytimes.com")

In [13]:
#Feed it into BeautifulSoup
doc = BeautifulSoup(response.text, 'html.parser')

In [15]:
#Get out the stories
stories = doc.find_all("article", { 'class': 'story'})
len(stories)


Out[15]:
141

In [20]:
all_stories = []
#Grab their headlines and bylines
for story in stories:
    #Grab all of the h2's inside of the story
    headline = story.find('h2', {'class': 'story-heading'})
    #If a headline exists, then process the rest!
    if headline:
        #loads of whitespace
        headline_text = headline.text.strip()
        #make a dictionary with the headline
        this_story = { 'headline': headline_text}
        byline = story.find('p', {'class': 'byline'})
        #not all have a byline
        if byline:
            byline_text = byline.text.strip()
            this_story['byline'] = byline_text
        all_stories.append(this_story)

In [21]:
#Save the headlines and bylines to a timestamped CSV

In [22]:
import time

In [ ]: