In [7]:
import requests
from bs4 import BeautifulSoup
In [5]:
!pip install requests
In [12]:
#Grab the NYT homepage
response = requests.get("http://www.nytimes.com")
In [13]:
#Feed it into BeautifulSoup
doc = BeautifulSoup(response.text, 'html.parser')
In [15]:
#Get out the stories
stories = doc.find_all("article", { 'class': 'story'})
len(stories)
Out[15]:
In [20]:
all_stories = []
#Grab their headlines and bylines
for story in stories:
#Grab all of the h2's inside of the story
headline = story.find('h2', {'class': 'story-heading'})
#If a headline exists, then process the rest!
if headline:
#loads of whitespace
headline_text = headline.text.strip()
#make a dictionary with the headline
this_story = { 'headline': headline_text}
byline = story.find('p', {'class': 'byline'})
#not all have a byline
if byline:
byline_text = byline.text.strip()
this_story['byline'] = byline_text
all_stories.append(this_story)
In [21]:
#Save the headlines and bylines to a timestamped CSV
In [22]:
import time
In [ ]: