notebook.community

Edit and run



In [7]:

    
import requests
from bs4 import BeautifulSoup



In [5]:

    
!pip install requests









    



Collecting requests
  Downloading requests-2.10.0-py2.py3-none-any.whl (506kB)
Installing collected packages: requests
Successfully installed requests-2.10.0



In [12]:

    
#Grab the NYT homepage
response = requests.get("http://www.nytimes.com")



In [13]:

    
#Feed it into BeautifulSoup
doc = BeautifulSoup(response.text, 'html.parser')



In [15]:

    
#Get out the stories
stories = doc.find_all("article", { 'class': 'story'})
len(stories)









    Out[15]:





141



In [20]:

    
all_stories = []
#Grab their headlines and bylines
for story in stories:
    #Grab all of the h2's inside of the story
    headline = story.find('h2', {'class': 'story-heading'})
    #If a headline exists, then process the rest!
    if headline:
        #loads of whitespace
        headline_text = headline.text.strip()
        #make a dictionary with the headline
        this_story = { 'headline': headline_text}
        byline = story.find('p', {'class': 'byline'})
        #not all have a byline
        if byline:
            byline_text = byline.text.strip()
            this_story['byline'] = byline_text
        all_stories.append(this_story)



In [21]:

    
#Save the headlines and bylines to a timestamped CSV



In [22]:

    
import time



In [ ]: