Scrape articles from Medium.com

I originally planned to scrape all articles from a particular topic/tag on Medium.com (e.g. technology or politics), but I found that many articles did not have many likes/recommends and lacked 'top highlights', presumably because not enough people highlighted sentences to allow for a consensus.

Instead, I scraped the 30 most popular articles per day from Medium.com to ensure that most articles would include a 'top highlight', then removed duplicates from the corpus. Originally I spaced the dates scraped 10 days apart; then I ran the script again using dates offset by 5 days. Thus, the final dataset includes unique articles scraped from Medium.com's daily list of the most popular, with a spacing of 5 days.

I used Selenium Webdriver and BeautifulSoup to scrape the html and identify the full text and highlights.


In [ ]:
#!/usr/bin/python

import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup, SoupStrainer
import re
import os, time
import pandas as pd
from datetime import timedelta, date
import html5lib


#### Estimate number of articles will get
# 818 days between 3/11/2015 and 6/5/2017 -- 818/10 (every 10 days) * 30 (articles per 10-days)  =2454 articles
# 818 / 10 (every 10 days) * 6 (min per 30-article day) / 60 (min/hour) = 8.18 hr

#### URL formats for automated group scraping
## format for scraping by a topic/tag:
# https://medium.com/tag/politics/archive/2015/03/11
# https://medium.com/tag/technology/archive/2015/03/11
## format for scraping popular articles:
# https://medium.com/browse/top/march-11-2016

#### URLs for specific test cases
# webpage = 'https://medium.com/microsoft-design/if-you-want-to-be-creative-dont-be-data-driven-55db74078eda'
# webpage = 'https://medium.com/startup-grind/fueling-the-ai-gold-rush-7ae438505bc2'
# webpage = 'https://betterhumans.coach.me/the-day-reading-died-c8fd8da7814'
# webpage = 'https://electricliterature.com/men-recommend-david-foster-wallace-to-me-7889a9dc6f03'
# webpage = 'https://medium.com/dualcores-studio/make-an-android-custom-view-publish-and-open-source-99a3d86df228'
# webpage = 'https://medium.com/@emmalindsay/whose-pain-counts-6e6b3dd287f5'
# webpage = 'https://backchannel.com/how-the-trendiest-grilled-cheese-venture-got-burnt-aa627b0c7ae1'
# webpage = 'https://medium.com/@bindingwave/florida-man-goes-undercover-at-a-trump-rally-51ec77e08eed'


# Set a date range to scrape
def daterange(start_date, end_date):
    print( range(0, int ((end_date - start_date).days), 10) )
    for n in range(0, int ((end_date - start_date).days), 10):
        yield end_date - timedelta(n)

class HighlightScraper(object):

    def __init__(self, dates, outhigh, outurls, outfull, outhtml):
        self.driver = webdriver.PhantomJS()
#         self.driver.implicitly_wait(60)    # testing implicitly_wait to allow html javascript to load
#         self.link = link
#         print( link )

    def scrape_highlight(self, dates, outhigh, outurls, outfull, outhtml):

        start_date = dates[0]
        end_date = dates[1]

        idurl = 1
        idscraped = 1

        # Get list of links for single date
        for single_date in daterange(start_date, end_date):
            date_str = single_date.strftime('%B-%d-%Y').lower()   # e.g. march-11-2016
            print( date_str )
            link = 'https://medium.com/browse/top/'+date_str+'?limit=30'
            self.driver.get(link)     # url = self.driver.current_url

            html1 = self.driver.page_source
            # "Read more..." button links to actual articles
            readmore = SoupStrainer('a',{'class': 'button button--smaller button--chromeless u-baseColor--buttonNormal'})
            urls = []
            for sub in BeautifulSoup(html1, 'lxml', parse_only=readmore):
                if sub == 'html':
                    continue
                elif sub != 'html':
                    if sub.has_attr('href'):
                        urls.append( sub['href'] )
                        outurls.write(date_str+'\t'+str(idurl)+'\t'+str(sub['href'])+'\n')
                        idurl += 1
                
            # Get highlight for each link (article) per single date
            for url in urls:
                self.driver.get(url)
                time.sleep(10)
                html2 = self.driver.execute_script('return document.documentElement.innerHTML;')
#                 print( html2 )
#                 highlight = SoupStrainer('span',{'class': 'markup--quote'}) 
                soup = BeautifulSoup(html2, 'lxml')
                outhtml.write(date_str+'\t'+str(idscraped)+'\t'+str(soup)+'\n\n')
#                 time.sleep(5)
                txt1 = soup.find_all(class_='markup--quote', attrs={'data-creator-ids':'anon'}) # 'markup--quote' = HTML label for highlights
#                 if txt1:
#                     txt2 = txt1[0].encode('ascii','ignore')
#                     print('ascii')
#                 else:
                txt2 = re.sub('<[^>]+>', '', str(txt1) )[1:-1]
                outhigh.write(date_str+'\t'+str(idscraped)+'\t'+txt2+'\n')
            
                full = BeautifulSoup(html2, 'lxml') #, parse_only=SoupStrainer('p'))
                full1 = re.sub('<[^>]+>', '', str(full) )
                outfull.write(date_str+'\t'+str(idscraped)+'\t'+full1+'\n')
                idscraped += 1
            print( date_str+' Done!' )


    def scrape(self, dates, outhigh, outurls, outfull, outhtml):
        self.scrape_highlight(dates, outhigh, outurls, outfull, outhtml)


if __name__ == '__main__':
    currdir = os.getcwd()
    timest = time.strftime('%Y%m%d_%H-%M-%S')
    outhigh = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/highlights_'+timest+'.txt','w')
    outurls = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/urls_'+timest+'.txt','w')
    outfull = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/fulltext_'+timest+'.txt','w')
    outhtml = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/fullhtml_'+timest+'.txt','w')
    
#    dates = [date(2015,3,11), date(2017,6,5)]
    dates = [date(2015,3,11), date(2017,5,31)]

    print( timest )
    scraper = HighlightScraper(dates, outhigh, outurls, outfull, outhtml)
    scraper.scrape(dates, outhigh, outurls, outfull, outhtml)
    
    outhigh.close()
    outurls.close()
    outfull.close()
    outhtml.close()
    
    timest = time.strftime('%Y%m%d_%H-%M-%S')
    print( timest )


20170606_10-45-58
range(0, 812, 10)
may-31-2017
may-31-2017 Done!
may-21-2017
may-21-2017 Done!
may-11-2017
may-11-2017 Done!
may-01-2017
may-01-2017 Done!
april-21-2017
april-21-2017 Done!
april-11-2017
april-11-2017 Done!
april-01-2017
april-01-2017 Done!
march-22-2017
march-22-2017 Done!
march-12-2017
march-12-2017 Done!
march-02-2017
march-02-2017 Done!
february-20-2017
february-20-2017 Done!
february-10-2017
february-10-2017 Done!
january-31-2017
january-31-2017 Done!
january-21-2017
january-21-2017 Done!
january-11-2017
january-11-2017 Done!
january-01-2017
january-01-2017 Done!
december-22-2016
december-22-2016 Done!
december-12-2016
december-12-2016 Done!
december-02-2016
december-02-2016 Done!
november-22-2016
november-22-2016 Done!
november-12-2016
november-12-2016 Done!
november-02-2016
november-02-2016 Done!
october-23-2016
october-23-2016 Done!
october-13-2016
october-13-2016 Done!
october-03-2016
october-03-2016 Done!
september-23-2016
september-23-2016 Done!
september-13-2016
september-13-2016 Done!
september-03-2016
september-03-2016 Done!
august-24-2016
august-24-2016 Done!
august-14-2016
august-14-2016 Done!
august-04-2016
august-04-2016 Done!
july-25-2016
july-25-2016 Done!
july-15-2016
july-15-2016 Done!
july-05-2016
july-05-2016 Done!
june-25-2016
june-25-2016 Done!
june-15-2016
june-15-2016 Done!
june-05-2016
june-05-2016 Done!
may-26-2016
may-26-2016 Done!
may-16-2016
may-16-2016 Done!
may-06-2016
may-06-2016 Done!
april-26-2016
april-26-2016 Done!
april-16-2016
april-16-2016 Done!
april-06-2016
april-06-2016 Done!
march-27-2016
march-27-2016 Done!
march-17-2016
march-17-2016 Done!
march-07-2016
march-07-2016 Done!
february-26-2016
february-26-2016 Done!
february-16-2016
february-16-2016 Done!
february-06-2016
february-06-2016 Done!
january-27-2016
january-27-2016 Done!
january-17-2016
january-17-2016 Done!
january-07-2016
january-07-2016 Done!
december-28-2015
december-28-2015 Done!
december-18-2015
december-18-2015 Done!
december-08-2015
december-08-2015 Done!
november-28-2015
november-28-2015 Done!
november-18-2015
november-18-2015 Done!
november-08-2015
november-08-2015 Done!
october-29-2015
october-29-2015 Done!
october-19-2015
october-19-2015 Done!
october-09-2015
october-09-2015 Done!
september-29-2015
september-29-2015 Done!
september-19-2015
september-19-2015 Done!
september-09-2015
september-09-2015 Done!
august-30-2015
august-30-2015 Done!
august-20-2015
august-20-2015 Done!
august-10-2015
august-10-2015 Done!
july-31-2015
july-31-2015 Done!
july-21-2015
july-21-2015 Done!
july-11-2015
july-11-2015 Done!
july-01-2015
july-01-2015 Done!
june-21-2015
june-21-2015 Done!
june-11-2015
june-11-2015 Done!
june-01-2015
june-01-2015 Done!
may-22-2015
may-22-2015 Done!
may-12-2015

In [ ]:
## isolate fullhtml lines from fullhtml file

import os
import pandas as pd

fhtmlfile = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/fullhtml_20170606_10-45-58_edit_isolate.txt','w')
with open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/fullhtml_20170606_10-45-58_edit.txt','r') as fhtml:
    fullh = [line for line in fhtml if '\t' in line]      # Isolate fullhtml lines from fullhtml file
    
fhtmlfile.write(''.join(fullh))

fhtmlfile.close()