I originally planned to scrape all articles from a particular topic/tag on Medium.com (e.g. technology or politics), but I found that many articles did not have many likes/recommends and lacked 'top highlights', presumably because not enough people highlighted sentences to allow for a consensus.
Instead, I scraped the 30 most popular articles per day from Medium.com to ensure that most articles would include a 'top highlight', then removed duplicates from the corpus. Originally I spaced the dates scraped 10 days apart; then I ran the script again using dates offset by 5 days. Thus, the final dataset includes unique articles scraped from Medium.com's daily list of the most popular, with a spacing of 5 days.
I used Selenium Webdriver and BeautifulSoup to scrape the html and identify the full text and highlights.
In [ ]:
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup, SoupStrainer
import re
import os, time
import pandas as pd
from datetime import timedelta, date
import html5lib
#### Estimate number of articles will get
# 818 days between 3/11/2015 and 6/5/2017 -- 818/10 (every 10 days) * 30 (articles per 10-days) =2454 articles
# 818 / 10 (every 10 days) * 6 (min per 30-article day) / 60 (min/hour) = 8.18 hr
#### URL formats for automated group scraping
## format for scraping by a topic/tag:
# https://medium.com/tag/politics/archive/2015/03/11
# https://medium.com/tag/technology/archive/2015/03/11
## format for scraping popular articles:
# https://medium.com/browse/top/march-11-2016
#### URLs for specific test cases
# webpage = 'https://medium.com/microsoft-design/if-you-want-to-be-creative-dont-be-data-driven-55db74078eda'
# webpage = 'https://medium.com/startup-grind/fueling-the-ai-gold-rush-7ae438505bc2'
# webpage = 'https://betterhumans.coach.me/the-day-reading-died-c8fd8da7814'
# webpage = 'https://electricliterature.com/men-recommend-david-foster-wallace-to-me-7889a9dc6f03'
# webpage = 'https://medium.com/dualcores-studio/make-an-android-custom-view-publish-and-open-source-99a3d86df228'
# webpage = 'https://medium.com/@emmalindsay/whose-pain-counts-6e6b3dd287f5'
# webpage = 'https://backchannel.com/how-the-trendiest-grilled-cheese-venture-got-burnt-aa627b0c7ae1'
# webpage = 'https://medium.com/@bindingwave/florida-man-goes-undercover-at-a-trump-rally-51ec77e08eed'
# Set a date range to scrape
def daterange(start_date, end_date):
print( range(0, int ((end_date - start_date).days), 10) )
for n in range(0, int ((end_date - start_date).days), 10):
yield end_date - timedelta(n)
class HighlightScraper(object):
def __init__(self, dates, outhigh, outurls, outfull, outhtml):
self.driver = webdriver.PhantomJS()
# self.driver.implicitly_wait(60) # testing implicitly_wait to allow html javascript to load
# self.link = link
# print( link )
def scrape_highlight(self, dates, outhigh, outurls, outfull, outhtml):
start_date = dates[0]
end_date = dates[1]
idurl = 1
idscraped = 1
# Get list of links for single date
for single_date in daterange(start_date, end_date):
date_str = single_date.strftime('%B-%d-%Y').lower() # e.g. march-11-2016
print( date_str )
link = 'https://medium.com/browse/top/'+date_str+'?limit=30'
self.driver.get(link) # url = self.driver.current_url
html1 = self.driver.page_source
# "Read more..." button links to actual articles
readmore = SoupStrainer('a',{'class': 'button button--smaller button--chromeless u-baseColor--buttonNormal'})
urls = []
for sub in BeautifulSoup(html1, 'lxml', parse_only=readmore):
if sub == 'html':
elif sub != 'html':
if sub.has_attr('href'):
urls.append( sub['href'] )
idurl += 1
# Get highlight for each link (article) per single date
for url in urls:
html2 = self.driver.execute_script('return document.documentElement.innerHTML;')
# print( html2 )
# highlight = SoupStrainer('span',{'class': 'markup--quote'})
soup = BeautifulSoup(html2, 'lxml')
# time.sleep(5)
txt1 = soup.find_all(class_='markup--quote', attrs={'data-creator-ids':'anon'}) # 'markup--quote' = HTML label for highlights
# if txt1:
# txt2 = txt1[0].encode('ascii','ignore')
# print('ascii')
# else:
txt2 = re.sub('<[^>]+>', '', str(txt1) )[1:-1]
full = BeautifulSoup(html2, 'lxml') #, parse_only=SoupStrainer('p'))
full1 = re.sub('<[^>]+>', '', str(full) )
idscraped += 1
print( date_str+' Done!' )
def scrape(self, dates, outhigh, outurls, outfull, outhtml):
self.scrape_highlight(dates, outhigh, outurls, outfull, outhtml)
if __name__ == '__main__':
currdir = os.getcwd()
timest = time.strftime('%Y%m%d_%H-%M-%S')
outhigh = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/highlights_'+timest+'.txt','w')
outurls = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/urls_'+timest+'.txt','w')
outfull = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/fulltext_'+timest+'.txt','w')
outhtml = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/fullhtml_'+timest+'.txt','w')
# dates = [date(2015,3,11), date(2017,6,5)]
dates = [date(2015,3,11), date(2017,5,31)]
print( timest )
scraper = HighlightScraper(dates, outhigh, outurls, outfull, outhtml)
scraper.scrape(dates, outhigh, outurls, outfull, outhtml)
timest = time.strftime('%Y%m%d_%H-%M-%S')
print( timest )
In [ ]:
## isolate fullhtml lines from fullhtml file
import os
import pandas as pd
fhtmlfile = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/fullhtml_20170606_10-45-58_edit_isolate.txt','w')
with open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/fullhtml_20170606_10-45-58_edit.txt','r') as fhtml:
fullh = [line for line in fhtml if '\t' in line] # Isolate fullhtml lines from fullhtml file