In [ ]:

    
#!/usr/bin/python

import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup, SoupStrainer
import re
import os, time
import pandas as pd
from datetime import timedelta, date
import html5lib


#### Estimate number of articles will get
# 818 days between 3/11/2015 and 6/5/2017 -- 818/10 (every 10 days) * 30 (articles per 10-days)  =2454 articles
# 818 / 10 (every 10 days) * 6 (min per 30-article day) / 60 (min/hour) = 8.18 hr

#### URL formats for automated group scraping
## format for scraping by a topic/tag:
# https://medium.com/tag/politics/archive/2015/03/11
# https://medium.com/tag/technology/archive/2015/03/11
## format for scraping popular articles:
# https://medium.com/browse/top/march-11-2016

#### URLs for specific test cases
# webpage = 'https://medium.com/microsoft-design/if-you-want-to-be-creative-dont-be-data-driven-55db74078eda'
# webpage = 'https://medium.com/startup-grind/fueling-the-ai-gold-rush-7ae438505bc2'
# webpage = 'https://betterhumans.coach.me/the-day-reading-died-c8fd8da7814'
# webpage = 'https://electricliterature.com/men-recommend-david-foster-wallace-to-me-7889a9dc6f03'
# webpage = 'https://medium.com/dualcores-studio/make-an-android-custom-view-publish-and-open-source-99a3d86df228'
# webpage = 'https://medium.com/@emmalindsay/whose-pain-counts-6e6b3dd287f5'
# webpage = 'https://backchannel.com/how-the-trendiest-grilled-cheese-venture-got-burnt-aa627b0c7ae1'
# webpage = 'https://medium.com/@bindingwave/florida-man-goes-undercover-at-a-trump-rally-51ec77e08eed'


# Set a date range to scrape
def daterange(start_date, end_date):
    print( range(0, int ((end_date - start_date).days), 10) )
    for n in range(0, int ((end_date - start_date).days), 10):
        yield end_date - timedelta(n)

class HighlightScraper(object):

    def __init__(self, dates, outhigh, outurls, outfull, outhtml):
        self.driver = webdriver.PhantomJS()
#         self.driver.implicitly_wait(60)    # testing implicitly_wait to allow html javascript to load
#         self.link = link
#         print( link )

    def scrape_highlight(self, dates, outhigh, outurls, outfull, outhtml):

        start_date = dates[0]
        end_date = dates[1]

        idurl = 1
        idscraped = 1

        # Get list of links for single date
        for single_date in daterange(start_date, end_date):
            date_str = single_date.strftime('%B-%d-%Y').lower()   # e.g. march-11-2016
            print( date_str )
            link = 'https://medium.com/browse/top/'+date_str+'?limit=30'
            self.driver.get(link)     # url = self.driver.current_url

            html1 = self.driver.page_source
            # "Read more..." button links to actual articles
            readmore = SoupStrainer('a',{'class': 'button button--smaller button--chromeless u-baseColor--buttonNormal'})
            urls = []
            for sub in BeautifulSoup(html1, 'lxml', parse_only=readmore):
                if sub == 'html':
                    continue
                elif sub != 'html':
                    if sub.has_attr('href'):
                        urls.append( sub['href'] )
                        outurls.write(date_str+'\t'+str(idurl)+'\t'+str(sub['href'])+'\n')
                        idurl += 1
                
            # Get highlight for each link (article) per single date
            for url in urls:
                self.driver.get(url)
                time.sleep(10)
                html2 = self.driver.execute_script('return document.documentElement.innerHTML;')
#                 print( html2 )
#                 highlight = SoupStrainer('span',{'class': 'markup--quote'}) 
                soup = BeautifulSoup(html2, 'lxml')
                outhtml.write(date_str+'\t'+str(idscraped)+'\t'+str(soup)+'\n\n')
#                 time.sleep(5)
                txt1 = soup.find_all(class_='markup--quote', attrs={'data-creator-ids':'anon'}) # 'markup--quote' = HTML label for highlights
#                 if txt1:
#                     txt2 = txt1[0].encode('ascii','ignore')
#                     print('ascii')
#                 else:
                txt2 = re.sub('<[^>]+>', '', str(txt1) )[1:-1]
                outhigh.write(date_str+'\t'+str(idscraped)+'\t'+txt2+'\n')
            
                full = BeautifulSoup(html2, 'lxml') #, parse_only=SoupStrainer('p'))
                full1 = re.sub('<[^>]+>', '', str(full) )
                outfull.write(date_str+'\t'+str(idscraped)+'\t'+full1+'\n')
                idscraped += 1
            print( date_str+' Done!' )


    def scrape(self, dates, outhigh, outurls, outfull, outhtml):
        self.scrape_highlight(dates, outhigh, outurls, outfull, outhtml)


if __name__ == '__main__':
    currdir = os.getcwd()
    timest = time.strftime('%Y%m%d_%H-%M-%S')
    outhigh = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/highlights_'+timest+'.txt','w')
    outurls = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/urls_'+timest+'.txt','w')
    outfull = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/fulltext_'+timest+'.txt','w')
    outhtml = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/fullhtml_'+timest+'.txt','w')
    
#    dates = [date(2015,3,11), date(2017,6,5)]
    dates = [date(2015,3,11), date(2017,5,31)]

    print( timest )
    scraper = HighlightScraper(dates, outhigh, outurls, outfull, outhtml)
    scraper.scrape(dates, outhigh, outurls, outfull, outhtml)
    
    outhigh.close()
    outurls.close()
    outfull.close()
    outhtml.close()
    
    timest = time.strftime('%Y%m%d_%H-%M-%S')
    print( timest )









    



20170606_10-45-58
range(0, 812, 10)
may-31-2017
may-31-2017 Done!
may-21-2017
may-21-2017 Done!
may-11-2017
may-11-2017 Done!
may-01-2017
may-01-2017 Done!
april-21-2017
april-21-2017 Done!
april-11-2017
april-11-2017 Done!
april-01-2017
april-01-2017 Done!
march-22-2017
march-22-2017 Done!
march-12-2017
march-12-2017 Done!
march-02-2017
march-02-2017 Done!
february-20-2017
february-20-2017 Done!
february-10-2017
february-10-2017 Done!
january-31-2017
january-31-2017 Done!
january-21-2017
january-21-2017 Done!
january-11-2017
january-11-2017 Done!
january-01-2017
january-01-2017 Done!
december-22-2016
december-22-2016 Done!
december-12-2016
december-12-2016 Done!
december-02-2016
december-02-2016 Done!
november-22-2016
november-22-2016 Done!
november-12-2016
november-12-2016 Done!
november-02-2016
november-02-2016 Done!
october-23-2016
october-23-2016 Done!
october-13-2016
october-13-2016 Done!
october-03-2016
october-03-2016 Done!
september-23-2016
september-23-2016 Done!
september-13-2016
september-13-2016 Done!
september-03-2016
september-03-2016 Done!
august-24-2016
august-24-2016 Done!
august-14-2016
august-14-2016 Done!
august-04-2016
august-04-2016 Done!
july-25-2016
july-25-2016 Done!
july-15-2016
july-15-2016 Done!
july-05-2016
july-05-2016 Done!
june-25-2016
june-25-2016 Done!
june-15-2016
june-15-2016 Done!
june-05-2016
june-05-2016 Done!
may-26-2016
may-26-2016 Done!
may-16-2016
may-16-2016 Done!
may-06-2016
may-06-2016 Done!
april-26-2016
april-26-2016 Done!
april-16-2016
april-16-2016 Done!
april-06-2016
april-06-2016 Done!
march-27-2016
march-27-2016 Done!
march-17-2016
march-17-2016 Done!
march-07-2016
march-07-2016 Done!
february-26-2016
february-26-2016 Done!
february-16-2016
february-16-2016 Done!
february-06-2016
february-06-2016 Done!
january-27-2016
january-27-2016 Done!
january-17-2016
january-17-2016 Done!
january-07-2016
january-07-2016 Done!
december-28-2015
december-28-2015 Done!
december-18-2015
december-18-2015 Done!
december-08-2015
december-08-2015 Done!
november-28-2015
november-28-2015 Done!
november-18-2015
november-18-2015 Done!
november-08-2015
november-08-2015 Done!
october-29-2015
october-29-2015 Done!
october-19-2015
october-19-2015 Done!
october-09-2015
october-09-2015 Done!
september-29-2015
september-29-2015 Done!
september-19-2015
september-19-2015 Done!
september-09-2015
september-09-2015 Done!
august-30-2015
august-30-2015 Done!
august-20-2015
august-20-2015 Done!
august-10-2015
august-10-2015 Done!
july-31-2015
july-31-2015 Done!
july-21-2015
july-21-2015 Done!
july-11-2015
july-11-2015 Done!
july-01-2015
july-01-2015 Done!
june-21-2015
june-21-2015 Done!
june-11-2015
june-11-2015 Done!
june-01-2015
june-01-2015 Done!
may-22-2015
may-22-2015 Done!
may-12-2015

Scrape articles from Medium.com

Get corpus of articles for the project by scraping popular articles from Medium.com.