Use asynchronous IO to speed up crawling

  • Obviously the task is IO bound
  • use builtin asyncio and the popular aiohttp

In [6]:
import aiohttp, asyncio, async_timeout
import pandas as pd
from util import AIO_get_data_from_soup

In [10]:
def wu_url(date, icao):
    """
    construct a wunderground url given date and icao
    """
    url = "https://www.wunderground.com/history/airport/" + \
        "{icao}/{year}/{month}/{day}/DailyHistory.html".format(
            icao=icao, year=date.year,month=date.month,day=date.day)
    return url

In [ ]:
async def fetch(session, url):
    with async_timeout.timeout(10):
        async with session.get(url) as response:
            return await response.text()

async def main(loop, time_range):
    async with aiohttp.ClientSession(loop=loop) as session:
        html = await fetch(session, 'http://python.org')
        print(html)

loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))