Scraping weather data

This notebook provides a few functions to scrape the weather data from JMA (Japan Meteorological Agency).

Source: Japan Meteorological Agency website (https://www.jma.go.jp/en/amedas_h/yesterday-44132.html?areaCode=000&groupCode=30)


In [62]:
import requests
import bs4
import re
import pandas as pd
import numpy as np

In [63]:
page = requests.get('https://www.jma.go.jp/en/amedas_h/yesterday-44132.html?areaCode=000&groupCode=30')

In [64]:
bs = bs4.BeautifulSoup(page.content)
t = bs.find_all('table')[4]
assert t.tr.td.text == 'Time', 'the page layout might have changed, could not find hourly data table'

In [65]:
id = re.sub(r'\xa0+| +', '-', tables[3].tr.td.text)
assert re.search(r'2019', id), 'the page layout might have changed, could not extract title'

In [66]:
headers = [td.text for td in t.tr.find_all('td')]
headers


Out[66]:
['Time',
 'Temperature',
 'Precipitation',
 'WindDirection',
 'WindSpeed',
 'SunshineDuration',
 'Humidity',
 'Pressure']

In [67]:
def sanitize_unit(name):
    name = name.replace('°', 'deg').replace('%', 'percent')
    return re.sub(r'\W', '_', name)

units = [sanitize_unit(td.text) for td in (t.find_all('tr')[1].find_all('td'))]
units


Out[67]:
['Hour', 'degC', 'mm', '16compasspoints', 'm_s', 'h', 'percent', 'hPa']

In [68]:
headers_with_units = list(map(lambda x: x[0]+'_'+x[1], zip(headers, units)))
headers_with_units


Out[68]:
['Time_Hour',
 'Temperature_degC',
 'Precipitation_mm',
 'WindDirection_16compasspoints',
 'WindSpeed_m_s',
 'SunshineDuration_h',
 'Humidity_percent',
 'Pressure_hPa']

In [69]:
values = [[td.text.replace('\xa0', 'NaN') for td in tr.find_all('td')] for tr in list(t.find_all('tr'))[2:]]
values


Out[69]:
[['1', '20.3', '1.0', 'ENE', '3.5', 'NaN', '100', '1004.6'],
 ['2', '20.2', '0.5', 'NE', '1.6', 'NaN', '100', '1004.5'],
 ['3', '20.1', '0.0', 'NE', '2.1', 'NaN', '100', '1004.8'],
 ['4', '20.0', '0.5', 'NE', '2.6', '0.0', '100', '1005.1'],
 ['5', '19.9', '0.5', 'NE', '3.3', '0.0', '100', '1005.4'],
 ['6', '20.0', '1.0', 'NE', '2.4', '0.0', '100', '1006.1'],
 ['7', '20.2', '1.0', 'NE', '2.1', '0.0', '100', '1006.4'],
 ['8', '20.4', '0.5', 'NE', '1.7', '0.0', '100', '1007.1'],
 ['9', '20.6', '0.0', 'NE', '2.3', '0.0', '99', '1007.5'],
 ['10', '21.1', '0.5', 'NE', '1.8', '0.0', '99', '1007.8'],
 ['11', '21.8', '0.5', 'NNE', '2.1', '0.0', '98', '1008.0'],
 ['12', '22.4', '0.0', 'ENE', '3.3', '0.0', '91', '1007.6'],
 ['13', '23.0', '0.0', 'NE', '1.9', '0.1', '85', '1007.5'],
 ['14', '24.6', '0.0', 'ENE', '2.9', '0.1', '79', '1007.7'],
 ['15', '23.7', '0.0', 'NE', '3.4', '0.0', '82', '1008.0'],
 ['16', '24.1', '0.0', 'E', '1.7', '0.0', '82', '1008.2'],
 ['17', '22.9', '0.0', 'E', '2.1', '0.0', '82', '1008.5'],
 ['18', '22.2', '0.0', 'ENE', '1.6', '0.0', '85', '1009.2'],
 ['19', '21.8', '0.0', 'NE', '2.8', '0.0', '86', '1009.8'],
 ['20', '21.2', '0.0', 'NE', '3.6', '0.0', '84', '1010.4'],
 ['21', '20.9', '0.0', 'NE', '2.5', 'NaN', '87', '1010.9'],
 ['22', '20.7', '0.0', 'ENE', '2.9', 'NaN', '88', '1011.6'],
 ['23', '19.6', '0.5', 'NE', '2.7', 'NaN', '98', '1011.8'],
 ['24', '19.5', '0.5', 'NE', '2.3', 'NaN', '100', '1011.8']]

In [70]:
df = pd.DataFrame(columns=headers_with_units, data=values)
df


Out[70]:
Time_Hour Temperature_degC Precipitation_mm WindDirection_16compasspoints WindSpeed_m_s SunshineDuration_h Humidity_percent Pressure_hPa
0 1 20.3 1.0 ENE 3.5 NaN 100 1004.6
1 2 20.2 0.5 NE 1.6 NaN 100 1004.5
2 3 20.1 0.0 NE 2.1 NaN 100 1004.8
3 4 20.0 0.5 NE 2.6 0.0 100 1005.1
4 5 19.9 0.5 NE 3.3 0.0 100 1005.4
5 6 20.0 1.0 NE 2.4 0.0 100 1006.1
6 7 20.2 1.0 NE 2.1 0.0 100 1006.4
7 8 20.4 0.5 NE 1.7 0.0 100 1007.1
8 9 20.6 0.0 NE 2.3 0.0 99 1007.5
9 10 21.1 0.5 NE 1.8 0.0 99 1007.8
10 11 21.8 0.5 NNE 2.1 0.0 98 1008.0
11 12 22.4 0.0 ENE 3.3 0.0 91 1007.6
12 13 23.0 0.0 NE 1.9 0.1 85 1007.5
13 14 24.6 0.0 ENE 2.9 0.1 79 1007.7
14 15 23.7 0.0 NE 3.4 0.0 82 1008.0
15 16 24.1 0.0 E 1.7 0.0 82 1008.2
16 17 22.9 0.0 E 2.1 0.0 82 1008.5
17 18 22.2 0.0 ENE 1.6 0.0 85 1009.2
18 19 21.8 0.0 NE 2.8 0.0 86 1009.8
19 20 21.2 0.0 NE 3.6 0.0 84 1010.4
20 21 20.9 0.0 NE 2.5 NaN 87 1010.9
21 22 20.7 0.0 ENE 2.9 NaN 88 1011.6
22 23 19.6 0.5 NE 2.7 NaN 98 1011.8
23 24 19.5 0.5 NE 2.3 NaN 100 1011.8

In [71]:
df.dtypes


Out[71]:
Time_Hour                        object
Temperature_degC                 object
Precipitation_mm                 object
WindDirection_16compasspoints    object
WindSpeed_m_s                    object
SunshineDuration_h               object
Humidity_percent                 object
Pressure_hPa                     object
dtype: object

In [72]:
df.to_csv('data/%s-hourly.csv' % id, index=False)

In [107]:
def SanitizeUnit(name):
    name = name.replace('°', 'deg').replace('%', 'percent')
    return re.sub(r'\W', '_', name)

def ScrapeWeatherPage(content):
    bs = bs4.BeautifulSoup(content)
    tables = bs.find_all('table')
    t = tables[4]
    assert t.tr.td.text == 'Time', 'the page layout might have changed, could not find hourly data table'
    id = re.sub(r'\xa0+| +', '-', tables[3].tr.td.text)
    assert re.search(r'2019', id), 'the page layout might have changed, could not extract title'
    headers = [td.text for td in t.tr.find_all('td')]
    units = [SanitizeUnit(td.text) for td in (t.find_all('tr')[1].find_all('td'))]
    headers_with_units = list(map(lambda x: x[0]+'_'+x[1], zip(headers, units)))
    values = [[td.text.replace('\xa0', 'NaN') for td in tr.find_all('td')] for tr in list(t.find_all('tr'))[2:]]
    df = pd.DataFrame(columns=headers_with_units, data=values)
    return df, id

def ScrapeAndSave(link=None, content=None, htmlfile=None):
    if link:
        page = requests.get(link)
        content = page.content
    elif htmlfile:
        with open(htmlfile) as f:
            content = f.read()
    df, id = ScrapeWeatherPage(content)
    assert(len(df) == 24), 'unexpected number of hours: %d != 24' % len(df)
    df.to_csv('data/%s-hourly.csv' % id, index=False)
    print('scraped %d rows to data/%s-hourly.csv' % (len(df), id))

In [103]:
del df, id

In [97]:
ScrapeAndSave('https://www.jma.go.jp/en/amedas_h/today-44166.html?areaCode=000&groupCode=30')


scraped 24 rows to data/17-July-2019-Haneda-(Tokyo-International-Airport)-hourly.csv

In [93]:
ScrapeAndSave('https://www.jma.go.jp/en/amedas_h/yesterday-44166.html?areaCode=000&groupCode=30')


scraped 24 rows to data/16-July-2019-Haneda-(Tokyo-International-Airport)-hourly.csv

In [94]:
ScrapeAndSave('https://www.jma.go.jp/en/amedas_h/today-44132.html?areaCode=000&groupCode=30')


scraped 24 rows to data/17-July-2019-Tokyo-hourly.csv

In [95]:
ScrapeAndSave('https://www.jma.go.jp/en/amedas_h/yesterday-44132.html?areaCode=000&groupCode=30')


scraped 24 rows to data/16-July-2019-Tokyo-hourly.csv

In [108]:
ScrapeAndSave(htmlfile="data/oshima-2019-07-09.html")


scraped 24 rows to data/09-July-2019-Oshima-hourly.csv

In [109]:
ScrapeAndSave(htmlfile='data/setagaya-2019-07-09.html')


scraped 24 rows to data/09-July-2019-Setagaya-hourly.csv

In [110]:
ScrapeAndSave(htmlfile='data/tokyo-2019-07-09.html')


scraped 24 rows to data/09-July-2019-Tokyo-hourly.csv

In [ ]: