Scraping weather data

This notebook provides a few functions to scrape the weather data from JMA (Japan Meteorological Agency).

Source: Japan Meteorological Agency website (https://www.jma.go.jp/en/amedas_h/yesterday-44132.html?areaCode=000&groupCode=30)



In [62]:

    
import requests
import bs4
import re
import pandas as pd
import numpy as np



In [63]:

    
page = requests.get('https://www.jma.go.jp/en/amedas_h/yesterday-44132.html?areaCode=000&groupCode=30')



In [64]:

    
bs = bs4.BeautifulSoup(page.content)
t = bs.find_all('table')[4]
assert t.tr.td.text == 'Time', 'the page layout might have changed, could not find hourly data table'



In [65]:

    
id = re.sub(r'\xa0+| +', '-', tables[3].tr.td.text)
assert re.search(r'2019', id), 'the page layout might have changed, could not extract title'



In [66]:

    
headers = [td.text for td in t.tr.find_all('td')]
headers









    Out[66]:





['Time',
 'Temperature',
 'Precipitation',
 'WindDirection',
 'WindSpeed',
 'SunshineDuration',
 'Humidity',
 'Pressure']



In [67]:

    
def sanitize_unit(name):
    name = name.replace('°', 'deg').replace('%', 'percent')
    return re.sub(r'\W', '_', name)

units = [sanitize_unit(td.text) for td in (t.find_all('tr')[1].find_all('td'))]
units









    Out[67]:





['Hour', 'degC', 'mm', '16compasspoints', 'm_s', 'h', 'percent', 'hPa']



In [68]:

    
headers_with_units = list(map(lambda x: x[0]+'_'+x[1], zip(headers, units)))
headers_with_units









    Out[68]:





['Time_Hour',
 'Temperature_degC',
 'Precipitation_mm',
 'WindDirection_16compasspoints',
 'WindSpeed_m_s',
 'SunshineDuration_h',
 'Humidity_percent',
 'Pressure_hPa']



In [69]:

    
values = [[td.text.replace('\xa0', 'NaN') for td in tr.find_all('td')] for tr in list(t.find_all('tr'))[2:]]
values









    Out[69]:





[['1', '20.3', '1.0', 'ENE', '3.5', 'NaN', '100', '1004.6'],
 ['2', '20.2', '0.5', 'NE', '1.6', 'NaN', '100', '1004.5'],
 ['3', '20.1', '0.0', 'NE', '2.1', 'NaN', '100', '1004.8'],
 ['4', '20.0', '0.5', 'NE', '2.6', '0.0', '100', '1005.1'],
 ['5', '19.9', '0.5', 'NE', '3.3', '0.0', '100', '1005.4'],
 ['6', '20.0', '1.0', 'NE', '2.4', '0.0', '100', '1006.1'],
 ['7', '20.2', '1.0', 'NE', '2.1', '0.0', '100', '1006.4'],
 ['8', '20.4', '0.5', 'NE', '1.7', '0.0', '100', '1007.1'],
 ['9', '20.6', '0.0', 'NE', '2.3', '0.0', '99', '1007.5'],
 ['10', '21.1', '0.5', 'NE', '1.8', '0.0', '99', '1007.8'],
 ['11', '21.8', '0.5', 'NNE', '2.1', '0.0', '98', '1008.0'],
 ['12', '22.4', '0.0', 'ENE', '3.3', '0.0', '91', '1007.6'],
 ['13', '23.0', '0.0', 'NE', '1.9', '0.1', '85', '1007.5'],
 ['14', '24.6', '0.0', 'ENE', '2.9', '0.1', '79', '1007.7'],
 ['15', '23.7', '0.0', 'NE', '3.4', '0.0', '82', '1008.0'],
 ['16', '24.1', '0.0', 'E', '1.7', '0.0', '82', '1008.2'],
 ['17', '22.9', '0.0', 'E', '2.1', '0.0', '82', '1008.5'],
 ['18', '22.2', '0.0', 'ENE', '1.6', '0.0', '85', '1009.2'],
 ['19', '21.8', '0.0', 'NE', '2.8', '0.0', '86', '1009.8'],
 ['20', '21.2', '0.0', 'NE', '3.6', '0.0', '84', '1010.4'],
 ['21', '20.9', '0.0', 'NE', '2.5', 'NaN', '87', '1010.9'],
 ['22', '20.7', '0.0', 'ENE', '2.9', 'NaN', '88', '1011.6'],
 ['23', '19.6', '0.5', 'NE', '2.7', 'NaN', '98', '1011.8'],
 ['24', '19.5', '0.5', 'NE', '2.3', 'NaN', '100', '1011.8']]



In [70]:

    
df = pd.DataFrame(columns=headers_with_units, data=values)
df









    Out[70]:







  
    
      
      Time_Hour
      Temperature_degC
      Precipitation_mm
      WindDirection_16compasspoints
      WindSpeed_m_s
      SunshineDuration_h
      Humidity_percent
      Pressure_hPa
    
  
  
    
      0
      1
      20.3
      1.0
      ENE
      3.5
      NaN
      100
      1004.6
    
    
      1
      2
      20.2
      0.5
      NE
      1.6
      NaN
      100
      1004.5
    
    
      2
      3
      20.1
      0.0
      NE
      2.1
      NaN
      100
      1004.8
    
    
      3
      4
      20.0
      0.5
      NE
      2.6
      0.0
      100
      1005.1
    
    
      4
      5
      19.9
      0.5
      NE
      3.3
      0.0
      100
      1005.4
    
    
      5
      6
      20.0
      1.0
      NE
      2.4
      0.0
      100
      1006.1
    
    
      6
      7
      20.2
      1.0
      NE
      2.1
      0.0
      100
      1006.4
    
    
      7
      8
      20.4
      0.5
      NE
      1.7
      0.0
      100
      1007.1
    
    
      8
      9
      20.6
      0.0
      NE
      2.3
      0.0
      99
      1007.5
    
    
      9
      10
      21.1
      0.5
      NE
      1.8
      0.0
      99
      1007.8
    
    
      10
      11
      21.8
      0.5
      NNE
      2.1
      0.0
      98
      1008.0
    
    
      11
      12
      22.4
      0.0
      ENE
      3.3
      0.0
      91
      1007.6
    
    
      12
      13
      23.0
      0.0
      NE
      1.9
      0.1
      85
      1007.5
    
    
      13
      14
      24.6
      0.0
      ENE
      2.9
      0.1
      79
      1007.7
    
    
      14
      15
      23.7
      0.0
      NE
      3.4
      0.0
      82
      1008.0
    
    
      15
      16
      24.1
      0.0
      E
      1.7
      0.0
      82
      1008.2
    
    
      16
      17
      22.9
      0.0
      E
      2.1
      0.0
      82
      1008.5
    
    
      17
      18
      22.2
      0.0
      ENE
      1.6
      0.0
      85
      1009.2
    
    
      18
      19
      21.8
      0.0
      NE
      2.8
      0.0
      86
      1009.8
    
    
      19
      20
      21.2
      0.0
      NE
      3.6
      0.0
      84
      1010.4
    
    
      20
      21
      20.9
      0.0
      NE
      2.5
      NaN
      87
      1010.9
    
    
      21
      22
      20.7
      0.0
      ENE
      2.9
      NaN
      88
      1011.6
    
    
      22
      23
      19.6
      0.5
      NE
      2.7
      NaN
      98
      1011.8
    
    
      23
      24
      19.5
      0.5
      NE
      2.3
      NaN
      100
      1011.8



In [71]:

    
df.dtypes









    Out[71]:





Time_Hour                        object
Temperature_degC                 object
Precipitation_mm                 object
WindDirection_16compasspoints    object
WindSpeed_m_s                    object
SunshineDuration_h               object
Humidity_percent                 object
Pressure_hPa                     object
dtype: object



In [72]:

    
df.to_csv('data/%s-hourly.csv' % id, index=False)



In [107]:

    
def SanitizeUnit(name):
    name = name.replace('°', 'deg').replace('%', 'percent')
    return re.sub(r'\W', '_', name)

def ScrapeWeatherPage(content):
    bs = bs4.BeautifulSoup(content)
    tables = bs.find_all('table')
    t = tables[4]
    assert t.tr.td.text == 'Time', 'the page layout might have changed, could not find hourly data table'
    id = re.sub(r'\xa0+| +', '-', tables[3].tr.td.text)
    assert re.search(r'2019', id), 'the page layout might have changed, could not extract title'
    headers = [td.text for td in t.tr.find_all('td')]
    units = [SanitizeUnit(td.text) for td in (t.find_all('tr')[1].find_all('td'))]
    headers_with_units = list(map(lambda x: x[0]+'_'+x[1], zip(headers, units)))
    values = [[td.text.replace('\xa0', 'NaN') for td in tr.find_all('td')] for tr in list(t.find_all('tr'))[2:]]
    df = pd.DataFrame(columns=headers_with_units, data=values)
    return df, id

def ScrapeAndSave(link=None, content=None, htmlfile=None):
    if link:
        page = requests.get(link)
        content = page.content
    elif htmlfile:
        with open(htmlfile) as f:
            content = f.read()
    df, id = ScrapeWeatherPage(content)
    assert(len(df) == 24), 'unexpected number of hours: %d != 24' % len(df)
    df.to_csv('data/%s-hourly.csv' % id, index=False)
    print('scraped %d rows to data/%s-hourly.csv' % (len(df), id))



In [103]:

    
del df, id



In [97]:

    
ScrapeAndSave('https://www.jma.go.jp/en/amedas_h/today-44166.html?areaCode=000&groupCode=30')









    



scraped 24 rows to data/17-July-2019-Haneda-(Tokyo-International-Airport)-hourly.csv



In [93]:

    
ScrapeAndSave('https://www.jma.go.jp/en/amedas_h/yesterday-44166.html?areaCode=000&groupCode=30')









    



scraped 24 rows to data/16-July-2019-Haneda-(Tokyo-International-Airport)-hourly.csv



In [94]:

    
ScrapeAndSave('https://www.jma.go.jp/en/amedas_h/today-44132.html?areaCode=000&groupCode=30')









    



scraped 24 rows to data/17-July-2019-Tokyo-hourly.csv



In [95]:

    
ScrapeAndSave('https://www.jma.go.jp/en/amedas_h/yesterday-44132.html?areaCode=000&groupCode=30')









    



scraped 24 rows to data/16-July-2019-Tokyo-hourly.csv



In [108]:

    
ScrapeAndSave(htmlfile="data/oshima-2019-07-09.html")









    



scraped 24 rows to data/09-July-2019-Oshima-hourly.csv



In [109]:

    
ScrapeAndSave(htmlfile='data/setagaya-2019-07-09.html')









    



scraped 24 rows to data/09-July-2019-Setagaya-hourly.csv



In [110]:

    
ScrapeAndSave(htmlfile='data/tokyo-2019-07-09.html')









    



scraped 24 rows to data/09-July-2019-Tokyo-hourly.csv



In [ ]:

	Time_Hour	Temperature_degC	Precipitation_mm	WindDirection_16compasspoints	WindSpeed_m_s	SunshineDuration_h	Humidity_percent	Pressure_hPa
0	1	20.3	1.0	ENE	3.5	NaN	100	1004.6
1	2	20.2	0.5	NE	1.6	NaN	100	1004.5
2	3	20.1	0.0	NE	2.1	NaN	100	1004.8
3	4	20.0	0.5	NE	2.6	0.0	100	1005.1
4	5	19.9	0.5	NE	3.3	0.0	100	1005.4
5	6	20.0	1.0	NE	2.4	0.0	100	1006.1
6	7	20.2	1.0	NE	2.1	0.0	100	1006.4
7	8	20.4	0.5	NE	1.7	0.0	100	1007.1
8	9	20.6	0.0	NE	2.3	0.0	99	1007.5
9	10	21.1	0.5	NE	1.8	0.0	99	1007.8
10	11	21.8	0.5	NNE	2.1	0.0	98	1008.0
11	12	22.4	0.0	ENE	3.3	0.0	91	1007.6
12	13	23.0	0.0	NE	1.9	0.1	85	1007.5
13	14	24.6	0.0	ENE	2.9	0.1	79	1007.7
14	15	23.7	0.0	NE	3.4	0.0	82	1008.0
15	16	24.1	0.0	E	1.7	0.0	82	1008.2
16	17	22.9	0.0	E	2.1	0.0	82	1008.5
17	18	22.2	0.0	ENE	1.6	0.0	85	1009.2
18	19	21.8	0.0	NE	2.8	0.0	86	1009.8
19	20	21.2	0.0	NE	3.6	0.0	84	1010.4
20	21	20.9	0.0	NE	2.5	NaN	87	1010.9
21	22	20.7	0.0	ENE	2.9	NaN	88	1011.6
22	23	19.6	0.5	NE	2.7	NaN	98	1011.8
23	24	19.5	0.5	NE	2.3	NaN	100	1011.8