This notebook provides a few functions to scrape the weather data from JMA (Japan Meteorological Agency).
Source: Japan Meteorological Agency website (https://www.jma.go.jp/en/amedas_h/yesterday-44132.html?areaCode=000&groupCode=30)
In [62]:
import requests
import bs4
import re
import pandas as pd
import numpy as np
In [63]:
page = requests.get('https://www.jma.go.jp/en/amedas_h/yesterday-44132.html?areaCode=000&groupCode=30')
In [64]:
bs = bs4.BeautifulSoup(page.content)
t = bs.find_all('table')[4]
assert t.tr.td.text == 'Time', 'the page layout might have changed, could not find hourly data table'
In [65]:
id = re.sub(r'\xa0+| +', '-', tables[3].tr.td.text)
assert re.search(r'2019', id), 'the page layout might have changed, could not extract title'
In [66]:
headers = [td.text for td in t.tr.find_all('td')]
headers
Out[66]:
In [67]:
def sanitize_unit(name):
name = name.replace('°', 'deg').replace('%', 'percent')
return re.sub(r'\W', '_', name)
units = [sanitize_unit(td.text) for td in (t.find_all('tr')[1].find_all('td'))]
units
Out[67]:
In [68]:
headers_with_units = list(map(lambda x: x[0]+'_'+x[1], zip(headers, units)))
headers_with_units
Out[68]:
In [69]:
values = [[td.text.replace('\xa0', 'NaN') for td in tr.find_all('td')] for tr in list(t.find_all('tr'))[2:]]
values
Out[69]:
In [70]:
df = pd.DataFrame(columns=headers_with_units, data=values)
df
Out[70]:
In [71]:
df.dtypes
Out[71]:
In [72]:
df.to_csv('data/%s-hourly.csv' % id, index=False)
In [107]:
def SanitizeUnit(name):
name = name.replace('°', 'deg').replace('%', 'percent')
return re.sub(r'\W', '_', name)
def ScrapeWeatherPage(content):
bs = bs4.BeautifulSoup(content)
tables = bs.find_all('table')
t = tables[4]
assert t.tr.td.text == 'Time', 'the page layout might have changed, could not find hourly data table'
id = re.sub(r'\xa0+| +', '-', tables[3].tr.td.text)
assert re.search(r'2019', id), 'the page layout might have changed, could not extract title'
headers = [td.text for td in t.tr.find_all('td')]
units = [SanitizeUnit(td.text) for td in (t.find_all('tr')[1].find_all('td'))]
headers_with_units = list(map(lambda x: x[0]+'_'+x[1], zip(headers, units)))
values = [[td.text.replace('\xa0', 'NaN') for td in tr.find_all('td')] for tr in list(t.find_all('tr'))[2:]]
df = pd.DataFrame(columns=headers_with_units, data=values)
return df, id
def ScrapeAndSave(link=None, content=None, htmlfile=None):
if link:
page = requests.get(link)
content = page.content
elif htmlfile:
with open(htmlfile) as f:
content = f.read()
df, id = ScrapeWeatherPage(content)
assert(len(df) == 24), 'unexpected number of hours: %d != 24' % len(df)
df.to_csv('data/%s-hourly.csv' % id, index=False)
print('scraped %d rows to data/%s-hourly.csv' % (len(df), id))
In [103]:
del df, id
In [97]:
ScrapeAndSave('https://www.jma.go.jp/en/amedas_h/today-44166.html?areaCode=000&groupCode=30')
In [93]:
ScrapeAndSave('https://www.jma.go.jp/en/amedas_h/yesterday-44166.html?areaCode=000&groupCode=30')
In [94]:
ScrapeAndSave('https://www.jma.go.jp/en/amedas_h/today-44132.html?areaCode=000&groupCode=30')
In [95]:
ScrapeAndSave('https://www.jma.go.jp/en/amedas_h/yesterday-44132.html?areaCode=000&groupCode=30')
In [108]:
ScrapeAndSave(htmlfile="data/oshima-2019-07-09.html")
In [109]:
ScrapeAndSave(htmlfile='data/setagaya-2019-07-09.html')
In [110]:
ScrapeAndSave(htmlfile='data/tokyo-2019-07-09.html')
In [ ]: