Source (web page with data): https://www.wunderground.com/history/airport/
Last access: nov, 2017
In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
In [3]:
# This example gets the data from Campinas Airport (São Paulo, Brazil)
# on Jan, 1st, 2017.
# The historical data provided by Wundergrund website has URLs formation like this:
# -------------------------------------------------------------------------------------------------
# "https://www.wunderground.com/history/airport/<airport_4leters_code>/
# <YYYY>/<MONTH>/<DAY>/DailyHistory.html"
# -------------------------------------------------------------------------------------------------
page = requests.get("https://www.wunderground.com/history/airport/SBKP/2017/1/1/DailyHistory.html")
In [4]:
# Test if content of the webpage was available (return code = 200)
page.status_code
Out[4]:
In [5]:
soup = BeautifulSoup(page.content, "html.parser")
In [6]:
# Just if case: if it is necessary to save as a file
# with open("html.parser", "w") as file:
# file.write(str(soup))
In [7]:
weather_cols = ['hour', 'tempC', 'index-hot', 'dew-point', 'humidity-perc',\
'atm-press', 'visibility', 'wind-dir', 'wind-speed', \
'wind-highest-speed', 'precip', 'weather-events', \
'weather-overall-conditions']
In [8]:
table = soup.find(class_="high-res")
In [9]:
tr = table.find_all("tr")
td = table.find_all("td")
In [10]:
weather_data_day = []
weather_data_hour = []
for row in td:
row = row.get_text()
if ('12:00 AM' in row):
weather_data_hour.append(row)
elif (('AM' in row) | ('PM' in row)):
weather_data_day.append(weather_data_hour)
weather_data_hour = []
weather_data_hour.append(row)
else:
weather_data_hour.append(row)
In [11]:
df = pd.DataFrame(weather_data_day, columns = weather_cols)
In [12]:
df = df.astype(str)
In [14]:
df= df.apply(lambda row: row.str.replace("\n", ""), axis=1)
df= df.apply(lambda row: row.str.replace("\t", " "), axis=1)
In [15]:
df.head()
Out[15]:
In [16]:
df['hour']
Out[16]:
In [ ]: