Weather data: web Scraping with BeautifulSoup

Capstone Project (Springboard Data Intensive course)

Source (web page with data): https://www.wunderground.com/history/airport/

Last access: nov, 2017


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [3]:
# This example gets the data from Campinas  Airport (São Paulo, Brazil)
# on Jan, 1st, 2017.
# The historical data provided by Wundergrund website has URLs formation like this:
# -------------------------------------------------------------------------------------------------
# "https://www.wunderground.com/history/airport/<airport_4leters_code>/
#                                       <YYYY>/<MONTH>/<DAY>/DailyHistory.html"
# -------------------------------------------------------------------------------------------------                                             
page = requests.get("https://www.wunderground.com/history/airport/SBKP/2017/1/1/DailyHistory.html")

In [4]:
# Test if content of the webpage was available (return code = 200)
page.status_code


Out[4]:
200

In [5]:
soup = BeautifulSoup(page.content, "html.parser")

In [6]:
# Just if case: if it is necessary to save as a file
# with open("html.parser", "w") as file:
#    file.write(str(soup))

Get weather data from a HTML table


In [7]:
weather_cols = ['hour', 'tempC', 'index-hot', 'dew-point', 'humidity-perc',\
                'atm-press', 'visibility', 'wind-dir', 'wind-speed', \
                'wind-highest-speed', 'precip', 'weather-events', \
                'weather-overall-conditions']

In [8]:
table = soup.find(class_="high-res")

In [9]:
tr = table.find_all("tr")
td = table.find_all("td")

In [10]:
weather_data_day  = []
weather_data_hour = []
for row in td:
    row = row.get_text()
    if ('12:00 AM' in row):
        weather_data_hour.append(row)
    elif (('AM' in row) | ('PM' in row)):
        weather_data_day.append(weather_data_hour)
        weather_data_hour = []
        weather_data_hour.append(row)
    else:
        weather_data_hour.append(row)

In [11]:
df = pd.DataFrame(weather_data_day, columns = weather_cols)

In [12]:
df = df.astype(str)

In [14]:
df= df.apply(lambda row: row.str.replace("\n", ""), axis=1)
df= df.apply(lambda row: row.str.replace("\t", " "), axis=1)

In [15]:
df.head()


Out[15]:
hour tempC index-hot dew-point humidity-perc atm-press visibility wind-dir wind-speed wind-highest-speed precip weather-events weather-overall-conditions
0 12:00 AM 24.0 °C - 12.0 °C 47% 1016 hPa - NW 7.4 km/h / 2.1 m/s - N/A Clear
1 1:00 AM 23.0 °C - 13.0 °C 53% 1016 hPa - NE 7.4 km/h / 2.1 m/s - N/A Clear
2 2:00 AM 23.0 °C - 15.0 °C 61% 1015 hPa - East 3.7 km/h / 1.0 m/s - N/A Clear
3 3:00 AM 22.0 °C - 19.0 °C 83% 1015 hPa - Calm Calm - N/A Clear
4 4:00 AM 22.0 °C - 19.0 °C 83% 1015 hPa - ENE 3.7 km/h / 1.0 m/s - N/A Clear

In [16]:
df['hour']


Out[16]:
0     12:00 AM
1      1:00 AM
2      2:00 AM
3      3:00 AM
4      4:00 AM
5      5:00 AM
6      6:00 AM
7      7:00 AM
8      8:00 AM
9      9:00 AM
10    10:00 AM
11    11:00 AM
12    12:00 PM
13     1:00 PM
14     2:00 PM
15     3:00 PM
16     4:00 PM
17     4:29 PM
18     5:00 PM
19     6:00 PM
20     6:08 PM
21     6:34 PM
22     7:00 PM
23     8:00 PM
24     9:00 PM
25    10:00 PM
Name: hour, dtype: object

In [ ]: