Weather data: web Scraping with BeautifulSoup

Capstone Project (Springboard Data Intensive course)

Source (web page with data): https://www.wunderground.com/history/airport/

Last access: nov, 2017



In [2]:

    
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re



In [3]:

    
# This example gets the data from Campinas  Airport (São Paulo, Brazil)
# on Jan, 1st, 2017.
# The historical data provided by Wundergrund website has URLs formation like this:
# -------------------------------------------------------------------------------------------------
# "https://www.wunderground.com/history/airport/<airport_4leters_code>/
#                                       <YYYY>/<MONTH>/<DAY>/DailyHistory.html"
# -------------------------------------------------------------------------------------------------                                             
page = requests.get("https://www.wunderground.com/history/airport/SBKP/2017/1/1/DailyHistory.html")



In [4]:

    
# Test if content of the webpage was available (return code = 200)
page.status_code









    Out[4]:





200



In [5]:

    
soup = BeautifulSoup(page.content, "html.parser")



In [6]:

    
# Just if case: if it is necessary to save as a file
# with open("html.parser", "w") as file:
#    file.write(str(soup))

Get weather data from a HTML table



In [7]:

    
weather_cols = ['hour', 'tempC', 'index-hot', 'dew-point', 'humidity-perc',\
                'atm-press', 'visibility', 'wind-dir', 'wind-speed', \
                'wind-highest-speed', 'precip', 'weather-events', \
                'weather-overall-conditions']



In [8]:

    
table = soup.find(class_="high-res")



In [9]:

    
tr = table.find_all("tr")
td = table.find_all("td")



In [10]:

    
weather_data_day  = []
weather_data_hour = []
for row in td:
    row = row.get_text()
    if ('12:00 AM' in row):
        weather_data_hour.append(row)
    elif (('AM' in row) | ('PM' in row)):
        weather_data_day.append(weather_data_hour)
        weather_data_hour = []
        weather_data_hour.append(row)
    else:
        weather_data_hour.append(row)



In [11]:

    
df = pd.DataFrame(weather_data_day, columns = weather_cols)



In [12]:

    
df = df.astype(str)



In [14]:

    
df= df.apply(lambda row: row.str.replace("\n", ""), axis=1)
df= df.apply(lambda row: row.str.replace("\t", " "), axis=1)



In [15]:

    
df.head()









    Out[15]:







  
    
      
      hour
      tempC
      index-hot
      dew-point
      humidity-perc
      atm-press
      visibility
      wind-dir
      wind-speed
      wind-highest-speed
      precip
      weather-events
      weather-overall-conditions
    
  
  
    
      0
      12:00 AM
      24.0 °C
      -
      12.0 °C
      47%
      1016 hPa
      -
      NW
      7.4 km/h  / 2.1 m/s
      -
      N/A
      
      Clear
    
    
      1
      1:00 AM
      23.0 °C
      -
      13.0 °C
      53%
      1016 hPa
      -
      NE
      7.4 km/h  / 2.1 m/s
      -
      N/A
      
      Clear
    
    
      2
      2:00 AM
      23.0 °C
      -
      15.0 °C
      61%
      1015 hPa
      -
      East
      3.7 km/h  / 1.0 m/s
      -
      N/A
      
      Clear
    
    
      3
      3:00 AM
      22.0 °C
      -
      19.0 °C
      83%
      1015 hPa
      -
      Calm
      Calm
      -
      N/A
      
      Clear
    
    
      4
      4:00 AM
      22.0 °C
      -
      19.0 °C
      83%
      1015 hPa
      -
      ENE
      3.7 km/h  / 1.0 m/s
      -
      N/A
      
      Clear



In [16]:

    
df['hour']









    Out[16]:





0     12:00 AM
1      1:00 AM
2      2:00 AM
3      3:00 AM
4      4:00 AM
5      5:00 AM
6      6:00 AM
7      7:00 AM
8      8:00 AM
9      9:00 AM
10    10:00 AM
11    11:00 AM
12    12:00 PM
13     1:00 PM
14     2:00 PM
15     3:00 PM
16     4:00 PM
17     4:29 PM
18     5:00 PM
19     6:00 PM
20     6:08 PM
21     6:34 PM
22     7:00 PM
23     8:00 PM
24     9:00 PM
25    10:00 PM
Name: hour, dtype: object



In [ ]:

	hour	tempC	index-hot	dew-point	humidity-perc	atm-press	visibility	wind-dir	wind-speed	wind-highest-speed	precip	weather-overall-conditions
0	12:00 AM	24.0 °C	-	12.0 °C	47%	1016 hPa	-	NW	7.4 km/h / 2.1 m/s	-	N/A	Clear
1	1:00 AM	23.0 °C	-	13.0 °C	53%	1016 hPa	-	NE	7.4 km/h / 2.1 m/s	-	N/A	Clear
2	2:00 AM	23.0 °C	-	15.0 °C	61%	1015 hPa	-	East	3.7 km/h / 1.0 m/s	-	N/A	Clear
3	3:00 AM	22.0 °C	-	19.0 °C	83%	1015 hPa	-	Calm	Calm	-	N/A	Clear
4	4:00 AM	22.0 °C	-	19.0 °C	83%	1015 hPa	-	ENE	3.7 km/h / 1.0 m/s	-	N/A	Clear