In [1]:
import datetime
from os import rename
from os.path import splitext
from calendar import month_name
from httplib import HTTPConnection
from time import clock
from urllib import urlretrieve
import urllib2
from zipfile import ZipFile
import numpy as np
from bs4 import BeautifulSoup

from data_scraper import *

import os
import errno
import humanize
import sys
import json
import pandas as pd

In [2]:
def parseHistoryTable(table):
    temperature = 0 # in degree F
    events = '' # some string i.e. rain
    humidity = 0 # humidity in %
    precipitation = 0 # in inch
    snowfall = 0 # in inch
    snowDepth = 0 # in inch
    sealevelpressure = 0 # in inch
    visibility = 0 # in miles
    windspeed = 0 # in miles per hour

    # skip the first entries of some fields...
    skipprec = True
    skipsnow = True
    skippres = True

    # Find all the <tr> tag pairs, skip the first one, then for each.
    for row in table.find_all('tr')[1:]:
        # Create a variable of all the <td> tag pairs in each <tr> tag pair,
        col = row.find_all('td')
        # only retrieve relevant columns
        if col[0].text == 'Mean Temperature':
            try:
                temperature = int(col[1].find('span', attrs={"class": "wx-value"}).text)
            except:
                temperature = 9999
        if col[0].text == 'Average Humidity':
            try:
                humidity = int(col[1].text)
            except:
                humidity = -1
        if col[0].text == 'Precipitation':
            if not skipprec:
                try:
                    precipitation = float(col[1].find('span', attrs={"class": "wx-value"}).text)
                except:
                    precipitation = -1.
            skipprec = False
        if col[0].text == 'Snow':
            if not skipsnow:
                try:
                    snowfall = float(col[1].find('span', attrs={"class": "wx-value"}).text)
                except:
                    snowfall = 0.
            skipsnow = False
        if col[0].text == 'Snow Depth':
            try:
                snowDepth = float(col[1].find('span', attrs={"class": "wx-value"}).text)
            except:
                snowDepth = 0.
        if col[0].text == 'Sea Level Pressure':
            if not skippres:
                try:
                    sealevelpressure = float(col[1].find('span', attrs={"class": "wx-value"}).text)
                except:
                    sealevelpressure = -1.
            skippres = False
        if col[0].text == 'Visibility':
            try:
                visibility = float(col[1].find('span', attrs={"class": "wx-value"}).text)
            except:
                visibility = -1.
        if col[0].text == 'Wind Speed':
            try:
                windspeed = float(col[1].find('span', attrs={"class": "wx-value"}).text)
            except:
                windspeed = -1.
        if col[0].text == 'Events':
            try:
                events = col[1].text
            except:
                events = '?'
    # return as dictionary
    d = dict(zip(['temperature', 'events', 'humidity', 'precipitation', 'snowfall', 'snowdepth', 'sealevelpressure', 'visibility', 'windspeed'], \
    [temperature, events, humidity, precipitation, snowfall, snowDepth, sealevelpressure, visibility, windspeed]))
    
    return d

def getWeather(year, month, day, airportcode):
    url = 'http://www.wunderground.com/cgi-bin/findweather/getForecast?airportorwmo=query&historytype=DailyHistory&backurl=%2Fhistory%2Findex.html&code={airportcode}&month={month}&day={day}&year={year}'
    response = urllib2.urlopen(url.format(year=year, day=day, month=month, airportcode=airportcode))
    html = response.read()
    
    soup = BeautifulSoup(html, "html.parser")
    
    table = soup.find("table", attrs={"id": "historyTable"})
    
    return parseHistoryTable(table)

In [3]:
year = 2015
day = 3
month = 10
airportcode = 'BOS'

getWeather(2014, 1, 3, airportcode)


Out[3]:
{'events': u'\nFog\n\t,\nSnow\n',
 'humidity': 66,
 'precipitation': 0.19,
 'sealevelpressure': 29.99,
 'snowdepth': 0,
 'snowfall': 0,
 'temperature': 8,
 'visibility': 5.0,
 'windspeed': 17.0}

In [10]:
# lazy load dictionary
weatherDict = {}
weatherFile = os.path.join('..', 'cache', 'weather_data.json')
if file_exists(weatherFile):
    # load current dict from json to cache results
    with open(weatherFile) as infile:
        weatherDict = json.load(infile)

# read airport list
dfairports = pd.read_csv(os.path.join('..', 'data', 'airports.csv'), header=None)

In [11]:
#just get JFK and BOS airports
dfairports = dfairports.iloc[[0,30]]

In [12]:
from datetime import timedelta, date

# take 2014
year = 2014

# iterate over all airports
pos = 1
for key, item in dfairports.iterrows():
    airport = item.values[0]
    
    print 'processing %s (%d/%d)...' % (airport, pos, dfairports.count())
    pos += 1
                                     
    start_date = date(year, 1, 1)
    end_date = date(year, 12, 31)
    d = start_date
    delta = datetime.timedelta(days=1)

    # iterate over one year to get the data from it
    rows = []
    date_keys = []
    
    # check if for airport data exists already
    if airport in weatherDict.keys():
        rows = weatherDict[airport]
        date_keys = [el['date'] for el in rows]
    
    while d <= end_date:
        key = '%04d%02d%02d' % (d.year, d.month, d.day)
        
        # data already requested? --> skip!
        if not key in date_keys:
            print 'GET %s' % key
            try:
                rows.append(dict([('data', getWeather(d.year, d.month, d.day, airport)), ('date', key)]))
            except:
                print 'error for %s' % key
        d += delta
    weatherDict[airport] = rows
    
    # save current JSON!
    with open(weatherFile, 'wb') as outfile:
        json.dump(weatherDict, outfile)
        
# save JSON!
with open(weatherFile, 'wb') as outfile:
    json.dump(weatherDict, outfile)


processing JFK (1/2)...
GET 20140101
GET 20140102
GET 20140103
GET 20140104
GET 20140105
GET 20140106
GET 20140107
GET 20140108
GET 20140109
GET 20140110
GET 20140111
GET 20140112
GET 20140113
GET 20140114
GET 20140115
GET 20140116
GET 20140117
GET 20140118
GET 20140119
GET 20140120
GET 20140121
GET 20140122
GET 20140123
GET 20140124
GET 20140125
GET 20140126
GET 20140127
GET 20140128
GET 20140129
GET 20140130
GET 20140131
GET 20140201
GET 20140202
GET 20140203
GET 20140204
GET 20140205
GET 20140206
GET 20140207
GET 20140208
GET 20140209
GET 20140210
GET 20140211
GET 20140212
GET 20140213
GET 20140214
GET 20140215
GET 20140216
GET 20140217
GET 20140218
GET 20140219
GET 20140220
GET 20140221
GET 20140222
GET 20140223
GET 20140224
GET 20140225
GET 20140226
GET 20140227
GET 20140228
GET 20140301
GET 20140302
GET 20140303
GET 20140304
GET 20140305
GET 20140306
GET 20140307
GET 20140308
GET 20140309
GET 20140310
GET 20140311
GET 20140312
GET 20140313
GET 20140314
GET 20140315
GET 20140316
GET 20140317
GET 20140318
GET 20140319
GET 20140320
GET 20140321
GET 20140322
GET 20140323
GET 20140324
GET 20140325
GET 20140326
GET 20140327
GET 20140328
GET 20140329
GET 20140330
GET 20140331
GET 20140401
GET 20140402
GET 20140403
GET 20140404
GET 20140405
GET 20140406
GET 20140407
GET 20140408
GET 20140409
GET 20140410
GET 20140411
GET 20140412
GET 20140413
GET 20140414
GET 20140415
GET 20140416
GET 20140417
GET 20140418
GET 20140419
GET 20140420
GET 20140421
GET 20140422
GET 20140423
GET 20140424
GET 20140425
GET 20140426
GET 20140427
GET 20140428
GET 20140429
GET 20140430
GET 20140501
GET 20140502
GET 20140503
GET 20140504
GET 20140505
GET 20140506
GET 20140507
GET 20140508
GET 20140509
GET 20140510
GET 20140511
GET 20140512
GET 20140513
GET 20140514
GET 20140515
GET 20140516
GET 20140517
GET 20140518
GET 20140519
GET 20140520
GET 20140521
GET 20140522
GET 20140523
GET 20140524
GET 20140525
GET 20140526
GET 20140527
GET 20140528
GET 20140529
GET 20140530
GET 20140531
GET 20140601
GET 20140602
GET 20140603
GET 20140604
GET 20140605
GET 20140606
GET 20140607
GET 20140608
GET 20140609
GET 20140610
GET 20140611
GET 20140612
GET 20140613
GET 20140614
GET 20140615
GET 20140616
GET 20140617
GET 20140618
GET 20140619
GET 20140620
GET 20140621
GET 20140622
GET 20140623
GET 20140624
GET 20140625
GET 20140626
GET 20140627
GET 20140628
GET 20140629
GET 20140630
GET 20140701
GET 20140702
GET 20140703
GET 20140704
GET 20140705
GET 20140706
GET 20140707
GET 20140708
GET 20140709
GET 20140710
GET 20140711
GET 20140712
GET 20140713
GET 20140714
GET 20140715
GET 20140716
GET 20140717
GET 20140718
GET 20140719
GET 20140720
GET 20140721
GET 20140722
GET 20140723
GET 20140724
GET 20140725
GET 20140726
GET 20140727
GET 20140728
GET 20140729
GET 20140730
GET 20140731
GET 20140801
GET 20140802
GET 20140803
GET 20140804
GET 20140805
GET 20140806
GET 20140807
GET 20140808
GET 20140809
GET 20140810
GET 20140811
GET 20140812
GET 20140813
GET 20140814
GET 20140815
GET 20140816
GET 20140817
GET 20140818
GET 20140819
GET 20140820
GET 20140821
GET 20140822
GET 20140823
GET 20140824
GET 20140825
GET 20140826
GET 20140827
GET 20140828
GET 20140829
GET 20140830
GET 20140831
GET 20140901
GET 20140902
GET 20140903
GET 20140904
GET 20140905
GET 20140906
GET 20140907
GET 20140908
GET 20140909
GET 20140910
GET 20140911
GET 20140912
GET 20140913
GET 20140914
GET 20140915
GET 20140916
GET 20140917
GET 20140918
GET 20140919
GET 20140920
GET 20140921
GET 20140922
GET 20140923
GET 20140924
GET 20140925
GET 20140926
GET 20140927
GET 20140928
GET 20140929
GET 20140930
GET 20141001
GET 20141002
GET 20141003
GET 20141004
GET 20141005
GET 20141006
GET 20141007
GET 20141008
GET 20141009
GET 20141010
GET 20141011
GET 20141012
GET 20141013
GET 20141014
GET 20141015
GET 20141016
GET 20141017
GET 20141018
GET 20141019
GET 20141020
GET 20141021
GET 20141022
GET 20141023
GET 20141024
GET 20141025
GET 20141026
GET 20141027
GET 20141028
GET 20141029
GET 20141030
GET 20141031
GET 20141101
GET 20141102
GET 20141103
GET 20141104
GET 20141105
GET 20141106
GET 20141107
GET 20141108
GET 20141109
GET 20141110
GET 20141111
GET 20141112
GET 20141113
GET 20141114
GET 20141115
GET 20141116
GET 20141117
GET 20141118
GET 20141119
GET 20141120
GET 20141121
GET 20141122
GET 20141123
GET 20141124
GET 20141125
GET 20141126
GET 20141127
GET 20141128
GET 20141129
GET 20141130
GET 20141201
GET 20141202
GET 20141203
GET 20141204
GET 20141205
GET 20141206
GET 20141207
GET 20141208
GET 20141209
GET 20141210
GET 20141211
GET 20141212
GET 20141213
GET 20141214
GET 20141215
GET 20141216
GET 20141217
GET 20141218
GET 20141219
GET 20141220
GET 20141221
GET 20141222
GET 20141223
GET 20141224
GET 20141225
GET 20141226
GET 20141227
GET 20141228
GET 20141229
GET 20141230
GET 20141231
processing BOS (2/2)...
GET 20140101
GET 20140102
GET 20140103
GET 20140104
GET 20140105
GET 20140106
GET 20140107
GET 20140108
GET 20140109
GET 20140110
GET 20140111
GET 20140112
GET 20140113
GET 20140114
GET 20140115
GET 20140116
GET 20140117
GET 20140118
GET 20140119
GET 20140120
GET 20140121
GET 20140122
GET 20140123
GET 20140124
GET 20140125
GET 20140126
GET 20140127
GET 20140128
GET 20140129
GET 20140130
GET 20140131
GET 20140201
GET 20140202
GET 20140203
GET 20140204
GET 20140205
GET 20140206
GET 20140207
GET 20140208
GET 20140209
GET 20140210
GET 20140211
GET 20140212
GET 20140213
GET 20140214
GET 20140215
GET 20140216
GET 20140217
GET 20140218
GET 20140219
GET 20140220
GET 20140221
GET 20140222
GET 20140223
GET 20140224
GET 20140225
GET 20140226
GET 20140227
GET 20140228
GET 20140301
GET 20140302
GET 20140303
GET 20140304
GET 20140305
GET 20140306
GET 20140307
GET 20140308
GET 20140309
GET 20140310
GET 20140311
GET 20140312
GET 20140313
GET 20140314
GET 20140315
GET 20140316
GET 20140317
GET 20140318
GET 20140319
GET 20140320
GET 20140321
GET 20140322
GET 20140323
GET 20140324
GET 20140325
GET 20140326
GET 20140327
GET 20140328
GET 20140329
GET 20140330
GET 20140331
GET 20140401
GET 20140402
GET 20140403
GET 20140404
GET 20140405
GET 20140406
GET 20140407
GET 20140408
GET 20140409
GET 20140410
GET 20140411
GET 20140412
GET 20140413
GET 20140414
GET 20140415
GET 20140416
GET 20140417
GET 20140418
GET 20140419
GET 20140420
GET 20140421
GET 20140422
GET 20140423
GET 20140424
GET 20140425
GET 20140426
GET 20140427
GET 20140428
GET 20140429
GET 20140430
GET 20140501
GET 20140502
GET 20140503
GET 20140504
GET 20140505
GET 20140506
GET 20140507
GET 20140508
GET 20140509
GET 20140510
GET 20140511
GET 20140512
GET 20140513
GET 20140514
GET 20140515
GET 20140516
GET 20140517
GET 20140518
GET 20140519
GET 20140520
GET 20140521
GET 20140522
GET 20140523
GET 20140524
GET 20140525
GET 20140526
GET 20140527
GET 20140528
GET 20140529
GET 20140530
GET 20140531
GET 20140601
GET 20140602
GET 20140603
GET 20140604
GET 20140605
GET 20140606
GET 20140607
GET 20140608
GET 20140609
GET 20140610
GET 20140611
GET 20140612
GET 20140613
GET 20140614
GET 20140615
GET 20140616
GET 20140617
GET 20140618
GET 20140619
GET 20140620
GET 20140621
GET 20140622
GET 20140623
GET 20140624
GET 20140625
GET 20140626
GET 20140627
GET 20140628
GET 20140629
GET 20140630
GET 20140701
GET 20140702
GET 20140703
GET 20140704
GET 20140705
GET 20140706
GET 20140707
GET 20140708
GET 20140709
GET 20140710
GET 20140711
GET 20140712
GET 20140713
GET 20140714
GET 20140715
GET 20140716
GET 20140717
GET 20140718
GET 20140719
GET 20140720
GET 20140721
GET 20140722
GET 20140723
GET 20140724
GET 20140725
GET 20140726
GET 20140727
GET 20140728
GET 20140729
GET 20140730
GET 20140731
GET 20140801
GET 20140802
GET 20140803
GET 20140804
GET 20140805
GET 20140806
GET 20140807
GET 20140808
GET 20140809
GET 20140810
GET 20140811
GET 20140812
GET 20140813
GET 20140814
GET 20140815
GET 20140816
GET 20140817
GET 20140818
GET 20140819
GET 20140820
GET 20140821
GET 20140822
GET 20140823
GET 20140824
GET 20140825
GET 20140826
GET 20140827
GET 20140828
GET 20140829
GET 20140830
GET 20140831
GET 20140901
GET 20140902
GET 20140903
GET 20140904
GET 20140905
GET 20140906
GET 20140907
GET 20140908
GET 20140909
GET 20140910
GET 20140911
GET 20140912
GET 20140913
GET 20140914
GET 20140915
GET 20140916
GET 20140917
GET 20140918
GET 20140919
GET 20140920
GET 20140921
GET 20140922
GET 20140923
GET 20140924
GET 20140925
GET 20140926
GET 20140927
GET 20140928
GET 20140929
GET 20140930
GET 20141001
GET 20141002
GET 20141003
GET 20141004
GET 20141005
GET 20141006
GET 20141007
GET 20141008
GET 20141009
GET 20141010
GET 20141011
GET 20141012
GET 20141013
GET 20141014
GET 20141015
GET 20141016
GET 20141017
GET 20141018
GET 20141019
GET 20141020
GET 20141021
GET 20141022
GET 20141023
GET 20141024
GET 20141025
GET 20141026
GET 20141027
GET 20141028
GET 20141029
GET 20141030
GET 20141031
GET 20141101
GET 20141102
GET 20141103
GET 20141104
GET 20141105
GET 20141106
GET 20141107
GET 20141108
GET 20141109
GET 20141110
GET 20141111
GET 20141112
GET 20141113
GET 20141114
GET 20141115
GET 20141116
GET 20141117
GET 20141118
GET 20141119
GET 20141120
GET 20141121
GET 20141122
GET 20141123
GET 20141124
GET 20141125
GET 20141126
GET 20141127
GET 20141128
GET 20141129
GET 20141130
GET 20141201
GET 20141202
GET 20141203
GET 20141204
GET 20141205
GET 20141206
GET 20141207
GET 20141208
GET 20141209
GET 20141210
GET 20141211
GET 20141212
GET 20141213
GET 20141214
GET 20141215
GET 20141216
GET 20141217
GET 20141218
GET 20141219
GET 20141220
GET 20141221
GET 20141222
GET 20141223
GET 20141224
GET 20141225
GET 20141226
GET 20141227
GET 20141228
GET 20141229
GET 20141230
GET 20141231

In [ ]: