In [1]:
%matplotlib inline
import logging
import itertools
import json
import os
import pickle
import urllib2
import folium
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from datetime import datetime
from os import listdir
from os.path import isfile, join
from src.data.parse_dataset import parse_json_files, parse_json_file, get_file_list, parse_dir
from IPython.display import Image
from datetime import date
logger = logging.getLogger()
logger.setLevel(logging.INFO)
Set the time period for which data will be collected
In [80]:
start_date = date(2016, 5, 15)
end_date = date(2016, 6, 28)
days = pd.date_range(start=start_date, end=end_date, closed='left')
Download the data from Wunderground
In [43]:
def download(url_string, file_name):
"""Download the given resource to the given file"""
response = urllib2.urlopen(url_string)
with open(file_name, "wb") as f:
f.write(response.read())
In [44]:
path = '/home/jfconavarrete/Documents/Work/Dissertation/spts-uoe/data/raw/weather'
lhr_url = 'http://api.wunderground.com/api/8494fbcae3235601/history_%s/q/UK/London.json'
# iterate through all days and stations
for i,day in enumerate(days):
url_string = lhr_url % (day.strftime('%Y%m%d'))
file_name = '%s/WEATHER-%s.json' % (path, day.strftime('%Y-%m-%d'))
logger.info('Downloading %s', url_string)
download(url_string, file_name)
# sleep 60 seconds every 10 requests due to API restrictions
if (i % 10) == 9:
time.sleep(60)
In [2]:
def parse_weather(json_obj):
"""Parses Wunderground API History JSON response"""
return [parse_observation(element) for element in json_obj['history']['observations']]
def parse_observation(observation):
"""Parses a JSON observation object to a dictionary"""
reading = {
'Timestamp': observation['utcdate']['pretty'],
'Temp': observation['tempm'],
'DewPt': observation['dewptm'],
'Humidity': observation['hum'],
'WindSpeed': observation['wspdm'],
'WindDirD': observation['wdird'],
'Visibility': observation['vism'],
'Pressure': observation['pressurem'],
'WindChill': observation['windchillm'],
'Precipitation': observation['precipm'],
'Condition': observation['conds'],
'Fog': observation['fog'],
'Rain': observation['rain'],
'Snow': observation['snow'],
'Hail': observation['hail'],
'Thunder': observation['thunder'],
'Tornado': observation['tornado'],
}
return reading
In [3]:
def get_file_date(file_name):
"""Gets the file's date"""
file_basename = os.path.basename(file_name)
idx = file_basename.find('-')
file_date = file_basename[idx + 1:]
return datetime.strptime(file_date, '%Y-%m-%d.json')
Convert the raw data to a Pandas DataFrame
In [4]:
records = parse_dir('/home/jfconavarrete/Documents/Work/Dissertation/spts-uoe/data/raw/weather',
parse_weather, sort_fn=get_file_date)
weather_dataset = pd.DataFrame(list(itertools.chain.from_iterable(records)))
In [5]:
# replace missing values
replace_empty = lambda col: col.replace('T', np.nan).replace('N/A', np.nan).replace('', np.nan)
replace_na = lambda col: col.replace('-9999.00', np.nan).replace('-9999', np.nan).replace('-9999.0', np.nan).replace('-999', np.nan)
weather_dataset = weather_dataset.apply(replace_empty, axis=1).apply(replace_na, axis=1)
# convert columns to their appropriate datatypes
weather_dataset['Fog'] = weather_dataset['Fog'].astype('int8')
weather_dataset['Hail'] = weather_dataset['Hail'].astype('int8')
weather_dataset['Rain'] = weather_dataset['Rain'].astype('int8')
weather_dataset['Snow'] = weather_dataset['Snow'].astype('int8')
weather_dataset['Tornado'] = weather_dataset['Tornado'].astype('int8')
weather_dataset['Thunder'] = weather_dataset['Snow'].astype('int8')
weather_dataset['Precipitation'] = weather_dataset['Precipitation'].astype('float32')
weather_dataset['Visibility'] = weather_dataset['Visibility'].astype('float32')
weather_dataset['WindChill'] = weather_dataset['WindChill'].astype('float32')
weather_dataset['WindSpeed'] = weather_dataset['WindSpeed'].astype('float32')
weather_dataset['DewPt'] = weather_dataset['DewPt'].astype('float32')
weather_dataset['Humidity'] = weather_dataset['Humidity'].astype('float32')
weather_dataset['Pressure'] = weather_dataset['Pressure'].astype('float32')
weather_dataset['Temp'] = weather_dataset['Temp'].astype('float32')
weather_dataset['WindDirD'] = weather_dataset['WindDirD'].astype('float32')
weather_dataset['Timestamp'] = pd.to_datetime(weather_dataset['Timestamp'], format='%I:%M %p %Z on %B %d, %Y', errors='raise').dt.tz_localize('UTC')
In [6]:
weather_dataset.sort_values(by=['Timestamp'], inplace=True)
In [7]:
weather_dataset.reset_index(inplace=True, drop=True)
In [8]:
weather_dataset.shape
Out[8]:
In [9]:
weather_dataset.info(memory_usage='deep')
In [10]:
weather_dataset.head()
Out[10]:
In [11]:
weather_dataset.describe()
Out[11]:
In [12]:
weather_dataset.apply(lambda x:x.nunique())
Out[12]:
In [13]:
weather_dataset.isnull().sum()
Out[13]:
In [14]:
weather_dataset.drop(['Precipitation', 'WindChill'], axis=1, inplace=True)
In [15]:
def get_missing_indexes(df, col_name):
# get the indexes of the missing values
return df[df[col_name].isnull()].index
def show_before_and_after(df, indexes, num=1):
# get one before and after each missing value and show them
missing_slices = [(x-num, x+num) for x in indexes]
# get slices of the missing values and concatenate them
return pd.concat([df.loc[slic[0]:slic[1]] for slic in missing_slices])
In [16]:
missing_indexes = get_missing_indexes(weather_dataset, 'Visibility')
show_before_and_after(weather_dataset, missing_indexes)
Out[16]:
In [17]:
# use linear interpolation
weather_dataset['Visibility'].interpolate(inplace=True)
show_before_and_after(weather_dataset, missing_indexes)
Out[17]:
In [18]:
# use linear interpolation
missing_indexes = get_missing_indexes(weather_dataset, 'Temp')
weather_dataset['Temp'].interpolate(inplace=True)
show_before_and_after(weather_dataset, missing_indexes)
Out[18]:
The interpolation values look reasonable
In [19]:
# use linear interpolation
missing_indexes = get_missing_indexes(weather_dataset, 'Humidity')
weather_dataset['Humidity'].interpolate(inplace=True)
show_before_and_after(weather_dataset, missing_indexes)
Out[19]:
The interpolation values look reasonable
In [20]:
weather_dataset['Pressure'] = weather_dataset['Pressure'].replace(0.0, np.NaN)
missing_indexes = get_missing_indexes(weather_dataset, 'Pressure')
weather_dataset['Pressure'].interpolate(inplace=True)
show_before_and_after(weather_dataset, missing_indexes)
Out[20]:
The interpolation values look reasonable
In [21]:
missing_indexes = get_missing_indexes(weather_dataset, 'DewPt')
weather_dataset['DewPt'].interpolate(inplace=True)
show_before_and_after(weather_dataset, missing_indexes)
Out[21]:
The interpolation values look reasonable
In [22]:
# use linear interpolation
missing_indexes = get_missing_indexes(weather_dataset, 'WindSpeed')
weather_dataset['WindSpeed'].interpolate(inplace=True)
show_before_and_after(weather_dataset, missing_indexes)
Out[22]:
The interpolation values look reasonable
In [23]:
# use linear interpolation
missing_indexes = get_missing_indexes(weather_dataset, 'WindDirD')
weather_dataset['WindDirD'].interpolate(inplace=True)
show_before_and_after(weather_dataset, missing_indexes)
Out[23]:
In [24]:
missing_indexes = get_missing_indexes(weather_dataset, 'Condition')
show_before_and_after(weather_dataset, missing_indexes, 1)
Out[24]:
In [25]:
weather_dataset['Condition'].isnull().sum()
Out[25]:
In [26]:
weather_dataset['Condition'].value_counts()
Out[26]:
In [27]:
ax = sns.stripplot(data=weather_dataset, x='Timestamp', y='Condition', orient='h');
print ax.get_xlim()
ax.set_xlim((736097.65688844072,736130.55144489254))
Out[27]:
In [28]:
g = sns.PairGrid(data=weather_dataset, hue='Condition', vars=['DewPt', 'Humidity', 'Pressure', 'Temp', 'Visibility', 'WindDirD'])
g = g.map(plt.scatter)
Can we do a classification model to interpolate the condition? Yes
Is it worth it? Not sure because it would be a 'proxy' feature for all the other weather features
Conclusion: Drop the column
In [29]:
weather_dataset.drop(['Condition'], axis=1, inplace=True)
In [30]:
weather_dataset.set_index('Timestamp').head()
Out[30]:
In [31]:
weather_dataset.describe()
Out[31]:
In [32]:
weather_dataset.info(memory_usage='deep')
In [33]:
pickle.dump(weather_dataset, open("data/parsed/weather_dataset_utc.p", "wb"))
In [ ]: