Townsville temperature

7/28/2017 townsville-temp.ipynb

Set up


In [1]:
import os
from urllib.request import urlretrieve
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')

In [2]:
#https://stackoverflow.com/questions/11936967/text-file-parsing-with-python
def clean_data(filename):    

    inputfile = open(filename + '.txt')
    outputfile = open(filename + '.csv', 'w')
    
    outputfile.writelines('Date,Temp\n')
    for line in inputfile.readlines()[1:]:
        outputfile.writelines(','.join(line.split()).replace('99999.9', '') + '\n')        

    inputfile.close()
    outputfile.close()

In [3]:
def get_data(url, filename, force=False):
    if force or not os.path.exists(filename + '.txt'): 
        urlretrieve(url, filename + '.txt')
    if force or not os.path.exists(filename + '.csv'):
        clean_data(filename)

Get data


In [4]:
#http://www.bom.gov.au/climate/change/acorn-sat/#tabs=Data-and-networks

minURL = 'http://www.bom.gov.au/climate/change/acorn/sat/data/acorn.sat.minT.032040.daily.txt'
minFile = 'townsville-min'
get_data(minURL, minFile)

maxURL = 'http://www.bom.gov.au/climate/change/acorn/sat/data/acorn.sat.maxT.032040.daily.txt'
maxFile = 'townsville-max'
get_data(maxURL, maxFile)

In [5]:
minData = pd.read_csv('townsville-min.csv', index_col='Date', parse_dates=True)
maxData = pd.read_csv('townsville-max.csv', index_col='Date', parse_dates=True)
#data = pd.DataFrame.join(maxData, minData, how='outer', lsuffix='_Max', rsuffix="_Min" )
data = minData.merge(maxData, suffixes=('_Min', '_Max'), left_index=True, right_index=True)
data['Temp_Diff'] = data['Temp_Max'] - data['Temp_Min']

In [6]:
data.shape


Out[6]:
(27891, 3)

In [7]:
data.head()


Out[7]:
Temp_Min Temp_Max Temp_Diff
Date
1940-10-20 18.8 28.9 10.1
1940-10-21 16.7 28.9 12.2
1940-10-22 16.4 28.9 12.5
1940-10-23 15.8 30.7 14.9
1940-10-24 19.8 29.5 9.7

In [8]:
data.tail()


Out[8]:
Temp_Min Temp_Max Temp_Diff
Date
2017-02-24 25.9 32.4 6.5
2017-02-25 24.4 31.9 7.5
2017-02-26 23.6 31.9 8.3
2017-02-27 22.9 32.1 9.2
2017-02-28 23.7 32.2 8.5

In [9]:
data.describe()


Out[9]:
Temp_Min Temp_Max Temp_Diff
count 27819.000000 27864.000000 27807.000000
mean 19.506161 29.152218 9.646175
std 4.819540 2.827920 3.225318
min 0.000000 13.900000 0.500000
25% 16.200000 27.100000 7.300000
50% 20.600000 29.400000 9.000000
75% 23.400000 31.200000 11.700000
max 30.900000 44.500000 24.100000

Exploratory visualisation


In [10]:
def apply_common(title=''):
    ax.set_ylim(-5,45)
    ax.set_title(title)
    ax.set_xlabel('Date')
    ax.set_ylabel('°Centrigrade')
    ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [11]:
data.hist()


Out[11]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000001A094C0D1D0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001A097E5BA58>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001A097EC27F0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001A097F38630>]], dtype=object)

In [12]:
ax = data.plot()
apply_common('All data')



In [13]:
plt.scatter(data['Temp_Max'], data.index, marker='.')
plt.show()



In [14]:
plt.scatter(data['Temp_Min'], data.index, marker='.')
plt.show()



In [15]:
plt.scatter(data['Temp_Diff'], data.index, marker='.')
plt.show()



In [16]:
filtered_data = data.dropna()
boxplot_data = [filtered_data['Temp_Max'], filtered_data['Temp_Min'], filtered_data['Temp_Diff']]
plt.boxplot(boxplot_data)
plt.xticks([1, 2, 3], ['maximum', 'minimum', 'difference'])
plt.show()



In [17]:
ax = data.resample('m').mean().plot()
apply_common('Monthly mean')



In [18]:
ax = data.resample('d').mean().rolling(365).mean().plot()
apply_common('Rolling 365-day mean')



In [19]:
ax = data.loc['2007-3-1':, 'Temp_Max'].resample('m').mean().plot()
apply_common('Ten year monthly mean')



In [20]:
ax = data.loc['2016-3-1':, 'Temp_Max'].resample('m').mean().plot()
apply_common('One year monthly mean')



In [21]:
ax = data.loc['2007-2-28':, 'Temp_Max'].resample('w').mean().plot()
apply_common('One year weekly mean')



In [22]:
ax = data.loc['2016-3-1':,'Temp_Max'].plot()
apply_common('One year daily')



In [23]:
#https://www.kaggle.com/miguelferia/visualization-of-temperature-data
#http://benalexkeen.com/resampling-time-series-data-with-pandas/
ax = data.resample('AS').mean().plot()
apply_common('Annual summary mean')



In [24]:
data['day#'] = data.index.dayofyear
fig, (ax1, ax2, ax3) = plt.subplots(nrows = 3, ncols = 1)
for key, grp in data.groupby(pd.TimeGrouper(freq='AS'), group_keys=False):
    ax1.plot(grp['day#'], grp['Temp_Max'], alpha=0.35)
    ax2.plot(grp['day#'], grp['Temp_Min'], alpha=0.35)
    ax3.plot(grp['day#'], grp['Temp_Diff'], alpha=0.35)
plt.show()