In [46]:
import os
from urllib.request import urlretrieve
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')
In [2]:
#https://stackoverflow.com/questions/11936967/text-file-parsing-with-python
def clean_data(filename):
inputfile = open(filename + '.txt')
outputfile = open(filename + '.csv', 'w')
outputfile.writelines('Date,Temp\n')
for line in inputfile.readlines()[1:]:
outputfile.writelines(','.join(line.split()).replace('99999.9', '') + '\n')
inputfile.close()
outputfile.close()
In [3]:
def get_data(url, filename, force=False):
if force or not os.path.exists(filename + '.txt'):
urlretrieve(url, filename + '.txt')
if force or not os.path.exists(filename + '.csv'):
clean_data(filename)
In [4]:
#http://www.bom.gov.au/climate/change/acorn-sat/#tabs=Data-and-networks
maxURL = 'http://www.bom.gov.au/climate/change/acorn/sat/data/acorn.sat.maxT.094029.daily.txt'
maxFile = 'hobart-max'
get_data(maxURL, maxFile)
data = pd.read_csv('hobart-max.csv', index_col='Date', parse_dates=True)
In [5]:
data.shape
Out[5]:
In [6]:
data.head()
Out[6]:
In [7]:
data.describe()
Out[7]:
In [8]:
# measures of variability
# variance- average deviation from the mean
print(data.var())
# standard deviation - square root of variance
print(data.std())
In [9]:
def apply_common(title=''):
#ax.set_ylim(-5,45)
ax.set_title(title)
ax.set_xlabel('Date')
ax.set_ylabel('°Centrigrade')
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
In [10]:
ax = data.plot()
apply_common('All data')
In [11]:
plt.scatter(data['Temp'], data.index, marker='.')
plt.show()
In [12]:
#distribution - unimodal, right-skewed
data.hist()
Out[12]:
In [13]:
# summarizes a data set using five statistics while also plotting unusual observations
# box is middle 50% of data, line in box is mean
# total length of the box, is interquartile range (IQR)
# whiskers < 1.5 IQR
# observations beyond whiskers are outliers
filtered_data = data.dropna()
boxplot_data = [filtered_data['Temp']]
plt.boxplot(boxplot_data)
plt.xticks([1], ['max temp'])
plt.show()
Bar plot Segmented bar plot Standardized segmented bar plot Mosaic plot Pie chart
In [14]:
data['Day'] = data.index.dayofweek
In [15]:
data.head()
Out[15]:
In [16]:
data.describe()
Out[16]:
In [23]:
#x = data.groupby(data.Day)
print()
#plt.bar(x,7)
#plt.show()
In [43]:
print('Mean is {0}'.format(data.Temp.mean()))
In [44]:
print('Median is {0}'.format(data.Temp.median()))
In [45]:
print('Mode is {0}'.format(data.Temp.mode()))
In [42]:
print('Range is {0} to {1}'.format(data.Temp.min(), data.Temp.max()))
In [52]:
print('IQR is {0}'.format(stats.iqr(data.Temp.dropna())))
In [53]:
print('Variance is {0}'.format(data.Temp.var()))
In [54]:
print('Standard deviation is {0}'.format(data.Temp.std()))
In [55]:
data.Temp.hist()
Out[55]:
Symmetry Distribution is unimodal, or normal.
Skewness Distribution is very mildly right, or positive, skewed (i.e, tail is to the right, larger numbers). Mean > median.
Kurtosis Distribution is mildly platykurtic?
In [62]:
stats.normaltest(data.Temp.dropna())
Out[62]:
In [61]:
stats.skewtest(data.Temp.dropna())
Out[61]:
In [59]:
stats.kurtosistest(data.Temp.dropna())
Out[59]:
In [ ]: