Analaysing Tabular Data

We are going to use a LIBRARY called numpy


In [84]:
import numpy

In [85]:
numpy.loadtxt(fname='data/weather-01.csv', delimiter = ',')


Out[85]:
array([[ 0.,  0.,  1., ...,  3.,  0.,  0.],
       [ 0.,  1.,  2., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  2.,  1.,  1.],
       ..., 
       [ 0.,  1.,  1., ...,  1.,  1.,  1.],
       [ 0.,  0.,  0., ...,  0.,  2.,  0.],
       [ 0.,  0.,  1., ...,  1.,  1.,  0.]])

Variables


In [86]:
weight_kg = 55

In [87]:
print (weight_kg)


55

In [88]:
print ('Weight in pounds:', weight_kg * 2.2)


Weight in pounds: 121.00000000000001

In [89]:
weight_kg = 57.5

In [90]:
print ('New weight:', weight_kg * 2.2)


New weight: 126.50000000000001

In [91]:
%whos


Variable              Type         Data/Info
--------------------------------------------
avg_plot              list         n=1
avg_plot_max          list         n=1
avg_temperature       ndarray      40: 40 elems, type `float64`, 320 bytes
avg_temperature_max   ndarray      40: 40 elems, type `float64`, 320 bytes
avg_temperature_min   ndarray      40: 40 elems, type `float64`, 320 bytes
data                  ndarray      60x40: 2400 elems, type `float64`, 19200 bytes
doublesmallchunk      ndarray      3x4: 12 elems, type `float64`, 96 bytes
image                 AxesImage    AxesImage(148.92,36;144.96x217.44)
matplotlib            module       <module 'matplotlib' from<...>/matplotlib/__init__.py'>
numpy                 module       <module 'numpy' from '/Li<...>kages/numpy/__init__.py'>
smallchunk            ndarray      3x4: 12 elems, type `float64`, 96 bytes
station_0             ndarray      40: 40 elems, type `float64`, 320 bytes
triplesmallchunk      ndarray      3x4: 12 elems, type `float64`, 96 bytes
weight_kg             float        57.5

In [92]:
data = numpy.loadtxt(fname='data/weather-01.csv', delimiter = ',')

In [93]:
print (data)


[[ 0.  0.  1. ...,  3.  0.  0.]
 [ 0.  1.  2. ...,  1.  0.  1.]
 [ 0.  1.  1. ...,  2.  1.  1.]
 ..., 
 [ 0.  1.  1. ...,  1.  1.  1.]
 [ 0.  0.  0. ...,  0.  2.  0.]
 [ 0.  0.  1. ...,  1.  1.  0.]]

In [94]:
print (type (data))


<class 'numpy.ndarray'>

In [95]:
%whos


Variable              Type         Data/Info
--------------------------------------------
avg_plot              list         n=1
avg_plot_max          list         n=1
avg_temperature       ndarray      40: 40 elems, type `float64`, 320 bytes
avg_temperature_max   ndarray      40: 40 elems, type `float64`, 320 bytes
avg_temperature_min   ndarray      40: 40 elems, type `float64`, 320 bytes
data                  ndarray      60x40: 2400 elems, type `float64`, 19200 bytes
doublesmallchunk      ndarray      3x4: 12 elems, type `float64`, 96 bytes
image                 AxesImage    AxesImage(148.92,36;144.96x217.44)
matplotlib            module       <module 'matplotlib' from<...>/matplotlib/__init__.py'>
numpy                 module       <module 'numpy' from '/Li<...>kages/numpy/__init__.py'>
smallchunk            ndarray      3x4: 12 elems, type `float64`, 96 bytes
station_0             ndarray      40: 40 elems, type `float64`, 320 bytes
triplesmallchunk      ndarray      3x4: 12 elems, type `float64`, 96 bytes
weight_kg             float        57.5

In [96]:
# Finding out the data type
print (data.dtype)


float64

In [97]:
# Find out the shape
print (data.shape)


(60, 40)

In [98]:
# This is 60 rows * 40 columns

In [99]:
# Getting a single mmuber out of the array
print ("First value in data:", data [0,0])


First value in data: 0.0

In [100]:
print ('A mimddle value:', data [30,20])


A mimddle value: 13.0

In [101]:
# Lets get the first 10 columns for the first 4 rows
print (data [0:4,0:10])
# Start at index 0 and go up to But Not including index 4


[[ 0.  0.  1.  3.  1.  2.  4.  7.  8.  3.]
 [ 0.  1.  2.  1.  2.  1.  3.  2.  2.  6.]
 [ 0.  1.  1.  3.  3.  2.  6.  2.  5.  9.]
 [ 0.  0.  2.  0.  4.  2.  2.  1.  6.  7.]]

In [102]:
# We don't need to start slicing at 0
print (data [5:10, 7:15])


[[  1.   6.   4.   7.   6.   6.   9.   9.]
 [  5.   5.   8.   6.   5.  11.   9.   4.]
 [  3.   5.   3.   7.   8.   8.   5.  10.]
 [  5.   5.   8.   2.   4.  11.  12.  10.]
 [  3.   5.   8.   6.   8.  12.   5.  13.]]

In [103]:
# We don't need to include Upper and Lower bounds
smallchunk = data [:3, 36:]
print (smallchunk)


[[ 2.  3.  0.  0.]
 [ 1.  1.  0.  1.]
 [ 2.  2.  1.  1.]]

In [104]:
# Arithmetic on Arrays

doublesmallchunk = smallchunk * 2.0

In [105]:
print (doublesmallchunk)


[[ 4.  6.  0.  0.]
 [ 2.  2.  0.  2.]
 [ 4.  4.  2.  2.]]

In [106]:
triplesmallchunk = smallchunk + doublesmallchunk

In [107]:
print (triplesmallchunk)


[[ 6.  9.  0.  0.]
 [ 3.  3.  0.  3.]
 [ 6.  6.  3.  3.]]

In [108]:
print (numpy.mean(data))


6.14875

In [109]:
print (numpy.max(data))


20.0

In [110]:
print (numpy.min(data))


0.0

In [111]:
# Get a set of data for the first station

station_0 = data [0, :]

In [112]:
print (numpy.max(station_0))


18.0

In [113]:
# We dont need to create this 'temporary' array slices
# We can refer to what we call array axes

In [114]:
# axis = 0 gets the mean Down the column, so the mean tempreature 
# for each recording period
print (numpy.mean(data, axis = 0))


[  0.           0.45         1.11666667   1.75         2.43333333   3.15
   3.8          3.88333333   5.23333333   5.51666667   5.95         5.9
   8.35         7.73333333   8.36666667   9.5          9.58333333
  10.63333333  11.56666667  12.35        13.25        11.96666667
  11.03333333  10.16666667  10.           8.66666667   9.15         7.25
   7.33333333   6.58333333   6.06666667   5.95         5.11666667   3.6
   3.3          3.56666667   2.48333333   1.5          1.13333333
   0.56666667]

In [115]:
# axis = 1 gets the mean across the row, so the mean tempreature 
# for each recording period
print (numpy.mean(data, axis = 1))


[ 5.45   5.425  6.1    5.9    5.55   6.225  5.975  6.65   6.625  6.525
  6.775  5.8    6.225  5.75   5.225  6.3    6.55   5.7    5.85   6.55
  5.775  5.825  6.175  6.1    5.8    6.425  6.05   6.025  6.175  6.55
  6.175  6.35   6.725  6.125  7.075  5.725  5.925  6.15   6.075  5.75
  5.975  5.725  6.3    5.9    6.75   5.925  7.225  6.15   5.95   6.275  5.7
  6.1    6.825  5.975  6.725  5.7    6.25   6.4    7.05   5.9  ]

In [116]:
# Do some simple Visulaisations

In [117]:
import matplotlib.pyplot

In [118]:
%matplotlib inline

In [119]:
image = matplotlib.pyplot.imshow(data)



In [120]:
%whos


Variable              Type         Data/Info
--------------------------------------------
avg_plot              list         n=1
avg_plot_max          list         n=1
avg_temperature       ndarray      40: 40 elems, type `float64`, 320 bytes
avg_temperature_max   ndarray      40: 40 elems, type `float64`, 320 bytes
avg_temperature_min   ndarray      40: 40 elems, type `float64`, 320 bytes
data                  ndarray      60x40: 2400 elems, type `float64`, 19200 bytes
doublesmallchunk      ndarray      3x4: 12 elems, type `float64`, 96 bytes
image                 AxesImage    AxesImage(148.92,36;144.96x217.44)
matplotlib            module       <module 'matplotlib' from<...>/matplotlib/__init__.py'>
numpy                 module       <module 'numpy' from '/Li<...>kages/numpy/__init__.py'>
smallchunk            ndarray      3x4: 12 elems, type `float64`, 96 bytes
station_0             ndarray      40: 40 elems, type `float64`, 320 bytes
triplesmallchunk      ndarray      3x4: 12 elems, type `float64`, 96 bytes
weight_kg             float        57.5

In [121]:
# Let's look at the average tempreture over time
avg_temperature = numpy.mean(data, axis = 0)

In [122]:
avg_plot = matplotlib.pyplot.plot(avg_temperature)



In [123]:
# Task: 
# Produce maximum and minimum plots of this data
# What do you think

In [124]:
avg_temperature_max = numpy.max(data, axis = 0)

In [125]:
avg_plot_max = matplotlib.pyplot.plot(avg_temperature_max)



In [126]:
avg_temperature_min = numpy.min(data, axis = 0)

In [128]:
avg_plot_min = matplotlib.pyplot.plot(avg_temperature_min)



In [130]:
avg_combine_plot = matplotlib.pyplot.plot(avg_temperature_min, avg_temperature_max)



In [ ]: