Analysing tabular data

We are going to use a LIBRARY called numpy


In [63]:
import numpy

In [64]:
numpy.loadtxt(fname = 'data/weather-01.csv', delimiter = ',')


Out[64]:
array([[ 0.,  0.,  1., ...,  3.,  0.,  0.],
       [ 0.,  1.,  2., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  2.,  1.,  1.],
       ..., 
       [ 0.,  1.,  1., ...,  1.,  1.,  1.],
       [ 0.,  0.,  0., ...,  0.,  2.,  0.],
       [ 0.,  0.,  1., ...,  1.,  1.,  0.]])

Variables


In [65]:
weight_kg = 55

In [66]:
print(weight_kg)


55

In [67]:
print('Weight in pounds: ', weight_kg * 2.2)


Weight in pounds:  121.00000000000001

In [68]:
weight_kg = 57

In [69]:
weight_kg = 57.5

In [70]:
print('New weight: ', weight_kg * 2.2)


New weight:  126.50000000000001

In [71]:
%whos


Variable           Type         Data/Info
-----------------------------------------
avg_plot           list         n=1
avg_temperature    ndarray      40: 40 elems, type `float64`, 320 bytes
data               ndarray      60x40: 2400 elems, type `float64`, 19200 bytes
doublesmallchunk   ndarray      3x4: 12 elems, type `float64`, 96 bytes
image              AxesImage    AxesImage(54,36;334.8x223.2)
matplotlib         module       <module 'matplotlib' from<...>matplotlib\\__init__.py'>
max_plot           list         n=1
max_temp           float64      20.0
min_plot           list         n=1
min_temp           float64      0.0
numpy              module       <module 'numpy' from 'C:\<...>ges\\numpy\\__init__.py'>
smallchunk         ndarray      3x4: 12 elems, type `float64`, 96 bytes
station_0          ndarray      40: 40 elems, type `float64`, 320 bytes
triplesmallchunk   ndarray      3x4: 12 elems, type `float64`, 96 bytes
weight_kg          float        57.5

In [72]:
data = numpy.loadtxt(fname = 'data/weather-01.csv', delimiter = ',')

In [73]:
data


Out[73]:
array([[ 0.,  0.,  1., ...,  3.,  0.,  0.],
       [ 0.,  1.,  2., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  2.,  1.,  1.],
       ..., 
       [ 0.,  1.,  1., ...,  1.,  1.,  1.],
       [ 0.,  0.,  0., ...,  0.,  2.,  0.],
       [ 0.,  0.,  1., ...,  1.,  1.,  0.]])

In [74]:
print(type(data))


<class 'numpy.ndarray'>

In [75]:
type(data)


Out[75]:
numpy.ndarray

In [76]:
%whos


Variable           Type         Data/Info
-----------------------------------------
avg_plot           list         n=1
avg_temperature    ndarray      40: 40 elems, type `float64`, 320 bytes
data               ndarray      60x40: 2400 elems, type `float64`, 19200 bytes
doublesmallchunk   ndarray      3x4: 12 elems, type `float64`, 96 bytes
image              AxesImage    AxesImage(54,36;334.8x223.2)
matplotlib         module       <module 'matplotlib' from<...>matplotlib\\__init__.py'>
max_plot           list         n=1
max_temp           float64      20.0
min_plot           list         n=1
min_temp           float64      0.0
numpy              module       <module 'numpy' from 'C:\<...>ges\\numpy\\__init__.py'>
smallchunk         ndarray      3x4: 12 elems, type `float64`, 96 bytes
station_0          ndarray      40: 40 elems, type `float64`, 320 bytes
triplesmallchunk   ndarray      3x4: 12 elems, type `float64`, 96 bytes
weight_kg          float        57.5

In [77]:
# Finding out the data type 
print(data.dtype)


float64

In [78]:
# Find out the shape 
print(data.shape)


(60, 40)

In [79]:
# This is 60 rows by(*) 40 columns

In [80]:
# Getting a single number out of the array 
print ("First value in data:", data [0, 0]) # numbers start at 0, unlike 1 for R - so first row is 0


First value in data: 0.0

In [81]:
print ('A middle value: ', data[30, 20])


A middle value:  13.0

In [82]:
# Lets get the first 10 columns for the first 4 rows  
print(data[0:4, 0:10]) # start at 0 and go up to, but dont include 4 (so 0:3, in R would be 1:4)


[[ 0.  0.  1.  3.  1.  2.  4.  7.  8.  3.]
 [ 0.  1.  2.  1.  2.  1.  3.  2.  2.  6.]
 [ 0.  1.  1.  3.  3.  2.  6.  2.  5.  9.]
 [ 0.  0.  2.  0.  4.  2.  2.  1.  6.  7.]]

In [83]:
# dont have to start slicing at 0
print(data[5:10, 7:15])


[[  1.   6.   4.   7.   6.   6.   9.   9.]
 [  5.   5.   8.   6.   5.  11.   9.   4.]
 [  3.   5.   3.   7.   8.   8.   5.  10.]
 [  5.   5.   8.   2.   4.  11.  12.  10.]
 [  3.   5.   8.   6.   8.  12.   5.  13.]]

In [84]:
# Dont even need to include the UPPER and LOWER bounds 
smallchunk = data[:3, 36:]
print(smallchunk)


[[ 2.  3.  0.  0.]
 [ 1.  1.  0.  1.]
 [ 2.  2.  1.  1.]]

In [85]:
# Arithmetic on arrays 
doublesmallchunk = smallchunk * 2.0

In [86]:
print(doublesmallchunk)


[[ 4.  6.  0.  0.]
 [ 2.  2.  0.  2.]
 [ 4.  4.  2.  2.]]

In [87]:
triplesmallchunk = smallchunk + doublesmallchunk

In [88]:
print(triplesmallchunk)


[[ 6.  9.  0.  0.]
 [ 3.  3.  0.  3.]
 [ 6.  6.  3.  3.]]

In [89]:
print (numpy.mean(data))


6.14875

In [90]:
numpy.mean(data)


Out[90]:
6.1487499999999997

In [91]:
print (numpy.max(data))


20.0

In [92]:
print(numpy.min(data))


0.0

In [93]:
# Get a set of data for the first station (data set is columns (time intervals) and rows (weather stations))
station_0 = data[0, :] # can put just : for all columns

In [94]:
print(numpy.max(station_0))


18.0

In [95]:
# We dont need to create 'temporary' array slices 
# We can refer to what we call araay axes

In [96]:
print(numpy.mean(data, axis = 1))


[ 5.45   5.425  6.1    5.9    5.55   6.225  5.975  6.65   6.625  6.525
  6.775  5.8    6.225  5.75   5.225  6.3    6.55   5.7    5.85   6.55
  5.775  5.825  6.175  6.1    5.8    6.425  6.05   6.025  6.175  6.55
  6.175  6.35   6.725  6.125  7.075  5.725  5.925  6.15   6.075  5.75
  5.975  5.725  6.3    5.9    6.75   5.925  7.225  6.15   5.95   6.275  5.7
  6.1    6.825  5.975  6.725  5.7    6.25   6.4    7.05   5.9  ]

In [97]:
# do some simple visualisations

In [98]:
import matplotlib.pyplot

In [100]:
%matplotlib inline

In [101]:
image = matplotlib.pyplot.imshow(data)



In [102]:
# Lets look at the average temperature over time 
avg_temperature = numpy.mean(data, axis = 0)

In [103]:
avg_plot = matplotlib.pyplot.plot(avg_temperature)



In [ ]:
#Task: produce max and minimum plots

In [107]:
max_temp = numpy.max(data, axis = 0)
min_temp = numpy.min(data, axis = 0)

In [108]:
min_plot = matplotlib.pyplot.plot(min_temp)



In [109]:
max_plot = matplotlib.pyplot.plot(max_temp)



In [111]:
min_plot = matplotlib.pyplot.plot(min_temp)
max_plot = matplotlib.pyplot.plot(max_temp)



In [ ]: