Analysing tabular data

We are going to use a LIBRARY called nampy


In [5]:
import numpy

In [6]:
numpy.loadtxt(fname='data/weather-01.csv', delimiter = ',')


Out[6]:
array([[ 0.,  0.,  1., ...,  3.,  0.,  0.],
       [ 0.,  1.,  2., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  2.,  1.,  1.],
       ..., 
       [ 0.,  1.,  1., ...,  1.,  1.,  1.],
       [ 0.,  0.,  0., ...,  0.,  2.,  0.],
       [ 0.,  0.,  1., ...,  1.,  1.,  0.]])

Variables


In [7]:
weight_kg=55

In [8]:
print [weight_kg]


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-8-e78c484dffc2> in <module>()
----> 1 print [weight_kg]

TypeError: 'builtin_function_or_method' object is not subscriptable

In [9]:
print (weight_kg)


55

In [10]:
print ('weight in pounds: ', weight_kg * 2.2 )


weight in pounds:  121.00000000000001

In [11]:
weight_kg=57.5

In [12]:
print ('New weight: ', weight_kg * 2.2)


New weight:  126.50000000000001

In [13]:
%whos


Variable    Type      Data/Info
-------------------------------
numpy       module    <module 'numpy' from 'C:\<...>ges\\numpy\\__init__.py'>
weight_kg   float     57.5

In [14]:
data=numpy.loadtxt(fname='data/weather-01.csv', delimiter = ',')

In [15]:
print (data)


[[ 0.  0.  1. ...,  3.  0.  0.]
 [ 0.  1.  2. ...,  1.  0.  1.]
 [ 0.  1.  1. ...,  2.  1.  1.]
 ..., 
 [ 0.  1.  1. ...,  1.  1.  1.]
 [ 0.  0.  0. ...,  0.  2.  0.]
 [ 0.  0.  1. ...,  1.  1.  0.]]

In [16]:
print(type(data))


<class 'numpy.ndarray'>

In [17]:
#Find out the data type
print (data.dtype)


float64

In [18]:
#find out the shape
print (data.shape)


(60, 40)

In [15]:
#This is 60 rows * 40 columns

In [19]:
#Getting a single number out of the array
print ("First value in data: ", data [0, 0])


First value in data:  0.0

In [20]:
print ('A middle value: ', data[30, 20])


A middle value:  13.0

In [21]:
# Letsget the 1st 10 columns for the first 4 rows
print(data[0:4, 0:10])
# Start at index 0 and go up to BUT NOT INCLUDING index 4


[[ 0.  0.  1.  3.  1.  2.  4.  7.  8.  3.]
 [ 0.  1.  2.  1.  2.  1.  3.  2.  2.  6.]
 [ 0.  1.  1.  3.  3.  2.  6.  2.  5.  9.]
 [ 0.  0.  2.  0.  4.  2.  2.  1.  6.  7.]]

In [22]:
#We don't need to start slicingf at 0
print (data[5:10, 7:15])


[[  1.   6.   4.   7.   6.   6.   9.   9.]
 [  5.   5.   8.   6.   5.  11.   9.   4.]
 [  3.   5.   3.   7.   8.   8.   5.  10.]
 [  5.   5.   8.   2.   4.  11.  12.  10.]
 [  3.   5.   8.   6.   8.  12.   5.  13.]]

In [23]:
#We don't even need to include the UPPER or LOWER bounds
smallchunck = data [:3, 36:]
print (smallchunck)


[[ 2.  3.  0.  0.]
 [ 1.  1.  0.  1.]
 [ 2.  2.  1.  1.]]

In [24]:
#Arithmetic on arrays
doublesmallchunck = smallchunck * 2.0

In [25]:
print (doublesmallchunck)


[[ 4.  6.  0.  0.]
 [ 2.  2.  0.  2.]
 [ 4.  4.  2.  2.]]

In [26]:
triplesmallchunck = smallchunck + doublesmallchunck

In [27]:
print(triplesmallchunck)


[[ 6.  9.  0.  0.]
 [ 3.  3.  0.  3.]
 [ 6.  6.  3.  3.]]

In [28]:
print (numpy.mean(data))


6.14875

In [29]:
print (numpy.max(data))


20.0

In [30]:
print (numpy.min(data))


0.0

In [31]:
#Get a set of data for the first station 0 means everyting from the first row and : means all the columns
station_0 = data [0, :]

In [32]:
print (station_0)


[  0.   0.   1.   3.   1.   2.   4.   7.   8.   3.   3.   3.  10.   5.   7.
   4.   7.   7.  12.  18.   6.  13.  11.  11.   7.   7.   4.   6.   8.   8.
   4.   4.   5.   7.   3.   4.   2.   3.   0.   0.]

In [33]:
print (numpy.max(station_0))


18.0

In [34]:
#We don't need to create 'temporary' array slices
#We can refer to what we call array axes

In [35]:
# axis = 0 gets the mean down each column, so the mean temperature for each recording period
print (numpy.mean(data, axis = 0))


[  0.           0.45         1.11666667   1.75         2.43333333   3.15
   3.8          3.88333333   5.23333333   5.51666667   5.95         5.9
   8.35         7.73333333   8.36666667   9.5          9.58333333
  10.63333333  11.56666667  12.35        13.25        11.96666667
  11.03333333  10.16666667  10.           8.66666667   9.15         7.25
   7.33333333   6.58333333   6.06666667   5.95         5.11666667   3.6
   3.3          3.56666667   2.48333333   1.5          1.13333333
   0.56666667]

In [36]:
# axis = 1 gets the mean ACROSS each row, so the mean temperature for each station for all the periods
print (numpy.mean(data, axis = 1))


[ 5.45   5.425  6.1    5.9    5.55   6.225  5.975  6.65   6.625  6.525
  6.775  5.8    6.225  5.75   5.225  6.3    6.55   5.7    5.85   6.55
  5.775  5.825  6.175  6.1    5.8    6.425  6.05   6.025  6.175  6.55
  6.175  6.35   6.725  6.125  7.075  5.725  5.925  6.15   6.075  5.75
  5.975  5.725  6.3    5.9    6.75   5.925  7.225  6.15   5.95   6.275  5.7
  6.1    6.825  5.975  6.725  5.7    6.25   6.4    7.05   5.9  ]

In [37]:
# Do some simple visualisations

In [38]:
import matplotlib.pyplot

In [39]:
%matplotlib inline

In [40]:
image = matplotlib.pyplot.imshow(data)



In [38]:
#Let's look at the average temperature over time
avg_temperature = numpy.means(data, axis = 0)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-38-b5e73d2ff5c1> in <module>()
      1 #Let's look at the average temperature over time
----> 2 avg_temperature = numpy.means(data, axis = 0)

AttributeError: module 'numpy' has no attribute 'means'

In [41]:
#Let's look at the average temperature over time
avg_temperature = numpy.mean(data, axis = 0)

In [42]:
avg_plot = matplotlib.pyplot.plot(avg.temperature)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-42-e6f300a3394d> in <module>()
----> 1 avg_plot = matplotlib.pyplot.plot(avg.temperature)

NameError: name 'avg' is not defined

In [43]:
avg_plot = matplotlib.pyplot.plot(avg_temperature)


Task

Produce maximum and minimum plots of this data What do you think?


In [44]:
max_temperature = numpy.max (data, axis=0)

In [45]:
max_temperature = numpy.max (data, axis=0)
min_temperature = numpy.min (data, axis=0)

In [46]:
max_plot = matplotlib.pyplot.plot(max_temperature)
min_plot = matplotlib.pyplot.plot(min_temperature)



In [ ]: