Analysing tabular data

We are going to use a LIBRARY called nump

numpy not nump


In [2]:
import numpy

In [3]:
numpy.loadtxt(fname='data/weather-0.1.csv',delimiter=',')


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-3-f50cb4440abc> in <module>()
----> 1 numpy.loadtxt(fname='data/weather-0.1.csv',delimiter=',')

C:\Users\David\Anaconda3\lib\site-packages\numpy\lib\npyio.py in loadtxt(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin)
    803                 fh = iter(open(fname, 'U'))
    804             else:
--> 805                 fh = iter(open(fname))
    806         else:
    807             fh = iter(fname)

FileNotFoundError: [Errno 2] No such file or directory: 'data/weather-0.1.csv'

In [3]:
numpu.loadtxt(='data/weather-01.csv', delimiter = ',')


  File "<ipython-input-3-59dc7a225797>", line 1
    numpu.loadtxt(='data/weather-01.csv', delimiter = ',')
                  ^
SyntaxError: invalid syntax

In [4]:
numpy.loadtxt(fname='Data/weather-01.csv', delimiter=',')


Out[4]:
array([[ 0.,  0.,  1., ...,  3.,  0.,  0.],
       [ 0.,  1.,  2., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  2.,  1.,  1.],
       ..., 
       [ 0.,  1.,  1., ...,  1.,  1.,  1.],
       [ 0.,  0.,  0., ...,  0.,  2.,  0.],
       [ 0.,  0.,  1., ...,  1.,  1.,  0.]])

Variables


In [8]:
weight_kg =55

In [9]:
print (weight_kg)


55

In [10]:
print('weight in pounds:', weight_kg * 2.2)


weight in pounds: 121.00000000000001

In [11]:
weight_kg = 57.5

In [13]:
print ('New weight:', weight_kg * 2.2)


New weight: 126.50000000000001

In [14]:
%whos


Variable    Type      Data/Info
-------------------------------
numpy       module    <module 'numpy' from 'C:\<...>ges\\numpy\\__init__.py'>
weight_kg   float     57.5

In [6]:
data=numpy.loadtxt(fname='Data/weather-01.csv', delimiter=',')

In [16]:
print (data)


[[ 0.  0.  1. ...,  3.  0.  0.]
 [ 0.  1.  2. ...,  1.  0.  1.]
 [ 0.  1.  1. ...,  2.  1.  1.]
 ..., 
 [ 0.  1.  1. ...,  1.  1.  1.]
 [ 0.  0.  0. ...,  0.  2.  0.]
 [ 0.  0.  1. ...,  1.  1.  0.]]

In [17]:
print (type(data))


<class 'numpy.ndarray'>

In [18]:
%whos


Variable    Type       Data/Info
--------------------------------
data        ndarray    60x40: 2400 elems, type `float64`, 19200 bytes
numpy       module     <module 'numpy' from 'C:\<...>ges\\numpy\\__init__.py'>
weight_kg   float      57.5

In [19]:
# Finding out the data tye
print (data.dtype)


float64

In [20]:
# Find out the shape
print (data.shape)


(60, 40)

In [21]:
# This is 60 rows by 40 columns

In [22]:
# Gtting a single number out of the array
print ("First value in data:", data [0,0])


First value in data: 0.0

In [26]:
# First element is 0 as we are counting the number of positions from the start, ie. the first is 0 from the start and 
# the last is n-1

In [27]:
print ('A middle value:', data[30,20])


A middle value: 13.0

In [28]:
#just named a new variable 'A middle value' and said that that variable is data from the 'data' array

In [30]:
#First 10 columns for the first 4 rows, taking a section of the array a slice

In [31]:
print (data[0:4,0:10])
#start at index 0 and go upto but not including 4, then do columns starting at 0 but not including 10,
#you end up with 4 rows 10 columns


[[ 0.  0.  1.  3.  1.  2.  4.  7.  8.  3.]
 [ 0.  1.  2.  1.  2.  1.  3.  2.  2.  6.]
 [ 0.  1.  1.  3.  3.  2.  6.  2.  5.  9.]
 [ 0.  0.  2.  0.  4.  2.  2.  1.  6.  7.]]

In [32]:
# don't have to start a slice at 0
print (data[5:10, 7:15])


[[  1.   6.   4.   7.   6.   6.   9.   9.]
 [  5.   5.   8.   6.   5.  11.   9.   4.]
 [  3.   5.   3.   7.   8.   8.   5.  10.]
 [  5.   5.   8.   2.   4.  11.  12.  10.]
 [  3.   5.   8.   6.   8.  12.   5.  13.]]

In [33]:
#Number of columns/rows = larger number minus smaller number

In [34]:
# we don't even need to include the upper or lower bounds, assumes first column/row or last column/row depending on which
#you miss out
smallchunk = data [:3, 36:]
print (smallchunk)
#starting at 0 going to column 3 and starting at row 36 going to the end


[[ 2.  3.  0.  0.]
 [ 1.  1.  0.  1.]
 [ 2.  2.  1.  1.]]

In [35]:
#aithmetic on arrays
doublessmallchunk = smallchunk * 2.0
# times everything in small chnk by 2.0

In [36]:
print (doublessmallchunk)
#tab auto completes things


[[ 4.  6.  0.  0.]
 [ 2.  2.  0.  2.]
 [ 4.  4.  2.  2.]]

In [38]:
triplesmallchunk = smallchunk + doublessmallchunk
# adding variables, same shape but with different values, same as timesing smallchunk by 3

In [39]:
print (triplesmallchunk)


[[ 6.  9.  0.  0.]
 [ 3.  3.  0.  3.]
 [ 6.  6.  3.  3.]]

In [40]:
print (numpy.mean(data))


6.14875

In [41]:
#print just tells you what a thing is it doens't create it as a new variable

In [42]:
print (numpy.max(data))


20.0

In [43]:
print (numpy.min(data))


0.0

In [44]:
# get a set of data for the first weather station
station_0 = data [0, :]
# getting first row for all columns

In [45]:
print (station_0)


[  0.   0.   1.   3.   1.   2.   4.   7.   8.   3.   3.   3.  10.   5.   7.
   4.   7.   7.  12.  18.   6.  13.  11.  11.   7.   7.   4.   6.   8.   8.
   4.   4.   5.   7.   3.   4.   2.   3.   0.   0.]

In [46]:
print (numpy.max(station_0))


18.0

In [47]:
# we don't need to create 'temporary' array slices
# we can refer to what we call array axes

In [48]:
# e.g.
print (numpy.mean(data, axis = 0))
#


[  0.           0.45         1.11666667   1.75         2.43333333   3.15
   3.8          3.88333333   5.23333333   5.51666667   5.95         5.9
   8.35         7.73333333   8.36666667   9.5          9.58333333
  10.63333333  11.56666667  12.35        13.25        11.96666667
  11.03333333  10.16666667  10.           8.66666667   9.15         7.25
   7.33333333   6.58333333   6.06666667   5.95         5.11666667   3.6
   3.3          3.56666667   2.48333333   1.5          1.13333333
   0.56666667]

In [49]:
print (numpy.mean(data, axis = 1))


[ 5.45   5.425  6.1    5.9    5.55   6.225  5.975  6.65   6.625  6.525
  6.775  5.8    6.225  5.75   5.225  6.3    6.55   5.7    5.85   6.55
  5.775  5.825  6.175  6.1    5.8    6.425  6.05   6.025  6.175  6.55
  6.175  6.35   6.725  6.125  7.075  5.725  5.925  6.15   6.075  5.75
  5.975  5.725  6.3    5.9    6.75   5.925  7.225  6.15   5.95   6.275  5.7
  6.1    6.825  5.975  6.725  5.7    6.25   6.4    7.05   5.9  ]

In [51]:
# axes are dimensions so axes=0 are the columns and the mean of axes=0 gives you the mean of each column, mean t for each time
# axis=1 are the rows so mean of axis=1 is the mean of each row- mean T of each station

In [52]:
# Visualisations
# matplotlib gives you matlab like plotting functions

In [9]:
import matplotlib.pyplot
# matplotlib is massive so just import small parts

In [10]:
%matplotlib inline
# plots appear in same window

In [55]:
image = matplotlib.pyplot.imshow(data)



In [57]:
# heat map. Don't know what it represents tho

In [58]:
#look at average T over time
avg_Temp = numpy.mean(data, axis=0)

In [59]:
avg_plot = matplotlib.pyplot.plot(avg_Temp)



In [7]:
min_Temp = numpy.min(data, axis=0)

In [13]:
max_Temp = numpy.max(data, axis=0)

In [11]:
min_plot = matplotlib.pyplot.plot(min_Temp)



In [14]:
max_plot = matplotlib.pyplot.plot(max_Temp)



In [15]:
max_plot = matplotlib.pyplot.plot(max_Temp)
min_plot = matplotlib.pyplot.plot(min_Temp)
#plots on one graph



In [ ]: