Analysing tabular data

We are going to use a LIBRARY called nump

numpy not nump



In [2]:

    
import numpy



In [3]:

    
numpy.loadtxt(fname='data/weather-0.1.csv',delimiter=',')









    



---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-3-f50cb4440abc> in <module>()
----> 1 numpy.loadtxt(fname='data/weather-0.1.csv',delimiter=',')

C:\Users\David\Anaconda3\lib\site-packages\numpy\lib\npyio.py in loadtxt(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin)
    803                 fh = iter(open(fname, 'U'))
    804             else:
--> 805                 fh = iter(open(fname))
    806         else:
    807             fh = iter(fname)

FileNotFoundError: [Errno 2] No such file or directory: 'data/weather-0.1.csv'



In [3]:

    
numpu.loadtxt(='data/weather-01.csv', delimiter = ',')









    



  File "<ipython-input-3-59dc7a225797>", line 1
    numpu.loadtxt(='data/weather-01.csv', delimiter = ',')
                  ^
SyntaxError: invalid syntax



In [4]:

    
numpy.loadtxt(fname='Data/weather-01.csv', delimiter=',')









    Out[4]:





array([[ 0.,  0.,  1., ...,  3.,  0.,  0.],
       [ 0.,  1.,  2., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  2.,  1.,  1.],
       ..., 
       [ 0.,  1.,  1., ...,  1.,  1.,  1.],
       [ 0.,  0.,  0., ...,  0.,  2.,  0.],
       [ 0.,  0.,  1., ...,  1.,  1.,  0.]])

Variables



In [8]:

    
weight_kg =55



In [9]:

    
print (weight_kg)



In [10]:

    
print('weight in pounds:', weight_kg * 2.2)









    



weight in pounds: 121.00000000000001



In [11]:

    
weight_kg = 57.5



In [13]:

    
print ('New weight:', weight_kg * 2.2)









    



New weight: 126.50000000000001



In [14]:

    
%whos









    



Variable    Type      Data/Info
-------------------------------
numpy       module    <module 'numpy' from 'C:\<...>ges\\numpy\\__init__.py'>
weight_kg   float     57.5



In [6]:

    
data=numpy.loadtxt(fname='Data/weather-01.csv', delimiter=',')



In [16]:

    
print (data)









    



[[ 0.  0.  1. ...,  3.  0.  0.]
 [ 0.  1.  2. ...,  1.  0.  1.]
 [ 0.  1.  1. ...,  2.  1.  1.]
 ..., 
 [ 0.  1.  1. ...,  1.  1.  1.]
 [ 0.  0.  0. ...,  0.  2.  0.]
 [ 0.  0.  1. ...,  1.  1.  0.]]



In [17]:

    
print (type(data))









    



<class 'numpy.ndarray'>



In [18]:

    
%whos









    



Variable    Type       Data/Info
--------------------------------
data        ndarray    60x40: 2400 elems, type `float64`, 19200 bytes
numpy       module     <module 'numpy' from 'C:\<...>ges\\numpy\\__init__.py'>
weight_kg   float      57.5



In [19]:

    
# Finding out the data tye
print (data.dtype)









    



float64



In [20]:

    
# Find out the shape
print (data.shape)



In [21]:

    
# This is 60 rows by 40 columns



In [22]:

    
# Gtting a single number out of the array
print ("First value in data:", data [0,0])









    



First value in data: 0.0



In [26]:

    
# First element is 0 as we are counting the number of positions from the start, ie. the first is 0 from the start and 
# the last is n-1



In [27]:

    
print ('A middle value:', data[30,20])









    



A middle value: 13.0



In [28]:

    
#just named a new variable 'A middle value' and said that that variable is data from the 'data' array



In [30]:

    
#First 10 columns for the first 4 rows, taking a section of the array a slice



In [31]:

    
print (data[0:4,0:10])
#start at index 0 and go upto but not including 4, then do columns starting at 0 but not including 10,
#you end up with 4 rows 10 columns









    



[[ 0.  0.  1.  3.  1.  2.  4.  7.  8.  3.]
 [ 0.  1.  2.  1.  2.  1.  3.  2.  2.  6.]
 [ 0.  1.  1.  3.  3.  2.  6.  2.  5.  9.]
 [ 0.  0.  2.  0.  4.  2.  2.  1.  6.  7.]]



In [32]:

    
# don't have to start a slice at 0
print (data[5:10, 7:15])









    



[[  1.   6.   4.   7.   6.   6.   9.   9.]
 [  5.   5.   8.   6.   5.  11.   9.   4.]
 [  3.   5.   3.   7.   8.   8.   5.  10.]
 [  5.   5.   8.   2.   4.  11.  12.  10.]
 [  3.   5.   8.   6.   8.  12.   5.  13.]]



In [33]:

    
#Number of columns/rows = larger number minus smaller number



In [34]:

    
# we don't even need to include the upper or lower bounds, assumes first column/row or last column/row depending on which
#you miss out
smallchunk = data [:3, 36:]
print (smallchunk)
#starting at 0 going to column 3 and starting at row 36 going to the end









    



[[ 2.  3.  0.  0.]
 [ 1.  1.  0.  1.]
 [ 2.  2.  1.  1.]]



In [35]:

    
#aithmetic on arrays
doublessmallchunk = smallchunk * 2.0
# times everything in small chnk by 2.0



In [36]:

    
print (doublessmallchunk)
#tab auto completes things









    



[[ 4.  6.  0.  0.]
 [ 2.  2.  0.  2.]
 [ 4.  4.  2.  2.]]



In [38]:

    
triplesmallchunk = smallchunk + doublessmallchunk
# adding variables, same shape but with different values, same as timesing smallchunk by 3



In [39]:

    
print (triplesmallchunk)









    



[[ 6.  9.  0.  0.]
 [ 3.  3.  0.  3.]
 [ 6.  6.  3.  3.]]



In [40]:

    
print (numpy.mean(data))



In [41]:

    
#print just tells you what a thing is it doens't create it as a new variable



In [42]:

    
print (numpy.max(data))



In [43]:

    
print (numpy.min(data))

0.0



In [44]:

    
# get a set of data for the first weather station
station_0 = data [0, :]
# getting first row for all columns



In [45]:

    
print (station_0)









    



[  0.   0.   1.   3.   1.   2.   4.   7.   8.   3.   3.   3.  10.   5.   7.
   4.   7.   7.  12.  18.   6.  13.  11.  11.   7.   7.   4.   6.   8.   8.
   4.   4.   5.   7.   3.   4.   2.   3.   0.   0.]



In [46]:

    
print (numpy.max(station_0))



In [47]:

    
# we don't need to create 'temporary' array slices
# we can refer to what we call array axes



In [48]:

    
# e.g.
print (numpy.mean(data, axis = 0))
#









    



[  0.           0.45         1.11666667   1.75         2.43333333   3.15
   3.8          3.88333333   5.23333333   5.51666667   5.95         5.9
   8.35         7.73333333   8.36666667   9.5          9.58333333
  10.63333333  11.56666667  12.35        13.25        11.96666667
  11.03333333  10.16666667  10.           8.66666667   9.15         7.25
   7.33333333   6.58333333   6.06666667   5.95         5.11666667   3.6
   3.3          3.56666667   2.48333333   1.5          1.13333333
   0.56666667]



In [49]:

    
print (numpy.mean(data, axis = 1))









    



[ 5.45   5.425  6.1    5.9    5.55   6.225  5.975  6.65   6.625  6.525
  6.775  5.8    6.225  5.75   5.225  6.3    6.55   5.7    5.85   6.55
  5.775  5.825  6.175  6.1    5.8    6.425  6.05   6.025  6.175  6.55
  6.175  6.35   6.725  6.125  7.075  5.725  5.925  6.15   6.075  5.75
  5.975  5.725  6.3    5.9    6.75   5.925  7.225  6.15   5.95   6.275  5.7
  6.1    6.825  5.975  6.725  5.7    6.25   6.4    7.05   5.9  ]



In [51]:

    
# axes are dimensions so axes=0 are the columns and the mean of axes=0 gives you the mean of each column, mean t for each time
# axis=1 are the rows so mean of axis=1 is the mean of each row- mean T of each station



In [52]:

    
# Visualisations
# matplotlib gives you matlab like plotting functions



In [9]:

    
import matplotlib.pyplot
# matplotlib is massive so just import small parts



In [10]:

    
%matplotlib inline
# plots appear in same window



In [55]:

    
image = matplotlib.pyplot.imshow(data)



In [57]:

    
# heat map. Don't know what it represents tho



In [58]:

    
#look at average T over time
avg_Temp = numpy.mean(data, axis=0)



In [59]:

    
avg_plot = matplotlib.pyplot.plot(avg_Temp)



In [7]:

    
min_Temp = numpy.min(data, axis=0)



In [13]:

    
max_Temp = numpy.max(data, axis=0)



In [11]:

    
min_plot = matplotlib.pyplot.plot(min_Temp)



In [14]:

    
max_plot = matplotlib.pyplot.plot(max_Temp)



In [15]:

    
max_plot = matplotlib.pyplot.plot(max_Temp)
min_plot = matplotlib.pyplot.plot(min_Temp)
#plots on one graph



In [ ]: