analysing tabular data


In [ ]:
import numpy

In [ ]:
numpy.loadtxt

In [ ]:
numpy.loadtxt(fname='data/weather-01.csv' delimiter = ',')

In [ ]:
numpy.loadtxt(fname='data/weather-01.csv'delimiter=',')

In [ ]:
numpy.loadtxt(fname='data/weather-01.csv',delimiter=',')

variables


In [ ]:
weight_kg=55

In [ ]:
print (weight_kg)

In [ ]:
print('weight in pounds:',weight_kg*2.2)

In [ ]:
numpy.loadtxt(fname='data/weather-01.csv',delimiter=',')

In [ ]:
numpy.loadtxt(fname='data/weather-01.csv',delimiter=',')

In [ ]:
numpy.loadtxt(fname='data/weather-01.csv',delimiter=',')

In [ ]:
%whos

In [ ]:
data=numpy.loadtxt(fname='data/weather-01.csv',delimiter=',')

In [ ]:
%whos

In [ ]:
%whos

In [ ]:
print(data.dtype)

In [ ]:
print(data.shape)

this is 60 by 40


In [ ]:
print ("first value in data:",data [0,0])

In [ ]:
print ('A middle value:',data[30,20])

lets get the first 10 columns for the firsst 4 rows

print(data[0:4, 0:10])

start at index 0 and go up to but not including index 4


In [ ]:
print (data[0:4, 0:10])

we dont need to start slicng at 0


In [ ]:
print (data[5:10,7:15])

we dont even need to inc upper and lower limits


In [ ]:
smallchunk=data[:3,36:]
print(smallchunk)

arithmetic on arrays


In [ ]:
doublesmallchunk=smallchunk*2.0

In [ ]:
print(doublesmallchunk)

In [ ]:
triplesmallchunk=smallchunk+doublesmallchunk

In [ ]:
print(triplesmallchunk)

In [ ]:
print(numpy.mean(data))

In [ ]:
print (numpy.max(data))

In [ ]:
print (numpy.min(data))

get a set of data for the first station

this is shorthand for "all the columns"


In [ ]:
station_0=data[0,:]

In [ ]:
print(numpy.max(station_0))

we dont need to create @temporary@ array slices

we can refer to what we call array axes


In [ ]:
print(numpy.mean(data, axis=0))

In [ ]:
print(numpy.mean(data, axis=1))

axis = 0 gets mean down eaach column

axis=1 gets the mean across each row so the mean temp

for each station for all periods

see above

do some simple vissualisations


In [ ]:
import matplotlib.pyplot

In [ ]:
%matplotlib inline

In [ ]:
image=matplotlib.pyplot.imshow(data)

lets look at the average tempp over time


In [ ]:
avg_temperature=numpy.mean(data,axis=0)

In [ ]:
avg_plot=matplotlib.pyplot.plot(avg_temperature)

In [ ]:
import numpy

In [ ]:
import matplotlib.pyplot

In [ ]:
%matplotlib inline

In [ ]:
data=numpy.loadtxt(fname='data/weather-01.csv',delimiter=',')

create a wide figure to hold sub plots


In [ ]:
fig=matplotlib.pyplot.figure (figsize=(10.0,3.0))

create placeholders for plots


In [ ]:


In [ ]:
fig=matplotlib.pyplot.figure (figsize=(10.0,3.0))
subplot1=fig.add_subplot (1,3,1)
subplot2=fig.add_subplot (1,3,2)
subplot3=fig.add_subplot (1,3,3)

subplot1.set_ylabel('average')
subplot1.plot(numpy.mean(data, axis=0))

subplot2.set_ylabel('minimum')
subplot2.plot(numpy.min(data, axis=0))

subplot3.set_ylabel('maximum')
subplot3.plot(numpy.max(data, axis=0))

this is fine for small numbers of datasets, what if wwe have hundreds or thousands? we need more automaation

loops


In [ ]:
word='notebook'
print (word[4])

see aabove note diff between squaare and normaal brackets


In [ ]:
for char in word:
    # colon before word or indentation v imporetaant
    #indent is 4 spaces

In [ ]:
for char in word:
    print (char)

reading filenames

get a list of all the filenames from disk


In [ ]:
import glob

global..something~


In [ ]:
print(glob.glob('data/weather*.csv'))

putting it all together


In [ ]:
filenames=sorted(glob.glob('data/weather*.csv'))
filenames=filenames[0:3]

for f in filenames:
    print (f)
    data=numpy.loadtxt(fname=f, delimiter=',')
    
#next bits need indenting


    fig=matplotlib.pyplot.figure (figsize=(10.0,3.0))
    subplot1=fig.add_subplot (1,3,1)
    subplot2=fig.add_subplot (1,3,2)
    subplot3=fig.add_subplot (1,3,3)

    subplot1.set_ylabel('average')
    subplot1.plot(numpy.mean(data, axis=0))

    subplot2.set_ylabel('minimum')
    subplot2.plot(numpy.min(data, axis=0))

    subplot3.set_ylabel('maximum')
    subplot3.plot(numpy.max(data, axis=0))
    
    fig.tight_layout()
    matplotlib.pyplot.show

In [ ]:


In [ ]:
num=37
if num>100:
    print('greater')
else:
    print('not greater')
    print ('done')

In [ ]:
num=107
if num>100:
    print('greater')
else:
    print('not greater')
    print ('done')

didnt print "done" due to break in indentation sequence


In [ ]:
num=-3

if num>0:
    print (num, "is positive")
elif num ==0:
    print (num, "is zero")
else:
    print (num, "is negative")

elif eqauls else if, always good to finish a chain with an else


In [ ]:
filenames=sorted(glob.glob('data/weather*.csv'))

In [ ]:
filenames=sorted(glob.glob('data/weather*.csv'))
filenames=filenames[0:3]

for f in filenames:
    print (f)
    data=numpy.loadtxt(fname=f, delimiter=',') == 0 
    if numpy.max (data, axis=0)[0] ==0 and numpy.max (data, axis=0)[20] ==20:
        print ('suspicious looking maxima')
    elif numpy.sum(numpy.min(data, axis=0)) ==0:
        print ('minimum adds to zero')
    else:
        print ('data looks ok')
    
   
    
#next bits need indenting


    fig=matplotlib.pyplot.figure (figsize=(10.0,3.0))
    subplot1=fig.add_subplot (1,3,1)
    subplot2=fig.add_subplot (1,3,2)
    subplot3=fig.add_subplot (1,3,3)

    subplot1.set_ylabel('average')
    subplot1.plot(numpy.mean(data, axis=0))

    subplot2.set_ylabel('minimum')
    subplot2.plot(numpy.min(data, axis=0))

    subplot3.set_ylabel('maximum')
    subplot3.plot(numpy.max(data, axis=0))
    
    fig.tight_layout()
    matplotlib.pyplot.show

something went wrong with the above


In [ ]:
def fahr_to_kelvin(temp):
    return((temp-32)*(5/9)+ 273.15)

In [ ]:
print ('freezing point of water:', fahr_to_kelvin(32))

In [ ]:
print ('boiling point of water:', fahr_to_kelvin(212))

using functions


In [ ]:
def analyse (filename):
    data=numpy.loadtxt(fname=filename,)......

unfinsinshed


In [ ]:
def detect_problems (filename):
    data=numpy.loadtxt(fname=filename, delimiter=',')
    
    if numpy.max (data, axis=0)[0] ==0 and numpy.max (data, axis=0)[20] ==20:
        print ('suspicious looking maxima')
    elif numpy.sum(numpy.min(data, axis=0)) ==0:
        print ('minimum adds to zero')
    else:
        print ('data looks ok')

In [ ]:
for f in filenames [0:5]:
    print (f)
    analyse (f)
    detect_problems (f)

In [ ]:
def analyse (filename):
    data=numpy.loadtxt(fname=filename,delimiter=',')
    
    fig=matplotlib.pyplot.figure (figsize=(10.0,3.0))
    subplot1=fig.add_subplot (1,3,1)
    subplot2=fig.add_subplot (1,3,2)
    subplot3=fig.add_subplot (1,3,3)

    subplot1.set_ylabel('average')
    subplot1.plot(numpy.mean(data, axis=0))

    subplot2.set_ylabel('minimum')
    subplot2.plot(numpy.min(data, axis=0))

    subplot3.set_ylabel('maximum')
    subplot3.plot(numpy.max(data, axis=0))
    
    fig.tight_layout()
    matplotlib.pyplot.show

In [ ]:
for f in filenames [0:5]:
    print (f)
    analyse (f)
    detect_problems (f)

In [ ]:


In [ ]:


In [ ]:


In [ ]:
help(numpy.loadtxt)

In [ ]:
help(detect_problems)

In [ ]:
"""some of our temperature files haave problems, check for these

this function reads a file and reports on odd looking maxima and minimia that add to zero
the function does not return any data
"""

def detect_problems (filename):
    data=numpy.loadtxt(fname=filename, delimiter=',')
    
    if numpy.max (data, axis=0)[0] ==0 and numpy.max (data, axis=0)[20] ==20:
        print ('suspicious looking maxima')
    elif numpy.sum(numpy.min(data, axis=0)) ==0:
        print ('minimum adds to zero')
    else:
        print ('data looks ok')

In [ ]:
def analyse (filename):
    data=numpy.loadtxt(fname=filename,delimiter=',')
    
    """ this function analyses a dataset and outputs plots for maax min and ave
    """
    
    fig=matplotlib.pyplot.figure (figsize=(10.0,3.0))
    subplot1=fig.add_subplot (1,3,1)
    subplot2=fig.add_subplot (1,3,2)
    subplot3=fig.add_subplot (1,3,3)

    subplot1.set_ylabel('average')
    subplot1.plot(numpy.mean(data, axis=0))

    subplot2.set_ylabel('minimum')
    subplot2.plot(numpy.min(data, axis=0))

    subplot3.set_ylabel('maximum')
    subplot3.plot(numpy.max(data, axis=0))
    
    fig.tight_layout()
    matplotlib.pyplot.show

In [ ]: