array.array) types which are thin wrappers over C arraysThe example will use the idea of figuring statistics on a persons scores in a class - as this is a pretty standard and understandable example.
In [29]:
import matplotlib.pyplot as plt
%matplotlib inline
In [13]:
import numpy as np
### Start by creating a 50 element numpy array for calculations
# np.random.random(50) if floats wanted or np.random.random((50,3)) if a 50 x 3 matrix wanted
# We'll assume the person was passing and use a 60-100 spread over 50 and then 51 elements (to show median)
scores = np.random.random_integers(60,100, 51)
print(scores)
print(type(scores)) # This will show this is not a List but an ndarray
In [14]:
### Now a few simple calculations: mean, highest score, and lowest score
# We'll need these values again so instead of having to recalculate them all the time we can use a dict to store
s_attribs = {"mean": np.mean(scores), "max": np.max(scores), "min": np.min(scores)}
print("Your average grade was {}, highest score was {}, lowest was {}".format(s_attribs['mean'], s_attribs['max'], s_attribs['min']))
In [18]:
### Now to find the median value - first the normal way then the numpy way
# Note one can use np.shape to find the shape of the array which will be a tuple of the (rows, columns) or len()
# This will use shape just to show that method as len() is used in many other examples
elements = scores.shape[0]
scores.sort() #This will sort the array in-place - use sorted() if you do not want to keep array sorted
if elements % 2:
# The half way point is not exact
print("The median was {}".format((scores[(elements // 2) - 1] + scores[elements // 2]) / 2))
else:
# The half way point is exact use that
print("The median was {}".format(scores[elements // 2]))
#or we could just do this without even needing to sort
# and add this to the dictionary
s_attribs["median"] = np.median(scores)
print("The median was {}".format(s_attribs["median"]))
In [17]:
from collections import Counter # for the next section
In [28]:
### Now to get some counts
# We'll be building a counter object so first lets try the quick way to find the mode (or most frequent value)
## s_attribs["mode"] = np.mode(scores) <- causes an Attribute error
## print("Most frequent grade was {}".format(s_attribs["mode"]))
### Well that didn't work, apparently numpy doesn't have a mode (scipy.stats does fyi)
## Then again we'll need some frequecy counts for other reasons anyway so its time to learn to count
score_count = Counter(scores)
print(score_count)
print("The mode is {}".format(score_count.most_common(1)[0][0]))
# change as we explain need to get call the first index twice and inherent problem with this method (ie same count)
In [27]:
### Okay just because it annoys me - I really just want the counts - a better way to get the mode is
from scipy.stats import mode
print(mode(scores)) #but we'll get to that in a later lesson
In [41]:
### Well we have a numbers and frequencies so maybe we should do a quick histogram
# First create lists of our labels and indexes
lbls, counts = zip(*score_count.items())
idx = np.arange(len(lbls))
### This will take a bit of explaining: using splat command in Python 3 to `expand` the values returned by a method
## And the need to get the length of labels (or score_count) as this will provide the indexes (indices?)
plt.figure(figsize=(12,12)) # Chane the graph size (play with this as you want)
plt.bar(idx, counts, 1) # 1 = width
plt.xticks(idx + 0.5, lbls) # Think labels (hence why we called it labels) - if width changes do width * .5
Out[41]: