Statistics

Some matplotlib and pandas experiments based on "Head First Statistics" from O'Reilly Media, Inc.


In [207]:
from pandas import *

1. First Impression

Why learn statistics? (p. 3)


In [208]:
profit_data = [("jul", 2.0), ("aug", 2.1), ("sep", 2.2), ("okt", 2.1), ("nov", 2.3), ("dez", 2.4)]
profit_data = DataFrame(profit_data, columns=["month", "profit"])
profit_data


Out[208]:
month profit
0 jul 2.0
1 aug 2.1
2 sep 2.2
3 okt 2.1
4 nov 2.3
5 dez 2.4

In [209]:
profit_fig, profit_axes = plt.subplots(nrows=1, ncols=2)
profit_fig.set_size_inches(15,5)
profit_axes[0].set_ylim(0, 2.5)
profit_data.plot(ax = profit_axes[0], x="month").set_title("Stagnating profits.")
profit_axes[1].set_ylim(2.0, 2.5)
profit_data.plot(ax = profit_axes[1], x="month").set_title("Awesome profits!");


The figures use different scaling and offset of the x axes, thus giving a different impression of the profit.

Manic Mango's Diagrams


In [210]:
mango_data = [("sports", 27500, 0.99), ("strategy", 11500, 0.9), ("action", 6000, 0.85), ("shooter", 3500, 0.95), ("others", 1500, 0.85)]
mango_data = DataFrame(mango_data, columns=["type", "sales", "satisfaction"])
mango_data


Out[210]:
type sales satisfaction
0 sports 27500 0.99
1 strategy 11500 0.90
2 action 6000 0.85
3 shooter 3500 0.95
4 others 1500 0.85

In [211]:
mango_colors=["#F3E761", "#F0F0F0", "#97C2B9", "#E9ABC7", "#CAA882"]
mango_fig, mango_axes = subplots(nrows=1, ncols=2)
mango_fig.set_size_inches(16,7)
_mango_sales = mango_data.sort("sales")
mango_axes[0].pie(_mango_sales.sales, labels=_mango_sales.type + "\n" + _mango_sales.sales.astype(str), startangle=90, colors=mango_colors) # autopct='%1.1f%%'
mango_axes[0].set_title("sales by type");
mango_axes[1].pie(_mango_sales.satisfaction, labels=_mango_sales.type + "\n" + (_mango_sales.satisfaction*100).astype(str) + "%", startangle=90, colors=mango_colors)
mango_axes[1].set_title("satisfied players by type");


Pie charts are only useful if the values are not all similar, and if they are all parts of a common whole.


In [212]:
mango_regions = [("A", 1000), ("B", 5000), ("C", 7500), ("D", 8000), ("E", 9500)]
mango_regions = DataFrame(mango_regions, columns=["region", "sales"])
mango_regions


Out[212]:
region sales
0 A 1000
1 B 5000
2 C 7500
3 D 8000
4 E 9500

In [213]:
fig, axes = subplots(nrows=2, ncols=2)
fig.set_size_inches(16,11)
mango_regions_index = arange(len(mango_regions))
mango_width=0.8
axes[0,0].bar(mango_regions_index - mango_width/2, mango_regions.sales, width=mango_width, color="#cccccc")
axes[0,0].set_xticks(mango_regions_index)
axes[0,0].set_xticklabels(mango_regions.region);
axes[0,0].set_title("sales by region");
mango_data_index = arange(len(mango_data))

axes[0,1].barh(mango_data_index - mango_width/2, list(reversed(list(100*mango_data.satisfaction))), height=mango_width, color="#cccccc")
axes[0,1].set_yticks(mango_data_index)
axes[0,1].set_yticklabels(list(reversed(list(mango_data.type))))
axes[0,1].set_title("satisfaction by type");

mango_data_sat = list(reversed(list(mango_data.satisfaction*mango_data.sales)))
mango_data_notsat = list(reversed(list((1-mango_data.satisfaction)*mango_data.sales)))
mango_b1 = axes[1,0].barh(mango_data_index, mango_data_sat, height=mango_width/2, color="#8888ff")
mango_b2 = axes[1,0].barh(mango_data_index - mango_width/2, mango_data_notsat, height=mango_width/2, color="#88ff88")
axes[1,0].set_yticks(mango_data_index)
axes[1,0].set_yticklabels(list(reversed(list(mango_data.type))))
axes[1,0].set_title("satisfaction by type (absolute)");
axes[1,0].legend([mango_b1, mango_b2], ["satisfied", "unsatisfied"], loc="lower right");

mango_b3 = axes[1,1].barh(mango_data_index - mango_width/2, mango_data_sat, height=mango_width, color="#8888ff")
mango_b4 = axes[1,1].barh(mango_data_index - mango_width/2, mango_data_notsat, left=mango_data_sat, height=mango_width, color="#88ff88")
axes[1,1].set_yticks(mango_data_index)
axes[1,1].set_yticklabels(list(reversed(list(mango_data.type))))
axes[1,1].set_title("satisfaction by type (absolute)");
axes[1,1].legend([mango_b3, mango_b4], ["satisfied", "unsatisfied"], loc="lower right");


Histogram


In [214]:
score_data = [(0,199,5), (200,399,29), (400,599, 56), (600, 799, 17), (800,999,3)]
score_data = DataFrame(score_data, columns=("points_from", "points_to", "frequency"))
score_data


Out[214]:
points_from points_to frequency
0 0 199 5
1 200 399 29
2 400 599 56
3 600 799 17
4 800 999 3

In [215]:
score_fig, score_axes = subplots(nrows=1, ncols=2)
score_fig.set_size_inches(16,4)
score_axes[0].bar(score_data.points_from, score_data.frequency, width=score_data.points_to-score_data.points_from+1, color="#cccccc");
score_axes[0].set_title("Frequency of games by points");
score_axes[0].set_xlabel("Points");
score_axes[0].set_ylabel("Frequency");
score_list = (score_data.points_to+score_data.points_from)/2
score_scores = []
for score in score_list:
    score_scores.extend(repeat(score, score_data[(score_data.points_from <= score) & (score_data.points_to >= score)].frequency))
score_axes[1].hist(score_scores, bins=sorted(set(list(score_data.points_from) + list(score_data.points_to+1))), color="#cccccc")
score_axes[1].set_title("Frequency of games by points");
score_axes[1].set_xlabel("Points");
score_axes[1].set_ylabel("Frequency");



In [216]:
gamelen_data = [(0,1,4300), (1,3,6900), (3,5,4900), (5,10,2000), (10,24,2100)]
gamelen_data = DataFrame(gamelen_data, columns=("len_from", "len_to", "frequency"))
gamelen_data


Out[216]:
len_from len_to frequency
0 0 1 4300
1 1 3 6900
2 3 5 4900
3 5 10 2000
4 10 24 2100

In [217]:
gamelen_fig, gamelen_axis = subplots(nrows=1, ncols=2)
gamelen_fig.set_size_inches(16,4)
gamelen_axis[0].bar(gamelen_data.len_from, gamelen_data.frequency, width=gamelen_data.len_to-gamelen_data.len_from, color="#cccccc");
gamelen_axis[0].set_title("Frequency of games by length");
gamelen_axis[0].set_xlabel("Length/h");
gamelen_axis[0].set_ylabel("Frequency");
gamelen_axis[0].spines['right'].set_color('none')
gamelen_axis[0].spines['top'].set_color('none')
gamelen_axis[0].xaxis.set_ticks_position('bottom')
gamelen_axis[0].yaxis.set_ticks_position('left')
gamelen_list = (gamelen_data.len_to+gamelen_data.len_from)/2
gamelen_list2 = []
for gamelen in gamelen_list:
    gamelen_list2.extend(repeat(gamelen, gamelen_data[(gamelen_data.len_from <= gamelen) & (gamelen_data.len_to >= gamelen)].frequency))
gamelen_axis[1].hist(gamelen_list2, bins=sorted(set(list(gamelen_data.len_from) + list(gamelen_data.len_to))), color="#cccccc", normed=True)
gamelen_axis[1].set_title("Frequency of games by length (normalized)");
gamelen_axis[1].set_xlabel("Length/h");
gamelen_axis[1].set_ylabel("Frequency");
gamelen_axis[1].spines['right'].set_color('none')
gamelen_axis[1].spines['top'].set_color('none')
gamelen_axis[1].xaxis.set_ticks_position('bottom')
gamelen_axis[1].yaxis.set_ticks_position('left')



In [218]:
gamelen_fig, gamelen_axis = subplots(nrows=1, ncols=1)
gamelen_fig.set_size_inches(7,4)
gamelen_axis.plot([0] + list(gamelen_data.len_to), [0] + list(gamelen_data.frequency.cumsum()));
gamelen_axis.set_title("Cumulative sum of games by length");
gamelen_axis.set_xlabel("Length/h");
gamelen_axis.set_ylabel("Cumulative sum");
gamelen_axis.spines['right'].set_color('none')
gamelen_axis.spines['top'].set_color('none')
gamelen_axis.xaxis.set_ticks_position('bottom')
gamelen_axis.yaxis.set_ticks_position('left')
gamelen_axis.set_xlim(0)
gamelen_axis.set_ylim(0, 22500);
gamelen_axis.yaxis.set_major_locator(MultipleLocator(2500.0))


2. Averages

Arithmetic Mean


In [219]:
power_ages = DataFrame([19,20,20,20,21], columns=["age"])
power_ages


Out[219]:
age
0 19
1 20
2 20
3 20
4 21

Arithmetic mean: \begin{equation} \textrm{mean}(x_1,\...,x_n)=\overline x={1\over n}\sum_{i=1}^n x_i \end{equation}


In [220]:
mean(power_ages.age)


Out[220]:
20.0

In [221]:
power_agefreqs = DataFrame([(19, 1), (20, 3), (21, 1)], columns=["age", "frequency"])
power_agefreqs


Out[221]:
age frequency
0 19 1
1 20 3
2 21 1

In [222]:
def avg(frame):
    return sum(1. * frame.age * frame.frequency) / sum(frame.frequency)
avg(power_agefreqs)


Out[222]:
20.0

In [223]:
kungfu_agefreqs = DataFrame([(19, 3), (20, 5), (21, 3), (136, 1), (138, 1)], columns=["age", "frequency"])
kungfu_agefreqs


Out[223]:
age frequency
0 19 3
1 20 5
2 21 3
3 136 1
4 138 1

In [224]:
avg(kungfu_agefreqs)


Out[224]:
38.0

In [225]:
agefreq_fig, agefreq_axes = subplots(nrows=1, ncols=2)
agefreq_fig.set_size_inches(16,4)
agefreq_axes[0].set_title("Age in Power-Workout");
agefreq_axes[0].bar(power_agefreqs.age, power_agefreqs.frequency, width=1, color="#cccccc");
agefreq_axes[0].set_xlim(18,24)
agefreq_axes[0].set_ylim(0, 5)
agefreq_axes[1].set_title("Age in Kung-Fu");
agefreq_axes[1].get_xaxis().set_visible(False)
agefreq_axes[1].get_yaxis().set_visible(False)
agefreq_axes[1].spines['bottom'].set_visible(False)
agefreq_axes[1].spines['top'].set_visible(False)
# Subdivide the right axes vertically: 1 row, 4 columns, select 3rd and 4th.
agefreq1 = agefreq_fig.add_subplot(143)
agefreq2 = agefreq_fig.add_subplot(144)
agefreq1.set_ylim(0, 10)
agefreq2.set_ylim(0, 10)
agefreq1.set_xlim(18,24)
agefreq2.set_xlim(135,140)
agefreq1.bar(kungfu_agefreqs.age, kungfu_agefreqs.frequency, width=1, color="#cccccc");
agefreq2.bar(kungfu_agefreqs.age, kungfu_agefreqs.frequency, width=1, color="#cccccc");
agefreq1.spines['right'].set_visible(False)
agefreq1.get_yaxis().tick_left()
agefreq2.spines['left'].set_visible(False)
agefreq2.get_yaxis().tick_right()
agefreq2.tick_params(labelright='off')


Median

Note that the mean may not occur in the data. The median has the advantage that it exists as a data point:


In [226]:
kungfu_ages = []
for index, data in kungfu_agefreqs.iterrows():
    kungfu_ages.extend(repeat(data["age"], data["frequency"]))
mean(kungfu_ages), median(kungfu_ages)


Out[226]:
(38.0, 20.0)

However, if the number of data points is even, the media is the value halfway between the two values around the middle:


In [227]:
median([10,20])


Out[227]:
15.0

In [228]:
skew_fig, skew_axes = subplots(nrows=1, ncols=3)
skew_fig.set_size_inches(17,4)
skew1 = randn(1000000)
skew2 = exp(1+0.4*skew1)
skew3 = -skew2
skew_axes[0].set_title("Unskewed normal, mean = median")
skew_axes[0].hist(skew1, bins=30, normed=True, color="#cccccc");
skew_axes[0].plot(mean(skew1), 0.2, 'go');
skew_axes[0].plot(median(skew1), 0.15, 'rD');
skew_axes[0].set_xlim(-4,4)
skew_axes[1].set_title("Right-skewed log-normal, median < mean");
skew_axes[1].hist(skew2, bins=30, normed=True, color="#cccccc");
skew_axes[1].plot(mean(skew2), 0.2, 'go');
skew_axes[1].plot(median(skew2), 0.15, 'rD');
skew_axes[1].set_xlim(0,10)
skew_axes[2].set_title("Left-skewed (mirrored) log-normal, median > mean");
skew_axes[2].hist(skew3, bins=30, normed=True, color="#cccccc");
skew_axes[2].plot(mean(skew3), 0.2, 'go');
skew_axes[2].plot(median(skew3), 0.15, 'rD');
skew_axes[2].set_xlim(-10,0);


Modal/Mode


In [229]:
swim_agefreqs = DataFrame([(1, 3), (2, 4), (3, 2), (31, 2), (32, 4), (33, 3)], columns=["age", "frequency"])
swim_agefreqs


Out[229]:
age frequency
0 1 3
1 2 4
2 3 2
3 31 2
4 32 4
5 33 3

The mean and median are both misleading in this case:


In [230]:
swim_ages = []
for index, data in swim_agefreqs.iterrows():
    swim_ages.extend(repeat(data["age"], data["frequency"]))
mean(swim_ages), median(swim_ages)


Out[230]:
(17.0, 17.0)

In [231]:
import scipy.stats
val, cnt = scipy.stats.mstats.mode(array(swim_ages))
val


Out[231]:
array([ 2.])

Note that 32 is another mode.

3. Variance and Deviation

Span, Interquantildistance, Percentils


In [232]:
player1 = DataFrame([(7,1),(8,1),(9,2),(10,3),(11,2),(12,1),(13,1)], columns=("points", "frequency"))
player2 = DataFrame([(7,1),(9,2),(10,5),(11,2),(13,1)], columns=("points", "frequency"))
player3 = DataFrame([(3,2),(6,1),(7,2),(10,3),(11,1),(13,1),(30,1)], columns=("points", "frequency"))
player1["player"] = 1
player2["player"] = 2
player3["player"] = 3
player_stats = pandas.concat([player1, player2, player3])
player_stats


Out[232]:
points frequency player
0 7 1 1
1 8 1 1
2 9 2 1
3 10 3 1
4 11 2 1
5 12 1 1
6 13 1 1
0 7 1 2
1 9 2 2
2 10 5 2
3 11 2 2
4 13 1 2
0 3 2 3
1 6 1 3
2 7 2 3
3 10 3 3
4 11 1 3
5 13 1 3
6 30 1 3

In [233]:
player_data = []

def get_stats(pstats):
    player, stats = pstats
    vals = []
    for index, data in stats.iterrows():
        vals.extend(repeat(data["points"], data["frequency"]))
    vals = array(vals)
    player_data.append(vals)
    return player, mean(vals), median(vals), scipy.stats.mstats.mode(vals)[0][0], amin(vals), amax(vals), percentile(vals,25), percentile(vals,75)

player_avgs = DataFrame(map(get_stats, player_stats.groupby(player_stats.player)),
                        columns=("player", "mean", "median", "mode", "minpoints", "maxpoints", "q1","q3"))
player_avgs


Out[233]:
player mean median mode minpoints maxpoints q1 q3
0 1 10 10 10 7 13 9.0 11.0
1 2 10 10 10 7 13 9.5 10.5
2 3 10 10 10 3 30 6.5 10.5

In [234]:
player_avgs['span'] = player_avgs.maxpoints - player_avgs.minpoints
player_avgs['interq'] = player_avgs.q3 - player_avgs.q1
player_avgs[['player', 'span', 'interq']]


Out[234]:
player span interq
0 1 6 2
1 2 6 1
2 3 27 4

In [235]:
player_fig, player_axes = subplots()
player_fig.set_size_inches(7,4)
player_axes.boxplot(player_data, vert=False, whis=1.5);
player_axes.set_title("Results of basketball players");
player_axes.set_xlim(0,33)


Out[235]:
(0, 33)

Variance, Standard Deviation

\begin{equation} V(x) = {1 \over n} \sum_i(x_i-\overline x)^2 \end{equation}\begin{equation} \sigma(x) = \sqrt{V(x)} \end{equation}

A discussion for possible reasons why we prefer the mean square error can be found here: http://stats.stackexchange.com/questions/118/why-square-the-difference-instead-of-taking-the-absolute-value-in-standard-devia


In [236]:
player_avgs["var"] = var(player_data, axis=1)
player_avgs["std"] = std(player_data, axis=1)
player_avgs[["player", "var", "std"]]


Out[236]:
player var std
0 1 2.727273 1.651446
1 2 2.000000 1.414214
2 3 49.272727 7.019453

Standard Score (z-score)

\begin{equation} z(x) = \frac{ x - \overline{x} }{ \sigma(x) } \end{equation}

In [237]:
scipy.stats.mstats.zscore(player_data[0]);
player_points = player_avgs["mean"] + player_avgs["std"]
array([scipy.stats.zmap([10, 11, player_points[0], 12, 30],
 player_data[0]), scipy.stats.zmap([10, 11, 12, 30],
 player_data[1]), scipy.stats.zmap([10, 11, 12, 30], player_data[2])])


Out[237]:
array([ array([  0.        ,   0.60553007,   1.        ,   1.21106014,  12.11060142]),
       array([  0.        ,   0.70710678,   1.41421356,  14.14213562]),
       array([ 0.        ,  0.14246123,  0.28492247,  2.84922466])], dtype=object)

In [237]: