Some matplotlib and pandas experiments based on "Head First Statistics" from O'Reilly Media, Inc.
In [207]:
from pandas import *
In [208]:
profit_data = [("jul", 2.0), ("aug", 2.1), ("sep", 2.2), ("okt", 2.1), ("nov", 2.3), ("dez", 2.4)]
profit_data = DataFrame(profit_data, columns=["month", "profit"])
profit_data
Out[208]:
In [209]:
profit_fig, profit_axes = plt.subplots(nrows=1, ncols=2)
profit_fig.set_size_inches(15,5)
profit_axes[0].set_ylim(0, 2.5)
profit_data.plot(ax = profit_axes[0], x="month").set_title("Stagnating profits.")
profit_axes[1].set_ylim(2.0, 2.5)
profit_data.plot(ax = profit_axes[1], x="month").set_title("Awesome profits!");
The figures use different scaling and offset of the x axes, thus giving a different impression of the profit.
In [210]:
mango_data = [("sports", 27500, 0.99), ("strategy", 11500, 0.9), ("action", 6000, 0.85), ("shooter", 3500, 0.95), ("others", 1500, 0.85)]
mango_data = DataFrame(mango_data, columns=["type", "sales", "satisfaction"])
mango_data
Out[210]:
In [211]:
mango_colors=["#F3E761", "#F0F0F0", "#97C2B9", "#E9ABC7", "#CAA882"]
mango_fig, mango_axes = subplots(nrows=1, ncols=2)
mango_fig.set_size_inches(16,7)
_mango_sales = mango_data.sort("sales")
mango_axes[0].pie(_mango_sales.sales, labels=_mango_sales.type + "\n" + _mango_sales.sales.astype(str), startangle=90, colors=mango_colors) # autopct='%1.1f%%'
mango_axes[0].set_title("sales by type");
mango_axes[1].pie(_mango_sales.satisfaction, labels=_mango_sales.type + "\n" + (_mango_sales.satisfaction*100).astype(str) + "%", startangle=90, colors=mango_colors)
mango_axes[1].set_title("satisfied players by type");
Pie charts are only useful if the values are not all similar, and if they are all parts of a common whole.
In [212]:
mango_regions = [("A", 1000), ("B", 5000), ("C", 7500), ("D", 8000), ("E", 9500)]
mango_regions = DataFrame(mango_regions, columns=["region", "sales"])
mango_regions
Out[212]:
In [213]:
fig, axes = subplots(nrows=2, ncols=2)
fig.set_size_inches(16,11)
mango_regions_index = arange(len(mango_regions))
mango_width=0.8
axes[0,0].bar(mango_regions_index - mango_width/2, mango_regions.sales, width=mango_width, color="#cccccc")
axes[0,0].set_xticks(mango_regions_index)
axes[0,0].set_xticklabels(mango_regions.region);
axes[0,0].set_title("sales by region");
mango_data_index = arange(len(mango_data))
axes[0,1].barh(mango_data_index - mango_width/2, list(reversed(list(100*mango_data.satisfaction))), height=mango_width, color="#cccccc")
axes[0,1].set_yticks(mango_data_index)
axes[0,1].set_yticklabels(list(reversed(list(mango_data.type))))
axes[0,1].set_title("satisfaction by type");
mango_data_sat = list(reversed(list(mango_data.satisfaction*mango_data.sales)))
mango_data_notsat = list(reversed(list((1-mango_data.satisfaction)*mango_data.sales)))
mango_b1 = axes[1,0].barh(mango_data_index, mango_data_sat, height=mango_width/2, color="#8888ff")
mango_b2 = axes[1,0].barh(mango_data_index - mango_width/2, mango_data_notsat, height=mango_width/2, color="#88ff88")
axes[1,0].set_yticks(mango_data_index)
axes[1,0].set_yticklabels(list(reversed(list(mango_data.type))))
axes[1,0].set_title("satisfaction by type (absolute)");
axes[1,0].legend([mango_b1, mango_b2], ["satisfied", "unsatisfied"], loc="lower right");
mango_b3 = axes[1,1].barh(mango_data_index - mango_width/2, mango_data_sat, height=mango_width, color="#8888ff")
mango_b4 = axes[1,1].barh(mango_data_index - mango_width/2, mango_data_notsat, left=mango_data_sat, height=mango_width, color="#88ff88")
axes[1,1].set_yticks(mango_data_index)
axes[1,1].set_yticklabels(list(reversed(list(mango_data.type))))
axes[1,1].set_title("satisfaction by type (absolute)");
axes[1,1].legend([mango_b3, mango_b4], ["satisfied", "unsatisfied"], loc="lower right");
In [214]:
score_data = [(0,199,5), (200,399,29), (400,599, 56), (600, 799, 17), (800,999,3)]
score_data = DataFrame(score_data, columns=("points_from", "points_to", "frequency"))
score_data
Out[214]:
In [215]:
score_fig, score_axes = subplots(nrows=1, ncols=2)
score_fig.set_size_inches(16,4)
score_axes[0].bar(score_data.points_from, score_data.frequency, width=score_data.points_to-score_data.points_from+1, color="#cccccc");
score_axes[0].set_title("Frequency of games by points");
score_axes[0].set_xlabel("Points");
score_axes[0].set_ylabel("Frequency");
score_list = (score_data.points_to+score_data.points_from)/2
score_scores = []
for score in score_list:
score_scores.extend(repeat(score, score_data[(score_data.points_from <= score) & (score_data.points_to >= score)].frequency))
score_axes[1].hist(score_scores, bins=sorted(set(list(score_data.points_from) + list(score_data.points_to+1))), color="#cccccc")
score_axes[1].set_title("Frequency of games by points");
score_axes[1].set_xlabel("Points");
score_axes[1].set_ylabel("Frequency");
In [216]:
gamelen_data = [(0,1,4300), (1,3,6900), (3,5,4900), (5,10,2000), (10,24,2100)]
gamelen_data = DataFrame(gamelen_data, columns=("len_from", "len_to", "frequency"))
gamelen_data
Out[216]:
In [217]:
gamelen_fig, gamelen_axis = subplots(nrows=1, ncols=2)
gamelen_fig.set_size_inches(16,4)
gamelen_axis[0].bar(gamelen_data.len_from, gamelen_data.frequency, width=gamelen_data.len_to-gamelen_data.len_from, color="#cccccc");
gamelen_axis[0].set_title("Frequency of games by length");
gamelen_axis[0].set_xlabel("Length/h");
gamelen_axis[0].set_ylabel("Frequency");
gamelen_axis[0].spines['right'].set_color('none')
gamelen_axis[0].spines['top'].set_color('none')
gamelen_axis[0].xaxis.set_ticks_position('bottom')
gamelen_axis[0].yaxis.set_ticks_position('left')
gamelen_list = (gamelen_data.len_to+gamelen_data.len_from)/2
gamelen_list2 = []
for gamelen in gamelen_list:
gamelen_list2.extend(repeat(gamelen, gamelen_data[(gamelen_data.len_from <= gamelen) & (gamelen_data.len_to >= gamelen)].frequency))
gamelen_axis[1].hist(gamelen_list2, bins=sorted(set(list(gamelen_data.len_from) + list(gamelen_data.len_to))), color="#cccccc", normed=True)
gamelen_axis[1].set_title("Frequency of games by length (normalized)");
gamelen_axis[1].set_xlabel("Length/h");
gamelen_axis[1].set_ylabel("Frequency");
gamelen_axis[1].spines['right'].set_color('none')
gamelen_axis[1].spines['top'].set_color('none')
gamelen_axis[1].xaxis.set_ticks_position('bottom')
gamelen_axis[1].yaxis.set_ticks_position('left')
In [218]:
gamelen_fig, gamelen_axis = subplots(nrows=1, ncols=1)
gamelen_fig.set_size_inches(7,4)
gamelen_axis.plot([0] + list(gamelen_data.len_to), [0] + list(gamelen_data.frequency.cumsum()));
gamelen_axis.set_title("Cumulative sum of games by length");
gamelen_axis.set_xlabel("Length/h");
gamelen_axis.set_ylabel("Cumulative sum");
gamelen_axis.spines['right'].set_color('none')
gamelen_axis.spines['top'].set_color('none')
gamelen_axis.xaxis.set_ticks_position('bottom')
gamelen_axis.yaxis.set_ticks_position('left')
gamelen_axis.set_xlim(0)
gamelen_axis.set_ylim(0, 22500);
gamelen_axis.yaxis.set_major_locator(MultipleLocator(2500.0))
In [219]:
power_ages = DataFrame([19,20,20,20,21], columns=["age"])
power_ages
Out[219]:
Arithmetic mean: \begin{equation} \textrm{mean}(x_1,\...,x_n)=\overline x={1\over n}\sum_{i=1}^n x_i \end{equation}
In [220]:
mean(power_ages.age)
Out[220]:
In [221]:
power_agefreqs = DataFrame([(19, 1), (20, 3), (21, 1)], columns=["age", "frequency"])
power_agefreqs
Out[221]:
In [222]:
def avg(frame):
return sum(1. * frame.age * frame.frequency) / sum(frame.frequency)
avg(power_agefreqs)
Out[222]:
In [223]:
kungfu_agefreqs = DataFrame([(19, 3), (20, 5), (21, 3), (136, 1), (138, 1)], columns=["age", "frequency"])
kungfu_agefreqs
Out[223]:
In [224]:
avg(kungfu_agefreqs)
Out[224]:
In [225]:
agefreq_fig, agefreq_axes = subplots(nrows=1, ncols=2)
agefreq_fig.set_size_inches(16,4)
agefreq_axes[0].set_title("Age in Power-Workout");
agefreq_axes[0].bar(power_agefreqs.age, power_agefreqs.frequency, width=1, color="#cccccc");
agefreq_axes[0].set_xlim(18,24)
agefreq_axes[0].set_ylim(0, 5)
agefreq_axes[1].set_title("Age in Kung-Fu");
agefreq_axes[1].get_xaxis().set_visible(False)
agefreq_axes[1].get_yaxis().set_visible(False)
agefreq_axes[1].spines['bottom'].set_visible(False)
agefreq_axes[1].spines['top'].set_visible(False)
# Subdivide the right axes vertically: 1 row, 4 columns, select 3rd and 4th.
agefreq1 = agefreq_fig.add_subplot(143)
agefreq2 = agefreq_fig.add_subplot(144)
agefreq1.set_ylim(0, 10)
agefreq2.set_ylim(0, 10)
agefreq1.set_xlim(18,24)
agefreq2.set_xlim(135,140)
agefreq1.bar(kungfu_agefreqs.age, kungfu_agefreqs.frequency, width=1, color="#cccccc");
agefreq2.bar(kungfu_agefreqs.age, kungfu_agefreqs.frequency, width=1, color="#cccccc");
agefreq1.spines['right'].set_visible(False)
agefreq1.get_yaxis().tick_left()
agefreq2.spines['left'].set_visible(False)
agefreq2.get_yaxis().tick_right()
agefreq2.tick_params(labelright='off')
Note that the mean may not occur in the data. The median has the advantage that it exists as a data point:
In [226]:
kungfu_ages = []
for index, data in kungfu_agefreqs.iterrows():
kungfu_ages.extend(repeat(data["age"], data["frequency"]))
mean(kungfu_ages), median(kungfu_ages)
Out[226]:
However, if the number of data points is even, the media is the value halfway between the two values around the middle:
In [227]:
median([10,20])
Out[227]:
In [228]:
skew_fig, skew_axes = subplots(nrows=1, ncols=3)
skew_fig.set_size_inches(17,4)
skew1 = randn(1000000)
skew2 = exp(1+0.4*skew1)
skew3 = -skew2
skew_axes[0].set_title("Unskewed normal, mean = median")
skew_axes[0].hist(skew1, bins=30, normed=True, color="#cccccc");
skew_axes[0].plot(mean(skew1), 0.2, 'go');
skew_axes[0].plot(median(skew1), 0.15, 'rD');
skew_axes[0].set_xlim(-4,4)
skew_axes[1].set_title("Right-skewed log-normal, median < mean");
skew_axes[1].hist(skew2, bins=30, normed=True, color="#cccccc");
skew_axes[1].plot(mean(skew2), 0.2, 'go');
skew_axes[1].plot(median(skew2), 0.15, 'rD');
skew_axes[1].set_xlim(0,10)
skew_axes[2].set_title("Left-skewed (mirrored) log-normal, median > mean");
skew_axes[2].hist(skew3, bins=30, normed=True, color="#cccccc");
skew_axes[2].plot(mean(skew3), 0.2, 'go');
skew_axes[2].plot(median(skew3), 0.15, 'rD');
skew_axes[2].set_xlim(-10,0);
In [229]:
swim_agefreqs = DataFrame([(1, 3), (2, 4), (3, 2), (31, 2), (32, 4), (33, 3)], columns=["age", "frequency"])
swim_agefreqs
Out[229]:
The mean and median are both misleading in this case:
In [230]:
swim_ages = []
for index, data in swim_agefreqs.iterrows():
swim_ages.extend(repeat(data["age"], data["frequency"]))
mean(swim_ages), median(swim_ages)
Out[230]:
In [231]:
import scipy.stats
val, cnt = scipy.stats.mstats.mode(array(swim_ages))
val
Out[231]:
Note that 32 is another mode.
In [232]:
player1 = DataFrame([(7,1),(8,1),(9,2),(10,3),(11,2),(12,1),(13,1)], columns=("points", "frequency"))
player2 = DataFrame([(7,1),(9,2),(10,5),(11,2),(13,1)], columns=("points", "frequency"))
player3 = DataFrame([(3,2),(6,1),(7,2),(10,3),(11,1),(13,1),(30,1)], columns=("points", "frequency"))
player1["player"] = 1
player2["player"] = 2
player3["player"] = 3
player_stats = pandas.concat([player1, player2, player3])
player_stats
Out[232]:
In [233]:
player_data = []
def get_stats(pstats):
player, stats = pstats
vals = []
for index, data in stats.iterrows():
vals.extend(repeat(data["points"], data["frequency"]))
vals = array(vals)
player_data.append(vals)
return player, mean(vals), median(vals), scipy.stats.mstats.mode(vals)[0][0], amin(vals), amax(vals), percentile(vals,25), percentile(vals,75)
player_avgs = DataFrame(map(get_stats, player_stats.groupby(player_stats.player)),
columns=("player", "mean", "median", "mode", "minpoints", "maxpoints", "q1","q3"))
player_avgs
Out[233]:
In [234]:
player_avgs['span'] = player_avgs.maxpoints - player_avgs.minpoints
player_avgs['interq'] = player_avgs.q3 - player_avgs.q1
player_avgs[['player', 'span', 'interq']]
Out[234]:
In [235]:
player_fig, player_axes = subplots()
player_fig.set_size_inches(7,4)
player_axes.boxplot(player_data, vert=False, whis=1.5);
player_axes.set_title("Results of basketball players");
player_axes.set_xlim(0,33)
Out[235]:
A discussion for possible reasons why we prefer the mean square error can be found here: http://stats.stackexchange.com/questions/118/why-square-the-difference-instead-of-taking-the-absolute-value-in-standard-devia
In [236]:
player_avgs["var"] = var(player_data, axis=1)
player_avgs["std"] = std(player_data, axis=1)
player_avgs[["player", "var", "std"]]
Out[236]:
In [237]:
scipy.stats.mstats.zscore(player_data[0]);
player_points = player_avgs["mean"] + player_avgs["std"]
array([scipy.stats.zmap([10, 11, player_points[0], 12, 30],
player_data[0]), scipy.stats.zmap([10, 11, 12, 30],
player_data[1]), scipy.stats.zmap([10, 11, 12, 30], player_data[2])])
Out[237]:
In [237]: