In [101]:
import pandas as pd
from matplotlib import pylab as pl 
import string
import scipy.stats
%matplotlib inline

In [102]:
df_14 = pd.read_csv('../data/cmssusypaps/2014_SUS/wordcounts.txt')
df_11 = pd.read_csv('../data/cmssusypaps/2011_SUS/wordcounts.txt')
df_10 = pd.read_csv('../data/cmssusypaps/2010_tech/wordcounts.txt')

In [103]:
df_14.head(10)
df_14.shape


Out[103]:
(28, 2)

In [104]:
df_11.head(10)


Out[104]:
word norm_count
0 all 12
1 also 8
2 an 20
3 and 151
4 are 72
5 as 42
6 be 23
7 between 9
8 both 8
9 by 36

10 rows × 2 columns


In [105]:
df_10.head(10)


Out[105]:
word norm_count
0 all 20
1 also 9
2 an 20
3 and 181
4 are 63
5 as 46
6 be 31
7 between 18
8 both 7
9 by 44

10 rows × 2 columns


In [108]:
pl.bar(range(df_14.shape[0]), df_14.norm_count, align='center', )
pl.xticks(range(df_14.shape[0]), df_14.word, size='medium', rotation='vertical')


Out[108]:
([<matplotlib.axis.XTick at 0x7f0b0b17c250>,
  <matplotlib.axis.XTick at 0x7f0b0b123e90>,
  <matplotlib.axis.XTick at 0x7f0b0b043c10>,
  <matplotlib.axis.XTick at 0x7f0b0afd0410>,
  <matplotlib.axis.XTick at 0x7f0b0afd0b50>,
  <matplotlib.axis.XTick at 0x7f0b0afd92d0>,
  <matplotlib.axis.XTick at 0x7f0b0b148f90>,
  <matplotlib.axis.XTick at 0x7f0b15fe13d0>,
  <matplotlib.axis.XTick at 0x7f0b0b388b10>,
  <matplotlib.axis.XTick at 0x7f0b0b329e10>,
  <matplotlib.axis.XTick at 0x7f0b0b166750>,
  <matplotlib.axis.XTick at 0x7f0b0afd9fd0>,
  <matplotlib.axis.XTick at 0x7f0b0afe5750>,
  <matplotlib.axis.XTick at 0x7f0b0afe5e90>,
  <matplotlib.axis.XTick at 0x7f0b0afee610>,
  <matplotlib.axis.XTick at 0x7f0b0afeed50>,
  <matplotlib.axis.XTick at 0x7f0b0aff94d0>,
  <matplotlib.axis.XTick at 0x7f0b0aff9c10>,
  <matplotlib.axis.XTick at 0x7f0b0b004390>,
  <matplotlib.axis.XTick at 0x7f0b0b004ad0>,
  <matplotlib.axis.XTick at 0x7f0b0b00d250>,
  <matplotlib.axis.XTick at 0x7f0b0b00d990>,
  <matplotlib.axis.XTick at 0x7f0b0af97110>,
  <matplotlib.axis.XTick at 0x7f0b0af97850>,
  <matplotlib.axis.XTick at 0x7f0b0af97f90>,
  <matplotlib.axis.XTick at 0x7f0b0afa1710>,
  <matplotlib.axis.XTick at 0x7f0b0afa1e50>,
  <matplotlib.axis.XTick at 0x7f0b0afab5d0>],
 <a list of 28 Text xticklabel objects>)

In [123]:
fig, axs = pl.subplots(1,3)

df_14['norm_count'].plot(ax=axs[0])
df_11['norm_count'].plot(ax=axs[1])
df_10['norm_count'].plot(ax=axs[2])
#pl.xticks(range(df_10.shape[0]), df_10.word, size='medium', rotation='vertical')


Out[123]:
<matplotlib.axes.AxesSubplot at 0x7f0b0a707a90>

In [118]:
t1 = scipy.stats.kruskal(df_11.norm_count, df_14.norm_count)
t2 = scipy.stats.kruskal(df_11.norm_count, df_10.norm_count)
t3 = scipy.stats.kruskal(df_14.norm_count, df_10.norm_count)
print(t1)
print(t2)
print(t3)


(0.20322745350960505, 0.6521282542355169)
(0.16792785575056832, 0.68195999204499247)
(0.0043001294925139166, 0.94771589671310952)

In [112]:
scipy.stats.wilcoxon(df_11.norm_count, df_14.norm_count)  #assumes normality


Out[112]:
(36.5, 0.00068080358216353744)

In [127]:
scipy.stats.chisquare(df_14.norm_count, df_11.norm_count)


Out[127]:
(114.93246304572013, 8.0583522066896693e-13)