Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['load', 'datetime', 'info', 'unique', 'rc', 'save']
`%matplotlib` prevents importing * from pylab and numpy
- focused: ~1TB data; ~400GB html
- focused sentences: 49,642,285
- focused ngrams (1-5): 3,589,782,965
- non-focused: ~1TB data; ~700GB html
- non-focused sentences: 190,623,576
- non-focused ngrams (1-5): 10,813,167,232
/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval
/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval
# produce linear splitted plot
#f_eval = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-01-15/lm_eval_out_1000.txt'
f_eval = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-01-15/lm_eval_out.txt'
df = read_csv(f_eval,
sep='\t',
header=None,
names=['serviceid', 'file', 'perplexity'])
df['split'] = df['serviceid'].map(lambda x: int(x[x.find('_')+1]))
df['type'] = df['serviceid'].map(lambda x: x[x.rfind('/')+1:x.find('_')])
df_r = df.pivot('split','type','perplexity')
df_r.plot()
plt.ylabel('Perplexity')
df_r
# produce log splitted plot
f_eval_woov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-01-29/lm_eval_out.woov.txt'
f_eval_noov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-01-29/lm_eval_out.noov.txt'
df_woov = read_csv(f_eval_woov,
sep='\t',
header=None,
names=['serviceid', 'file', 'perplexity'])
df_noov = read_csv(f_eval_noov,
sep='\t',
header=None,
names=['serviceid', 'file', 'perplexity'])
df_woov['split'] = df_woov['serviceid'].map(lambda x: int(x[x.find('_')+1:].replace('M','KK').replace('K','000')))
df_noov['split'] = df_noov['serviceid'].map(lambda x: int(x[x.find('_')+1:].replace('M','KK').replace('K','000')))
df_woov['type'] = df_woov['serviceid'].map(lambda x: x[x.rfind('/')+1:x.find('_')])
df_noov['type'] = df_noov['serviceid'].map(lambda x: x[x.rfind('/')+1:x.find('_')])
df_r_woov = df_woov.pivot('split','type','perplexity')
df_r_noov = df_noov.pivot('split','type','perplexity')
print('type values: ', df_r_woov.columns.get_values())
print('type values: ', df_r_noov.columns.get_values())
fig, axes = plt.subplots(nrows=2, ncols=2)
df_r_woov[['f2','nf2']].plot(ax=axes[0,0],logx=True,legend=False)
df_r_woov[['f5','nf5']].plot(ax=axes[0,1],logx=True,legend=False)
df_r_noov[['f2','nf2']].plot(ax=axes[1,0],logx=True,legend=False)
df_r_noov[['f5','nf5']].plot(ax=axes[1,1],logx=True,legend=False)
for ax in axes.flat: ax.legend(loc='upper right')
# compare to berkeley lm
f_eval_noov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-01-29/test3.noov.txt'
df_noov = read_csv(f_eval_noov,
sep='\t',
header=None,
names=['serviceid', 'file', 'perplexity'])
df_noov['ident'] = df_noov['serviceid'].map(lambda x: x[x.rfind('/')+1:])
df_noov['size'] = df_noov['ident'].map(lambda x: (x, x.replace('_m1','')[x.find('_')+4:]))
df_noov['type'] = df_noov['size'].map(lambda x: x[0].replace(x[1],''))
df_noov['size'] = df_noov['size'].map(lambda x: int(x[1].replace('M','KK').replace('K','000')))
df_noov = df_noov.pivot('size','type','perplexity')
print('type values: ', df_noov.columns.get_values())
#df_noov[['nf5_bkn_m1','nf5_pkn_m1']].plot(logx=True)
df_noov[['f3_pkn','nf3_pkn','f5_pkn','nf5_pkn']].plot(logx=True)
df_noov[['f3_pkn','nf3_pkn']].plot(logx=True)
df_noov.plot(logx=True)
df_noov
# compare to berkeley lm
f_eval_woov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-01-29/test3.woov.txt'
df_woov = read_csv(f_eval_woov,
sep='\t',
header=None,
names=['serviceid', 'file', 'perplexity'])
df_woov['ident'] = df_woov['serviceid'].map(lambda x: x[x.rfind('/')+1:])
df_woov['size'] = df_woov['ident'].map(lambda x: (x, x.replace('_m1','')[x.find('_')+4:]))
df_woov['type'] = df_woov['size'].map(lambda x: x[0].replace(x[1],''))
df_woov['size'] = df_woov['size'].map(lambda x: int(x[1].replace('M','KK').replace('K','000')))
df_woov = df_woov.pivot('size','type','perplexity')
print('type values: ', df_woov.columns.get_values())
#df_noov[['nf5_bkn_m1','nf5_pkn_m1']].plot(logx=True)
df_woov[['f3_pkn','nf3_pkn','f5_pkn','nf5_pkn']].plot(logx=True)
df_woov[['f3_pkn','nf3_pkn']].plot(logx=True)
df_woov.plot(logx=True)
df_woov
type values: ['f5_m1_woov' 'nf5_m1_woov' 'f5_m1_noov' 'nf5_m1_noov']
type f5_m1_woov nf5_m1_woov f5_m1_noov nf5_m1_noov
size
10000000 765.7 765.7 540.6 540.6
10100000 764.0 764.7 539.5 540.0
10300000 759.2 762.6 536.5 538.7
11000000 749.3 759.0 529.5 536.0
13000000 731.3 749.8 518.2 530.1
20000000 704.0 724.8 500.6 514.3
40000000 656.3 708.0 469.6 504.8
110000000 619.1 693.7 445.2 496.0
310000000 573.8 693.2 412.9 495.8
Out[19]:
Float64Index([10000000.0, 10100000.0, 10300000.0, 11000000.0, 13000000.0, 20000000.0, 40000000.0, 110000000.0, 310000000.0], dtype='float64')
Float64Index([10000000.0, 10100000.0, 10300000.0, 11000000.0, 13000000.0, 20000000.0, 40000000.0, 110000000.0, 310000000.0], dtype='float64')