In [4]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['load', 'datetime', 'info', 'unique', 'rc', 'save']
`%matplotlib` prevents importing * from pylab and numpy
  • focused: ~1TB data; ~400GB html
  • focused sentences: 49,642,285
  • focused ngrams (1-5): 3,589,782,965
  • non-focused: ~1TB data; ~700GB html
  • non-focused sentences: 190,623,576
  • non-focused ngrams (1-5): 10,813,167,232

In [5]:
import matplotlib.pyplot as plt
from matplotlib import rc
# #rc('font',**{'family':'sans-serif','sans-serif':['Helvetica'], 'size':16})
# ## for Palatino and other serif fonts use:
# #rc('font',**{'family':'serif','serif':['Palatino']})
# #rc('font',**{'family':'sans-serif','serif':['Computer Modern Sans serif']})
# rc('text', usetex=True)
# rc('font', size=16)
# rc('text', dvipnghack=True) # mac related
import numpy as np
from pandas import *
import os
print(os.getcwd())
os.chdir('/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval')
print(os.getcwd())


/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval
/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval

In [2]:

# produce linear splitted plot #f_eval = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-01-15/lm_eval_out_1000.txt' f_eval = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-01-15/lm_eval_out.txt' df = read_csv(f_eval, sep='\t', header=None, names=['serviceid', 'file', 'perplexity']) df['split'] = df['serviceid'].map(lambda x: int(x[x.find('_')+1])) df['type'] = df['serviceid'].map(lambda x: x[x.rfind('/')+1:x.find('_')]) df_r = df.pivot('split','type','perplexity') df_r.plot() plt.ylabel('Perplexity') df_r
# produce log splitted plot f_eval_woov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-01-29/lm_eval_out.woov.txt' f_eval_noov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-01-29/lm_eval_out.noov.txt' df_woov = read_csv(f_eval_woov, sep='\t', header=None, names=['serviceid', 'file', 'perplexity']) df_noov = read_csv(f_eval_noov, sep='\t', header=None, names=['serviceid', 'file', 'perplexity']) df_woov['split'] = df_woov['serviceid'].map(lambda x: int(x[x.find('_')+1:].replace('M','KK').replace('K','000'))) df_noov['split'] = df_noov['serviceid'].map(lambda x: int(x[x.find('_')+1:].replace('M','KK').replace('K','000'))) df_woov['type'] = df_woov['serviceid'].map(lambda x: x[x.rfind('/')+1:x.find('_')]) df_noov['type'] = df_noov['serviceid'].map(lambda x: x[x.rfind('/')+1:x.find('_')]) df_r_woov = df_woov.pivot('split','type','perplexity') df_r_noov = df_noov.pivot('split','type','perplexity') print('type values: ', df_r_woov.columns.get_values()) print('type values: ', df_r_noov.columns.get_values()) fig, axes = plt.subplots(nrows=2, ncols=2) df_r_woov[['f2','nf2']].plot(ax=axes[0,0],logx=True,legend=False) df_r_woov[['f5','nf5']].plot(ax=axes[0,1],logx=True,legend=False) df_r_noov[['f2','nf2']].plot(ax=axes[1,0],logx=True,legend=False) df_r_noov[['f5','nf5']].plot(ax=axes[1,1],logx=True,legend=False) for ax in axes.flat: ax.legend(loc='upper right')
# compare to berkeley lm f_eval_noov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-01-29/test3.noov.txt' df_noov = read_csv(f_eval_noov, sep='\t', header=None, names=['serviceid', 'file', 'perplexity']) df_noov['ident'] = df_noov['serviceid'].map(lambda x: x[x.rfind('/')+1:]) df_noov['size'] = df_noov['ident'].map(lambda x: (x, x.replace('_m1','')[x.find('_')+4:])) df_noov['type'] = df_noov['size'].map(lambda x: x[0].replace(x[1],'')) df_noov['size'] = df_noov['size'].map(lambda x: int(x[1].replace('M','KK').replace('K','000'))) df_noov = df_noov.pivot('size','type','perplexity') print('type values: ', df_noov.columns.get_values()) #df_noov[['nf5_bkn_m1','nf5_pkn_m1']].plot(logx=True) df_noov[['f3_pkn','nf3_pkn','f5_pkn','nf5_pkn']].plot(logx=True) df_noov[['f3_pkn','nf3_pkn']].plot(logx=True) df_noov.plot(logx=True) df_noov
# compare to berkeley lm f_eval_woov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-01-29/test3.woov.txt' df_woov = read_csv(f_eval_woov, sep='\t', header=None, names=['serviceid', 'file', 'perplexity']) df_woov['ident'] = df_woov['serviceid'].map(lambda x: x[x.rfind('/')+1:]) df_woov['size'] = df_woov['ident'].map(lambda x: (x, x.replace('_m1','')[x.find('_')+4:])) df_woov['type'] = df_woov['size'].map(lambda x: x[0].replace(x[1],'')) df_woov['size'] = df_woov['size'].map(lambda x: int(x[1].replace('M','KK').replace('K','000'))) df_woov = df_woov.pivot('size','type','perplexity') print('type values: ', df_woov.columns.get_values()) #df_noov[['nf5_bkn_m1','nf5_pkn_m1']].plot(logx=True) df_woov[['f3_pkn','nf3_pkn','f5_pkn','nf5_pkn']].plot(logx=True) df_woov[['f3_pkn','nf3_pkn']].plot(logx=True) df_woov.plot(logx=True) df_woov

In [6]:
# plot perplexities

# f_eval_woov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-02-26/lmeval_1K_sbound_oovref_woov.txt'
# f_eval_noov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-02-26/lmeval_1K_sbound_oovref_noov.txt'

f_eval_woov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-03-20/lmeval_1K_oovref_woov.txt'
f_eval_noov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-03-20/lmeval_1K_oovref_noov.txt'

# f_eval_woov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-02-26/lmeval_1K_de_oovref_woov.txt'
# f_eval_noov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-02-26/lmeval_1K_de_oovref_noov.txt'

# f_eval_woov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-03-20/lmeval_de_oovref_woov.txt'
# f_eval_noov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-03-20/lmeval_de_oovref_noov.txt'

f_eval_woov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-02-26/lmeval_1K_depc_oovref_woov.txt'
f_eval_noov = '/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-02-26/lmeval_1K_depc_oovref_noov.txt'



df_woov = read_csv(f_eval_woov, 
              sep='\t',
              header=None, 
              names=['serviceid', 'file', 'perplexity'])
df_noov = read_csv(f_eval_noov, 
              sep='\t',
              header=None, 
              names=['serviceid', 'file', 'perplexity'])

df_woov['ident'] = df_woov['serviceid'].map(lambda x: x[x.rfind('/')+1:])
df_woov['size'] = df_woov['ident'].map(lambda x: (x, x.replace('_m1s','').replace('_m1', '')[x.find('_')+1:]))
df_woov['xtick'] = df_woov['size'].map(lambda x: x[1])
df_woov['type'] = df_woov['size'].map(lambda x: x[0].replace('_'+x[1],'') + '_woov')
df_woov['size'] = df_woov['size'].map(lambda x: int(x[1].replace('M','KK').replace('K','000')) + 1e7)# 1e5 amh, 1e7 pedocs
dfp_woov = df_woov.pivot('size','type','perplexity')

df_noov['ident'] = df_noov['serviceid'].map(lambda x: x[x.rfind('/')+1:])
df_noov['size'] = df_noov['ident'].map(lambda x: (x, x.replace('_m1s', '').replace('_m1','')[x.find('_')+1:]))
df_noov['xtick'] = df_noov['size'].map(lambda x: x[1])
df_noov['type'] = df_noov['size'].map(lambda x: x[0].replace('_'+x[1],'') + '_noov')
df_noov['size'] = df_noov['size'].map(lambda x: int(x[1].replace('M','KK').replace('K','000')) + 1e7)
dfp_noov = df_noov.pivot('size','type','perplexity')

d = dfp_woov.join(dfp_noov)
print('type values: ', d.columns.get_values())

d[['f5_m1_woov','nf5_m1_woov','f5_m1_noov','nf5_m1_noov']].plot(
    logx=True, logy=False, legend=False, style=['bs:','ro-.', 'g+-', 'm>--'])

plt.gca().legend(
    loc='lower center', 
    labels=['$f (oov)$', '$nf (oov)$', '$f (no~oov)$', '$nf (no~oov)$'], 
    bbox_to_anchor=(0.5, 1), 
    ncol=2,
    fontsize=16)
plt.gca().set_ylabel('$PP$')
plt.xticks(d.index, ['','','','$+1M$','','$+10M$','$+30M$','$+100M$','$+300M$'])
plt.xlabel('$additional~corpus~size~in~\#tokens$')
plt.ylim((400,1000))
print(d)

# plt.savefig('/Volumes/ExtendedHD/Users/stevo/git/ltbotpaper/pp_edu.pdf', bbox_inches='tight')
#plt.savefig('/Volumes/ExtendedHD/Users/stevo/git/ltbotpaper/pp_edu_dec.pdf', bbox_inches='tight')


type values:  ['f5_m1_woov' 'nf5_m1_woov' 'f5_m1_noov' 'nf5_m1_noov']
type       f5_m1_woov  nf5_m1_woov  f5_m1_noov  nf5_m1_noov
size                                                       
10000000        765.7        765.7       540.6        540.6
10100000        764.0        764.7       539.5        540.0
10300000        759.2        762.6       536.5        538.7
11000000        749.3        759.0       529.5        536.0
13000000        731.3        749.8       518.2        530.1
20000000        704.0        724.8       500.6        514.3
40000000        656.3        708.0       469.6        504.8
110000000       619.1        693.7       445.2        496.0
310000000       573.8        693.2       412.9        495.8

In [ ]:
# plot relative perplexity

# intial value
iv = d.iloc[0].loc['f5_m1_woov']
d['nf5_m1_drop_woov'] = iv - d['nf5_m1_woov'];
d['nf5_m1_rel_drop_woov'] = (-(d['nf5_m1_drop_woov'] / iv)) * 100;
d['f5_m1_drop_woov'] = iv - d['f5_m1_woov'];
d['f5_m1_rel_drop_woov'] = (-(d['f5_m1_drop_woov'] / iv)) * 100;

iv = d.iloc[0].loc['f5_m1_noov']
d['nf5_m1_drop_noov'] = iv - d['nf5_m1_noov'];
d['nf5_m1_rel_drop_noov'] = (-(d['nf5_m1_drop_noov'] / iv)) * 100;
d['f5_m1_drop_noov'] = iv - d['f5_m1_noov'];
d['f5_m1_rel_drop_noov'] = (-(d['f5_m1_drop_noov'] / iv)) * 100;

plt.figure()
d[['f5_m1_rel_drop_woov','nf5_m1_rel_drop_woov','f5_m1_rel_drop_noov','nf5_m1_rel_drop_noov']].plot(
    logx=True, logy=False, legend=False, style=['bs:','ro-.', 'g+-', 'm>--'])
plt.gca().legend(loc='lower left', labels=['focused (oov)', 'non-focused (oov)', 'focused (no oov)', 'non-focused (no oov)'])
plt.gca().set_ylabel('% PP change')
plt.xticks(d.index, ['','','','+1M','','+10M','+30M','+100M','+300M'])
#plt.ylim((-50,50))

In [19]:



Out[19]:
Float64Index([10000000.0, 10100000.0, 10300000.0, 11000000.0, 13000000.0, 20000000.0, 40000000.0, 110000000.0, 310000000.0], dtype='float64')

In [8]:
print(d.index)
# /Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/2015-02-26/lm_eval_de_pc_realsizes.txt
# # f:
# 11224     172870     100K_m1/sorted_sent_tok_de.txt
# 5677962   93799389   100M_m1/sorted_sent_tok_de.txt
# 560381    9122831    10M_m1/sorted_sent_tok_de.txt
# 56609     867686     1M_m1/sorted_sent_tok_de.txt
# 17083     267371     300K_m1/sorted_sent_tok_de.txt
# 16819241  277303711  300M_m1/sorted_sent_tok_de.txt
# 1661651   27783058   30M_m1/sorted_sent_tok_de.txt
# 170488    2661758    3M_m1/sorted_sent_tok_de.txt

# #
# # nf:
# 2498     34250     100K_m1/sorted_sent_tok_de.txt
# 1840409  26817917  100M_m1/sorted_sent_tok_de.txt
# 156302   2305689   10M_m1/sorted_sent_tok_de.txt
# 15155    216432    1M_m1/sorted_sent_tok_de.txt
# 6658     95167     300K_m1/sorted_sent_tok_de.txt
# 4194850  61122895  300M_m1/sorted_sent_tok_de.txt
# 647267   9452318   30M_m1/sorted_sent_tok_de.txt
# 38012    557515    3M_m1/sorted_sent_tok_de.txt


#d[['f5_m1_woov','nf5_m1_woov','f5_m1_noov','nf5_m1_noov']].plot(
#    logx=True, logy=False, legend=False, style=['bs:','ro-.', 'g+-', 'm>--'])

x_f =np.array([0, 87105, 267371, 867686, 2661758, 9122831, 27783058, 93799389, 277303711]) + 10000000
x_nf=np.array([0, 34250, 95167, 216432, 557515, 2305689, 9452318, 26817917, 61122895]) + 10000000
plt.plot(x_f.T,d[['f5_m1_woov']],'bs:')
plt.plot(x_nf.T,d[['nf5_m1_woov']],'ro-.')
plt.plot(x_f.T,d[['f5_m1_noov']],'g+-')
plt.plot(x_nf.T,d[['nf5_m1_noov']],'m>--')
#plt.plot(x_nf.T,m_nf,'ro-.')

plt.gca().set_xscale('log')
plt.grid(True)

plt.gca().legend(
    loc='lower center', 
    labels=['$f (oov)$', '$nf (oov)$', '$f (no~oov)$', '$nf (no~oov)$'], 
    bbox_to_anchor=(0.5, 1), 
    ncol=2,
    fontsize=16)
plt.gca().set_ylabel('$PP$')
plt.xticks(d.index, ['','','','$+1M$','','$+10M$','$+30M$','$+100M$','$+300M$'])
plt.xlabel('$additional~corpus~size~in~\#tokens$')
plt.xlim((0,310000000))
plt.ylim((400,1000))
#plt.savefig('/Volumes/ExtendedHD/Users/stevo/git/ltbotpaper/pp_edu_dec_realsize.pdf', bbox_inches='tight')


Float64Index([10000000.0, 10100000.0, 10300000.0, 11000000.0, 13000000.0, 20000000.0, 40000000.0, 110000000.0, 310000000.0], dtype='float64')
Out[8]:
(400, 1000)

In [ ]: