In [50]:
# %matplotlib
%pylab
#inline
In [2]:
#import numpy as np
#import matplotlib.pylab as plt
from pandas import *
In [45]:
langdist_train = open('/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/lang/edu-f08-nf10/langdist_train.txt').readlines()
langdist_test = open('/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/lang/edu-f08-nf10/langdist_test.txt').readlines()
langdist_f = open('/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/lang/edu-f08-nf10/langdist_f.txt').readlines()
##langdist_nfa = open('/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/lang/edu-f08-nf10/langdist_nf.txt').readlines()
langdist_nfl = open('/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/lang/edu-f08-nf10/langdist_nf_limit.txt').readlines()
In [46]:
langdist_train = DataFrame(list(map(lambda x : x.strip().split(' '), langdist_train)), columns=['train','lang'])
langdist_test = DataFrame(list(map(lambda x : x.strip().split(' '), langdist_test)), columns=['test','lang'])
langdist_f = DataFrame(list(map(lambda x : x.strip().split(' '), langdist_f)),columns=['focused','lang'])
##langdist_nf = DataFrame(list(map(lambda x : x.strip().split(' '), langdist_nf)),columns=['non-focused-all','lang'])
langdist_nfl = DataFrame(list(map(lambda x : x.strip().split(' '), langdist_nfl)),columns=['non-focused','lang'])
In [103]:
langs=langdist_train
langs=merge(langs, langdist_test, on='lang', how='outer')
langs=merge(langs, langdist_f, on='lang', how='outer')
#langs=merge(langs, langdist_nf, on='lang', how='outer')
langs = merge(langs, langdist_nfl, on='lang', how='outer')
langs = langs[langs['lang'] != 'language1']
langs = langs.convert_objects(convert_numeric=True)
langs.set_index('lang',inplace = True)
#print(langs)
print(langs.sort(columns=['non-focused'], ascending=False))
print('sums:')
print(langs.sum())
langs_ = langs.loc[['en','de','fr','nl','es','it'],:]
langs_.loc['other'] = langs.sum() - langs_.sum()
print(langs_)
print(((langs_ / langs_.sum())*100).to_latex(float_format=lambda x : '%2.3f' % x))
#langs=langs_
In [89]:
langs.plot(kind='bar',figsize=(18, 5),width=.9)
plt.ylabel('number of sentences')
plt.xlabel('language ISO-639-1')
#plt.savefig('/Volumes/ExtendedHD/Users/stevo/git/ltbotpaper/langdist_f_vs_nf.pdf', bbox_inches='tight')
Out[89]:
In [90]:
langs.sum()
langsrel=langs/langs.sum()
langsrel.plot(kind='bar',figsize=(18, 5),width=.9)
plt.ylabel('sentences in percent')
plt.xlabel('language ISO-639-1')
Out[90]:
In [35]:
with open('/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/lang/edu-f08-nf10/sent_lang_f.txt') as f:
for i in range(int(1e0)): next(f);
langsents_f = f.readlines(int(1e5))
with open('/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/lang/edu-f08-nf10/sent_lang_nf.txt') as f:
for i in range(int(1e0)): next(f);
langsents_nf = f.readlines(int(1e5))
print(langsents_f[:10])
print(langsents_nf[:10])
In [36]:
langsents = DataFrame(list(map(lambda x : x.strip(), langsents_f)), columns=['lang_f'])
langsents['lang_nf'] = DataFrame(list(map(lambda x : x.strip(), langsents_nf)), columns=['lang_nf'])
In [37]:
vals=unique(langsents.values.ravel())
vald=dict([(x,i) for i,x in enumerate(vals)])
langsents_num = langsents.replace(vald)
print(vald)
In [41]:
langsents_num['lang_f'].plot(style=['g.'], figsize=(18,5))
plt.yticks(np.arange(len(vals)),vals)
plt.ylim((-1,len(vals)+1))
plt.figure()
langsents_num['lang_nf'].plot(style=['r.'], figsize=(18,5))
plt.yticks(np.arange(len(vals)),vals)
plt.ylim((-1,len(vals)+1))
plt.xlabel('sentence by time')
plt.ylabel('language ISO-639-1')
Out[41]:
In [ ]: