In [50]:
# %matplotlib
%pylab 
#inline


Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib

In [2]:
#import numpy as np
#import matplotlib.pylab as plt
from pandas import *

In [45]:
langdist_train = open('/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/lang/edu-f08-nf10/langdist_train.txt').readlines()
langdist_test = open('/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/lang/edu-f08-nf10/langdist_test.txt').readlines()
langdist_f = open('/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/lang/edu-f08-nf10/langdist_f.txt').readlines()
##langdist_nfa = open('/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/lang/edu-f08-nf10/langdist_nf.txt').readlines()
langdist_nfl = open('/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/lang/edu-f08-nf10/langdist_nf_limit.txt').readlines()

In [46]:
langdist_train = DataFrame(list(map(lambda x : x.strip().split(' '), langdist_train)), columns=['train','lang'])
langdist_test = DataFrame(list(map(lambda x : x.strip().split(' '), langdist_test)), columns=['test','lang'])
langdist_f = DataFrame(list(map(lambda x : x.strip().split(' '), langdist_f)),columns=['focused','lang'])
##langdist_nf = DataFrame(list(map(lambda x : x.strip().split(' '), langdist_nf)),columns=['non-focused-all','lang'])
langdist_nfl = DataFrame(list(map(lambda x : x.strip().split(' '), langdist_nfl)),columns=['non-focused','lang'])

In [103]:
langs=langdist_train
langs=merge(langs, langdist_test, on='lang', how='outer')
langs=merge(langs, langdist_f, on='lang', how='outer')
#langs=merge(langs, langdist_nf, on='lang', how='outer')
langs = merge(langs, langdist_nfl, on='lang', how='outer')
langs = langs[langs['lang'] != 'language1']
langs = langs.convert_objects(convert_numeric=True)
langs.set_index('lang',inplace = True)
#print(langs)
print(langs.sort(columns=['non-focused'], ascending=False))
print('sums:')
print(langs.sum())
langs_ = langs.loc[['en','de','fr','nl','es','it'],:]
langs_.loc['other'] = langs.sum() - langs_.sum()
print(langs_)
print(((langs_ / langs_.sum())*100).to_latex(float_format=lambda x : '%2.3f' % x))
#langs=langs_


          train    test   focused  non-focused
lang                                          
en          NaN       1   2072747     33745522
de       415156  415223  42558617      7151482
fr         2435    2393    284636      1424965
es            4       7    118103       729684
it           25      22    146631       535220
unknown   10826   10794    341257       487198
pt            7       4     65691       440835
ca           27      33     38261       282032
nl          102      97    102348       255225
se           80      68     82804       241111
dk           22      22     86241       180228
so           17      18     19346       135321
tr           70      73     18003       126612
fi            8       7     62850        95958
no           10      12     23479        89790
cs            5       7     26300        59951
ee          NaN       1     10873        31204
sk            6      10      7720        30337
sl            2       8      8852        27824
hu          NaN     NaN     12129        15459
mt           15      17     10778         7117
lt            4       4      3370         5151
lv            2       2      2962         3572
jp          NaN     NaN      1074         3225
kr          NaN     NaN        34           52
gr          NaN     NaN        16           47
sums:
train            428823
test             428823
focused        46105122
non-focused    46105122
dtype: float64
        train    test   focused  non-focused
en        NaN       1   2072747     33745522
de     415156  415223  42558617      7151482
fr       2435    2393    284636      1424965
nl        102      97    102348       255225
es          4       7    118103       729684
it         25      22    146631       535220
other   11101   11080    822040      2263024
\begin{tabular}{lrrrr}
\toprule
{} &  train &   test &  focused &  non-focused \\
\midrule
en    &    nan &  0.000 &    4.496 &       73.193 \\
de    & 96.813 & 96.829 &   92.308 &       15.511 \\
fr    &  0.568 &  0.558 &    0.617 &        3.091 \\
nl    &  0.024 &  0.023 &    0.222 &        0.554 \\
es    &  0.001 &  0.002 &    0.256 &        1.583 \\
it    &  0.006 &  0.005 &    0.318 &        1.161 \\
other &  2.589 &  2.584 &    1.783 &        4.908 \\
\bottomrule
\end{tabular}


In [89]:
langs.plot(kind='bar',figsize=(18, 5),width=.9)
plt.ylabel('number of sentences')
plt.xlabel('language ISO-639-1')
#plt.savefig('/Volumes/ExtendedHD/Users/stevo/git/ltbotpaper/langdist_f_vs_nf.pdf', bbox_inches='tight')


Out[89]:
<matplotlib.text.Text at 0x10e1b4940>

In [90]:
langs.sum()
langsrel=langs/langs.sum()
langsrel.plot(kind='bar',figsize=(18, 5),width=.9)
plt.ylabel('sentences in percent')
plt.xlabel('language ISO-639-1')


Out[90]:
<matplotlib.text.Text at 0x10e283ef0>

In [35]:
with open('/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/lang/edu-f08-nf10/sent_lang_f.txt') as f:
    for i in range(int(1e0)): next(f);
    langsents_f = f.readlines(int(1e5))
with open('/Volumes/ExtendedHD/Users/stevo/Workspaces/lm/eval/lang/edu-f08-nf10/sent_lang_nf.txt') as f:
    for i in range(int(1e0)): next(f);
    langsents_nf = f.readlines(int(1e5))
print(langsents_f[:10])
print(langsents_nf[:10])


['de\t\n', 'de\t\n', 'de\t\n', 'de\t\n', 'de\t\n', 'de\t\n', 'de\t\n', 'de\t\n', 'en\t\n', 'en\t\n']
['en\t\n', 'en\t\n', 'en\t\n', 'en\t\n', 'en\t\n', 'en\t\n', 'en\t\n', 'en\t\n', 'en\t\n', 'en\t\n']

In [36]:
langsents = DataFrame(list(map(lambda x : x.strip(), langsents_f)), columns=['lang_f'])
langsents['lang_nf'] = DataFrame(list(map(lambda x : x.strip(), langsents_nf)), columns=['lang_nf'])

In [37]:
vals=unique(langsents.values.ravel())
vald=dict([(x,i) for i,x in enumerate(vals)])
langsents_num = langsents.replace(vald)
print(vald)


{nan: 23, 'se': 17, 'sk': 21, 'lv': 15, 'mt': 16, 'pt': 10, 'tr': 9, 'no': 5, 'lt': 14, 'sl': 22, 'cs': 6, 'jp': 19, 'so': 20, 'es': 13, 'it': 8, 'dk': 7, 'en': 1, 'nl': 11, 'fr': 4, 'de': 0, 'fi': 12, 'ee': 18, 'unknown': 2, 'ca': 3}

In [41]:
langsents_num['lang_f'].plot(style=['g.'], figsize=(18,5))
plt.yticks(np.arange(len(vals)),vals)
plt.ylim((-1,len(vals)+1))
plt.figure()
langsents_num['lang_nf'].plot(style=['r.'], figsize=(18,5))
plt.yticks(np.arange(len(vals)),vals)
plt.ylim((-1,len(vals)+1))
plt.xlabel('sentence by time')
plt.ylabel('language ISO-639-1')


Out[41]:
<matplotlib.text.Text at 0x10c1abeb8>

In [ ]: