Kostic, D. 1999. Frekvencijski recnik savremenog srpskog jezika (Frequency Dictionary of Contemporary Serbian Language). Institute for Experimental Phonetics and Speech Pathology & Laboratory of Experimental Psychology, University of Belgrade, Serbia.
Baayen, R. H., Milin, P., Filipovic Durdevic, D., Hendrix, P. and Marelli, M. 2011. "An amorphous model for morphological processing in visual comprehension based on naive discriminative learning." Psychological Review 118:438-482.
In [1]:
import pandas as pd
import pandas.rpy.common as com
import numpy as np
from sklearn.feature_extraction import DictVectorizer
%load_ext autoreload
%autoreload 2
%load_ext rmagic
%precision 2
Out[1]:
u'%.2f'
In [2]:
from ndl import *
In [3]:
%%R
library(ndl)
This is ndl version 0.2.16.
For an overview of the package, type 'help("ndl.package")'.
In [4]:
serbian = com.load_data('serbian')
serbian['Cues'] = orthoCoding(serbian.WordForm,grams=2)
serbian['Outcomes'] = [tuple(c.split('_')) for c in serbian.LemmaCase]
serbian.head()
Out[4]:
WordForm
LemmaCase
Frequency
Cues
Outcomes
1
yena
yena_nom_Sg
576
(#y, ye, en, na, a#)
(yena, nom, Sg)
2
yene
yena_gen_Sg
229
(#y, ye, en, ne, e#)
(yena, gen, Sg)
3
yeni
yena_dat_Sg
55
(#y, ye, en, ni, i#)
(yena, dat, Sg)
4
yenu
yena_acc_Sg
167
(#y, ye, en, nu, u#)
(yena, acc, Sg)
5
yenom
yena_ins_Sg
39
(#y, ye, en, no, om, m#)
(yena, ins, Sg)
5 rows × 5 columns
In [5]:
sw = ndl(serbian)
In [6]:
sw.head()
Out[6]:
Pl
Sg
acc
akademija
aparat
bitka
bog
boja
bol
bor
borac
brazda
brdo
brid
briga
brigada
brod
bura
cena
cesta
#a
-0.467714
0.952442
0.326035
0.471536
0.506614
-0.001006
-0.032020
0.037598
-0.028174
0.015433
-0.011195
0.006800
0.003051
0.000134
0.015739
-0.039569
0.017378
-0.002766
0.030046
-0.052284
...
#b
-0.069955
0.556007
0.095385
-0.005611
-0.012985
0.079436
0.121653
0.066181
0.254183
0.097360
0.109091
0.027724
-0.003189
0.009077
0.036710
0.022223
0.106195
0.000795
-0.000225
-0.019009
...
#c
-0.099439
0.585776
0.110130
0.002622
-0.021386
0.010436
-0.004947
0.016928
-0.016833
-0.005711
-0.021122
-0.009816
0.002113
-0.007284
0.023586
-0.012067
0.007555
0.000627
0.577554
0.344233
...
#d
-0.017994
0.500962
0.077627
-0.014584
-0.004215
-0.002597
0.015613
-0.035485
0.024504
0.006509
0.007348
-0.012459
-0.045132
-0.002656
0.027050
-0.018789
0.013864
0.000444
-0.025700
0.006292
...
#e
-0.465341
0.925997
-0.093245
0.038236
-0.056175
0.024248
0.112225
-0.042413
-0.031564
-0.030352
-0.037280
-0.028162
0.002051
0.010243
0.078581
0.000173
-0.073940
0.000415
-0.023138
0.004618
...
5 rows × 278 columns
In [7]:
num = ['Sg','Pl']
case = ['nom','gen','dat','acc','ins','loc']
infl = num + case
predict = [ ]
for cue in serbian.Cues:
A = activation(cue,sw)
A.sort(ascending=False)
res = [ None, None, None ]
for ind in A.index:
if ind in num:
res[2] = ind
elif ind in case:
res[1] = ind
else:
res[0] = ind
if not None in res:
break
predict.append(tuple(res))
serbian['Predicted'] = predict
In [8]:
serbian
Out[8]:
WordForm
LemmaCase
Frequency
Cues
Outcomes
Predicted
1
yena
yena_nom_Sg
576
(#y, ye, en, na, a#)
(yena, nom, Sg)
(yena, nom, Sg)
2
yene
yena_gen_Sg
229
(#y, ye, en, ne, e#)
(yena, gen, Sg)
(yena, nom, Pl)
3
yeni
yena_dat_Sg
55
(#y, ye, en, ni, i#)
(yena, dat, Sg)
(yena, nom, Sg)
4
yenu
yena_acc_Sg
167
(#y, ye, en, nu, u#)
(yena, acc, Sg)
(yena, acc, Sg)
5
yenom
yena_ins_Sg
39
(#y, ye, en, no, om, m#)
(yena, ins, Sg)
(yena, ins, Sg)
6
yeni
yena_loc_Sg
16
(#y, ye, en, ni, i#)
(yena, loc, Sg)
(yena, nom, Sg)
7
yene
yena_nom_Pl
415
(#y, ye, en, ne, e#)
(yena, nom, Pl)
(yena, nom, Pl)
8
yena
yena_gen_Pl
336
(#y, ye, en, na, a#)
(yena, gen, Pl)
(yena, nom, Sg)
9
yenama
yena_dat_Pl
33
(#y, ye, en, na, am, ma, a#)
(yena, dat, Pl)
(yena, loc, Pl)
10
yene
yena_acc_Pl
136
(#y, ye, en, ne, e#)
(yena, acc, Pl)
(yena, nom, Pl)
11
yenama
yena_ins_Pl
24
(#y, ye, en, na, am, ma, a#)
(yena, ins, Pl)
(yena, loc, Pl)
12
yenama
yena_loc_Pl
4
(#y, ye, en, na, am, ma, a#)
(yena, loc, Pl)
(yena, loc, Pl)
13
yeqa
yeqa_nom_Sg
179
(#y, ye, eq, qa, a#)
(yeqa, nom, Sg)
(yeqa, nom, Sg)
14
yeqe
yeqa_gen_Sg
54
(#y, ye, eq, qe, e#)
(yeqa, gen, Sg)
(yeqa, gen, Sg)
15
yeqi
yeqa_dat_Sg
7
(#y, ye, eq, qi, i#)
(yeqa, dat, Sg)
(yeqa, loc, Sg)
16
yequ
yeqa_acc_Sg
95
(#y, ye, eq, qu, u#)
(yeqa, acc, Sg)
(yeqa, acc, Sg)
17
yeqom
yeqa_ins_Sg
30
(#y, ye, eq, qo, om, m#)
(yeqa, ins, Sg)
(yeqa, ins, Sg)
18
yeqi
yeqa_loc_Sg
43
(#y, ye, eq, qi, i#)
(yeqa, loc, Sg)
(yeqa, loc, Sg)
19
yeqe
yeqa_nom_Pl
102
(#y, ye, eq, qe, e#)
(yeqa, nom, Pl)
(yeqa, gen, Sg)
20
yeqa
yeqa_gen_Pl
164
(#y, ye, eq, qa, a#)
(yeqa, gen, Pl)
(yeqa, nom, Sg)
21
yeqama
yeqa_dat_Pl
3
(#y, ye, eq, qa, am, ma, a#)
(yeqa, dat, Pl)
(yeqa, loc, Pl)
22
yeqe
yeqa_acc_Pl
84
(#y, ye, eq, qe, e#)
(yeqa, acc, Pl)
(yeqa, gen, Sg)
23
yeqama
yeqa_ins_Pl
14
(#y, ye, eq, qa, am, ma, a#)
(yeqa, ins, Pl)
(yeqa, loc, Pl)
24
yeqama
yeqa_loc_Pl
7
(#y, ye, eq, qa, am, ma, a#)
(yeqa, loc, Pl)
(yeqa, loc, Pl)
25
yivot
yivot_nom_Sg
991
(#y, yi, iv, vo, ot, t#)
(yivot, nom, Sg)
(yivot, nom, Sg)
26
yivota
yivot_gen_Sg
1004
(#y, yi, iv, vo, ot, ta, a#)
(yivot, gen, Sg)
(yivot, gen, Sg)
27
yivotu
yivot_dat_Sg
100
(#y, yi, iv, vo, ot, tu, u#)
(yivot, dat, Sg)
(yivot, loc, Sg)
28
yivot
yivot_acc_Sg
799
(#y, yi, iv, vo, ot, t#)
(yivot, acc, Sg)
(yivot, nom, Sg)
29
yivotom
yivot_ins_Sg
142
(#y, yi, iv, vo, ot, to, om, m#)
(yivot, ins, Sg)
(yivot, ins, Sg)
30
yivotu
yivot_loc_Sg
248
(#y, yi, iv, vo, ot, tu, u#)
(yivot, loc, Sg)
(yivot, loc, Sg)
31
yivoti
yivot_nom_Pl
22
(#y, yi, iv, vo, ot, ti, i#)
(yivot, nom, Pl)
(yivot, gen, Sg)
32
yivota
yivot_gen_Pl
30
(#y, yi, iv, vo, ot, ta, a#)
(yivot, gen, Pl)
(yivot, gen, Sg)
33
yivotima
yivot_dat_Pl
3
(#y, yi, iv, vo, ot, ti, im, ma, a#)
(yivot, dat, Pl)
(yivot, ins, Pl)
34
yivote
yivot_acc_Pl
52
(#y, yi, iv, vo, ot, te, e#)
(yivot, acc, Pl)
(yivot, gen, Sg)
35
yivotima
yivot_ins_Pl
5
(#y, yi, iv, vo, ot, ti, im, ma, a#)
(yivot, ins, Pl)
(yivot, ins, Pl)
36
yivotima
yivot_loc_Pl
2
(#y, yi, iv, vo, ot, ti, im, ma, a#)
(yivot, loc, Pl)
(yivot, ins, Pl)
37
{etwa
{etwa_nom_Sg
33
(#{, {e, et, tw, wa, a#)
({etwa, nom, Sg)
({etwa, gen, Sg)
38
{etwe
{etwa_gen_Sg
10
(#{, {e, et, tw, we, e#)
({etwa, gen, Sg)
({etwa, nom, Sg)
39
{etwi
{etwa_dat_Sg
1
(#{, {e, et, tw, wi, i#)
({etwa, dat, Sg)
({etwa, nom, Pl)
40
{etwu
{etwa_acc_Sg
29
(#{, {e, et, tw, wu, u#)
({etwa, acc, Sg)
({etwa, loc, Sg)
41
{etwom
{etwa_ins_Sg
5
(#{, {e, et, tw, wo, om, m#)
({etwa, ins, Sg)
({etwa, ins, Sg)
42
{etwi
{etwa_loc_Sg
12
(#{, {e, et, tw, wi, i#)
({etwa, loc, Sg)
({etwa, nom, Pl)
43
{etwe
{etwa_nom_Pl
6
(#{, {e, et, tw, we, e#)
({etwa, nom, Pl)
({etwa, nom, Sg)
44
{etwi
{etwa_gen_Pl
5
(#{, {e, et, tw, wi, i#)
({etwa, gen, Pl)
({etwa, nom, Pl)
45
{etwama
{etwa_dat_Pl
1
(#{, {e, et, tw, wa, am, ma, a#)
({etwa, dat, Pl)
({etwa, ins, Pl)
46
{etwe
{etwa_acc_Pl
11
(#{, {e, et, tw, we, e#)
({etwa, acc, Pl)
({etwa, nom, Sg)
47
{etwama
{etwa_ins_Pl
2
(#{, {e, et, tw, wa, am, ma, a#)
({etwa, ins, Pl)
({etwa, ins, Pl)
48
{etwama
{etwa_loc_Pl
2
(#{, {e, et, tw, wa, am, ma, a#)
({etwa, loc, Pl)
({etwa, ins, Pl)
49
{irina
{irina_nom_Sg
16
(#{, {i, ir, ri, in, na, a#)
({irina, nom, Sg)
({irina, gen, Sg)
50
{irine
{irina_gen_Sg
28
(#{, {i, ir, ri, in, ne, e#)
({irina, gen, Sg)
({irina, acc, Pl)
51
{irini
{irina_dat_Sg
3
(#{, {i, ir, ri, in, ni, i#)
({irina, dat, Sg)
({irina, loc, Sg)
52
{irinu
{irina_acc_Sg
17
(#{, {i, ir, ri, in, nu, u#)
({irina, acc, Sg)
({irina, acc, Sg)
53
{irinom
{irina_ins_Sg
20
(#{, {i, ir, ri, in, no, om, m#)
({irina, ins, Sg)
({irina, ins, Sg)
54
{irini
{irina_loc_Sg
17
(#{, {i, ir, ri, in, ni, i#)
({irina, loc, Sg)
({irina, loc, Sg)
55
{irine
{irina_nom_Pl
11
(#{, {i, ir, ri, in, ne, e#)
({irina, nom, Pl)
({irina, acc, Pl)
56
{irina
{irina_gen_Pl
12
(#{, {i, ir, ri, in, na, a#)
({irina, gen, Pl)
({irina, gen, Sg)
57
{irinama
{irina_dat_Pl
2
(#{, {i, ir, ri, in, na, am, ma, a#)
({irina, dat, Pl)
({irina, loc, Pl)
58
{irine
{irina_acc_Pl
23
(#{, {i, ir, ri, in, ne, e#)
({irina, acc, Pl)
({irina, acc, Pl)
59
{irinama
{irina_ins_Pl
2
(#{, {i, ir, ri, in, na, am, ma, a#)
({irina, ins, Pl)
({irina, loc, Pl)
60
{irinama
{irina_loc_Pl
3
(#{, {i, ir, ri, in, na, am, ma, a#)
({irina, loc, Pl)
({irina, loc, Pl)
...
...
...
...
...
...
3240 rows × 6 columns
In [9]:
sum(serbian.Outcomes == serbian.Predicted) / float(len(serbian.index))
Out[9]:
0.37
In [ ]:
Content source: rmalouf/learning
Similar notebooks: