In [ ]:
In [346]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os, sys
import numpy as np
import csv
import pandas as pd
sys.path.append("./pylinguistics/")
import Pylinguistics as pl
import json
import csv
import gc
from pandas import read_csv
fapespall=pd.read_csv('/home/vinicius/phd/experiments/fapesp/fapesp_features.csv')
fapespall.drop('Unnamed: 0', axis=1, inplace=True)
a1=pd.DataFrame(fapespall.loc[7]).T
a2=pd.DataFrame(fapespall.loc[55]).T
a3=pd.DataFrame(fapespall.loc[167]).T
a4=pd.DataFrame(fapespall.loc[201]).T
a5=pd.DataFrame(fapespall.loc[204]).T
a6=pd.DataFrame(fapespall.loc[267]).T
a7=pd.DataFrame(fapespall.loc[475]).T
a8=pd.DataFrame(fapespall.loc[807]).T
a9=pd.DataFrame(fapespall.loc[897]).T
a0=pd.DataFrame(fapespall.loc[1067]).T
aq=pd.DataFrame(fapespall.loc[1152]).T
aw=pd.DataFrame(fapespall.loc[1157]).T
ae=pd.DataFrame(fapespall.loc[1486]).T
ar=pd.DataFrame(fapespall.loc[1568]).T
at=pd.DataFrame(fapespall.loc[1618]).T
ay=pd.DataFrame(fapespall.loc[2185]).T
au=pd.DataFrame(fapespall.loc[2481]).T
ai=pd.DataFrame(fapespall.loc[2537]).T
ao=pd.DataFrame(fapespall.loc[2611]).T
ap=pd.DataFrame(fapespall.loc[2807]).T
fapespResult = pd.concat([a1,a2,a3,a4,a5,a6,a7,a8,a9,a0,aq,aw,ae,ar,at,ay,au,ai,ao,ap])
fapespResult.to_csv('pylinguisticsResults.csv',sep='\t')
In [347]:
pyling=pd.read_csv('pylinguisticsResults.csv', sep='\t')
cohmetrix=pd.read_csv('cohmetrixResults.csv', sep='\t')
def calcMse(a, b):
r = a - b
r = r*r
return r
def calcPmedia(a, b):
return ((100*a) / (float(b)))-100
dic={'adjectiveIncidence':'basic_counts:adjectives','advIncidence':'basic_counts:adverbs','contentIncidence':'basic_counts:content_words','redability':'basic_counts:flesch','functionalIncidence':'basic_counts:function_words','avg_word_per_sentence':'basic_counts:words_per_sentence','nounIncidence':'basic_counts:nouns','sentence_count':'basic_counts:sentences','word_count':'basic_counts:words','pronIncidence':'basic_counts:pronouns','verbIncidence':'basic_counts:verbs','ConnectiveIncidence':'connectives:conn_incidence','LexicalDiversty':'Tokens:ttr', 'LogicOperatorsIncidence':'logic_operators:logic_operators'}
result = []
resultp=[]
for index, row in pyling.iterrows():
mse = {}
pmedia={}
for key in dic:
mse[key] = calcMse(pyling.loc[index][key], cohmetrix.loc[index][dic[key]])
pmedia[key] = calcPmedia(pyling.loc[index][key], cohmetrix.loc[index][dic[key]])
result.append(mse)
resultp.append(pmedia)
In [348]:
dfresult = pd.DataFrame(resultp)
dfresult.head(20)
Out[348]:
ConnectiveIncidence
LexicalDiversty
adjectiveIncidence
advIncidence
avg_word_per_sentence
contentIncidence
functionalIncidence
nounIncidence
pronIncidence
redability
sentence_count
verbIncidence
word_count
0
-15.789474
0.000000
66.666667
-9.090909
0.000000
-11.229947
12.631579
-18.181818
-15.088757
27.041368
0.000000
-13.793103
0.000000
1
-32.886923
-2.485189
11.440446
-31.712444
11.863557
-8.706476
0.531322
-10.968774
-6.712293
41.292408
-10.000000
-10.410622
0.677201
2
-52.941176
0.000000
54.166667
-11.111111
5.263158
-15.360502
7.608696
-19.730942
-2.501840
-4.360509
-5.000000
-26.984127
0.000000
3
-47.708886
-0.110840
25.651908
-13.979986
38.787499
-9.413587
6.819667
-13.086046
-4.990621
-16.327457
-27.659574
-24.112669
0.399467
4
-48.000000
-1.117318
23.076923
66.666667
0.000000
-4.191617
-3.225806
-10.091743
-10.000000
17.624334
0.000000
-13.793103
0.000000
5
-8.919753
4.579946
3157.870370
5.092593
-4.845815
-15.826312
189.004630
-24.896609
-15.456238
18.061547
0.000000
12.098765
-4.845815
6
-41.145997
-2.038565
32.082833
-11.292517
-5.069422
-8.064719
1.875000
-11.782614
-11.891892
4.320654
5.555556
-16.347539
0.204499
7
-28.972713
-1.284985
4.173355
-27.078652
-4.006163
-9.514433
-1.917773
-3.735357
-10.152284
-16.134081
4.761905
-22.278187
0.564972
8
-34.848623
-0.842783
17.227390
-16.963932
6.630370
-7.865479
-0.061916
-14.313269
-6.077210
-3.593980
-5.882353
1.471599
0.357995
9
-15.848261
-1.019282
30.530822
-20.438356
0.550964
-5.732649
0.936414
-7.222580
-15.972222
23.393607
0.000000
-11.598174
0.550964
10
-54.358836
-1.231555
25.380383
-34.549303
13.938990
-10.043486
-0.629598
-7.087827
-7.448487
-8.006978
-12.000000
-26.079213
0.266312
11
-39.536683
-1.809162
15.888024
-32.273233
9.061217
-12.208909
5.504143
-11.076512
4.667678
6.681690
-7.692308
-21.190671
0.671892
12
-53.155498
-2.031687
53.479624
-55.862069
11.882716
-13.385580
11.724138
-18.930331
-2.587519
-3.702174
-10.000000
-20.551724
0.694444
13
-28.029200
-2.508872
21.309274
-17.806075
2.162406
-10.826234
3.679090
-17.806075
-8.039937
10.857557
-2.564103
-4.636516
-0.457143
14
-21.907199
-0.713972
30.846981
-18.999488
8.227061
-10.242676
1.077562
-11.413437
-1.866334
7.679310
-7.317073
-23.764224
0.308008
15
-61.668322
-3.026838
14.762722
-7.093454
8.436296
-8.533798
1.214638
-11.492281
-5.052319
-0.948673
-7.142857
-12.846991
0.690846
16
-34.838643
-0.381739
19.245283
-21.188462
9.018987
-10.544940
0.329633
-10.245486
-7.776120
23.928010
-7.692308
-24.748122
0.632911
17
-41.136432
-1.707244
16.268344
-18.603591
1.489362
-10.557219
-0.102789
-11.181977
1.848863
31.327806
0.000000
-17.025268
1.489362
18
-52.481840
-1.114924
24.180791
-20.169492
0.212314
-10.413254
1.686279
-8.503715
-9.898465
8.649768
0.000000
-25.158898
0.212314
19
-50.905959
-3.028020
36.199019
4.971731
3.513708
-9.658649
3.118788
-13.682323
-0.395695
16.607238
-2.857143
-15.796783
0.556174
In [349]:
saida = dfresult[dfresult.adjectiveIncidence < 200]
#print saida
#print saida.columns[[8]]
#saida.drop('u'redability'', inplace=True)
#saida.drop(saida.columns[[8]], inplace=True)
#saida.drop('adjectiveIncidence', inplace=True)
yerrs =saida.std()
saida.mean().plot(kind='bar' ,yerr=yerrs)
Out[349]:
<matplotlib.axes.AxesSubplot at 0x7f54dfded310>
In [350]:
%matplotlib inline
import matplotlib.pyplot as plt
dfresult = pd.DataFrame(result)
dfresult.head(20)
Out[350]:
ConnectiveIncidence
LexicalDiversty
adjectiveIncidence
advIncidence
avg_word_per_sentence
contentIncidence
functionalIncidence
nounIncidence
pronIncidence
redability
sentence_count
verbIncidence
word_count
0
109.264408
0.000000e+00
1214.048975
12.140490
0.000000
5353.955979
1748.230524
6992.922095
70.759234
8.689143
0
194.247836
0
1
754.470865
2.362721e-04
112.110474
131.187344
27.620864
2857.690272
3.916304
1648.907960
19.307690
15.206411
1
143.643582
9
2
1153.435386
0.000000e+00
601.637593
3.559986
1.945291
8547.525810
697.757209
6892.132431
0.721958
2.004087
1
1028.835885
0
3
2582.846822
2.622850e-07
492.931054
16.771771
38.412070
3382.722691
628.151844
2993.617669
3.219265
16.967331
169
409.158702
9
4
1632.486481
4.534685e-05
408.121620
45.346847
0.000000
555.498872
181.387387
1371.742112
45.346847
8.113866
0
181.387387
0
5
8.685133
4.276778e-04
4838.122842
0.113242
7.562500
21640.766928
17747.334521
48840.350957
1.043130
60.318460
0
15.979122
484
6
1076.878417
1.285788e-04
497.605923
4.319652
1.896661
2319.137340
54.198502
1902.053387
36.962744
0.603409
1
516.779018
1
7
525.155427
4.428659e-05
2.724082
58.512351
1.026143
3105.257289
63.132186
174.900471
28.658573
9.317337
1
1332.321829
9
8
467.616584
1.700244e-05
109.923691
23.604102
2.670581
2123.856987
0.062366
2749.580153
28.026330
1.138021
4
3.663902
9
9
128.853641
3.510268e-05
181.093852
31.701417
0.023669
1110.356494
11.949011
878.909137
55.951957
33.454055
0
132.303385
4
10
1823.746384
4.586989e-05
139.911227
216.719833
17.533253
3768.031027
5.276531
839.504571
20.814782
5.767720
9
871.258294
4
11
933.245127
7.766594e-05
72.932200
63.216055
4.304738
5151.923774
434.020209
1716.311344
2.141977
4.084413
9
824.436556
36
12
2392.231201
1.090064e-04
938.770860
76.185809
11.711605
6691.365364
1932.966706
6483.411784
1.471125
0.065785
4
537.871174
16
13
460.632834
1.591645e-04
166.599206
45.097051
0.235376
4513.759321
172.097218
4214.068913
9.194266
15.839322
1
53.471907
16
14
182.120272
1.290141e-05
410.834923
38.964208
3.819792
3875.719896
15.862491
1951.631020
0.530186
6.264198
9
842.991686
9
15
3685.673356
2.410650e-04
131.643905
14.423870
3.043299
2925.901761
19.223355
1467.472551
10.994909
0.050658
4
472.822226
16
16
723.648977
3.773386e-06
213.647061
60.497955
7.509708
3952.536097
1.683941
1309.240237
25.200672
13.737867
4
1041.129558
25
17
1135.478661
5.572101e-05
74.880888
20.720181
0.135734
3705.458120
0.155831
1783.356437
0.650312
68.646162
0
426.326067
196
18
2190.155121
2.736740e-05
133.433300
18.337836
0.002268
3537.004783
43.396324
992.582156
15.944139
12.124284
0
1096.793339
4
19
2001.115028
2.216372e-04
343.075637
0.990925
0.814544
3581.160095
127.121535
2935.623165
0.025108
12.718463
1
579.507504
25
In [351]:
cohmetrix[dic['redability']].head(20)
Out[351]:
0 10.900836
1 -9.443721
2 32.465425
3 25.228313
4 16.162227
5 -43.000171
6 17.978625
7 18.919153
8 29.682436
9 24.724482
10 29.993934
11 30.246734
12 6.928000
13 36.655250
14 32.591965
15 23.725006
16 15.490058
17 26.447105
18 40.255350
19 21.474345
Name: basic_counts:flesch, dtype: float64
In [352]:
pyling.head(20)
Out[352]:
Unnamed: 0
ConnectiveAdditiveIncidence
ConnectiveCasualIncidence
ConnectiveIncidence
ConnectiveLogicIncidence
ConnectiveTemporalIncidence
ContentDiversty
LexicalDiversty
LogicAndIncidence
LogicIfIncidence
...
avg_word_per_sentence
contentIncidence
functionalIncidence
nounIncidence
pronIncidence
redability
sentence_count
syllable_count
verbIncidence
word_count
0
7
41.811847
3.484321
55.749129
38.327526
3.484321
0.819277
0.641115
38.327526
3.484321
...
41.000000
578.397213
372.822300
376.306620
47.337278
13.848571
7
656
87.108014
287
1
55
38.116592
13.452915
56.053812
31.390135
2.242152
0.824000
0.603139
31.390135
4.484305
...
49.555556
560.538117
374.439462
329.596413
61.068702
-13.343261
9
1117
103.139013
446
2
167
26.415094
3.773585
30.188679
24.528302
0.000000
0.588889
0.441509
22.641509
5.660377
...
27.894737
509.433962
373.584906
337.735849
33.112583
31.049767
19
1187
86.792453
530
3
201
46.419098
6.631300
55.702918
46.419098
0.000000
0.639810
0.461538
42.440318
1.326260
...
22.176471
559.681698
392.572944
363.395225
34.157833
21.109171
34
1829
63.660477
754
4
204
33.670034
6.734007
43.771044
30.303030
3.367003
0.818750
0.595960
30.303030
0.000000
...
29.700000
538.720539
404.040404
329.966330
60.606061
19.010712
10
701
84.175084
297
5
267
23.148148
4.629630
30.092593
23.148148
2.314815
0.541420
0.472222
23.148148
0.000000
...
54.000000
782.407407
203.703704
666.666667
5.586592
-50.766667
8
1250
37.037037
432
6
475
40.816327
4.081633
46.938776
38.775510
0.000000
0.769517
0.544898
36.734694
4.081633
...
25.789474
548.979592
400.000000
326.530612
45.045045
18.755419
19
1181
116.326531
490
7
807
39.325843
3.745318
56.179775
41.198502
3.745318
0.706714
0.511236
29.962547
3.745318
...
24.272727
529.962547
406.367041
340.823970
47.377327
15.866721
22
1315
127.340824
534
8
897
28.537455
10.701546
40.428062
26.159334
0.000000
0.711454
0.485137
26.159334
5.945303
...
26.281250
539.833532
403.091558
313.912010
81.818182
28.615655
32
1924
131.985731
841
9
1067
35.616438
13.698630
60.273973
24.657534
2.739726
0.755000
0.575342
21.917808
5.479452
...
28.076923
547.945205
372.602740
380.821918
39.351852
30.508430
13
819
87.671233
365
10
1152
30.544489
3.984064
35.856574
27.888446
0.000000
0.731884
0.543161
25.232404
2.656042
...
34.227273
549.800797
362.549801
379.814077
56.689342
27.592326
22
1660
83.665339
753
11
1157
28.921023
6.674082
46.718576
26.696329
8.898776
0.685345
0.478309
25.583982
1.112347
...
24.972222
516.129032
399.332592
332.591769
32.818533
32.267727
36
2032
106.785317
899
12
1486
37.931034
5.172414
43.103448
36.206897
1.724138
0.716612
0.503448
36.206897
6.896552
...
32.222222
529.310345
418.965517
344.827586
45.662100
6.671513
18
1436
89.655172
580
13
1568
36.739380
12.629162
55.109070
35.591274
3.444317
0.672199
0.490241
33.295063
1.148106
...
22.921053
553.386912
369.690011
299.655568
34.682081
40.635114
38
1904
150.401837
871
14
1618
33.776868
9.211873
48.106448
29.682702
0.000000
0.707317
0.499488
26.612078
4.094166
...
25.710526
545.547595
373.592631
342.886387
38.286235
35.094804
38
2167
93.142272
977
15
2185
24.013722
6.861063
37.735849
18.867925
0.000000
0.656805
0.497427
13.722127
22.298456
...
22.423077
579.759863
365.351630
295.025729
62.314540
23.499934
26
1396
147.512864
583
16
2481
37.735849
7.547170
50.314465
31.446541
3.773585
0.728774
0.506918
23.899371
5.031447
...
33.125000
533.333333
394.968553
316.981132
59.536935
19.196521
24
1842
98.113208
795
17
2537
34.591195
6.289308
48.218029
34.591195
2.096436
0.607724
0.429769
31.446541
3.144654
...
25.105263
515.723270
383.647799
335.429769
44.423440
34.732403
38
2127
100.628931
954
18
2611
37.076271
1.059322
42.372881
38.135593
0.000000
0.674948
0.463983
29.661017
4.237288
...
22.476190
511.652542
397.245763
338.983051
36.346692
43.737345
42
2034
98.516949
944
19
2807
33.185841
7.743363
43.141593
29.867257
0.000000
0.669960
0.476770
28.761062
2.212389
...
26.588235
559.734513
372.787611
341.814159
39.886040
25.040640
34
2103
128.318584
904
20 rows × 26 columns
In [353]:
dfresult.head(20)
Out[353]:
ConnectiveIncidence
LexicalDiversty
adjectiveIncidence
advIncidence
avg_word_per_sentence
contentIncidence
functionalIncidence
nounIncidence
pronIncidence
redability
sentence_count
verbIncidence
word_count
0
109.264408
0.000000e+00
1214.048975
12.140490
0.000000
5353.955979
1748.230524
6992.922095
70.759234
8.689143
0
194.247836
0
1
754.470865
2.362721e-04
112.110474
131.187344
27.620864
2857.690272
3.916304
1648.907960
19.307690
15.206411
1
143.643582
9
2
1153.435386
0.000000e+00
601.637593
3.559986
1.945291
8547.525810
697.757209
6892.132431
0.721958
2.004087
1
1028.835885
0
3
2582.846822
2.622850e-07
492.931054
16.771771
38.412070
3382.722691
628.151844
2993.617669
3.219265
16.967331
169
409.158702
9
4
1632.486481
4.534685e-05
408.121620
45.346847
0.000000
555.498872
181.387387
1371.742112
45.346847
8.113866
0
181.387387
0
5
8.685133
4.276778e-04
4838.122842
0.113242
7.562500
21640.766928
17747.334521
48840.350957
1.043130
60.318460
0
15.979122
484
6
1076.878417
1.285788e-04
497.605923
4.319652
1.896661
2319.137340
54.198502
1902.053387
36.962744
0.603409
1
516.779018
1
7
525.155427
4.428659e-05
2.724082
58.512351
1.026143
3105.257289
63.132186
174.900471
28.658573
9.317337
1
1332.321829
9
8
467.616584
1.700244e-05
109.923691
23.604102
2.670581
2123.856987
0.062366
2749.580153
28.026330
1.138021
4
3.663902
9
9
128.853641
3.510268e-05
181.093852
31.701417
0.023669
1110.356494
11.949011
878.909137
55.951957
33.454055
0
132.303385
4
10
1823.746384
4.586989e-05
139.911227
216.719833
17.533253
3768.031027
5.276531
839.504571
20.814782
5.767720
9
871.258294
4
11
933.245127
7.766594e-05
72.932200
63.216055
4.304738
5151.923774
434.020209
1716.311344
2.141977
4.084413
9
824.436556
36
12
2392.231201
1.090064e-04
938.770860
76.185809
11.711605
6691.365364
1932.966706
6483.411784
1.471125
0.065785
4
537.871174
16
13
460.632834
1.591645e-04
166.599206
45.097051
0.235376
4513.759321
172.097218
4214.068913
9.194266
15.839322
1
53.471907
16
14
182.120272
1.290141e-05
410.834923
38.964208
3.819792
3875.719896
15.862491
1951.631020
0.530186
6.264198
9
842.991686
9
15
3685.673356
2.410650e-04
131.643905
14.423870
3.043299
2925.901761
19.223355
1467.472551
10.994909
0.050658
4
472.822226
16
16
723.648977
3.773386e-06
213.647061
60.497955
7.509708
3952.536097
1.683941
1309.240237
25.200672
13.737867
4
1041.129558
25
17
1135.478661
5.572101e-05
74.880888
20.720181
0.135734
3705.458120
0.155831
1783.356437
0.650312
68.646162
0
426.326067
196
18
2190.155121
2.736740e-05
133.433300
18.337836
0.002268
3537.004783
43.396324
992.582156
15.944139
12.124284
0
1096.793339
4
19
2001.115028
2.216372e-04
343.075637
0.990925
0.814544
3581.160095
127.121535
2935.623165
0.025108
12.718463
1
579.507504
25
In [354]:
dfresult.word_count.plot(kind='box')
Out[354]:
<matplotlib.axes.AxesSubplot at 0x7f54dfd27090>
Content source: vwoloszyn/pylinguistics
Similar notebooks: