In [ ]:


In [1]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

import os, sys
import numpy as np
import csv
import pandas as pd
sys.path.append("./pylinguistics/")
import Pylinguistics as pl
import json
import csv
import gc
from pandas import read_csv



fapespall=pd.read_csv('/home/vinicius/phd/experiments/fapesp/fapesp_features.csv')
fapespall.drop('Unnamed: 0', axis=1, inplace=True)




a1=pd.DataFrame(fapespall.loc[7]).T
a2=pd.DataFrame(fapespall.loc[55]).T
a3=pd.DataFrame(fapespall.loc[167]).T
a4=pd.DataFrame(fapespall.loc[201]).T
a5=pd.DataFrame(fapespall.loc[204]).T

a6=pd.DataFrame(fapespall.loc[267]).T
a7=pd.DataFrame(fapespall.loc[475]).T
a8=pd.DataFrame(fapespall.loc[807]).T
a9=pd.DataFrame(fapespall.loc[897]).T
a0=pd.DataFrame(fapespall.loc[1067]).T

aq=pd.DataFrame(fapespall.loc[1152]).T
aw=pd.DataFrame(fapespall.loc[1157]).T
ae=pd.DataFrame(fapespall.loc[1486]).T
ar=pd.DataFrame(fapespall.loc[1568]).T
at=pd.DataFrame(fapespall.loc[1618]).T

ay=pd.DataFrame(fapespall.loc[2185]).T
au=pd.DataFrame(fapespall.loc[2481]).T
ai=pd.DataFrame(fapespall.loc[2537]).T
ao=pd.DataFrame(fapespall.loc[2611]).T
ap=pd.DataFrame(fapespall.loc[2807]).T

fapespResult = pd.concat([a1,a2,a3,a4,a5,a6,a7,a8,a9,a0,aq,aw,ae,ar,at,ay,au,ai,ao,ap])

fapespResult.to_csv('pylinguisticsResults.csv',sep='\t')

In [2]:
pyling=pd.read_csv('pylinguisticsResults.csv', sep='\t')
cohmetrix=pd.read_csv('cohmetrixResults.csv', sep='\t')

def calcMse(a, b):
    r = a - b
    r = r*r
    return r

def calcPmedia(a, b):
    
    return ((100*a) / (float(b)))-100
    
dic={'adjectiveIncidence':'basic_counts:adjectives','advIncidence':'basic_counts:adverbs','contentIncidence':'basic_counts:content_words','redability':'basic_counts:flesch','functionalIncidence':'basic_counts:function_words','avg_word_per_sentence':'basic_counts:words_per_sentence','nounIncidence':'basic_counts:nouns','sentence_count':'basic_counts:sentences','word_count':'basic_counts:words','pronIncidence':'basic_counts:pronouns','verbIncidence':'basic_counts:verbs','ConnectiveIncidence':'connectives:conn_incidence','LexicalDiversty':'Tokens:ttr', 'LogicOperatorsIncidence':'logic_operators:logic_operators'}

result = []
resultp=[]
for index, row in pyling.iterrows():
    mse = {}
    pmedia={}
    for key in dic:
        mse[key] = calcMse(pyling.loc[index][key], cohmetrix.loc[index][dic[key]])
        pmedia[key] = calcPmedia(pyling.loc[index][key], cohmetrix.loc[index][dic[key]])

    result.append(mse)
    resultp.append(pmedia)

In [3]:
dfresult = pd.DataFrame(resultp)
dfresult.head(20)


Out[3]:
ConnectiveIncidence LexicalDiversty LogicOperatorsIncidence adjectiveIncidence advIncidence avg_word_per_sentence contentIncidence functionalIncidence nounIncidence pronIncidence redability sentence_count verbIncidence word_count
0 -15.789474 0.000000 33.333333 66.666667 -9.090909 0.000000 -11.229947 12.631579 -18.181818 -15.088757 27.041368 0.000000 -13.793103 0.000000
1 -32.886923 -2.485189 12.571001 11.440446 -31.712444 11.863557 -8.706476 0.531322 -10.968774 -6.712293 41.292408 -10.000000 -10.410622 0.677201
2 -52.941176 0.000000 21.428571 54.166667 -11.111111 5.263158 -15.360502 7.608696 -19.730942 -2.501840 -4.360509 -5.000000 -26.984127 0.000000
3 -47.708886 -0.110840 10.669024 25.651908 -13.979986 38.787499 -9.413587 6.819667 -13.086046 -4.990621 -16.327457 -27.659574 -24.112669 0.399467
4 -48.000000 -1.117318 0.000000 23.076923 66.666667 0.000000 -4.191617 -3.225806 -10.091743 -10.000000 17.624334 0.000000 -13.793103 0.000000
5 -8.919753 4.579946 110.185185 3157.870370 5.092593 -4.845815 -15.826312 189.004630 -24.896609 -15.456238 18.061547 0.000000 12.098765 -4.845815
6 -41.145997 -2.038565 9.775510 32.082833 -11.292517 -5.069422 -8.064719 1.875000 -11.782614 -11.891892 4.320654 5.555556 -16.347539 0.204499
7 -28.972713 -1.284985 13.643660 4.173355 -27.078652 -4.006163 -9.514433 -1.917773 -3.735357 -10.152284 -16.134081 4.761905 -22.278187 0.564972
8 -34.848623 -0.842783 18.095741 17.227390 -16.963932 6.630370 -7.865479 -0.061916 -14.313269 -6.077210 -3.593980 -5.882353 1.471599 0.357995
9 -15.848261 -1.019282 21.552511 30.530822 -20.438356 0.550964 -5.732649 0.936414 -7.222580 -15.972222 23.393607 0.000000 -11.598174 0.550964
10 -54.358836 -1.231555 13.334541 25.380383 -34.549303 13.938990 -10.043486 -0.629598 -7.087827 -7.448487 -8.006978 -12.000000 -26.079213 0.266312
11 -39.536683 -1.809162 7.279199 15.888024 -32.273233 9.061217 -12.208909 5.504143 -11.076512 4.667678 6.681690 -7.692308 -21.190671 0.671892
12 -53.155498 -2.031687 30.671506 53.479624 -55.862069 11.882716 -13.385580 11.724138 -18.930331 -2.587519 -3.702174 -10.000000 -20.551724 0.694444
13 -28.029200 -2.508872 6.368609 21.309274 -17.806075 2.162406 -10.826234 3.679090 -17.806075 -8.039937 10.857557 -2.564103 -4.636516 -0.457143
14 -21.907199 -0.713972 19.631525 30.846981 -18.999488 8.227061 -10.242676 1.077562 -11.413437 -1.866334 7.679310 -7.317073 -23.764224 0.308008
15 -61.668322 -3.026838 51.335457 14.762722 -7.093454 8.436296 -8.533798 1.214638 -11.492281 -5.052319 -0.948673 -7.142857 -12.846991 0.690846
16 -34.838643 -0.381739 13.566936 19.245283 -21.188462 9.018987 -10.544940 0.329633 -10.245486 -7.776120 23.928010 -7.692308 -24.748122 0.632911
17 -41.136432 -1.707244 13.022568 16.268344 -18.603591 1.489362 -10.557219 -0.102789 -11.181977 1.848863 31.327806 0.000000 -17.025268 1.489362
18 -52.481840 -1.114924 10.875706 24.180791 -20.169492 0.212314 -10.413254 1.686279 -8.503715 -9.898465 8.649768 0.000000 -25.158898 0.212314
19 -50.905959 -3.028020 16.592920 36.199019 4.971731 3.513708 -9.658649 3.118788 -13.682323 -0.395695 16.607238 -2.857143 -15.796783 0.556174

In [4]:
saida = dfresult[dfresult.adjectiveIncidence < 200 ]
#print saida
#print saida.columns[[8]]
#saida.drop('u'redability'',  inplace=True)
#saida.drop(saida.columns[[8]],  inplace=True)
#saida.drop('adjectiveIncidence',  inplace=True)

yerrs =saida.std()

saida.mean().plot(kind='bar' ,yerr=yerrs)


Out[4]:
<matplotlib.axes.AxesSubplot at 0x7fb4a11437d0>

In [5]:
%matplotlib inline
import matplotlib.pyplot as plt

dfresult = pd.DataFrame(result)
dfresult.head(20)


Out[5]:
ConnectiveIncidence LexicalDiversty LogicOperatorsIncidence adjectiveIncidence advIncidence avg_word_per_sentence contentIncidence functionalIncidence nounIncidence pronIncidence redability sentence_count verbIncidence word_count
0 109.264408 0.000000e+00 109.264408 1214.048975 12.140490 0.000000 5353.955979 1748.230524 6992.922095 70.759234 8.689143 0 194.247836 0
1 754.470865 2.362721e-04 18.118191 112.110474 131.187344 27.620864 2857.690272 3.916304 1648.907960 19.307690 15.206411 1 143.643582 9
2 1153.435386 0.000000e+00 32.039872 601.637593 3.559986 1.945291 8547.525810 697.757209 6892.132431 0.721958 2.004087 1 1028.835885 0
3 2582.846822 2.622850e-07 26.156194 492.931054 16.771771 38.412070 3382.722691 628.151844 2993.617669 3.219265 16.967331 169 409.158702 9
4 1632.486481 4.534685e-05 0.000000 408.121620 45.346847 0.000000 555.498872 181.387387 1371.742112 45.346847 8.113866 0 181.387387 0
5 8.685133 4.276778e-04 147.256582 4838.122842 0.113242 7.562500 21640.766928 17747.334521 48840.350957 1.043130 60.318460 0 15.979122 484
6 1076.878417 1.285788e-04 15.985313 497.605923 4.319652 1.896661 2319.137340 54.198502 1902.053387 36.962744 0.603409 1 516.779018 1
7 525.155427 4.428659e-05 29.114632 2.724082 58.512351 1.026143 3105.257289 63.132186 174.900471 28.658573 9.317337 1 1332.321829 9
8 467.616584 1.700244e-05 33.993216 109.923691 23.604102 2.670581 2123.856987 0.062366 2749.580153 28.026330 1.138021 4 3.663902 9
9 128.853641 3.510268e-05 28.554038 181.093852 31.701417 0.023669 1110.356494 11.949011 878.909137 55.951957 33.454055 0 132.303385 4
10 1823.746384 4.586989e-05 15.258843 139.911227 216.719833 17.533253 3768.031027 5.276531 839.504571 20.814782 5.767720 9 871.258294 4
11 933.245127 7.766594e-05 4.152831 72.932200 63.216055 4.304738 5151.923774 434.020209 1716.311344 2.141977 4.084413 9 824.436556 36
12 2392.231201 1.090064e-04 102.360511 938.770860 76.185809 11.711605 6691.365364 1932.966706 6483.411784 1.471125 0.065785 4 537.871174 16
13 460.632834 1.591645e-04 6.123940 166.599206 45.097051 0.235376 4513.759321 172.097218 4214.068913 9.194266 15.839322 1 53.471907 16
14 182.120272 1.290141e-05 49.765257 410.834923 38.964208 3.819792 3875.719896 15.862491 1951.631020 0.530186 6.264198 9 842.991686 9
15 3685.673356 2.410650e-04 346.670053 131.643905 14.423870 3.043299 2925.901761 19.223355 1467.472551 10.994909 0.050658 4 472.822226 16
16 723.648977 3.773386e-06 23.122003 213.647061 60.497955 7.509708 3952.536097 1.683941 1309.240237 25.200672 13.737867 4 1041.129558 25
17 1135.478661 5.572101e-05 22.186835 74.880888 20.720181 0.135734 3705.458120 0.155831 1783.356437 0.650312 68.646162 0 426.326067 196
18 2190.155121 2.736740e-05 17.275003 133.433300 18.337836 0.002268 3537.004783 43.396324 992.582156 15.944139 12.124284 0 1096.793339 4
19 2001.115028 2.216372e-04 28.649845 343.075637 0.990925 0.814544 3581.160095 127.121535 2935.623165 0.025108 12.718463 1 579.507504 25

In [6]:
cohmetrix[dic['redability']].head(20)


Out[6]:
0     10.900836
1     -9.443721
2     32.465425
3     25.228313
4     16.162227
5    -43.000171
6     17.978625
7     18.919153
8     29.682436
9     24.724482
10    29.993934
11    30.246734
12     6.928000
13    36.655250
14    32.591965
15    23.725006
16    15.490058
17    26.447105
18    40.255350
19    21.474345
Name: basic_counts:flesch, dtype: float64

In [7]:
pyling.head(20)


Out[7]:
Unnamed: 0 ConnectiveAdditiveIncidence ConnectiveCasualIncidence ConnectiveIncidence ConnectiveLogicIncidence ConnectiveTemporalIncidence ContentDiversty LexicalDiversty LogicAndIncidence LogicIfIncidence ... avg_word_per_sentence contentIncidence functionalIncidence nounIncidence pronIncidence redability sentence_count syllable_count verbIncidence word_count
0 7 41.811847 3.484321 55.749129 38.327526 3.484321 0.819277 0.641115 38.327526 3.484321 ... 41.000000 578.397213 372.822300 376.306620 47.337278 13.848571 7 656 87.108014 287
1 55 38.116592 13.452915 56.053812 31.390135 2.242152 0.824000 0.603139 31.390135 4.484305 ... 49.555556 560.538117 374.439462 329.596413 61.068702 -13.343261 9 1117 103.139013 446
2 167 26.415094 3.773585 30.188679 24.528302 0.000000 0.588889 0.441509 22.641509 5.660377 ... 27.894737 509.433962 373.584906 337.735849 33.112583 31.049767 19 1187 86.792453 530
3 201 46.419098 6.631300 55.702918 46.419098 0.000000 0.639810 0.461538 42.440318 1.326260 ... 22.176471 559.681698 392.572944 363.395225 34.157833 21.109171 34 1829 63.660477 754
4 204 33.670034 6.734007 43.771044 30.303030 3.367003 0.818750 0.595960 30.303030 0.000000 ... 29.700000 538.720539 404.040404 329.966330 60.606061 19.010712 10 701 84.175084 297
5 267 23.148148 4.629630 30.092593 23.148148 2.314815 0.541420 0.472222 23.148148 0.000000 ... 54.000000 782.407407 203.703704 666.666667 5.586592 -50.766667 8 1250 37.037037 432
6 475 40.816327 4.081633 46.938776 38.775510 0.000000 0.769517 0.544898 36.734694 4.081633 ... 25.789474 548.979592 400.000000 326.530612 45.045045 18.755419 19 1181 116.326531 490
7 807 39.325843 3.745318 56.179775 41.198502 3.745318 0.706714 0.511236 29.962547 3.745318 ... 24.272727 529.962547 406.367041 340.823970 47.377327 15.866721 22 1315 127.340824 534
8 897 28.537455 10.701546 40.428062 26.159334 0.000000 0.711454 0.485137 26.159334 5.945303 ... 26.281250 539.833532 403.091558 313.912010 81.818182 28.615655 32 1924 131.985731 841
9 1067 35.616438 13.698630 60.273973 24.657534 2.739726 0.755000 0.575342 21.917808 5.479452 ... 28.076923 547.945205 372.602740 380.821918 39.351852 30.508430 13 819 87.671233 365
10 1152 30.544489 3.984064 35.856574 27.888446 0.000000 0.731884 0.543161 25.232404 2.656042 ... 34.227273 549.800797 362.549801 379.814077 56.689342 27.592326 22 1660 83.665339 753
11 1157 28.921023 6.674082 46.718576 26.696329 8.898776 0.685345 0.478309 25.583982 1.112347 ... 24.972222 516.129032 399.332592 332.591769 32.818533 32.267727 36 2032 106.785317 899
12 1486 37.931034 5.172414 43.103448 36.206897 1.724138 0.716612 0.503448 36.206897 6.896552 ... 32.222222 529.310345 418.965517 344.827586 45.662100 6.671513 18 1436 89.655172 580
13 1568 36.739380 12.629162 55.109070 35.591274 3.444317 0.672199 0.490241 33.295063 1.148106 ... 22.921053 553.386912 369.690011 299.655568 34.682081 40.635114 38 1904 150.401837 871
14 1618 33.776868 9.211873 48.106448 29.682702 0.000000 0.707317 0.499488 26.612078 4.094166 ... 25.710526 545.547595 373.592631 342.886387 38.286235 35.094804 38 2167 93.142272 977
15 2185 24.013722 6.861063 37.735849 18.867925 0.000000 0.656805 0.497427 13.722127 22.298456 ... 22.423077 579.759863 365.351630 295.025729 62.314540 23.499934 26 1396 147.512864 583
16 2481 37.735849 7.547170 50.314465 31.446541 3.773585 0.728774 0.506918 23.899371 5.031447 ... 33.125000 533.333333 394.968553 316.981132 59.536935 19.196521 24 1842 98.113208 795
17 2537 34.591195 6.289308 48.218029 34.591195 2.096436 0.607724 0.429769 31.446541 3.144654 ... 25.105263 515.723270 383.647799 335.429769 44.423440 34.732403 38 2127 100.628931 954
18 2611 37.076271 1.059322 42.372881 38.135593 0.000000 0.674948 0.463983 29.661017 4.237288 ... 22.476190 511.652542 397.245763 338.983051 36.346692 43.737345 42 2034 98.516949 944
19 2807 33.185841 7.743363 43.141593 29.867257 0.000000 0.669960 0.476770 28.761062 2.212389 ... 26.588235 559.734513 372.787611 341.814159 39.886040 25.040640 34 2103 128.318584 904

20 rows × 26 columns


In [8]:
dfresult.head(20)


Out[8]:
ConnectiveIncidence LexicalDiversty LogicOperatorsIncidence adjectiveIncidence advIncidence avg_word_per_sentence contentIncidence functionalIncidence nounIncidence pronIncidence redability sentence_count verbIncidence word_count
0 109.264408 0.000000e+00 109.264408 1214.048975 12.140490 0.000000 5353.955979 1748.230524 6992.922095 70.759234 8.689143 0 194.247836 0
1 754.470865 2.362721e-04 18.118191 112.110474 131.187344 27.620864 2857.690272 3.916304 1648.907960 19.307690 15.206411 1 143.643582 9
2 1153.435386 0.000000e+00 32.039872 601.637593 3.559986 1.945291 8547.525810 697.757209 6892.132431 0.721958 2.004087 1 1028.835885 0
3 2582.846822 2.622850e-07 26.156194 492.931054 16.771771 38.412070 3382.722691 628.151844 2993.617669 3.219265 16.967331 169 409.158702 9
4 1632.486481 4.534685e-05 0.000000 408.121620 45.346847 0.000000 555.498872 181.387387 1371.742112 45.346847 8.113866 0 181.387387 0
5 8.685133 4.276778e-04 147.256582 4838.122842 0.113242 7.562500 21640.766928 17747.334521 48840.350957 1.043130 60.318460 0 15.979122 484
6 1076.878417 1.285788e-04 15.985313 497.605923 4.319652 1.896661 2319.137340 54.198502 1902.053387 36.962744 0.603409 1 516.779018 1
7 525.155427 4.428659e-05 29.114632 2.724082 58.512351 1.026143 3105.257289 63.132186 174.900471 28.658573 9.317337 1 1332.321829 9
8 467.616584 1.700244e-05 33.993216 109.923691 23.604102 2.670581 2123.856987 0.062366 2749.580153 28.026330 1.138021 4 3.663902 9
9 128.853641 3.510268e-05 28.554038 181.093852 31.701417 0.023669 1110.356494 11.949011 878.909137 55.951957 33.454055 0 132.303385 4
10 1823.746384 4.586989e-05 15.258843 139.911227 216.719833 17.533253 3768.031027 5.276531 839.504571 20.814782 5.767720 9 871.258294 4
11 933.245127 7.766594e-05 4.152831 72.932200 63.216055 4.304738 5151.923774 434.020209 1716.311344 2.141977 4.084413 9 824.436556 36
12 2392.231201 1.090064e-04 102.360511 938.770860 76.185809 11.711605 6691.365364 1932.966706 6483.411784 1.471125 0.065785 4 537.871174 16
13 460.632834 1.591645e-04 6.123940 166.599206 45.097051 0.235376 4513.759321 172.097218 4214.068913 9.194266 15.839322 1 53.471907 16
14 182.120272 1.290141e-05 49.765257 410.834923 38.964208 3.819792 3875.719896 15.862491 1951.631020 0.530186 6.264198 9 842.991686 9
15 3685.673356 2.410650e-04 346.670053 131.643905 14.423870 3.043299 2925.901761 19.223355 1467.472551 10.994909 0.050658 4 472.822226 16
16 723.648977 3.773386e-06 23.122003 213.647061 60.497955 7.509708 3952.536097 1.683941 1309.240237 25.200672 13.737867 4 1041.129558 25
17 1135.478661 5.572101e-05 22.186835 74.880888 20.720181 0.135734 3705.458120 0.155831 1783.356437 0.650312 68.646162 0 426.326067 196
18 2190.155121 2.736740e-05 17.275003 133.433300 18.337836 0.002268 3537.004783 43.396324 992.582156 15.944139 12.124284 0 1096.793339 4
19 2001.115028 2.216372e-04 28.649845 343.075637 0.990925 0.814544 3581.160095 127.121535 2935.623165 0.025108 12.718463 1 579.507504 25

In [9]:
dfresult.word_count.plot(kind='box')


Out[9]:
<matplotlib.axes.AxesSubplot at 0x7fb4cc1e0e10>