In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.mlab as mlab

In [4]:
dfraw = pd.read_csv('features.csv', sep=',')
dfraw


Out[4]:
article offset alias entity link_score entity_score entity_score_rank entity_score_dtop entity_score_dsuc cosine_sim cosine_sim_rank cosine_sim_dtop cosine_sim_dsuc correct
0 Albert Wenk 174 Artesis Hogeschool Antwerpen Artesis Hogeschool Antwerpen 1.000000 1.000000 1 Infinity Infinity 0.027930 1 Infinity Infinity True
1 Logan Ramsey 1408 Anne Ramsey Anne Ramsey 1.000000 1.000000 1 Infinity 0.44082840236686394 0.266522 1 Infinity 1.2665216650036508 True
2 Spektralmethode 171 Ansatzfunktionen Ansatz (Mathematik) 1.000000 0.333333 2 0.3333333333333333 Infinity 0.181334 2 0.06183150029798601 Infinity False
3 Spektralmethode 171 Ansatzfunktionen Ansatzfunktion 1.000000 0.666667 1 Infinity 0.3333333333333333 0.243165 1 Infinity 0.06183150029798601 True
4 Élencourt 138 Arrondissement Beauvais Arrondissement Beauvais 1.000000 1.000000 1 Infinity Infinity 0.301277 1 Infinity Infinity True
5 Victor Eftimiu 74 Albanien Abgeordneter 0.690233 0.000310 21 0.8176178660049628 Infinity 0.021549 25 0.062188452380636604 7.688920440697689E-4 False
6 Victor Eftimiu 74 Albanien Adel 0.690233 0.000620 14 0.8173076923076923 3.1017369727047146E-4 0.042905 8 0.040833356179805165 8.115629768289251E-4 False
7 Victor Eftimiu 74 Albanien Albania 0.690233 0.000310 21 0.8176178660049628 Infinity 0.021708 24 0.062029720876732426 1.5873150390417756E-4 False
8 Victor Eftimiu 74 Albanien Albania (Begriffsklärung) 0.690233 0.000310 21 0.8176178660049628 Infinity 0.010401 52 0.07333645051479792 2.3759949270757072E-4 False
9 Victor Eftimiu 74 Albanien Albanien 0.690233 0.817928 1 Infinity 0.7127791563275434 0.042093 9 0.04164491915663409 0.006297353533244558 True
10 Victor Eftimiu 74 Albanien Albanien beim Eurovision Song Contest 0.690233 0.001551 10 0.8163771712158809 6.20347394540943E-4 0.017750 30 0.06598805585270895 1.859480380340152E-4 False
11 Victor Eftimiu 74 Albanien Albanien im Mittelalter 0.690233 0.000310 21 0.8176178660049628 Infinity 0.048738 5 0.035000069318407157 0.00257018576116215 False
12 Victor Eftimiu 74 Albanien Albanien und die Europäische Union 0.690233 0.000310 21 0.8176178660049628 Infinity 0.023830 20 0.059908392912543226 0.0012089604671199211 False
13 Victor Eftimiu 74 Albanien Albanische Basketballnationalmannschaft 0.690233 0.000931 12 0.8169975186104218 3.1017369727047146E-4 0.008832 57 0.07490604260713715 3.241390543284057E-4 False
14 Victor Eftimiu 74 Albanien Albanische Fußballnationalmannschaft 0.690233 0.105149 2 0.7127791563275434 0.08839950372208435 0.018334 28 0.0654035387286466 4.627369311335887E-4 False
15 Victor Eftimiu 74 Albanien Albanische Fußballnationalmannschaft (U-21-Män... 0.690233 0.005273 7 0.8126550868486353 3.101736972704718E-4 0.013050 45 0.07068793601226803 5.816615723109406E-4 False
16 Victor Eftimiu 74 Albanien Albanische Fußballnationalmannschaft der Frauen 0.690233 0.005583 6 0.8123449131513648 3.101736972704709E-4 0.006187 64 0.07755136764625414 7.005092154502152E-4 False
17 Victor Eftimiu 74 Albanien Albanische Küche 0.690233 0.000310 21 0.8176178660049628 Infinity 0.020781 26 0.06295734442470638 0.0022057811059381614 False
18 Victor Eftimiu 74 Albanien Albanische Luftstreitkräfte 0.690233 0.000310 21 0.8176178660049628 Infinity 0.012468 46 0.07126959758457897 6.367698125467718E-4 False
19 Victor Eftimiu 74 Albanien Albanische Sprache 0.690233 0.001551 10 0.8163771712158809 6.20347394540943E-4 0.028659 14 0.05507846712369462 2.095543805849115E-4 False
20 Victor Eftimiu 74 Albanien Albanische Streitkräfte 0.690233 0.000310 21 0.8176178660049628 Infinity 0.014950 39 0.06878755322423608 1.196445324029051E-4 False
21 Victor Eftimiu 74 Albanien Albanische Volleyballnationalmannschaft der Fr... 0.690233 0.000310 21 0.8176178660049628 Infinity 0.004611 68 0.07912667919437812 0.004611242542187034 False
22 Victor Eftimiu 74 Albanien Albanisches Heer 0.690233 0.000620 14 0.8173076923076923 3.1017369727047146E-4 0.010517 51 0.07322095762279718 1.1549289200075107E-4 False
23 Victor Eftimiu 74 Albanien Arbeitsministerium 0.690233 0.000310 21 0.8176178660049628 Infinity 0.007345 61 0.07639332280492074 6.709663625261804E-4 False
24 Victor Eftimiu 74 Albanien Autobahn 0.690233 0.000310 21 0.8176178660049628 Infinity 0.025260 18 0.05847752842615518 2.446038215260897E-4 False
25 Victor Eftimiu 74 Albanien Außenminister 0.690233 0.000310 21 0.8176178660049628 Infinity 0.013065 44 0.07067334429995525 1.4591712312772565E-5 False
26 Victor Eftimiu 74 Albanien Bildungsministerium 0.690233 0.000310 21 0.8176178660049628 Infinity 0.006603 63 0.07713489228035474 4.164753658994066E-4 False
27 Victor Eftimiu 74 Albanien Botschafter der Vereinigten Staaten 0.690233 0.000310 21 0.8176178660049628 Infinity 0.009494 56 0.07424419468341709 6.61847923720071E-4 False
28 Victor Eftimiu 74 Albanien Bunker in Albanien 0.690233 0.000310 21 0.8176178660049628 Infinity 0.017564 31 0.06617400389074296 4.863493127612653E-4 False
29 Victor Eftimiu 74 Albanien Ehrenbürger 0.690233 0.000310 21 0.8176178660049628 Infinity 0.015137 38 0.06860082720994617 1.8672601428990597E-4 False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
99970 Poi 5595 Australien Skilanglauf 0.361357 0.000052 76 0.8897001708870592 Infinity 0.030089 6 0.09699993073495518 3.364883882109697E-4 False
99971 Poi 5595 Australien Skulpturengarten 0.361357 0.000052 76 0.8897001708870592 Infinity 0.009880 126 0.11720947816076607 1.3165887229213066E-4 False
99972 Poi 5595 Australien Snowboard 0.361357 0.000052 76 0.8897001708870592 Infinity 0.022825 24 0.10426386982836983 5.013551672455874E-5 False
99973 Poi 5595 Australien Socialist Equality Party (Australien) 0.361357 0.000052 76 0.8897001708870592 Infinity 0.002339 227 0.12475045053270237 1.0999596407621198E-4 False
99974 Poi 5595 Australien Speerwurf 0.361357 0.000052 76 0.8897001708870592 Infinity 0.016442 59 0.11064735815033228 2.262556929298748E-4 False
99975 Poi 5595 Australien Sportschießen 0.361357 0.000052 76 0.8897001708870592 Infinity 0.024457 18 0.10263262052901435 4.115742396461723E-4 False
99976 Poi 5595 Australien Staaten und Territorien Australiens 0.361357 0.000104 55 0.8896483869297291 5.178395733001916E-5 0.017975 48 0.10911444525912233 5.530440016748156E-5 False
99977 Poi 5595 Australien Staatsanwalt 0.361357 0.000052 76 0.8897001708870592 Infinity 0.012908 96 0.11418138078906255 2.7090196738142544E-4 False
99978 Poi 5595 Australien Staatswappen 0.361357 0.000052 76 0.8897001708870592 Infinity 0.012637 97 0.11445228275644398 1.300597421058109E-4 False
99979 Poi 5595 Australien Stabhochsprung 0.361357 0.000052 76 0.8897001708870592 Infinity 0.015831 62 0.11125806966224214 8.18188513813066E-5 False
99980 Poi 5595 Australien Stadtautobahn 0.361357 0.000052 76 0.8897001708870592 Infinity 0.007762 163 0.11932685393693625 1.6910090196933444E-5 False
99981 Poi 5595 Australien Straßenbrücke 0.361357 0.000052 76 0.8897001708870592 Infinity 0.013153 92 0.11393587943974878 1.0225289407302801E-5 False
99982 Poi 5595 Australien Streik 0.361357 0.000052 76 0.8897001708870592 Infinity 0.014374 79 0.1127148483059093 1.6470953838542725E-4 False
99983 Poi 5595 Australien Sträflingskolonie Australien 0.361357 0.000052 76 0.8897001708870592 Infinity 0.019449 42 0.10763987892692117 3.212116600587525E-5 False
99984 Poi 5595 Australien Surfen 0.361357 0.000052 76 0.8897001708870592 Infinity 0.008331 147 0.11875833117780274 1.136414581218853E-6 False
99985 Poi 5595 Australien Tennis 0.361357 0.000052 76 0.8897001708870592 Infinity 0.021951 31 0.1051386842330542 1.5784039993157334E-4 False
99986 Poi 5595 Australien Tennis Australia 0.361357 0.000052 76 0.8897001708870592 Infinity 0.011284 112 0.115804938623654 6.74437800435336E-5 False
99987 Poi 5595 Australien Triathlon 0.361357 0.000052 76 0.8897001708870592 Infinity 0.020084 39 0.10700572185681456 4.199613273206948E-4 False
99988 Poi 5595 Australien Uniting Church in Australia 0.361357 0.000052 76 0.8897001708870592 Infinity 0.002991 225 0.12409877074149459 2.8853217209194506E-4 False
99989 Poi 5595 Australien Uranabbau in Australien 0.361357 0.000052 76 0.8897001708870592 Infinity 0.022842 23 0.10424731714061378 1.655268775603569E-5 False
99990 Poi 5595 Australien Verkehrsministerium 0.361357 0.000052 76 0.8897001708870592 Infinity 0.005468 205 0.12162148295008021 2.340843574462721E-4 False
99991 Poi 5595 Australien Verteidigungsministerium 0.361357 0.000052 76 0.8897001708870592 Infinity 0.019262 44 0.10782737542036844 1.7756460585487133E-4 False
99992 Poi 5595 Australien Victoria (Australien) 0.361357 0.000155 50 0.8895966029723992 5.178395733001915E-5 0.021498 33 0.10559129812603801 5.362677300902244E-5 False
99993 Poi 5595 Australien Victorianischer Goldrausch 0.361357 0.000052 76 0.8897001708870592 Infinity 0.017750 51 0.10933941529740171 6.461619302169583E-5 False
99994 Poi 5595 Australien Vizeadmiral 0.361357 0.000052 76 0.8897001708870592 Infinity 0.007805 161 0.11928433118220252 2.2331633532672937E-5 False
99995 Poi 5595 Australien Volleyball 0.361357 0.000052 76 0.8897001708870592 Infinity 0.021793 32 0.10529652463298576 2.9477349305225E-4 False
99996 Poi 5595 Australien Vulkan 0.361357 0.000052 76 0.8897001708870592 Infinity 0.016763 56 0.11032599839505572 1.6833581562428201E-4 False
99997 Poi 5595 Australien Vulkanismus 0.361357 0.000052 76 0.8897001708870592 Infinity 0.009748 127 0.11734113703305821 2.5103621446075433E-5 False
99998 Poi 5595 Australien Wahlkreis 0.361357 0.000052 76 0.8897001708870592 Infinity 0.016487 58 0.11060195935586797 4.539879446431236E-5 False
99999 Poi 5595 Australien Wasserball 0.361357 0.000052 76 0.8897001708870592 Infinity 0.029753 7 0.09733641912316615 1.2739047771093484E-4 False

100000 rows × 14 columns


In [4]:
columns = ["entity_score_dtop", "entity_score_dsuc", "cosine_sim_dtop", "cosine_sim_dsuc"]
dfraw[columns] = dfraw[columns].astype(float)
inf_replacement = -0.04
df = dfraw.applymap(lambda val: inf_replacement if val == float("inf") else val)
df


Out[4]:
article offset alias entity link_score entity_score entity_score_rank entity_score_dtop entity_score_dsuc cosine_sim cosine_sim_rank cosine_sim_dtop cosine_sim_dsuc correct
0 Albert Wenk 174 Artesis Hogeschool Antwerpen Artesis Hogeschool Antwerpen 1.000000 1.000000 1 -0.040000 -0.040000 0.027930 1 -0.040000 -0.040000 True
1 Logan Ramsey 1408 Anne Ramsey Anne Ramsey 1.000000 1.000000 1 -0.040000 0.440828 0.266522 1 -0.040000 1.266522 True
2 Spektralmethode 171 Ansatzfunktionen Ansatz (Mathematik) 1.000000 0.333333 2 0.333333 -0.040000 0.181334 2 0.061832 -0.040000 False
3 Spektralmethode 171 Ansatzfunktionen Ansatzfunktion 1.000000 0.666667 1 -0.040000 0.333333 0.243165 1 -0.040000 0.061832 True
4 Élencourt 138 Arrondissement Beauvais Arrondissement Beauvais 1.000000 1.000000 1 -0.040000 -0.040000 0.301277 1 -0.040000 -0.040000 True
5 Victor Eftimiu 74 Albanien Abgeordneter 0.690233 0.000310 21 0.817618 -0.040000 0.021549 25 0.062188 0.000769 False
6 Victor Eftimiu 74 Albanien Adel 0.690233 0.000620 14 0.817308 0.000310 0.042905 8 0.040833 0.000812 False
7 Victor Eftimiu 74 Albanien Albania 0.690233 0.000310 21 0.817618 -0.040000 0.021708 24 0.062030 0.000159 False
8 Victor Eftimiu 74 Albanien Albania (Begriffsklärung) 0.690233 0.000310 21 0.817618 -0.040000 0.010401 52 0.073336 0.000238 False
9 Victor Eftimiu 74 Albanien Albanien 0.690233 0.817928 1 -0.040000 0.712779 0.042093 9 0.041645 0.006297 True
10 Victor Eftimiu 74 Albanien Albanien beim Eurovision Song Contest 0.690233 0.001551 10 0.816377 0.000620 0.017750 30 0.065988 0.000186 False
11 Victor Eftimiu 74 Albanien Albanien im Mittelalter 0.690233 0.000310 21 0.817618 -0.040000 0.048738 5 0.035000 0.002570 False
12 Victor Eftimiu 74 Albanien Albanien und die Europäische Union 0.690233 0.000310 21 0.817618 -0.040000 0.023830 20 0.059908 0.001209 False
13 Victor Eftimiu 74 Albanien Albanische Basketballnationalmannschaft 0.690233 0.000931 12 0.816998 0.000310 0.008832 57 0.074906 0.000324 False
14 Victor Eftimiu 74 Albanien Albanische Fußballnationalmannschaft 0.690233 0.105149 2 0.712779 0.088400 0.018334 28 0.065404 0.000463 False
15 Victor Eftimiu 74 Albanien Albanische Fußballnationalmannschaft (U-21-Män... 0.690233 0.005273 7 0.812655 0.000310 0.013050 45 0.070688 0.000582 False
16 Victor Eftimiu 74 Albanien Albanische Fußballnationalmannschaft der Frauen 0.690233 0.005583 6 0.812345 0.000310 0.006187 64 0.077551 0.000701 False
17 Victor Eftimiu 74 Albanien Albanische Küche 0.690233 0.000310 21 0.817618 -0.040000 0.020781 26 0.062957 0.002206 False
18 Victor Eftimiu 74 Albanien Albanische Luftstreitkräfte 0.690233 0.000310 21 0.817618 -0.040000 0.012468 46 0.071270 0.000637 False
19 Victor Eftimiu 74 Albanien Albanische Sprache 0.690233 0.001551 10 0.816377 0.000620 0.028659 14 0.055078 0.000210 False
20 Victor Eftimiu 74 Albanien Albanische Streitkräfte 0.690233 0.000310 21 0.817618 -0.040000 0.014950 39 0.068788 0.000120 False
21 Victor Eftimiu 74 Albanien Albanische Volleyballnationalmannschaft der Fr... 0.690233 0.000310 21 0.817618 -0.040000 0.004611 68 0.079127 0.004611 False
22 Victor Eftimiu 74 Albanien Albanisches Heer 0.690233 0.000620 14 0.817308 0.000310 0.010517 51 0.073221 0.000115 False
23 Victor Eftimiu 74 Albanien Arbeitsministerium 0.690233 0.000310 21 0.817618 -0.040000 0.007345 61 0.076393 0.000671 False
24 Victor Eftimiu 74 Albanien Autobahn 0.690233 0.000310 21 0.817618 -0.040000 0.025260 18 0.058478 0.000245 False
25 Victor Eftimiu 74 Albanien Außenminister 0.690233 0.000310 21 0.817618 -0.040000 0.013065 44 0.070673 0.000015 False
26 Victor Eftimiu 74 Albanien Bildungsministerium 0.690233 0.000310 21 0.817618 -0.040000 0.006603 63 0.077135 0.000416 False
27 Victor Eftimiu 74 Albanien Botschafter der Vereinigten Staaten 0.690233 0.000310 21 0.817618 -0.040000 0.009494 56 0.074244 0.000662 False
28 Victor Eftimiu 74 Albanien Bunker in Albanien 0.690233 0.000310 21 0.817618 -0.040000 0.017564 31 0.066174 0.000486 False
29 Victor Eftimiu 74 Albanien Ehrenbürger 0.690233 0.000310 21 0.817618 -0.040000 0.015137 38 0.068601 0.000187 False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
99970 Poi 5595 Australien Skilanglauf 0.361357 0.000052 76 0.889700 -0.040000 0.030089 6 0.097000 0.000336 False
99971 Poi 5595 Australien Skulpturengarten 0.361357 0.000052 76 0.889700 -0.040000 0.009880 126 0.117209 0.000132 False
99972 Poi 5595 Australien Snowboard 0.361357 0.000052 76 0.889700 -0.040000 0.022825 24 0.104264 0.000050 False
99973 Poi 5595 Australien Socialist Equality Party (Australien) 0.361357 0.000052 76 0.889700 -0.040000 0.002339 227 0.124750 0.000110 False
99974 Poi 5595 Australien Speerwurf 0.361357 0.000052 76 0.889700 -0.040000 0.016442 59 0.110647 0.000226 False
99975 Poi 5595 Australien Sportschießen 0.361357 0.000052 76 0.889700 -0.040000 0.024457 18 0.102633 0.000412 False
99976 Poi 5595 Australien Staaten und Territorien Australiens 0.361357 0.000104 55 0.889648 0.000052 0.017975 48 0.109114 0.000055 False
99977 Poi 5595 Australien Staatsanwalt 0.361357 0.000052 76 0.889700 -0.040000 0.012908 96 0.114181 0.000271 False
99978 Poi 5595 Australien Staatswappen 0.361357 0.000052 76 0.889700 -0.040000 0.012637 97 0.114452 0.000130 False
99979 Poi 5595 Australien Stabhochsprung 0.361357 0.000052 76 0.889700 -0.040000 0.015831 62 0.111258 0.000082 False
99980 Poi 5595 Australien Stadtautobahn 0.361357 0.000052 76 0.889700 -0.040000 0.007762 163 0.119327 0.000017 False
99981 Poi 5595 Australien Straßenbrücke 0.361357 0.000052 76 0.889700 -0.040000 0.013153 92 0.113936 0.000010 False
99982 Poi 5595 Australien Streik 0.361357 0.000052 76 0.889700 -0.040000 0.014374 79 0.112715 0.000165 False
99983 Poi 5595 Australien Sträflingskolonie Australien 0.361357 0.000052 76 0.889700 -0.040000 0.019449 42 0.107640 0.000032 False
99984 Poi 5595 Australien Surfen 0.361357 0.000052 76 0.889700 -0.040000 0.008331 147 0.118758 0.000001 False
99985 Poi 5595 Australien Tennis 0.361357 0.000052 76 0.889700 -0.040000 0.021951 31 0.105139 0.000158 False
99986 Poi 5595 Australien Tennis Australia 0.361357 0.000052 76 0.889700 -0.040000 0.011284 112 0.115805 0.000067 False
99987 Poi 5595 Australien Triathlon 0.361357 0.000052 76 0.889700 -0.040000 0.020084 39 0.107006 0.000420 False
99988 Poi 5595 Australien Uniting Church in Australia 0.361357 0.000052 76 0.889700 -0.040000 0.002991 225 0.124099 0.000289 False
99989 Poi 5595 Australien Uranabbau in Australien 0.361357 0.000052 76 0.889700 -0.040000 0.022842 23 0.104247 0.000017 False
99990 Poi 5595 Australien Verkehrsministerium 0.361357 0.000052 76 0.889700 -0.040000 0.005468 205 0.121621 0.000234 False
99991 Poi 5595 Australien Verteidigungsministerium 0.361357 0.000052 76 0.889700 -0.040000 0.019262 44 0.107827 0.000178 False
99992 Poi 5595 Australien Victoria (Australien) 0.361357 0.000155 50 0.889597 0.000052 0.021498 33 0.105591 0.000054 False
99993 Poi 5595 Australien Victorianischer Goldrausch 0.361357 0.000052 76 0.889700 -0.040000 0.017750 51 0.109339 0.000065 False
99994 Poi 5595 Australien Vizeadmiral 0.361357 0.000052 76 0.889700 -0.040000 0.007805 161 0.119284 0.000022 False
99995 Poi 5595 Australien Volleyball 0.361357 0.000052 76 0.889700 -0.040000 0.021793 32 0.105297 0.000295 False
99996 Poi 5595 Australien Vulkan 0.361357 0.000052 76 0.889700 -0.040000 0.016763 56 0.110326 0.000168 False
99997 Poi 5595 Australien Vulkanismus 0.361357 0.000052 76 0.889700 -0.040000 0.009748 127 0.117341 0.000025 False
99998 Poi 5595 Australien Wahlkreis 0.361357 0.000052 76 0.889700 -0.040000 0.016487 58 0.110602 0.000045 False
99999 Poi 5595 Australien Wasserball 0.361357 0.000052 76 0.889700 -0.040000 0.029753 7 0.097336 0.000127 False

100000 rows × 14 columns


In [23]:
p_label = "Valid links"
n_label = "Invalid links"
p_color = "green"
n_color = "blue"
p_alpha = 0.8
n_alpha = 0.5

relative_frequency = r"$f$"
ls = "$ls$"
es = "$es$"
cs = "$cs$"
rank = r"$r$"
dtop = r"$\Delta top$"
dsuc = r"$\Delta succ$"
es_index = r"$_{es}$"
cs_index = r"$_{cs}$"
es_rank = rank + es_index
es_dtop = dtop + es_index
es_dsuc = dsuc + es_index
cs_rank = rank + cs_index
cs_dtop = dtop + cs_index
cs_dsuc = dsuc + cs_index

# https://stackoverflow.com/questions/12444716/how-do-i-set-the-figure-title-and-axes-labels-font-size-in-matplotlib
params = {'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large'}
plt.rcParams.update(params)

In [6]:
x = df[["correct"]].as_matrix()
links = len(list(filter(lambda val: val, x)))
no_links = len(list(filter(lambda val: not val, x)))
(links, no_links)


Out[6]:
(8357, 91643)

In [28]:
composite_features()
link_scores()
entity_scores()
entity_score_ranks()
entity_score_dtop()
entity_score_dsuc()
cosine_sims()
cosine_sim_ranks()
cosine_sim_dtop()
cosine_sim_dsuc()
"done"


Out[28]:
'done'

In [8]:
def composite_features():
    data = df[['entity_score', 'cosine_sim', 'correct']]

    labels = df.correct

    X = data.as_matrix()
    y = labels.as_matrix()

    class_names = ['Invalid link', 'Valid link']

    fig, axes = plt.subplots()
    colors = [n_color, p_color]
    for color, i, class_name in zip(colors, [0, 1, 2], class_names):
        plt.scatter(X[y == i, 0], X[y == i, 1], color=color, alpha=0.2, label=class_name, marker=".")

    axes.set_xlim([0.0, 1.0])
    axes.set_ylim([0.0001, 1.0])
    axes.set_yscale('log')

    #plt.title('Composite features')
    plt.xlabel(es)
    plt.ylabel(cs)
    plt.legend(loc='lower right', shadow=False, scatterpoints=1)
    plt.show()
    fig.savefig("plots/composite_features.png", bbox_inches="tight", dpi=600)
    
composite_features()
"done"


Out[8]:
'done'

In [13]:
def link_scores(logx = False):
    data = df[['link_score', 'correct']]
    x = data.as_matrix()
    tp = list(filter(lambda val: val[1] == True, x))
    tn = list(filter(lambda val: val[1] == False, x))
    tp = pd.Series(list(map(lambda pair: pair[0], tp)))
    tn = pd.Series(list(map(lambda pair: pair[0], tn)))

    # create bin boundaries
    number_bins = 30
    bins = np.arange(0.0, 1.0, 1.0 / number_bins)
    bins = np.append(bins, 1.0)
    if logx:
        bins = np.logspace(-1, 0, number_bins)

    # normalize histogram: show relative frequencies of classes
    p_weights = np.ones_like(tp)/float(len(tp))
    n_weights = np.ones_like(tn)/float(len(tn))

    fig, ax = plt.subplots()
    ax.hist(tp, weights=p_weights, bins=bins, alpha=p_alpha, label="Valid links", color=p_color)
    ax.hist(tn, weights=n_weights, bins=bins, alpha=n_alpha, label="Invalid links", color=n_color)
    
    if logx:
        ax.set_xlim([0.1, 1.0])
        ax.set_xscale('log')
    ax.set_ylim([0.0, 1.0])

    #plt.title('Distribution of link scores')
    plt.xlabel(ls)
    plt.ylabel(relative_frequency)
    plt.legend(loc='best')
    plt.show()
    fig.savefig("plots/link_scores.pdf", bbox_inches="tight")
    
link_scores(False)



In [14]:
def entity_scores(normalize=True):
    # https://stackoverflow.com/questions/6871201/plot-two-histograms-at-the-same-time-with-matplotlib

    data = df[['entity_score', 'correct']]
    x = data.as_matrix()
    tp = list(filter(lambda val: val[1] == True, x))
    tn = list(filter(lambda val: val[1] == False, x))
    tp = pd.Series(list(map(lambda pair: pair[0], tp)))
    tn = pd.Series(list(map(lambda pair: pair[0], tn)))

    # create bin boundaries
    number_bins = 30
    bins = np.arange(0.0, 1.0, 1.0 / number_bins)
    bins = np.append(bins, 1.0)

    # normalize histogram: show relative frequencies of classes
    if normalize:
        p_weights = np.ones_like(tp)/float(len(tp))
        n_weights = np.ones_like(tn)/float(len(tn))
    else:
        p_weights = [1] * len(tp)
        n_weights = [1] * len(tn)
        
    fig, ax = plt.subplots()
    ax.hist(tp, weights=p_weights, bins=bins, alpha=p_alpha, label=p_label, color=p_color)
    ax.hist(tn, weights=n_weights, bins=bins, alpha=n_alpha, label=n_label, color=n_color)
    if normalize:
        ax.set_ylim([0.0, 1.0])
    else:
        ax.set_yscale("log")

    #plt.title('Distribution of entity scores')
    plt.xlabel(es)
    plt.ylabel(relative_frequency)
    plt.legend(loc='best')
    plt.show()
    fig.savefig("plots/entity_scores.pdf", bbox_inches="tight")
    
entity_scores(True)



In [27]:
def cosine_sims(logx=True, logy=True):
    data = df[['cosine_sim', 'correct']]
    x = data.as_matrix()
    tp = list(filter(lambda val: val[1] == True, x))
    tn = list(filter(lambda val: val[1] == False, x))
    tp = pd.Series(list(map(lambda pair: pair[0], tp)))
    tn = pd.Series(list(map(lambda pair: pair[0], tn)))

    # create bin boundaries
    number_bins = 35
    bins = np.arange(0.0, 1.0, 1.0 / number_bins)
    bins = np.append(bins, 1.0)
    if logx:
        bins = np.logspace(-4, 0, number_bins)

    # normalize histogram: show relative frequencies of classes
    p_weights = np.ones_like(tp)/float(len(tp))
    n_weights = np.ones_like(tn)/float(len(tn))

    fig, ax = plt.subplots()
    ax.hist(tp, weights=p_weights, bins=bins, alpha=p_alpha, label=p_label, color=p_color)
    ax.hist(tn, weights=n_weights, bins=bins, alpha=n_alpha, label=n_label, color=n_color)
    if logx:
        ax.set_xlim([0.0001, 1.0])
        ax.set_xscale('log')
    if logy:
        ax.set_ylim([0.00001, 1.0])
        ax.set_yscale('log')
    else:
        ax.set_ylim([0.0, 1.0])

    #plt.title('Distribution of context scores')
    plt.xlabel(cs)
    plt.ylabel(relative_frequency)
    plt.legend(loc='best')
    plt.show()
    fig.savefig("plots/context_scores.pdf", bbox_inches="tight")
    
cosine_sims(True, True)



In [16]:
def entity_score_ranks():
    data = df[['entity_score_rank', 'correct']]
    x = data.as_matrix()
    tp = list(filter(lambda val: val[1] == True, x))
    tn = list(filter(lambda val: val[1] == False, x))
    tp = pd.Series(list(map(lambda pair: pair[0], tp)))
    tn = pd.Series(list(map(lambda pair: pair[0], tn)))

    # create bin boundaries
    number_bins = 29
    bins = list(range(1, number_bins + 2))

    # normalize histogram: show relative frequencies of classes
    p_weights = np.ones_like(tp)/float(len(tp))
    n_weights = np.ones_like(tn)/float(len(tn))

    fig, ax = plt.subplots()
    ax.hist(tp, weights=p_weights, bins=bins, alpha=p_alpha, label=p_label, align="left", color=p_color)
    ax.hist(tn, weights=n_weights, bins=bins, alpha=n_alpha, label=n_label, align="left", color=n_color)

    ticks = [1] + list(range(0, max(bins), 5))[1:] + [bins[-2]]
    ax.set_xticks(ticks)
    ax.set_ylim([0.0, 1.0])

    #plt.title("Distribution of entity scores' ranks")
    plt.xlabel(es_rank)
    plt.ylabel(relative_frequency)
    plt.legend(loc='upper right')
    plt.show()
    fig.savefig("plots/entity_scores_ranks.pdf", bbox_inches="tight")
    
entity_score_ranks()



In [17]:
def cosine_sim_ranks():
    data = df[['cosine_sim_rank', 'correct']]
    x = data.as_matrix()
    tp = list(filter(lambda val: val[1] == True, x))
    tn = list(filter(lambda val: val[1] == False, x))
    tp = pd.Series(list(map(lambda pair: pair[0], tp)))
    tn = pd.Series(list(map(lambda pair: pair[0], tn)))

    # create bin boundaries
    number_bins = 29
    bins = list(range(1, number_bins + 2))

    # normalize histogram: show relative frequencies of classes
    p_weights = np.ones_like(tp)/float(len(tp))
    n_weights = np.ones_like(tn)/float(len(tn))

    fig, ax = plt.subplots()
    ax.hist(tp, weights=p_weights, bins=bins, alpha=p_alpha, label=p_label, align="left", color=p_color)
    ax.hist(tn, weights=n_weights, bins=bins, alpha=n_alpha, label=n_label, align="left", color=n_color)

    ticks = [1] + list(range(0, max(bins), 5))[1:] + [bins[-2]]
    ax.set_xticks(ticks)
    ax.set_ylim([0.0, 1.0])

    #plt.title("Distribution of context scores' ranks")
    plt.xlabel(cs_rank)
    plt.ylabel(relative_frequency)
    plt.legend(loc='upper right')
    plt.show()
    fig.savefig("plots/context_scores_ranks.pdf", bbox_inches="tight")
    
cosine_sim_ranks()



In [18]:
def entity_score_dtop():
    data = df[['entity_score_dtop', 'correct']]
    x = data.as_matrix()
    tp = list(filter(lambda val: val[1] == True, x))
    tn = list(filter(lambda val: val[1] == False, x))
    tp = pd.Series(list(map(lambda pair: pair[0], tp)))
    tn = pd.Series(list(map(lambda pair: pair[0], tn)))

    # create bin boundaries
    number_bins = 30
    bin_start, bin_end = 0.0, 1.0
    bin_width = (bin_end - bin_start) / number_bins
    bins = np.arange(bin_start, bin_end, bin_width)
    bins = np.append(bins, 1.0)
    space = 2
    bins = np.append([bin_start - space * bin_width, bin_start - bin_width], bins)

    # normalize histogram: show relative frequencies of classes
    p_weights = np.ones_like(tp)/float(len(tp))
    n_weights = np.ones_like(tn)/float(len(tn))

    fig, ax = plt.subplots()
    ax.hist(tp, weights=p_weights, bins=bins, alpha=p_alpha, label=p_label, color=p_color)
    ax.hist(tn, weights=n_weights, bins=bins, alpha=n_alpha, label=n_label, color=n_color)

    #ticks = [-space * bin_width] + list(np.arange(0.0, 1.0, 0.2))
    #tick_labels = list(ticks)
    #tick_labels[0] = "n/a"
    #ax.set_xticks(ticks)
    #ax.set_xticklabels(tick_labels)
    ax.set_xlim(-space * bin_width, 1.0)
    ax.set_ylim([0.0, 1.0])

    #plt.title(r"Distribution of entity scores' " + dtop)
    plt.xlabel(es_dtop)
    plt.ylabel(relative_frequency)
    plt.legend(loc='best')
    plt.show()
    fig.savefig("plots/entity_scores_dtop.pdf", bbox_inches="tight")
    
entity_score_dtop()



In [19]:
def cosine_sim_dtop():
    data = df[['cosine_sim_dtop', 'correct']]
    x = data.as_matrix()
    tp = list(filter(lambda val: val[1] == True, x))
    tn = list(filter(lambda val: val[1] == False, x))
    tp = pd.Series(list(map(lambda pair: pair[0], tp)))
    tn = pd.Series(list(map(lambda pair: pair[0], tn)))

    # create bin boundaries
    number_bins = 30
    bin_start, bin_end = 0.0, 1.0
    bin_width = (bin_end - bin_start) / number_bins
    bins = np.arange(bin_start, bin_end, bin_width)
    bins = np.append(bins, 1.0)
    bins = np.append([bin_start - 2 * bin_width, bin_start - bin_width], bins)

    # normalize histogram: show relative frequencies of classes
    p_weights = np.ones_like(tp)/float(len(tp))
    n_weights = np.ones_like(tn)/float(len(tn))

    fig, ax = plt.subplots()
    ax.hist(tp, weights=p_weights, bins=bins, alpha=p_alpha, label=p_label, color=p_color)
    ax.hist(tn, weights=n_weights, bins=bins, alpha=n_alpha, label=n_label, color=n_color)

    #ticks = [1] + list(range(0, max(bins), 5))[1:] + [bins[-2]]
    #ax.set_xticks(ticks)
    ax.set_xlim(-2 * bin_width, 1.0)
    ax.set_ylim([0.0, 1.0])

    #plt.title(r"Distribution of context scores' " + dtop)
    plt.xlabel(cs_dtop)
    plt.ylabel(relative_frequency)
    plt.legend(loc='best')
    plt.show()
    fig.savefig("plots/context_scores_dtop.pdf", bbox_inches="tight")
    
cosine_sim_dtop()



In [20]:
def entity_score_dsuc():
    data = df[['entity_score_dsuc', 'correct']]
    x = data.as_matrix()
    tp = list(filter(lambda val: val[1] == True, x))
    tn = list(filter(lambda val: val[1] == False, x))
    tp = pd.Series(list(map(lambda pair: pair[0], tp)))
    tn = pd.Series(list(map(lambda pair: pair[0], tn)))

    # create bin boundaries
    number_bins = 30
    bin_start, bin_end = 0.0, 1.0
    bin_width = (bin_end - bin_start) / number_bins
    bins = np.arange(bin_start, bin_end, bin_width)
    bins = np.append(bins, 1.0)
    bins = np.append([bin_start - 2 * bin_width, bin_start - bin_width], bins)

    # normalize histogram: show relative frequencies of classes
    p_weights = np.ones_like(tp)/float(len(tp))
    n_weights = np.ones_like(tn)/float(len(tn))

    fig, ax = plt.subplots()
    ax.hist(tp, weights=p_weights, bins=bins, alpha=p_alpha, label=p_label, color=p_color)
    ax.hist(tn, weights=n_weights, bins=bins, alpha=n_alpha, label=n_label, color=n_color)

    #ticks = [1] + list(range(0, max(bins), 5))[1:] + [bins[-2]]
    #ax.set_xticks(ticks)
    ax.set_xlim(-2 * bin_width, 1.0)
    ax.set_ylim([0.0, 1.0])

    #plt.title(r"Distribution of entity scores' " + dsuc)
    plt.xlabel(es_dsuc)
    plt.ylabel(relative_frequency)
    plt.legend(loc='best')
    plt.show()
    fig.savefig("plots/entity_scores_dsuc.pdf", bbox_inches="tight")
    
entity_score_dsuc()



In [21]:
def cosine_sim_dsuc():
    data = df[['cosine_sim_dsuc', 'correct']]
    x = data.as_matrix()
    tp = list(filter(lambda val: val[1] == True, x))
    tn = list(filter(lambda val: val[1] == False, x))
    tp = pd.Series(list(map(lambda pair: pair[0], tp)))
    tn = pd.Series(list(map(lambda pair: pair[0], tn)))

    # create bin boundaries
    number_bins = 30
    bin_start, bin_end = 0.0, 1.0
    bin_width = (bin_end - bin_start) / number_bins
    bins = np.arange(bin_start, bin_end, bin_width)
    bins = np.append(bins, 1.0)
    bins = np.append([bin_start - 2 * bin_width, bin_start - bin_width], bins)

    # normalize histogram: show relative frequencies of classes
    p_weights = np.ones_like(tp)/float(len(tp))
    n_weights = np.ones_like(tn)/float(len(tn))

    fig, ax = plt.subplots()
    ax.hist(tp, weights=p_weights, bins=bins, alpha=p_alpha, label=p_label, color=p_color)
    ax.hist(tn, weights=n_weights, bins=bins, alpha=n_alpha, label=n_label, color=n_color)

    #ticks = [1] + list(range(0, max(bins), 5))[1:] + [bins[-2]]
    #ax.set_xticks(ticks)
    ax.set_xlim(-2 * bin_width, 1.0)
    ax.set_ylim([0.0, 1.0])
    
    #plt.title(r"Distribution of context scores' " + dsuc)
    plt.xlabel(cs_dsuc)
    plt.ylabel(relative_frequency)
    plt.legend(loc='best')
    plt.show()
    fig.savefig("plots/context_scores_dsuc.pdf", bbox_inches="tight")
    
cosine_sim_dsuc()