Testing Yan et al. (2010, 2011) on the Radio Galaxy Zoo

Let's run the crowd learning algorithm on the Radio Galaxy Zoo.


In [1]:
from pprint import pprint
import sys

from astropy.coordinates import SkyCoord
import h5py
import numpy
import sklearn.neighbors
import seaborn

sys.path.insert(1, '..')
import crowdastro.active_learning.active_crowd as active_crowd
import crowdastro.active_learning.passive_crowd as passive_crowd
import crowdastro.active_learning.active_crowd_scalar as active_crowd_scalar

CROWDASTRO_H5_PATH = '../data/crowdastro.h5'
TRAINING_H5_PATH = '../data/training.h5'
NORRIS_DAT_PATH = '../data/norris_2006_atlas_classifications_ra_dec_only.dat'

# Load Norris labels.
with h5py.File(TRAINING_H5_PATH, 'r') as training_f:
    ir_positions = training_f['positions'].value
ir_tree = sklearn.neighbors.KDTree(ir_positions)

with open(NORRIS_DAT_PATH, 'r') as norris_dat:
    norris_coords = [r.strip().split('|') for r in norris_dat]

norris_labels = numpy.zeros((len(ir_positions)))
for ra, dec in norris_coords:
    # Find a neighbour.
    skycoord = SkyCoord(ra=ra, dec=dec, unit=('hourangle', 'deg'))
    ra = skycoord.ra.degree
    dec = skycoord.dec.degree
    ((dist,),), ((ir,),) = ir_tree.query([(ra, dec)])
    if dist < 0.1:
        norris_labels[ir] = 1

How many annotators do we have? How many labels are anonymously contributed?

At the moment, I can only use the algorithm for non-anonymous users. How many are there?


In [2]:
with h5py.File(CROWDASTRO_H5_PATH) as f_h5:
    print(sum(1 for i in f_h5['/atlas/cdfs/']['classification_usernames'] if not i)
          / len(f_h5['/atlas/cdfs/']['classification_usernames']))


0.14638396747022944

Only 15% of labels are contributed by anonymous users! That's great for the algorithm. How many users are there?


In [3]:
with h5py.File(CROWDASTRO_H5_PATH) as f_h5:
    print(len({i for i in f_h5['/atlas/cdfs/']['classification_usernames'] if i}))


1193

There are 1193 labellers. That's big but hopefully my code can handle it (and if not I'll have to change my methodology a bit).

Retrieving labels

Let's pull out some labels. This involves matching each IR object to a label for each annotator. If a IR object never appears in a subject that the annotator has labelled, then it should be masked.


In [4]:
with h5py.File(CROWDASTRO_H5_PATH) as f_h5:
    annotators = sorted({i for i in f_h5['/atlas/cdfs/classification_usernames'] if i})
    n_annotators = len(annotators)
    annotator_to_index = {j:i for i, j in enumerate(annotators)}
    
    n_examples = f_h5['/wise/cdfs/numeric'].shape[0]
    
    ir_tree = sklearn.neighbors.KDTree(f_h5['/wise/cdfs/numeric'][:, :2], metric='chebyshev')

In [5]:
with h5py.File(CROWDASTRO_H5_PATH) as f_h5:
    labels = numpy.ma.MaskedArray(numpy.zeros((n_annotators, n_examples)),
                                  mask=numpy.ones((n_annotators, n_examples)))
    
    for (atlas_idx, ra, dec), c_user in zip(
            f_h5['/atlas/cdfs/classification_positions'],
            f_h5['/atlas/cdfs/classification_usernames'],
    ):
        if not c_user:
            continue

        t = annotator_to_index[c_user]
        
        atlas_ra, atlas_dec = f_h5['/atlas/cdfs/numeric'][atlas_idx, :2]
        
        # t has seen this ATLAS subject, so unmask everything within 1' Chebyshev distance (the radius of an RGZ subject).
        nearby = ir_tree.query_radius([[atlas_ra, atlas_dec]], 1 / 60)[0]
        labels.mask[t, nearby] = 0

        # Label the point nearest the classification as 1.
        # (The others are 0 by default.)
        if numpy.isnan(ra) or numpy.isnan(dec):
            continue

        point = ir_tree.query([[ra, dec]], return_distance=False)[0]
        labels[t, point] = 1

In [6]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(15, 10))
plt.subplot(1, 2, 1)
plt.hist((~labels.mask).sum(axis=0))
plt.xlabel('Number of viewings')
plt.ylabel('IR objects')
plt.subplot(1, 2, 2)
plt.hist((~labels.mask).sum(axis=1), bins=numpy.linspace(0, 25000, 200))
plt.xlim((0, 1000))
plt.xlabel('Number of annotations')
plt.ylabel('Annotators')


Out[6]:
<matplotlib.text.Text at 0x2274b83a748>

How good are the annotators? #127

What is the distribution of balanced accuracies for each annotator? Can we estimate $p(y_i^{(t)} | x_i, z_i)$?


In [7]:
import sklearn.metrics

accuracies = []
annotator_to_accuracy = {}
for t in range(n_annotators):
    mask = labels[t].mask
    cm = sklearn.metrics.confusion_matrix(norris_labels[~mask], labels[t, ~mask]).astype(float)
    if cm.shape == (1, 1):
        continue

    tp = cm[1, 1]
    n, p = cm.sum(axis=1)
    tn = cm[0, 0]
    if not (n and p):
        continue

    ba = (tp / p + tn / n) / 2
    accuracies.append(ba)
    annotator_to_accuracy[t] = ba

print('{:.02%} of labellers have a balanced accuracy.'.format(len(accuracies) / n_annotators))


74.27% of labellers have a balanced accuracy.

In [8]:
plt.hist(accuracies, color='grey', bins=20)
plt.xlabel('Balanced accuracy')
plt.ylabel('Number of annotators')
plt.show()

print('Average: ({:.02f} +- {:.02f})%'.format(numpy.mean(accuracies) * 100, numpy.std(accuracies) * 100))


Average: (71.03 +- 18.29)%

How many annotators are experts?


In [9]:
experts = ("42jkb", "ivywong", "stasmanian", "klmasters", "Kevin",
           "akapinska", "enno.middelberg", "xDocR", "DocR", "vrooje", "KWillett")
print([expert for expert in experts if expert.encode('ascii') in annotators])


['KWillett']

How many positive examples have the top 10 annotators labelled?


In [10]:
counts = [(numpy.ma.sum(labels[t] == 1), t) for t in range(n_annotators)]
counts.sort()
pprint([(annotator_to_accuracy[t], t, count) for count, t in counts[-10:]])

top_10 = sorted([t for _, t in counts[-10:]])


[(0.75797582455910884, 1169, 519),
 (0.75699271733750673, 135, 683),
 (0.75924796624019708, 64, 962),
 (0.69394703116773282, 314, 1500),
 (0.7682293723857847, 662, 1893),
 (0.77407966065131606, 1062, 1916),
 (0.80921568705428282, 604, 1967),
 (0.81688934170995275, 1051, 1970),
 (0.82859181461409082, 978, 2006),
 (0.83511567214955051, 490, 2233)]

In [11]:
for annotator, count in [(annotators[t], count) for count, t in reversed(counts)]:
    print(annotator.decode('utf-8'), '\t', count)


antikodon 	 2233
planetari7 	 2006
sisifolibre 	 1970
csunjoto 	 1967
snowysky 	 1916
equidad1 	 1893
PattyD 	 1500
ChaMei 	 962
Explorer15 	 683
willy71 	 519
cripple85 	 506
JeanTate 	 430
Mitch56 	 427
Mimocarino 	 398
HelmutU 	 358
Dolorous Edd 	 337
ControlledChaos 	 268
arianna99 	 260
shocko61 	 219
Shade017@umn.edu 	 205
rm3082 	 196
tbrockhaus 	 179
ChrisMolloy 	 172
graham d 	 171
pamelaann 	 165
G.Costalunga 	 137
brandon.mariano 	 126
Weissoniq 	 124
1001G 	 118
flecmm03 	 116
parkerm126 	 115
ruprechtludwig 	 113
ashleysuzanne 	 105
kobo123 	 102
Gary Michael 	 100
tom324232 	 99
Prothon 	 95
romulous_ErrorDupUsername 	 93
bobk47 	 92
alinejojo 	 91
mixu74 	 90
Rock Beauty 	 86
Jimg3d 	 84
Swann2501 	 83
fiskmannen 	 76
gavinrider 	 70
WizardHowl 	 67
WEBs in space 	 65
armybro1 	 64
Engel1587 	 62
dhiana 	 60
chiliking 	 60
area53 	 59
legacyhardware 	 58
gwallingford 	 58
Missybee35 	 58
olbo 	 57
ehinterm 	 57
radek45 	 56
spaceygeek 	 55
Codo 	 54
Vic7or 	 52
cjunco 	 51
ruthparker 	 50
Ashlynn123 	 50
star-light@cox.net 	 48
LeCorndog 	 47
lemisiak 	 46
honeyeyedchild 	 46
Rilo 	 46
victorialepp 	 45
j317182123 	 45
FRA1MSP 	 45
scmorris 	 43
anncopal 	 40
Shill2008 	 40
rylas007 	 39
orionsam1 	 38
jennifer bjornstrom 	 38
paweltluscik 	 37
McMeeps 	 37
Jordan.Botta 	 37
steve giess 	 34
nico775 	 34
cosmonaut42 	 33
Wald86 	 33
eddaw 	 32
alphaxpress 	 32
Flatlegs 	 32
Chouchou Bidou 	 32
alycerethmann 	 31
SPevely 	 31
starquake 	 29
rcmills1707@gmail.com 	 29
fernandoitri 	 29
called2serve249 	 29
davestringer 	 28
chairgaf 	 28
skysmyzer 	 27
peterw143 	 27
namla 	 27
marcostudios 	 27
m161014 	 27
dylantowers 	 27
abnaxusofvenar 	 27
DEP 	 27
blueOrion 	 26
a_rusin 	 26
PetM 	 26
papitanovana 	 25
chester_ltp 	 25
burtonpl 	 25
Nightowl1955 	 25
graham_d 	 24
cokeomo 	 24
RobMatMar 	 24
Jessielucas 	 24
vargajiri 	 23
meus 	 23
avaughan 	 23
Kira7940 	 23
suzyq0810 	 22
seeingdublin2 	 22
dbrockwa 	 22
Loulouuse 	 22
jeffsluggett 	 21
b.umberto 	 21
aussiegoodstuff 	 21
Tonya_Kiissah_Nox 	 21
Kavaec94 	 21
titaNium2 	 20
Griff2142 	 20
ElisabethB 	 20
lilbethfay 	 19
ebanuelos9 	 19
dresslikeice 	 19
Selaene 	 19
JSeim 	 19
Angelocieco 	 19
Dolorous_Edd 	 18
Chezburgar7300 	 18
Alpha-Orionis 	 18
gsullivan4 	 17
gfitzs321 	 17
alanalas 	 17
Wizardlord 	 17
Ptd 	 17
Heffy 	 17
Andy_Arg 	 17
voidstar 	 16
matt234pm 	 16
N5bz 	 16
DANIEL_GARCIA 	 16
stargazer375 	 15
randymcvey77 	 15
paris.antonio 	 15
jojoba 	 15
despertaferro 	 15
btaraybill 	 15
alexthepink95 	 15
PRivate_pAuLa 	 15
Fenja 	 15
EdGi 	 15
tjones3 	 14
soretim 	 14
robynet 	 14
nerilldp 	 14
maxpra99 	 14
letsfaceit 	 14
giazira 	 14
chris_sasaurus 	 14
adamovichsv 	 14
Woko69 	 14
Skydiver33 	 14
vigliottac 	 13
jmarinelli 	 13
eperozzi 	 13
ba12003127 	 13
Zwolf20 	 13
Slice0matic 	 13
Krannin 	 13
JoeCool 	 13
CrystalFigueroa05 	 13
snowflakesarewhite 	 12
siasac 	 12
lwd630 	 12
jaimevictoria 	 12
Sollieboy 	 12
MarcinKlimczewski 	 12
Hotpepperz 	 12
Heheherbata 	 12
Bwalk2121 	 12
Ariaiyc 	 12
Alex_Chekh 	 12
AZooKeeper0001 	 12
stellatami 	 11
kaylamariee_13 	 11
iamme.cy@gmail.com 	 11
firejuggler 	 11
eylovesoccer 	 11
debbiethetruong 	 11
choplen 	 11
c196948 	 11
beccavasaurus 	 11
awinfrey 	 11
avivgoll 	 11
ariana22c 	 11
Torrencrack 	 11
Tommyedlobster 	 11
ColormaticLlama 	 11
Amaranta09 	 11
Alegria25 	 11
tomhinde 	 10
stam8 	 10
mandi1978 	 10
hbriggs@uwo.ca 	 10
genghiskhen 	 10
arerainingcats 	 10
aomeara1 	 10
Vatilass 	 10
Teo9 	 10
Monkeyscript 	 10
Maroon2790 	 10
Gopach 	 10
DeamCole 	 10
DTSULLY 	 10
Artur Korshak 	 10
wtaskew 	 9
true_Nic 	 9
thundercough 	 9
speakeasysky 	 9
markbakovic 	 9
lreadtan@uwo.ca 	 9
jarmo 	 9
jaredk 	 9
enderwigout 	 9
eclements1980 	 9
aspro82 	 9
abhargava 	 9
Werner  	 9
TheTallestPurple 	 9
RedDawn84 	 9
J3300918 	 9
HighDry 	 9
wenson91 	 8
vtenhunen 	 8
vmax3000 	 8
moonbasealpha 	 8
martakot227 	 8
gr357435 	 8
dcm2324 	 8
cserfalvi 	 8
coreyfrey 	 8
aquitanian 	 8
alphamoonbase 	 8
WesleyPeters 	 8
Vendrov 	 8
Ryzasia 	 8
Rcdyson987 	 8
Ravno 	 8
Papertigers 	 8
Mario1967 	 8
Kristie14 	 8
Incarlina 	 8
Hoff810 	 8
GalacticChemist3 	 8
Edeac 	 8
Dougaluk 	 8
Dave_Reynolds 	 8
zorkin.denis 	 7
wstengrasmcke 	 7
toxic0 	 7
strikerusrus 	 7
starchildjuss 	 7
ssh 	 7
smorga45@uwo.ca 	 7
mrmiloslav 	 7
metalmania 	 7
markoramiusiii 	 7
marielouisedonohoe 	 7
lexistirman 	 7
jimfite 	 7
itsfullofstars 	 7
irgendwaslanges 	 7
gartral 	 7
dr. schlau 	 7
dennispattensr 	 7
davilaelijah 	 7
brentdmorris 	 7
avoigt 	 7
antsaa79 	 7
akbaker1 	 7
acisco0006 	 7
Ushiromiya Xyrius 	 7
User3110 	 7
Stooth 	 7
SpaceHuntress 	 7
Sharkounet 	 7
Romulous 	 7
NicoleFayK 	 7
Mspinelli91 	 7
MisterToughWhite 	 7
MisBi 	 7
Mike Beer 	 7
Mht 	 7
FairyRed 	 7
Ewert Strmberg 	 7
Ellery 	 7
Colmena 	 7
Cautator 	 7
Alien Rast 	 7
zwanenburg 	 6
zoolodjee 	 6
zabbott2 	 6
xvnopill 	 6
wgoltz 	 6
tvirian 	 6
trackrat148 	 6
themoffitt 	 6
taras mykytyuk 	 6
stella stasinopoulou 	 6
sparfyonova@gmail.com 	 6
scara 	 6
rangers1221 	 6
micfly25 	 6
mforkin 	 6
metastrophe 	 6
mbabiera@uwo.ca 	 6
mary.e.savage 	 6
lnbradsh@asu.edu 	 6
kopec705 	 6
khafak 	 6
kernewe@uwo.ca 	 6
irene.mp 	 6
hjmedrud 	 6
hardan 	 6
griever41269 	 6
gleffeer 	 6
genogenie 	 6
foxxravin 	 6
eaglefire 	 6
dazzle34 	 6
cyannn 	 6
bhowidi@uwo.ca 	 6
ameliaharkins 	 6
achedore@uwo.ca 	 6
Zeladra 	 6
Trs8682 	 6
Tanzo 	 6
Smkm724 	 6
Shairlyn 	 6
Scienceprof 	 6
PsyberDave 	 6
Mietek62 	 6
Mars_to_earth 	 6
Kieran Davies 	 6
Jeb Kerman 	 6
Intestereo 	 6
Hoganwpearson 	 6
Fuzzie47 	 6
Evilshrimp 	 6
Dumortier 	 6
BigRed1 	 6
Amy Marie 	 6
waprior 	 5
wakaleo 	 5
therealstarwars 	 5
szymondrag 	 5
szheng86 	 5
sk10071609 	 5
sandra pilkington 	 5
robinstratford 	 5
pppb 	 5
mwitteri 	 5
mli627@uwo.ca 	 5
meyeusername 	 5
mckbenbow 	 5
mauromarussi 	 5
logantbowhay 	 5
lena3rika 	 5
krobe46@uwo.ca 	 5
kedems29 	 5
kb969676 	 5
jmacph2@uwo.ca 	 5
jferre7@uwo.ca 	 5
jcoulth2@uwo.ca 	 5
jackalsnight 	 5
ian hodgson 	 5
frq 	 5
fido2 	 5
dontus 	 5
deubeuliou 	 5
ashleyrenee13 	 5
ap1101 	 5
ann_otlivanovaa 	 5
anastasija 	 5
Radioastro 	 5
Qlex 	 5
Omega Goon 	 5
Nique_2005 	 5
NeilDeGrassTyson 	 5
KWillett 	 5
JuliaIaniro 	 5
Josh Bosh 	 5
John De Rouck 	 5
JanaW 	 5
Jamiejson 	 5
JRod1994 	 5
Frostman1187 	 5
Foraldo 	 5
Craig_Sour 	 5
Chinabob 	 5
ArtValenti 	 5
AndyDaley 	 5
wabuswell 	 4
tjohns94@uwo.ca 	 4
sthaler 	 4
sidneyarham 	 4
shood1 	 4
sergiompereira 	 4
sarahvalentin86 	 4
sarah_ash9 	 4
rreilly@uwo.ca 	 4
rmounce 	 4
rfdomingos 	 4
realcarolineb 	 4
rbhatna4 	 4
pytolsky 	 4
pvjones 	 4
pontifex 	 4
phlogos 	 4
nilium 	 4
mnowy 	 4
mind123 	 4
mewking 	 4
martymacs 	 4
kmarskel@uwo.ca 	 4
kekkyojin 	 4
kayleehairel 	 4
karpitto 	 4
jriccion 	 4
josefajardo 	 4
jlesarg2@uwo.ca 	 4
jeffrey_c_schneider@yahoo.com 	 4
jddavis007 	 4
jchromcz 	 4
hgolden95 	 4
dileo3081 	 4
cnygaard08 	 4
cmacd48 	 4
bluerigel314 	 4
billlaw 	 4
bedginto 	 4
bdomingues 	 4
astronomas 	 4
arvvie 	 4
amiltenb@uwo.ca 	 4
akil hashim 	 4
agartland 	 4
aaudette@uwo.ca 	 4
Zliu423@uwo.ca 	 4
Zenstar 	 4
Xito 	 4
Wilfred-Cackowski 	 4
Tulsiqt 	 4
ThePolyscope 	 4
TFurby 	 4
Suddenlybears 	 4
SpaceMonkeyMafia 	 4
Smashrock 	 4
SingMeToSleep 	 4
SMCCOR26 	 4
Praetot6554 	 4
Ozosorio 	 4
Nggoodlet 	 4
Neri_Riker 	 4
MatthewLaj 	 4
Maridan 	 4
MIke Stephen 	 4
Lord Crc 	 4
Lightningfrog 	 4
Leverd01 	 4
KristinaStrelchenko 	 4
Jenniferq32 	 4
Jdesrosier8204025 	 4
Isajah 	 4
Hruk 	 4
Heliosiah 	 4
DocSki2010 	 4
Destice 	 4
DaFaja 	 4
Captain_Shepard 	 4
Baptiste81 	 4
B4sti4n 	 4
Anne1954 	 4
zombig 	 3
zoliveir@uwo.ca 	 3
wr355509 	 3
wouterburs 	 3
websmith77 	 3
tete29999 	 3
tdw1203 	 3
tayal 	 3
stump17jcw 	 3
stargazerqa 	 3
srinivasanranjani 	 3
spencercoffey12 	 3
spacenavy90 	 3
soallard 	 3
slam255@uwo.ca 	 3
sienzala 	 3
sfenix 	 3
serenity613 	 3
saxonturnbuckle 	 3
rws 	 3
rwicken 	 3
rff 	 3
philou92 	 3
pashley108 	 3
pamigo 	 3
ozark576 	 3
nojabr'88 	 3
nijssen 	 3
niascott 	 3
mwong533@uwo.ca 	 3
mscalley03 	 3
mrpaulcromwell 	 3
mlab81 	 3
mercness 	 3
mcv1 	 3
mandy93 	 3
lme 	 3
likyanny 	 3
lecrop 	 3
ladyt 	 3
kukker 	 3
kskok@uwo.ca 	 3
kfontaine 	 3
keev 	 3
keepsie 	 3
kakiszn 	 3
kaelee 	 3
jrich58 	 3
jmarin8@uwo.ca 	 3
jayzz 	 3
jacksonccm 	 3
insaini007 	 3
impossibleiseasy 	 3
homeoflostcauses 	 3
hbrough2 	 3
halorazer 	 3
gigidudu 	 3
gdhill6@uwo.ca 	 3
gbear605 	 3
gapper301 	 3
fredpl 	 3
evaneerd 	 3
emptylica 	 3
dtorresglez 	 3
dspanier 	 3
dramji 	 3
dnbaumgartner 	 3
dgrupultsnik 	 3
deepbhusal 	 3
davevenne 	 3
davest 	 3
cupoftea222 	 3
col281 	 3
codyaaron3 	 3
chrisawesomewright 	 3
cell- 	 3
broseli 	 3
bpa4bes 	 3
blubkeks 	 3
bkn21 	 3
bicanovsky 	 3
astronomicom1 	 3
amahjour@uwo.ca 	 3
alwarren56 	 3
adamack 	 3
Wandering_Recluse 	 3
Stewart Walker 	 3
StellaFrost 	 3
Steckles 	 3
Stained_Spoon 	 3
SorAzolam 	 3
RumblesNZ 	 3
RoyalOcean 	 3
REYNACE 	 3
PlanetMira 	 3
Notri 	 3
Northrop 	 3
Montexes 	 3
MaxKa 	 3
Makc21 	 3
Lynzw 	 3
Junior Guide 	 3
Jagdeep9 	 3
IOAsclepiusOI 	 3
HHMI183 	 3
Gurmatonk 	 3
Guildmaster84 	 3
GrantAR 	 3
Gibas 	 3
Flying_J 	 3
EcceruElme 	 3
Dr_Fra 	 3
Dounat 	 3
Derinox 	 3
Demian Wright 	 3
Dclow 	 3
Daniel222 	 3
Daisy_of_Doom 	 3
Clefairy131 	 3
Chocoholic_Jedi 	 3
Aphelion94 	 3
AngusGeorge 	 3
AlexRaycon 	 3
Aitor Robleto 	 3
ARThompson 	 3
25gabo 	 3
zextriqs 	 2
zardoz99 	 2
ysrivast@uwo.ca 	 2
wmouhtou@uwo.ca 	 2
weegreenblobbie 	 2
vineetp 	 2
vcheun33@uwo.ca 	 2
vanrenter 	 2
vacomano 	 2
uwfshane 	 2
urbanart 	 2
trinicy 	 2
tomssmot 	 2
tokoivun 	 2
tmt2018 	 2
tmccreight25 	 2
titus28 	 2
theodoros71 	 2
theisis 	 2
the wwwyzzerdd 	 2
teufelabgott 	 2
stephen.peace4 	 2
steenz 	 2
starchitect 	 2
smoreng 	 2
sloboje 	 2
silkwire 	 2
seralupin 	 2
scottv 	 2
schhabr7 	 2
sagely06 	 2
rtuttle@uwo.ca 	 2
rocketj9000 	 2
rmz4160 	 2
rmcfry 	 2
rhearen 	 2
rezoloot 	 2
rcowan5@uwo.ca 	 2
pmrab 	 2
peterooch 	 2
paulbistr 	 2
paulbelluscio 	 2
pauetegalopa 	 2
otinokyad 	 2
offshoremike 	 2
nmcawthorne 	 2
nirgut 	 2
naythe 	 2
nakman123 	 2
nakke 	 2
msulli49 	 2
mprange4 	 2
mooreorless 	 2
mjunge 	 2
mgal 	 2
mediafashion 	 2
mdelic 	 2
mcosmo 	 2
mbrow346 	 2
martinpgilson 	 2
martin.kyngas 	 2
madisonparker 	 2
lucilleg1 	 2
lionni 	 2
kfung82 	 2
kennyallau 	 2
jukkamc 	 2
jrb4 	 2
joecag91 	 2
jmacle54@uwo.ca 	 2
jlam323 	 2
jebediah-MTCQ 	 2
jack9761 	 2
ivonameow 	 2
hspring28 	 2
hgojmera 	 2
helicity 	 2
gonzalezdececiliaana 	 2
gly2@uwo.ca 	 2
gguizzo@uwo.ca 	 2
geekess 	 2
fr667 	 2
flipit4u 	 2
ericcrozier 	 2
elshek 	 2
electriclemons 	 2
eikwar 	 2
dudemanppl 	 2
drewcifer90 	 2
donquasar 	 2
dnm2189 	 2
dlppld 	 2
dhruvbhatia1234 	 2
dbalmada 	 2
darkened 	 2
cuttlefish25 	 2
curlcollector 	 2
craigflynn 	 2
cpchandler 	 2
copups 	 2
clloyd 	 2
chasecharlebois 	 2
charlottepingu 	 2
ceyberg 	 2
celticlord88 	 2
cedelong 	 2
cberg 	 2
catelyn 	 2
carolinebowers 	 2
calebfewell 	 2
cLeShEr 	 2
brendamtz96 	 2
bogdanleon 	 2
bharri48@uwo.ca 	 2
berndiesel 	 2
awasae 	 2
avielord1 	 2
austinp9200 	 2
andreanr 	 2
akanner 	 2
agiagiar 	 2
aanastac 	 2
Zarkark 	 2
Zanna640 	 2
Yana Kolba 	 2
Yaksack 	 2
Wonderholic 	 2
Venomex 	 2
VARF 	 2
Uni2013a 	 2
Tobias Heiligensetzer 	 2
Themistocles17 	 2
ThE_jOKe 	 2
Tepf 	 2
TelmaVahey 	 2
TMDG 	 2
Storm610 	 2
Stevie_Ray 	 2
Sonofartemis 	 2
Sleepinbeauty 	 2
Siihq 	 2
Seshins 	 2
Rymang 	 2
RobHutch 	 2
Ray42 	 2
R.i.p. 	 2
Ps359064 	 2
PlaTinum1 	 2
Petrusperes 	 2
Peterbrit 	 2
Olly314 	 2
Nodr 	 2
Mithridate Eupator 	 2
Milbucell 	 2
MichaelRosen 	 2
MavericK75NL 	 2
Mark10189 	 2
MarciAurila 	 2
Madak13 	 2
Libithina 	 2
LeeRy30 	 2
Kylemay15 	 2
Kadik 	 2
KELO76 	 2
Junkieturtle 	 2
JungleHyena 	 2
Joshua_Greschler 	 2
Joshjoshjosh490 	 2
John.obispo 	 2
Isissea 	 2
IronFossile 	 2
Idler 	 2
Icon77 	 2
Huskynator 	 2
Gtar 	 2
Greg the Jew 	 2
GeXeS 	 2
Forsooth 	 2
FooBat 	 2
Foliant 	 2
Fatimajaber97 	 2
DeepWatch 	 2
Dee-Moe 	 2
DarylMusashi 	 2
CzFool 	 2
Crispin.S 	 2
Clue4fun4 	 2
Bowchicawowzers 	 2
Belgabor 	 2
BaibaBond 	 2
B??kitna Czarownica 	 2
AthensHunter1 	 2
Andro1d 	 2
AmadShizzle 	 2
Alejandrohernandez 	 2
2175204 	 2
10740011 	 2
zzzio 	 1
zzutom 	 1
yrawashd@uwo.ca 	 1
yomec 	 1
yellowpecora 	 1
wintrymix 	 1
willjohnboy 	 1
vx100 	 1
vulgothiago 	 1
vihareve 	 1
vaschi 	 1
varunzoo 	 1
vaerth 	 1
tyu93@uwo.ca 	 1
tyonemit@uwo.ca 	 1
turelli 	 1
tprime2@uwo.ca 	 1
tom2316 	 1
tmille55 	 1
tim.astronomy 	 1
thezohar 	 1
therealredwedge 	 1
tess.mcdonald 	 1
tenfiveoh 	 1
swurmser 	 1
susan pyne 	 1
supcom 	 1
stevepm 	 1
sterngucker 	 1
stella_diver 	 1
stardustymd 	 1
sstawins10 	 1
spencerw90 	 1
sonnybayy 	 1
somekindofoctopus 	 1
solarist 	 1
skhan643@uwo.ca 	 1
sjellifo@uwo.ca 	 1
sirk.emerson 	 1
sghahari@uwo.ca 	 1
sbruzzone 	 1
sawman 	 1
sat666/leg 	 1
saradrml 	 1
salmarutha 	 1
sHinE9Y 	 1
ruali12 	 1
rosche2000 	 1
romane954 	 1
rollingronnie 	 1
robtkorb 	 1
rlohman@uwo.ca 	 1
rebeccajane 	 1
rdxts 	 1
pwilling 	 1
ptimlick 	 1
psi22 	 1
pokoirl 	 1
paradyne 	 1
oosime 	 1
omicronCeti 	 1
npettes 	 1
nkapfere@uwo.ca 	 1
njmeluch 	 1
niffe 	 1
nicholastrode 	 1
nicholascross 	 1
ndadams 	 1
nbarth0516 	 1
nbarret6@uwo.ca 	 1
myamar 	 1
mwhitle@uwo.ca 	 1
mudkip201 	 1
msk1711 	 1
mr_seeker 	 1
mlgd 	 1
mlewis69 	 1
mjedm6 	 1
mirandavirabeth 	 1
mgouws 	 1
mdesche2 	 1
mattwj8907 	 1
masamunecyrus 	 1
martmeos 	 1
lupechales 	 1
lmokgeth@uwo.ca 	 1
littletofu 	 1
lindzdeal812 	 1
lindek 	 1
lexagons 	 1
leeatschool 	 1
lechaussette 	 1
lawrence281192 	 1
kpostlet@uwo.ca 	 1
kleineguzzista 	 1
klapushnizza 	 1
kirish2@uwo.ca 	 1
kidbrejo 	 1
kheperas 	 1
katmos 	 1
katayfaya 	 1
kamdeno333 	 1
justmelissa 	 1
justinbronze 	 1
juliapot11 	 1
jturns28 	 1
jstol@uwo.ca 	 1
jruddoc@uwo.ca 	 1
jpdurham 	 1
jononeill82 	 1
jones.joshua 	 1
jonbark 	 1
jonathanstcroix 	 1
jojohase 	 1
johnv2 	 1
johanger 	 1
jo-luc 	 1
jntingler 	 1
jmills224 	 1
jmccab2@uwo.ca 	 1
jlfqam 	 1
jferna29@uwo.ca 	 1
jeray 	 1
jarphys 	 1
jaroslavp 	 1
jande63 	 1
jamesjantw 	 1
jake99340 	 1
isra26 	 1
isabelledechabannes 	 1
irishfury 	 1
insta 	 1
ic5152 	 1
iakey 	 1
i find secrets 	 1
hummel 	 1
hufanean 	 1
harshy 	 1
harlock1970 	 1
haleyh2211 	 1
hacketju 	 1
gthethi@uwo.ca 	 1
gregwolf824  	 1
got_milk 	 1
glupia 	 1
ghostBiologist 	 1
galaxies00 	 1
fratchergirl 	 1
fractalvibes 	 1
forrana 	 1
fermat24 	 1
fenrisulf75 	 1
fengist 	 1
fajarbudhi17 	 1
emil169 	 1
elre233 	 1
ellu87 	 1
eleri5 	 1
dominikabulat 	 1
d.gonsior 	 1
curtevans 	 1
csulli27@uwo.ca 	 1
cplummerabb 	 1
cosminman 	 1
collaudo 	 1
clinton1 	 1
cchoi73 	 1
calfaroc 	 1
bschuckenbrock 	 1
breezyplatypus 	 1
bibliophile4 	 1
bgamble7 	 1
bfrink 	 1
bevalorous 	 1
bdodd@uwo.ca 	 1
bc2callhome 	 1
baskus64 	 1
b0409d 	 1
aussieneptune 	 1
auld 	 1
atayl82@uwo.ca 	 1
astroscientist42 	 1
astrophysicist2215 	 1
aransom@uwo.ca 	 1
apeng5@uwo.ca 	 1
aomiotek 	 1
annals 	 1
andreeags 	 1
andiparis 	 1
ambsnyder 	 1
alucardvladis 	 1
alan1001 	 1
akocher2112 	 1
ajohn343@uwo.ca 	 1
airbears 	 1
agnanase 	 1
aglista 	 1
afinkbe@uwo.ca 	 1
adi dinu 	 1
aclev 	 1
ZachWilliams 	 1
YakupTopal 	 1
Whoandwhatitis 	 1
TomSun 	 1
Thomas Toml 	 1
TheStentley 	 1
TheLegend55 	 1
Taylanokan 	 1
Stof52 	 1
Steven127 	 1
Stefandist 	 1
St0mper 	 1
Spotter55 	 1
Sophie37 	 1
Somau5 	 1
Sniwa 	 1
Sizow 	 1
Silver Cat 	 1
SIRTFkid 	 1
Rogerht 	 1
RocketPoweredTennisBall 	 1
Robillard 	 1
RipperSB 	 1
ROE_OpenDays 	 1
Peter Dzwig 	 1
Paymygasbill77 	 1
Paulus1962 	 1
PILIRANO 	 1
P.Schutte 	 1
OooortCloud 	 1
OmK 	 1
ObrienDave 	 1
Nrodrigo777 	 1
Noctiferix 	 1
Nik05 	 1
NicholasR 	 1
Neo99 	 1
Nemesis38s 	 1
Neha Palekar 	 1
NGXalpha 	 1
NGC48100 	 1
NASAFinder 	 1
MrMWallace 	 1
Montag46 	 1
Milenn 	 1
MikeWain 	 1
Michael Doll 	 1
Mel751 	 1
MarieTran 	 1
MJSuperstar 	 1
MD27 	 1
Liz217 	 1
LauraGuenthner 	 1
LEpauli 	 1
KhalilaRedBird 	 1
Jonathanmoreno 	 1
Johnthewig 	 1
Johnarmysf  	 1
Joanne Kennedy 	 1
Jahei09 	 1
JT03_X 	 1
JRonca 	 1
JMuller 	 1
Isotronic 	 1
IsaacGuasp 	 1
ILikeYourGenes 	 1
HeatherAH 	 1
Gl0rphoid 	 1
Gigapants 	 1
GalacticSKL 	 1
FromKepler186f 	 1
Freebird714 	 1
Felix93300 	 1
Elaine  	 1
Ekukhta 	 1
Edwige_Nlassa 	 1
Ectoplasm 	 1
Dugg44 	 1
DrewsK 	 1
DoveLove 	 1
DeepPastry 	 1
DeathDealer315 	 1
DaynanCrull 	 1
DavidCurren 	 1
DarkStillWater 	 1
DanFrench2012 	 1
Cybrspin 	 1
Courtneym128 	 1
Corcaroli 	 1
ConnorPickett 	 1
Colloid 	 1
Cicero84 	 1
ChunkPlanet 	 1
ChrisRogue 	 1
Chibbs 	 1
Chappers34 	 1
Cferreri 	 1
Ccolvin968 	 1
CavassuarY 	 1
CaptAirhead 	 1
Bryanlynch 	 1
BrokenFibula 	 1
Blonglor 	 1
Bioman94 	 1
Binsh 	 1
Barbalbero 	 1
Bally34 	 1
Baccha 	 1
BOSSNESS 	 1
Ashar83 	 1
Arthemis 	 1
Anindita.Nandy 	 1
Andrecompadre 	 1
Anderson_Blake 	 1
Algar 	 1
Alecs92 	 1
Aaron101 	 1
12walch 	 1
 Aleks 	 1
ztzanete@uwo.ca 	 0
winniechan7 	 0
wguan6 	 0
vbkk3000 	 0
spittma2@uwo.ca 	 0
snaki 	 0
reese.amber 	 0
pbroseme@uwo.ca 	 0
pastafarian14 	 0
ogrocks12 	 0
niedzwiedzlebiedz 	 0
nbrown89@uwo.ca 	 0
naumenko.pavlik65 	 0
mraitt 	 0
mobile 	 0
miertje 	 0
lavalob@uwo.ca 	 0
jrutsobe 	 0
jlalani2 	 0
jfriend4@uwo.ca 	 0
jfoxton@uwo.ca 	 0
jbarr29@uwo.ca 	 0
jamethon 	 0
ilham.mammadov 	 0
gkim233@uwo.ca 	 0
fmacgill 	 0
flucia@uwo.ca 	 0
firecatstef 	 0
emhcul 	 0
deedee123 	 0
dcampb93@uwo.ca 	 0
cxie26@uwo.ca 	 0
crobi22 	 0
cptak4@uwo.ca 	 0
cosler@uwo.ca 	 0
bpearson 	 0
bluesonthru 	 0
betelgeuse1960 	 0
bert willard 	 0
awong575@uwo.ca 	 0
anucum 	 0
anthriscus 	 0
acrawf29@uwo.ca 	 0
WedoOx 	 0
Valdez Johan 	 0
VBLANC 	 0
MistyArt 	 0
Maxsam 	 0
MasonHaynes 	 0
Joe Darcy 	 0
Fezman92 	 0
Erik Israel 	 0
Darkskinmale 	 0
AtomixSpace 	 0
Astroboy98 	 0
Alisonglockzin 	 0

Running the algorithm on the top 10 annotators

Let's throw in just the top 10 annotators #126 and see how it goes. First, I'll upsample the positive examples. I'll count a "negative" example as anything that doesn't have any positive classifications.


In [33]:
top_labels = labels[top_10]

positive_bool = numpy.any(top_labels, axis=0)
positives = numpy.arange(top_labels.shape[1])[positive_bool]
non_positives = numpy.arange(top_labels.shape[1])[~positive_bool]

while positives.shape[0] < non_positives.shape[0]:
    new_positives = positives[:]
    numpy.random.shuffle(new_positives)
    positives = numpy.concatenate([positives, new_positives])

positives = positives[:non_positives.shape[0]]

upsampled = numpy.concatenate([positives, non_positives])
upsampled.sort()

In [34]:
upsampled_train, upsampled_test = sklearn.cross_validation.train_test_split(downsampled)
upsampled_train.sort()
upsampled_test.sort()

Now I can run the algorithm.


In [35]:
print(upsampled_train.shape, upsampled_test.shape)

with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features'][upsampled_train, :]
    res = passive_crowd.train(x, top_labels.astype(bool)[:, upsampled_train], lr_init=True)


(5229,) (1743,)
K:\Languages\Python35_64\lib\site-packages\numpy\ma\core.py:4139: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

In [40]:
import sklearn.metrics

with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features']
    pred = passive_crowd.predict(res[0], res[1], x[upsampled_test, :])
    cm = sklearn.metrics.confusion_matrix(norris_labels[upsampled_test], pred)
    
    tp = cm[1, 1]
    n, p = cm.sum(axis=1)
    tn = cm[0, 0]

    ba = (tp / p + tn / n) / 2
    print(ba)
    print(cm)


0.640485951405
[[1518   88]
 [  91   46]]

In [39]:
import seaborn, matplotlib.pyplot as plt
%matplotlib inline

with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features']
    pred = passive_crowd.logistic_regression(res[0], res[1], x[upsampled_test, :])
    pos_pred = pred[norris_labels[upsampled_test] == 1]
    neg_pred = pred[norris_labels[upsampled_test] == 0]
    assert pos_pred.shape[0] + neg_pred.shape[0] == pred.shape[0]
    plt.figure(figsize=(10, 5))
    seaborn.distplot(pos_pred, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
    seaborn.distplot(neg_pred, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
    plt.xlim((0, 1))
    plt.show()


Running the algorithm on simulated labellers, no noise

I'll use the Norris labels to generate true labels for a fully observed crowd labelling scenario.


In [41]:
simulated_norris_labels = numpy.ma.MaskedArray(numpy.tile(norris_labels, (2, 1)), False)
#                                                mask=numpy.random.binomial(1, 0.5, size=(5, 24140)))

In [42]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features'][upsampled_train, :]
    res = passive_crowd.train(x, simulated_norris_labels.astype(bool)[:, upsampled_train], lr_init=True)

In [43]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features']
    pred = passive_crowd.predict(res[0], res[1], x[upsampled_test, :])
    cm = sklearn.metrics.confusion_matrix(norris_labels[upsampled_test], pred)
    
    tp = cm[1, 1]
    n, p = cm.sum(axis=1)
    tn = cm[0, 0]

    ba = (tp / p + tn / n) / 2
    print(ba)
    print(cm)


0.250440865004
[[  19 1587]
 [  70   67]]

In [44]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features']
    score = res[0].dot(x[upsampled_test, :].T) + res[1]
    pos_score = score[norris_labels[upsampled_test] == 1]
    neg_score = score[norris_labels[upsampled_test] == 0]
    assert pos_score.shape[0] + neg_score.shape[0] == score.shape[0]
    plt.figure(figsize=(10, 5))
    seaborn.distplot(pos_score, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
    seaborn.distplot(neg_score, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
    plt.show()



In [45]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features']
    score = res[0].dot(x[upsampled_train, :].T) + res[1]
    pos_score = score[norris_labels[upsampled_train] == 1]
    neg_score = score[norris_labels[upsampled_train] == 0]
    assert pos_score.shape[0] + neg_score.shape[0] == score.shape[0]
    plt.figure(figsize=(10, 5))
    seaborn.distplot(pos_score, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
    seaborn.distplot(neg_score, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
    plt.xlim((-10, 25))
    plt.show()


Norris Baseline

Just a quick comparison with the Norris labels, fully observed and one annotator.


In [26]:
import sklearn.linear_model

lr = sklearn.linear_model.LogisticRegression(C=100)
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features']
    lr.fit(x[downsampled_train, :], norris_labels[downsampled_train])

In [27]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features']
    pred = lr.predict(x[downsampled_test, :])
    cm = sklearn.metrics.confusion_matrix(norris_labels[downsampled_test], pred)
    
    tp = cm[1, 1]
    n, p = cm.sum(axis=1)
    tn = cm[0, 0]

    ba = (tp / p + tn / n) / 2
    print(ba)
    print(cm)


0.698217910746
[[1580   12]
 [  90   61]]

In [28]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features']
    score = lr.decision_function(x[downsampled_test, :])
    pos_score = score[norris_labels[downsampled_test] == 1]
    neg_score = score[norris_labels[downsampled_test] == 0]
    assert pos_score.shape[0] + neg_score.shape[0] == score.shape[0]
    plt.figure(figsize=(10, 5))
    seaborn.distplot(pos_score, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
    seaborn.distplot(neg_score, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
    plt.show()



In [29]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features']
    score = lr.decision_function(x[downsampled_train, :])
    pos_score = score[norris_labels[downsampled_train] == 1]
    neg_score = score[norris_labels[downsampled_train] == 0]
    assert pos_score.shape[0] + neg_score.shape[0] == score.shape[0]
    plt.figure(figsize=(10, 5))
    seaborn.distplot(pos_score, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
    seaborn.distplot(neg_score, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
    plt.show()


Top 10 annotators with scalar $\eta_t$


In [88]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features'][downsampled_train, :]
    res = active_crowd_scalar.train(x, top_labels.astype(bool)[:, downsampled_train], lr_init=True)


K:\Languages\Python35_64\lib\site-packages\numpy\ma\core.py:4139: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

In [89]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features']
    pred = passive_crowd.predict(res[0], res[1], x[downsampled_test, :])
    cm = sklearn.metrics.confusion_matrix(norris_labels[downsampled_test], pred)
    
    tp = cm[1, 1]
    n, p = cm.sum(axis=1)
    tn = cm[0, 0]

    ba = (tp / p + tn / n) / 2
    print(ba)
    print(cm)


0.718941748346
[[1440  162]
 [  65   76]]

In [98]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features']
    score = res[0].dot(x[downsampled_test, :].T) + res[1]
    pos_score = score[norris_labels[downsampled_test] == 1]
    neg_score = score[norris_labels[downsampled_test] == 0]
    assert pos_score.shape[0] + neg_score.shape[0] == score.shape[0]
    plt.figure(figsize=(10, 5))
    seaborn.distplot(pos_score, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
    seaborn.distplot(neg_score, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
    plt.xlim((-20, 20))
    plt.show()



In [97]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features']
    score = res[0].dot(x[downsampled_train, :].T) + res[1]
    pos_score = score[norris_labels[downsampled_train] == 1]
    neg_score = score[norris_labels[downsampled_train] == 0]
    assert pos_score.shape[0] + neg_score.shape[0] == score.shape[0]
    plt.figure(figsize=(10, 5))
    seaborn.distplot(pos_score, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
    seaborn.distplot(neg_score, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
    plt.xlim((-20, 20))
    plt.show()



In [92]:
res[2]


Out[92]:
array([ 0.13664592,  0.37203094,  0.2565089 ,  0.74780762,  0.02890832,
        0.81651655,  0.12864316,  0.02816478,  0.16995135,  0.3093587 ])

Clustering annotators

First, let's make a covariance matrix.


In [17]:
cov = numpy.ma.cov(labels)
print(cov, cov.shape)


[[0.06666666666666668 0.06666666666666667 -- ..., -- -- --]
 [0.06666666666666667 0.05500511951244607 0.03833333333333335 ..., --
  -9.912705577010326e-19 --]
 [-- 0.03833333333333335 0.07666666666666666 ..., -- -- --]
 ..., 
 [-- -- -- ..., 0.06495589414595031 -- --]
 [-- -9.912705577010326e-19 -- ..., -- 0.125 --]
 [-- -- -- ..., -- -- 0.06666666666666668]] (1193, 1193)

In [18]:
plt.imshow(cov, interpolation='None')


Out[18]:
<matplotlib.image.AxesImage at 0x24180002f60>

As expected, lots of unknowns. We'll press on nevertheless!


In [34]:
import sklearn.cluster, collections

In [29]:
kmc = sklearn.cluster.KMeans(5)
kmc.fit(cov)


Out[29]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=5, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [33]:
clusters = kmc.predict(cov)

Now, we'll do a majority vote over these clusters.


In [37]:
cluster_labels = numpy.ma.MaskedArray(numpy.zeros((5, labels.shape[1])), mask=numpy.zeros((5, labels.shape[1])))
for c in range(5):
    for i in range(labels.shape[1]):
        this_cluster_labels = labels[clusters == c, i]
        # Compute the majority vote.
        counter = collections.Counter(this_cluster_labels[~this_cluster_labels.mask])
        if counter:
            cluster_labels[c, i] = max(counter, key=counter.get)
        else:
            cluster_labels.mask[c, i] = True

Now let's try a basic logistic regression on each of them.


In [59]:
def balanced_accuracy(y_true, y_pred):
    try:
        cm = sklearn.metrics.confusion_matrix(y_true[~y_pred.mask], y_pred[~y_pred.mask])
    except AttributeError:
        cm = sklearn.metrics.confusion_matrix(y_true, y_pred)
    
    tp = cm[1, 1]
    n, p = cm.sum(axis=1)
    tn = cm[0, 0]

    ba = (tp / p + tn / n) / 2
    return ba

In [52]:
i_tr, i_te = sklearn.cross_validation.train_test_split(numpy.arange(labels.shape[1]))

with h5py.File(TRAINING_H5_PATH) as f_h5:
    features = f_h5['features'].value
    for c in range(5):
        labels_ = cluster_labels[c]
        lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
        lr.fit(features[i_tr], labels_[i_tr])
        print(balanced_accuracy(norris_labels[i_te], lr.predict(features[i_te])))


0.727235774647
0.715165678675
0.754294782614
0.723188656146
0.5

Next, we'll put it into the scalar $\eta_t$ implementation.


In [57]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features'][downsampled_train, :]
    res = active_crowd_scalar.train(x, cluster_labels.astype(bool)[:, downsampled_train], lr_init=True)


K:\Languages\Python35_64\lib\site-packages\numpy\ma\core.py:4139: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")
K:\Languages\Python35_64\lib\site-packages\numpy\ma\core.py:827: RuntimeWarning: invalid value encountered in less_equal
  return umath.less_equal(x, self.critical_value)

In [60]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features']
    pred = passive_crowd.predict(res[0], res[1], x[downsampled_test, :])
    cm = sklearn.metrics.confusion_matrix(norris_labels[downsampled_test], pred)
    ba = balanced_accuracy(norris_labels[downsampled_test], pred)
    print(cm)
    print(ba)


[[1568   21]
 [ 121   33]]
0.600534927627

In [63]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features']
    score = res[0].dot(x[downsampled_test, :].T) + res[1]
    pos_score = score[norris_labels[downsampled_test] == 1]
    neg_score = score[norris_labels[downsampled_test] == 0]
    assert pos_score.shape[0] + neg_score.shape[0] == score.shape[0]
    plt.figure(figsize=(10, 5))
    seaborn.distplot(pos_score, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
    seaborn.distplot(neg_score, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
    plt.xlim((-20, 20))
    plt.show()


Now let's try the full algorithm.


In [67]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features'][downsampled_train, :]
    res = active_crowd.train(x, cluster_labels.astype(bool)[:, downsampled_train], lr_init=True)


K:\Languages\Python35_64\lib\site-packages\numpy\ma\core.py:4139: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

In [68]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features']
    pred = passive_crowd.predict(res[0], res[1], x[downsampled_test, :])
    cm = sklearn.metrics.confusion_matrix(norris_labels[downsampled_test], pred)
    ba = balanced_accuracy(norris_labels[downsampled_test], pred)
    print(cm)
    print(ba)


[[   0 1589]
 [   1  153]]
0.496753246753

In [78]:
plt.plot(res[2].T)
plt.xscale('log')
plt.legend(range(5))


Out[78]:
<matplotlib.legend.Legend at 0x2419867d4a8>

In [79]:
print(res[3])


[ -206.06684877  -995.36668545 -1651.16336611 -1076.5866288    -48.6273634 ]

PCA on $X$?


In [94]:
import sklearn.decomposition

pca = sklearn.decomposition.PCA(n_components=10)

In [95]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features'][downsampled_train, :]
    pca.fit(x)

In [96]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features'][downsampled_train, :]
    res = active_crowd.train(pca.transform(x), labels.astype(bool)[:, downsampled_train], lr_init=True)

In [108]:
seaborn.distplot(res[3])


Out[108]:
<matplotlib.axes._subplots.AxesSubplot at 0x24198158ba8>

In [109]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features']
    pred = passive_crowd.predict(res[0], res[1], pca.transform(x[downsampled_test, :]))
    cm = sklearn.metrics.confusion_matrix(norris_labels[downsampled_test], pred)
    ba = balanced_accuracy(norris_labels[downsampled_test], pred)
    print(cm)
    print(ba)


[[1396  193]
 [  47  107]]
0.786672578523

In [112]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
    x = f_h5['features']
    score = res[0].dot(pca.transform(x[downsampled_test, :]).T) + res[1]
    pos_score = score[norris_labels[downsampled_test] == 1]
    neg_score = score[norris_labels[downsampled_test] == 0]
    assert pos_score.shape[0] + neg_score.shape[0] == score.shape[0]
    plt.figure(figsize=(10, 5))
    seaborn.distplot(pos_score, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
    seaborn.distplot(neg_score, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
    plt.xlim((-100, 100))
    plt.show()



In [ ]: