similarity engine trainer

train a gensim word2vec model with a corpus


In [7]:
import gensim
import codecs
class Corpus_Iterator(object):
    def __init__(self,filename):
        self.filename = filename
        
    def __iter__(self):
        for line in codecs.open(self.filename,'r',encoding='utf8'):
            yield line.split()
    
corpus_file = 'corpus.txt'

In [64]:
corp = Corpus_Iterator('corpus.txt')
model = gensim.models.Word2Vec(corp,min_count=1,size=100)
model.save('SimEngine')

In [66]:
import gensim
import pymongo
from pymongo import MongoClient
import time

mongo_url = 'mongodb://localhost:27017/'
db = 'CamSim'
coll = 'CamAuthors'
client = MongoClient(mongo_url)
ca = client[db][coll]
for count_size in [0,1,2,3,4,5,6,7,8,9,10]:
    print("*"*50 + ' '+ str(count_size))
    corp = Corpus_Iterator('corpus.txt')
    model = gensim.models.Word2Vec(corp,min_count=count_size,size=100)
    model.save('SimEngine')

    ind = 0
    cur1=ca.find()
    cur2=ca.find()
    ind1=0
    ind2=0
    for rec1 in cur1[0:1]:
        corp1 = rec1['corpus'].split()
        big_negs = 0
        negs = 0
        singletons =0
        multiplons = 0
        for rec2 in cur2[:1000]:
            corp2 = rec2['corpus'].split()
            for i in corp1:
                for j in corp2:
                    try:
                        s = model.similarity(i,j)
                        multiplons+=1
                        if (s<0.): negs+=1
                        if (s<-0.25): big_negs+=1
                    except:
                        singletons+=1
            ind2+=1
                #s = model.n_similarity(corp1,corp2)
        print(str(negs)+' negatives '+str(ind1)+' '+str(ind2))
        print(str(singletons)+' singletons ')
        print(str(multiplons)+' multiplons')
        print(str(big_negs)+' large negatives, percentage: '+str(100*float(big_negs)/negs))
        print(str(100*float(negs)/(negs+singletons+multiplons))+' percent negative')
        ind1+=1


************************************************** 0
78427 negatives 0 1000
0 singletons 
163631 multiplons
38894 large negatives, percentage: 49.5926147883
32.4000859298 percent negative
************************************************** 1
78427 negatives 0 1000
0 singletons 
163631 multiplons
38894 large negatives, percentage: 49.5926147883
32.4000859298 percent negative
************************************************** 2
45186 negatives 0 1000
47341 singletons 
116290 multiplons
25841 large negatives, percentage: 57.1880671004
21.6390427982 percent negative
************************************************** 3
18369 negatives 0 1000
86477 singletons 
77154 multiplons
8870 large negatives, percentage: 48.2878763134
10.0928571429 percent negative
************************************************** 4
20130 negatives 0 1000
90005 singletons 
73626 multiplons
9446 large negatives, percentage: 46.9249875807
10.9544462644 percent negative
************************************************** 5
20776 negatives 0 1000
92357 singletons 
71274 multiplons
9378 large negatives, percentage: 45.1386214863
11.2663835972 percent negative
************************************************** 6
18007 negatives 0 1000
94520 singletons 
69111 multiplons
9445 large negatives, percentage: 52.4518242906
9.91367445138 percent negative
************************************************** 7
19526 negatives 0 1000
96865 singletons 
66766 multiplons
9734 large negatives, percentage: 49.8514800778
10.6607992051 percent negative
************************************************** 8
17887 negatives 0 1000
98384 singletons 
65247 multiplons
8582 large negatives, percentage: 47.9789791469
9.85411915072 percent negative
************************************************** 9
15198 negatives 0 1000
108521 singletons 
55110 multiplons
6601 large negatives, percentage: 43.433346493
8.49862158822 percent negative
************************************************** 10
13387 negatives 0 1000
109829 singletons 
53802 multiplons
6145 large negatives, percentage: 45.9027414656
7.56250776757 percent negative

In [80]:
def pcorp(li):
    li2 = []
    for w in li:
        try:
            v = model[w]
            li2.append(w)
        except:
            pass
    return li2

In [99]:
import gensim
import pymongo
from pymongo import MongoClient
import time

mongo_url = 'mongodb://localhost:27017/'
db = 'CamSim'
coll = 'CamAuthors'
client = MongoClient(mongo_url)
ca = client[db][coll]
for count_size in [0,1,2,3,4,5,6,7,8,9,10,15,20,30,40,50,60,70,80,100]:
    print("*"*50 + ' '+ str(count_size))
    corp = Corpus_Iterator('corpus.txt')
    model = gensim.models.Word2Vec(corp,min_count=count_size,size=100)
    model.save('SimEngine')

    ind = 0
    cur1=ca.find()
    cur2=ca.find()
    ind1=0
    ind2=0
    for rec1 in cur1[0:1]:
        corp1 = rec1['corpus'].split()
        n_big_negs = 0
        n_negs = 0
        p_big_negs = 0
        p_negs = 0
        fails = 0
        pcorp1 = pcorp(corp1)
        for rec2 in cur2[:1000]:
            corp2 = rec2['corpus'].split()
            pcorp2 = pcorp(corp2)
            cum = 0
            try:
                for i in pcorp1:
                    for j in pcorp2:
                        cum+=model.similarity(i,j)
                pair_ave = cum/(len(pcorp1)+len(pcorp2))
                if (pair_ave<0.): p_negs+=1
                if (pair_ave<-0.25): p_big_negs+=1
                nsim_score = model.n_similarity(pcorp1,pcorp2)
                if (nsim_score<0.): n_negs+=1
                if (nsim_score<-0.25): n_big_negs+=1
            except:
                fails+=1
            ind2+=1
        print('n_negatives ' + str(n_negs))
        print('n_big negatives '+ str(n_big_negs))
        print('p_negatives ' + str(p_negs))
        print('p_big negatives '+ str(p_big_negs))
        print('fails '+ str(fails))


************************************************** 0
n_negatives 158
n_big negatives 81
p_negatives 127
p_big negatives 20
fails 0
************************************************** 1
n_negatives 158
n_big negatives 81
p_negatives 127
p_big negatives 20
fails 0
************************************************** 2
n_negatives 162
n_big negatives 87
p_negatives 78
p_big negatives 27
fails 0
************************************************** 3
n_negatives 158
n_big negatives 65
p_negatives 63
p_big negatives 25
fails 0
************************************************** 4
n_negatives 164
n_big negatives 63
p_negatives 69
p_big negatives 32
fails 1
************************************************** 5
n_negatives 181
n_big negatives 64
p_negatives 59
p_big negatives 23
fails 1
************************************************** 6
n_negatives 173
n_big negatives 70
p_negatives 58
p_big negatives 27
fails 1
************************************************** 7
n_negatives 206
n_big negatives 102
p_negatives 94
p_big negatives 45
fails 1
************************************************** 8
n_negatives 185
n_big negatives 84
p_negatives 64
p_big negatives 32
fails 1
************************************************** 9
n_negatives 182
n_big negatives 72
p_negatives 65
p_big negatives 29
fails 4
************************************************** 10
n_negatives 204
n_big negatives 80
p_negatives 67
p_big negatives 35
fails 4
************************************************** 15
n_negatives 285
n_big negatives 188
p_negatives 220
p_big negatives 77
fails 6
************************************************** 20
n_negatives 212
n_big negatives 94
p_negatives 57
p_big negatives 10
fails 9
************************************************** 30
n_negatives 278
n_big negatives 154
p_negatives 246
p_big negatives 28
fails 14
************************************************** 40
n_negatives 231
n_big negatives 117
p_negatives 175
p_big negatives 23
fails 18
************************************************** 50
n_negatives 232
n_big negatives 112
p_negatives 168
p_big negatives 12
fails 29
************************************************** 60
n_negatives 230
n_big negatives 103
p_negatives 168
p_big negatives 22
fails 54
************************************************** 70
n_negatives 405
n_big negatives 215
p_negatives 383
p_big negatives 39
fails 58
************************************************** 80
n_negatives 469
n_big negatives 203
p_negatives 440
p_big negatives 39
fails 65
************************************************** 100
n_negatives 269
n_big negatives 141
p_negatives 222
p_big negatives 26
fails 76

In [ ]: