In [1]:
import data
import topicmodeling as tp
import gensim as g


Using gpu device 0: GeForce GTX 970 (CNMeM is disabled, cuDNN not available)

In [119]:
l_interested = list(set(data.get_entity_list(data.libinfo.interested_libs)))
print(len(l_interested))
#l_code = list(map(data.get_code, l_interested))
l_doc = list(map(data.get_doc, l_interested))
l_doc = list(map(lambda doc : data.pdoc.extract(doc, stage=data.pdoc.ADVANCED), l_doc))


3979

In [120]:
documents = l_doc
texts = tp.simple_process(documents=documents, stoplist=tp.read_stoplist("../SmartStoplist.txt"))
texts = tp.remove_infrequent(texts, n_times=1)

In [121]:
id2word = g.corpora.Dictionary(texts)
#id2word.save('/tmp/deerwester.dict') # store the id2word, for future reference
print(*list(id2word)[:10])


734 157 1094 316 401 651 51 1056 782 301

In [122]:
corpus = [id2word.doc2bow(text) for text in texts]
#g.corpora.corpusCorpus.serialize('/tmp/deerwester.mm', corpus) # store to disk, for later use
print(*list(corpus)[:10])
tfidf = g.models.tfidfmodel.TfidfModel(corpus)
corpus = [tfidf[bag] for bag in corpus]
print(*list(corpus)[:10])


[(0, 1), (1, 1), (2, 1), (3, 1)] [(4, 1), (5, 1), (6, 1)] [(7, 1), (8, 1), (9, 1)] [(10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1)] [(17, 1), (18, 1), (19, 1), (20, 1)] [(21, 1), (22, 1), (23, 1)] [(24, 1), (25, 1), (26, 1), (27, 1), (28, 1)] [(2, 1), (29, 1), (30, 1), (31, 1), (32, 1)] [(2, 1), (3, 1)] [(7, 1), (33, 1), (34, 1), (35, 1)]
[(0, 0.7628244667984156), (1, 0.493458034466283), (2, 0.21387308022274398), (3, 0.35896560647285025)] [(4, 0.6812763365292757), (5, 0.5535053508848635), (6, 0.4790557168295493)] [(7, 0.37854881721187744), (8, 0.7318543350988053), (9, 0.5666480611319289)] [(10, 0.45884116429793026), (11, 0.2767154094081887), (12, 0.43717178069978774), (13, 0.2961800287792112), (14, 0.4109359040475713), (15, 0.3211391957713851), (16, 0.40255793689581393)] [(17, 0.5075897644187175), (18, 0.2754831511310989), (19, 0.4756394287066911), (20, 0.663497398909599)] [(21, 0.6011099968417766), (22, 0.4852135728524555), (23, 0.6350075278424929)] [(24, 0.6387338917345905), (25, 0.20564924505688248), (26, 0.347054140605519), (27, 0.4042951325649071), (28, 0.5155834295533627)] [(2, 0.21764436588670363), (29, 0.5207128600086892), (30, 0.4583839310919786), (31, 0.5611660382718072), (32, 0.39555770328276346)] [(2, 0.5118426727982279), (3, 0.8590792037424524)] [(7, 0.3396474512936074), (33, 0.4861926004470257), (34, 0.6354176929006794), (35, 0.4944701403008004)]

In [ ]:
# load id->word mapping (the id2word), one of the results of step 2 above
#id2word = g.g.corpora.id2word.load_from_text('wiki_en_wordids.txt')
# load corpus iterator
#corpus = g.corpora.MmCorpus('/tmp/deerwester.mm')
#corpus = g.g.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output (recommended)

In [123]:
n_topics = 40
## extract 100 LDA topics, using 1 pass and updating once every 1 chunk (10,000 documents)
lda = g.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topics, 
                                 update_every=1, chunksize=10000, passes=5)
## print the most contributing words for n_topic topics
l = list(lda.print_topics(n_topics))
for i, s in l :
    print(i, *s.split(' + '))


0 0.051*seri 0.051*chebyshev 0.042*mask 0.039*multipli 0.038*divid 0.036*fill 0.029*arrai 0.020*polynomi 0.017*repres 0.015*inform
1 0.057*equal 0.051*varianc 0.043*sum 0.037*window 0.032*mask 0.032*arrai 0.027*axi 0.022*neg 0.018*fill_valu 0.018*element
2 0.047*scale 0.025*map 0.025*center 0.025*companion 0.022*count 0.022*numpi 0.018*appli 0.018*featur 0.015*paramet 0.015*partit
3 0.031*statist 0.022*option 0.020*ax 0.019*descript 0.019*ensembl 0.016*user 0.015*flatten 0.015*insert 0.014*subclust 0.014*compon
4 0.043*algorithm 0.036*minim 0.035*order 0.031*log 0.030*probabl 0.029*scalar 0.027*variabl 0.025*function 0.025*quadrat 0.019*section
5 0.041*trim 0.025*dictionari 0.025*sequenc 0.022*digit 0.022*stage 0.020*period 0.020*transform 0.020*error 0.019*stream 0.018*analog
6 0.111*fit 0.088*model 0.059*data 0.045*transform 0.022*estim 0.020*imag 0.018*linear 0.018*filter 0.018*squar 0.017*lowpass
7 0.046*boolean 0.041*filter 0.032*behavior 0.031*dissimilar 0.031*execut 0.030*calcul 0.029*arrai 0.023*dimension 0.019*call 0.018*maskedarrai
8 0.031*featur 0.025*reduc 0.025*updat 0.023*cluster 0.020*arr 0.018*step 0.017*rang 0.016*end 0.016*select 0.015*tree
9 0.044*weight 0.039*distanc 0.034*perform 0.032*cluster 0.025*document 0.025*pair 0.017*comput 0.016*point 0.016*term 0.014*sampl
10 0.032*system 0.032*nan 0.029*time 0.025*standard 0.024*likelihood 0.024*respons 0.023*deviat 0.022*axi 0.020*continu 0.018*reduct
11 0.048*differenti 0.030*find 0.028*reconstruct 0.021*point 0.020*id 0.020*comput 0.020*kei 0.019*dirichlet 0.019*entropi 0.019*interpol
12 0.043*arrai 0.042*diagon 0.033*determin 0.033*shape 0.032*valu 0.030*column 0.025*matrix 0.022*argument 0.022*nonzero 0.019*indic
13 0.038*initi 0.033*integ 0.031*bit 0.027*rais 0.027*func 0.026*bound 0.022*wishart 0.017*hyperrectangl 0.017*read 0.016*power
14 0.057*paramet 0.052*call 0.048*found 0.043*estim 0.032*row 0.031*dtype 0.028*csr 0.024*csc 0.017*matrix 0.016*predict_log_proba
15 0.094*evalu 0.050*point 0.030*posit 0.027*kernel 0.026*spline 0.025*seri 0.023*median 0.022*polynomi 0.021*deriv 0.017*hermite
16 0.024*make 0.022*limit 0.022*append 0.021*logarithm 0.019*common 0.018*multidimension 0.018*equat 0.018*arrai 0.018*frozen 0.016*valu
17 0.044*spars 0.039*fourier 0.035*transform 0.033*matrix 0.032*discret 0.028*invers 0.025*format 0.020*convert 0.019*dimension 0.019*comput
18 0.041*interpol 0.032*function 0.030*helper 0.028*multivari 0.026*transpos 0.019*field 0.017*normal 0.016*densiti 0.014*underli 0.014*linear
19 0.050*read 0.050*header 0.047*file 0.035*object 0.032*write 0.030*cartesian 0.029*valid 0.027*product 0.020*node 0.019*arrai
20 0.054*root 0.054*vector 0.048*matrix 0.043*multipl 0.036*svd 0.035*rank 0.023*arrai 0.021*ndarrai 0.021*comput 0.018*test
21 0.034*comput 0.030*decomposit 0.029*index 0.027*solv 0.023*correl 0.021*equat 0.021*choleski 0.018*laguerr 0.018*cross 0.017*curv
22 0.036*set 0.036*construct 0.031*attribut 0.027*decis 0.025*print 0.024*comput 0.023*integr 0.022*instanc 0.019*np 0.019*function
23 0.037*arrai 0.029*wishart 0.024*creat 0.022*group 0.021*transfer 0.021*exp 0.020*eigenvector 0.018*distribut 0.017*eigenvalu 0.017*comput
24 0.063*legendr 0.032*run 0.031*case 0.029*kind 0.024*learn 0.023*function 0.017*nearest 0.017*charact 0.015*helper 0.015*string
25 0.059*train 0.043*fit 0.038*gauss 0.038*data 0.037*quadratur 0.037*model 0.030*float 0.024*histogram 0.019*charact 0.019*bin
26 0.044*covari 0.031*embed 0.029*gradient 0.027*privat 0.025*function 0.025*conjug 0.020*decision_funct 0.018*str 0.018*job 0.017*pole
27 0.048*represent 0.033*bessel 0.030*function 0.025*power 0.023*comput 0.021*deriv 0.020*space 0.020*matrix 0.018*system 0.018*fraction
28 0.068*predict 0.068*class 0.038*sampl 0.038*probabl 0.035*rel 0.028*precis 0.021*random 0.021*estim 0.021*comput 0.020*complex
29 0.062*hermit 0.042*seri 0.032*add 0.030*subtract 0.028*remov 0.022*current 0.021*intern 0.021*docstr 0.017*mask 0.016*trail
30 0.031*coordin 0.026*approxim 0.026*shift 0.025*repeat 0.024*descent 0.020*part 0.020*euclidean 0.019*gradient 0.018*function 0.017*arrai
31 0.052*copi 0.046*place 0.032*sort 0.031*averag 0.026*featur 0.025*classifi 0.023*linearoper 0.019*decim 0.018*select 0.016*matrix
32 0.032*matrix 0.031*norm 0.025*file 0.024*random 0.023*cumul 0.020*main 0.019*spectral 0.018*densiti 0.018*rv 0.017*power
33 0.041*vandermond 0.040*degre 0.037*multi 0.037*binari 0.034*pseudo 0.031*matrix 0.029*target 0.029*structur 0.026*true 0.022*regress
34 0.047*minimum 0.025*check 0.024*cosin 0.024*window 0.023*match 0.023*distribut 0.021*concaten 0.019*element 0.019*wise 0.019*calcul
35 0.048*score 0.026*string 0.025*precis 0.025*nth 0.021*type 0.021*permut 0.020*modifi 0.019*full 0.017*function 0.017*comput
36 0.041*dimens 0.027*final 0.024*estim 0.024*valid 0.023*input 0.022*number 0.020*arrai 0.019*matlab 0.018*file 0.018*partial
37 0.052*iter 0.046*gener 0.043*coeffici 0.033*random 0.033*variat 0.031*spline 0.027*project 0.019*cubic 0.018*matrix 0.014*similar
38 0.048*maximum 0.037*axi 0.033*record 0.025*back 0.021*origin 0.020*slice 0.020*data 0.019*length 0.019*ridg 0.018*width
39 0.126*wise 0.103*element 0.040*product 0.039*matric 0.037*string 0.030*arrai 0.022*represent 0.021*spars 0.018*axi 0.015*kroneck

In [ ]:
#n_topics = 20
## extract 400 LSI topics; use the default one-pass algorithm
#lsi = g.models.lsimodel.LsiModel(corpus=corpus, id2word=id2word, num_topics=n_topics)
## print the most contributing words (both positively and negatively) for each of the first n_topic topics
#print(*lsi.print_topics(n_topics), sep='\n')

In [124]:
lll = lda
#index = g.similarities.SparseMatrixSimilarity(lll[corpus], num_features=22)
index = g.similarities.Similarity(None, lll[corpus], num_features=n_topics)

In [160]:
i_doc = 3301
show = l_doc
text = texts[i_doc]
sims = index[lll[tfidf[id2word.doc2bow(text)]]]
print(i_doc, '', l_interested[i_doc], sep=' | ')
for i, score in list(sorted(enumerate(sims), key=lambda t : t[1], reverse=True))[:10] :
    print(i, 
          inspect.getmodule(l_interested[i]),
          l_interested[i].__name__,
          "%.3f"%score, 
          show[i].strip(), 
          sep=' | ')


3301 |  | <function linkage at 0x000000001809B6A8>
3301 | <module 'scipy.cluster.hierarchy' from 'C:\\Anaconda3\\envs\\cnn\\lib\\site-packages\\scipy\\cluster\\hierarchy.py'> | linkage | 1.000 | Performs hierarchical/agglomerative clustering on the condensed
distance matrix y.
2618 | <module 'sklearn.decomposition.online_lda' from 'C:\\Anaconda3\\envs\\cnn\\lib\\site-packages\\sklearn\\decomposition\\online_lda.py'> | _update_doc_distribution | 1.000 | E-step: update document-topic distribution.
1359 | <module 'sklearn.cluster.k_means_' from 'C:\\Anaconda3\\envs\\cnn\\lib\\site-packages\\sklearn\\cluster\\k_means_.py'> | _labels_inertia_minibatch | 0.999 | Compute labels and inertia using mini batches.
2677 | <module 'sklearn.neural_network.rbm' from 'C:\\Anaconda3\\envs\\cnn\\lib\\site-packages\\sklearn\\neural_network\\rbm.py'> | gibbs | 0.983 | Perform one Gibbs sampling step.
3 | <module 'scipy.cluster.hierarchy' from 'C:\\Anaconda3\\envs\\cnn\\lib\\site-packages\\scipy\\cluster\\hierarchy.py'> | cophenet | 0.982 | Calculates the cophenetic distances between each observation in
the hierarchical clustering defined by the linkage ``Z``.
3030 | <module 'scipy.io.matlab.mio5_params' from 'C:\\Anaconda3\\envs\\cnn\\lib\\site-packages\\scipy\\io\\matlab\\mio5_params.py'> | _convert_codecs | 0.907 | Convert codec template mapping to byte order
2822 | <module 'numpy._import_tools' from 'C:\\Anaconda3\\envs\\cnn\\lib\\site-packages\\numpy\\_import_tools.py'> | get_pkgdocs | 0.723 | Return documentation summary of subpackages.
1300 | <module 'sklearn.cluster.k_means_' from 'C:\\Anaconda3\\envs\\cnn\\lib\\site-packages\\sklearn\\cluster\\k_means_.py'> | fit | 0.722 | Compute k-means clustering.
445 | <module 'scipy.io.netcdf' from 'C:\\Anaconda3\\envs\\cnn\\lib\\site-packages\\scipy\\io\\netcdf.py'> | itemsize | 0.722 | Return the itemsize of the variable.
393 | <module 'scipy.stats.stats' from 'C:\\Anaconda3\\envs\\cnn\\lib\\site-packages\\scipy\\stats\\stats.py'> | f_oneway | 0.722 | Performs a 1-way ANOVA.

In [155]:
print([i_doc for i_doc, doc in enumerate(l_doc) if 'cluster' in doc and 'project' in l_interested[i_doc].__name__])


[1593]

In [137]:
l_interested[1593]


Out[137]:
<function sklearn.cluster.bicluster.SpectralBiclustering._project_and_cluster>

In [139]:
tfidf[id2word.doc2bow(texts[1593])]


Out[139]:
[(15, 0.4276524485396137),
 (77, 0.5741410809762693),
 (105, 0.2620051940743123),
 (129, 0.5169114161390884),
 (578, 0.3893985985293709)]

In [140]:
lda[corpus[1593]]


Out[140]:
[(8, 0.51086741256130108), (37, 0.18945832436536456)]

In [138]:
corpus[1593]


Out[138]:
[(15, 0.4276524485396137),
 (77, 0.5741410809762693),
 (105, 0.2620051940743123),
 (129, 0.5169114161390884),
 (578, 0.3893985985293709)]

In [154]:
import inspect
inspect.getmodule(l_interested[i_doc])


Out[154]:
<module 'scipy.cluster.hierarchy' from 'C:\\Anaconda3\\envs\\cnn\\lib\\site-packages\\scipy\\cluster\\hierarchy.py'>

In [ ]: