notebook.community

Edit and run



In [4]:

    
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

d = ['the red apple', 'the orange', 'the green apple', 'the lemon', 'the plum', 'the red apple']

vectorizer = CountVectorizer()
x = vectorizer.fit_transform(d)

print x.toarray()









    



[[1 0 0 0 0 1 1]
 [0 0 0 1 0 0 1]
 [1 1 0 0 0 0 1]
 [0 0 1 0 0 0 1]
 [0 0 0 0 1 0 1]
 [1 0 0 0 0 1 1]]



In [12]:

    
transformer = TfidfTransformer()
x = transformer.fit_transform(x)

print x.toarray()









    



[[ 0.58873218  0.          0.          0.          0.          0.72971837
   0.34771471]
 [ 0.          0.          0.          0.90275015  0.          0.
   0.43016528]
 [ 0.58873218  0.72971837  0.          0.          0.          0.
   0.34771471]
 [ 0.          0.          0.90275015  0.          0.          0.
   0.43016528]
 [ 0.          0.          0.          0.          0.90275015  0.
   0.43016528]]



In [55]:

    
document = [
    ('This is my dissertation apresentation'),
    #('this is my dissertation text')
]



In [56]:

    
from gensim import corpora, models, similarities

texts = [word.lower().split() for word in document]



In [57]:

    
texts









    Out[57]:





[['this', 'is', 'my', 'dissertation', 'apresentation']]



In [58]:

    
dictionary = corpora.Dictionary(texts)



In [59]:

    
corpus = [dictionary.doc2bow(word) for word in texts]



In [60]:

    
dictionary.dfs









    Out[60]:





{0: 1, 1: 1, 2: 1, 3: 1, 4: 1}



In [76]:

    
lda = models.LdaModel(corpus, id2word=dictionary)
lda.print_topics()









    Out[76]:





[(37,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (78,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (81,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (75,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (4,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (70,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (38,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (72,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (62,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (99,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my')]



In [80]:

    
lda.show_topics()









    Out[80]:





[(2,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (1,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (35,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (60,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (67,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (70,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (28,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (16,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (42,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (37,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my')]



In [ ]: