In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

d = ['the red apple', 'the orange', 'the green apple', 'the lemon', 'the plum', 'the red apple']

vectorizer = CountVectorizer()
x = vectorizer.fit_transform(d)

print x.toarray()


[[1 0 0 0 0 1 1]
 [0 0 0 1 0 0 1]
 [1 1 0 0 0 0 1]
 [0 0 1 0 0 0 1]
 [0 0 0 0 1 0 1]
 [1 0 0 0 0 1 1]]

In [12]:
transformer = TfidfTransformer()
x = transformer.fit_transform(x)

print x.toarray()


[[ 0.58873218  0.          0.          0.          0.          0.72971837
   0.34771471]
 [ 0.          0.          0.          0.90275015  0.          0.
   0.43016528]
 [ 0.58873218  0.72971837  0.          0.          0.          0.
   0.34771471]
 [ 0.          0.          0.90275015  0.          0.          0.
   0.43016528]
 [ 0.          0.          0.          0.          0.90275015  0.
   0.43016528]]

In [55]:
document = [
    ('This is my dissertation apresentation'),
    #('this is my dissertation text')
]

In [56]:
from gensim import corpora, models, similarities

texts = [word.lower().split() for word in document]

In [57]:
texts


Out[57]:
[['this', 'is', 'my', 'dissertation', 'apresentation']]

In [58]:
dictionary = corpora.Dictionary(texts)

In [59]:
corpus = [dictionary.doc2bow(word) for word in texts]

In [60]:
dictionary.dfs


Out[60]:
{0: 1, 1: 1, 2: 1, 3: 1, 4: 1}

In [76]:
lda = models.LdaModel(corpus, id2word=dictionary)
lda.print_topics()


Out[76]:
[(37,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (78,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (81,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (75,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (4,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (70,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (38,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (72,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (62,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (99,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my')]

In [80]:
lda.show_topics()


Out[80]:
[(2,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (1,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (35,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (60,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (67,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (70,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (28,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (16,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (42,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my'),
 (37,
  u'0.200*this + 0.200*dissertation + 0.200*is + 0.200*apresentation + 0.200*my')]

In [ ]: