In [1]:
from gensim.corpora.dictionary import Dictionary
from nltk.tokenize import word_tokenize
from gensim.models.tfidfmodel import TfidfModel


C:\anaconda\lib\site-packages\gensim\utils.py:865: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")

In [2]:
my_documents = ['The movie was about a spaceship and aliens.',
 'I really liked the movie!',
 'Awesome action scenes, but boring characters.',
 'The movie was awful! I hate alien films.',
 'Space is cool! I liked the movie.',
 'More space films, please!',]
tokenized_docs = [word_tokenize(doc.lower())
    for doc in my_documents]
dictionary = Dictionary(tokenized_docs)

In [3]:
tokenized_docs


Out[3]:
[['the', 'movie', 'was', 'about', 'a', 'spaceship', 'and', 'aliens', '.'],
 ['i', 'really', 'liked', 'the', 'movie', '!'],
 ['awesome', 'action', 'scenes', ',', 'but', 'boring', 'characters', '.'],
 ['the', 'movie', 'was', 'awful', '!', 'i', 'hate', 'alien', 'films', '.'],
 ['space', 'is', 'cool', '!', 'i', 'liked', 'the', 'movie', '.'],
 ['more', 'space', 'films', ',', 'please', '!']]

In [5]:
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
corpus


Out[5]:
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(0, 1), (1, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(8, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)],
 [(0, 1),
  (1, 1),
  (2, 1),
  (8, 1),
  (9, 1),
  (12, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1)],
 [(0, 1), (1, 1), (8, 1), (9, 1), (11, 1), (12, 1), (24, 1), (25, 1), (26, 1)],
 [(12, 1), (16, 1), (23, 1), (24, 1), (27, 1), (28, 1)]]

In [7]:
tfidf = TfidfModel(corpus)
doc= corpus[0]

In [8]:
tfidf_weights = tfidf[doc]
print(tfidf_weights[:5])
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)


[(0, 0.09623384247925978), (1, 0.09623384247925978), (2, 0.26074668280769164), (3, 0.42525952313612353), (4, 0.42525952313612353)]

In [9]:
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary.get(term_id), weight)


about 0.42525952313612353
a 0.42525952313612353
spaceship 0.42525952313612353
and 0.42525952313612353
aliens 0.42525952313612353