In [1]:
from gensim.corpora.dictionary import Dictionary
from nltk.tokenize import word_tokenize
from gensim.models.tfidfmodel import TfidfModel
In [2]:
my_documents = ['The movie was about a spaceship and aliens.',
'I really liked the movie!',
'Awesome action scenes, but boring characters.',
'The movie was awful! I hate alien films.',
'Space is cool! I liked the movie.',
'More space films, please!',]
tokenized_docs = [word_tokenize(doc.lower())
for doc in my_documents]
dictionary = Dictionary(tokenized_docs)
In [3]:
tokenized_docs
Out[3]:
In [5]:
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
corpus
Out[5]:
In [7]:
tfidf = TfidfModel(corpus)
doc= corpus[0]
In [8]:
tfidf_weights = tfidf[doc]
print(tfidf_weights[:5])
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)
In [9]:
for term_id, weight in sorted_tfidf_weights[:5]:
print(dictionary.get(term_id), weight)