In [43]:
documents_long = [
"the reticulated python is a species of python found in southeast asia and the longest snake in the world",
"the burmese python is a large snake native to tropical southeast asia",
"python is an interpreted high level programming language for general purpose programming",
"the green anaconda also known as the common anaconda is a non venomous snake species found in south america",
"the yellow anaconda is a snake species endemic to south america",
"anaconda is an open source distribution of the python and r programming languages for data science applications"
]
first = 'egg bacon sausage and spam'
second = 'spam bacon homar sausage and spam'
third = 'spam egg spam and spam'
documents = [first, second, third]
In [46]:
def build_vocabulary(documents):
vocabulary = set()
for document in documents:
vocabulary.update(document.split())
return sorted(vocabulary)
def build_matrix(documents, vocabulary):
matrix = []
for document in documents:
matrix.append([int(term in document) for term in vocabulary])
return matrix
vocabulary = build_vocabulary(documents)
term_document_matrix = build_matrix(documents, vocabulary)
print(vocabulary)
print(term_document_matrix)
In [49]:
from sklearn.feature_extraction.text import CountVectorizer
documents = [first, second, third]
vectorizer = CountVectorizer(binary=True)
term_document_matrix = vectorizer.fit_transform(documents)
print(type(term_document_matrix))
print(vectorizer.get_feature_names())
In [48]:
print(term_document_matrix.toarray())
In [4]:
from collections import Counter
count = Counter(documents[0].split())
terms = count.keys()
total = sum(count.values())
tf = {t: round(count[t]/total, 2) for t in terms}
print(tf)
In [5]:
from math import log
idf = {t: log(len(documents)/sum(t in d.split() for d in documents)) for t in terms}
tf_idf = {t: round(tf[t]*idf[t], 3) for t in terms}
print(tf_idf)
In [2]:
import pandas as 🐼
🐼.dataFrame(['what', 'the', 'hell'])
In [ ]: