In [43]:
documents_long = [
    "the reticulated python is a species of python found in southeast asia and the longest snake in the world",
    "the burmese python is a large snake native to tropical southeast asia",
    "python is an interpreted high level programming language for general purpose programming",
    "the green anaconda also known as the common anaconda is a non venomous snake species found in south america",
    "the yellow anaconda is a snake species endemic to south america",
    "anaconda is an open source distribution of the python and r programming languages for data science applications"
]

first = 'egg bacon sausage and spam'
second = 'spam bacon homar sausage and spam'
third = 'spam egg spam and spam'

documents = [first, second, third]

Binary sets by hand


In [46]:
def build_vocabulary(documents):
    vocabulary = set()
    for document in documents:
        vocabulary.update(document.split())
    return sorted(vocabulary)

def build_matrix(documents, vocabulary):
    matrix = []
    for document in documents:
        matrix.append([int(term in document) for term in vocabulary])
    return matrix

vocabulary = build_vocabulary(documents)
term_document_matrix = build_matrix(documents, vocabulary)

print(vocabulary)
print(term_document_matrix)


['and', 'bacon', 'egg', 'homar', 'sausage', 'spam']
[[1, 1, 1, 0, 1, 1], [1, 1, 0, 1, 1, 1], [1, 0, 1, 0, 0, 1]]

In [49]:
from sklearn.feature_extraction.text import CountVectorizer

documents = [first, second, third]
vectorizer = CountVectorizer(binary=True)
term_document_matrix = vectorizer.fit_transform(documents)

print(type(term_document_matrix))
print(vectorizer.get_feature_names())


<class 'scipy.sparse.csr.csr_matrix'>
['and', 'bacon', 'egg', 'homar', 'sausage', 'spam']

In [48]:
print(term_document_matrix.toarray())


[[1 1 1 0 1 1]
 [1 1 0 1 1 1]
 [1 0 1 0 0 1]]

tf-idf by hand


In [4]:
from collections import Counter

count = Counter(documents[0].split())
terms = count.keys() 
total = sum(count.values())
tf = {t: round(count[t]/total, 2) for t in terms}
print(tf)


{'reticulated': 0.05, 'world': 0.05, 'is': 0.05, 'and': 0.05, 'species': 0.05, 'python': 0.11, 'snake': 0.05, 'asia': 0.05, 'southeast': 0.05, 'a': 0.05, 'the': 0.16, 'longest': 0.05, 'found': 0.05, 'in': 0.11, 'of': 0.05}

In [5]:
from math import log 

idf = {t: log(len(documents)/sum(t in d.split() for d in documents)) for t in terms}
tf_idf = {t: round(tf[t]*idf[t], 3) for t in terms}
print(tf_idf)


{'reticulated': 0.09, 'world': 0.09, 'is': 0.0, 'and': 0.055, 'species': 0.035, 'python': 0.045, 'snake': 0.02, 'asia': 0.055, 'southeast': 0.055, 'a': 0.02, 'the': 0.029, 'longest': 0.09, 'found': 0.055, 'in': 0.121, 'of': 0.055}

In [2]:
import pandas as 🐼

🐼.dataFrame(['what', 'the', 'hell'])


  File "<ipython-input-2-c89f763203d8>", line 1
    import pandas as _🐼_
                       ^
SyntaxError: invalid character in identifier

In [ ]: