In [1]:
import pandas as pd
from collections import Counter
import numpy as np
d1 = "in the new york times in" # (repeated in)
d2 = "the new york post"
d3 = "the los angeles times"
docstrs = [d1,d2,d3]
docs = [s.split() for s in docstrs]
docs
Out[1]:
In [2]:
uniq = set(' '.join(docstrs).split())
uniq = sorted(list(uniq))
uniq
Out[2]:
In [25]:
tf = pd.DataFrame(data=list(uniq), columns=['word'])
tf = tf.set_index('word')
tf
Out[25]:
In [26]:
Counter(d1.split()).items()
Out[26]:
In [27]:
for i,d in enumerate(docs):
c = Counter(d)
doc_items = pd.DataFrame.from_records(list(c.items()),
columns=['word',f'd{i+1}'])
doc_items = doc_items.set_index('word')
tf = tf.merge(doc_items, on='word', how='left')
tf = tf.fillna(0).astype('int')
print(tf)
print(tf.T)
In [6]:
df = pd.DataFrame(data=uniq, columns=['word'])
df = df.set_index('word')
df['doc count'] = [np.sum([w in d for d in docs]) for w in uniq]
df
Out[6]:
In [7]:
df['df'] = df['doc count'] / len(docs)
df['idf'] = 1 / df['df']
df['log idf'] = np.log10(df['idf'])
df
Out[7]:
In [8]:
tfidf = pd.concat([df, tf], axis=1)
tfidf
Out[8]:
In [9]:
tfidf['d1 tf'] = tfidf['d1'] / len(docs[0])
tfidf['d2 tf'] = tfidf['d2'] / len(docs[1])
tfidf['d3 tf'] = tfidf['d3'] / len(docs[2])
tfidf
Out[9]:
In [10]:
tfidf['d1 tfidf'] = tfidf['d1 tf'] * tfidf['log idf']
tfidf['d2 tfidf'] = tfidf['d2 tf'] * tfidf['log idf']
tfidf['d3 tfidf'] = tfidf['d3 tf'] * tfidf['log idf']
tfidf
Out[10]:
In [11]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docstrs)
X
Out[11]:
In [12]:
vectorizer.get_feature_names()
Out[12]:
In [13]:
X.toarray()
Out[13]:
In [14]:
D = pd.DataFrame(data=X.toarray(), columns=vectorizer.get_feature_names())
D = D.T
D.columns = ['d1','d2','d3']
D
Out[14]: