In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
plt.style.use('ggplot')
%matplotlib inline
df = pd.read_table('data/preprocessed.tsv')
In [ ]:
corpus = [
"it was the best of times",
"it was the worst of times",
"it was the age of wisdom",
"it was the age of foolishness"
]
In [ ]:
vect = CountVectorizer(binary=True)
X = vect.fit_transform(corpus).toarray()
pd.DataFrame(X, columns=vect.get_feature_names())
In [ ]:
vect = CountVectorizer()
X = vect.fit_transform(corpus).toarray()
pd.DataFrame(X, columns=vect.get_feature_names())
In [ ]:
vect = TfidfVectorizer()
X = vect.fit_transform(corpus).toarray()
pd.DataFrame(X, columns=vect.get_feature_names())
In [ ]:
df.head()
In [ ]:
corpus = df['title']
vect = CountVectorizer(stop_words='english')
X = vect.fit_transform(corpus).toarray()
pd.DataFrame(X, columns=vect.get_feature_names())
In [ ]:
corpus = df['title']
vect = CountVectorizer(stop_words='english', max_features=10)
X = vect.fit_transform(corpus).toarray()
pd.DataFrame(X, columns=vect.get_feature_names())
In [ ]:
# ENTER CODE HERE