In [ ]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, recall_score
In [ ]:
df = pd.read_table('data/preprocessed.tsv', usecols=['title', 'description', 'selected'])
df.fillna(value="", inplace=True)
In [ ]:
y = df['selected'].astype(int).values
In [ ]:
corpus = df['title']
In [ ]:
vect = TfidfVectorizer(sublinear_tf=True, stop_words='english')
X = vect.fit_transform(corpus)
pd.DataFrame(X.toarray(), columns=vect.get_feature_names()).head()
In [ ]:
svd = TruncatedSVD(n_components=250)
X = svd.fit_transform(X)
pd.DataFrame(X).head()
In [ ]:
gnb = GaussianNB()
gnb.fit(X, y)
In [ ]:
predictions = gnb.predict(X)
print((predictions == y).sum() / 290)
In [ ]:
# Retrieve the corpus from the dataset
In [ ]:
# Obtain the TD Matrix
In [ ]:
# Reduce the dimensionality of the TD matrix to 250
In [ ]:
# Train the classifier
In [ ]:
# Test the classifier
In [ ]: