In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sn
In [8]:
with open('SMSSpamCollection.txt') as fh:
lines = list(fh)
data = [(line.split()[0], ' '.join(line.split()[1:])) for line in lines]
data_df = pd.DataFrame.from_records(data, columns=['label', 'text'])
In [12]:
counts = data_df.label.value_counts()
ax = counts.plot(kind='bar', rot=0)
In [14]:
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
In [25]:
pipeline = Pipeline([
('counter', CountVectorizer(min_df=5, max_df=0.3, lowercase=True)),
('model', MultinomialNB(fit_prior=False))
])
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1000)
predictions = cross_val_predict(
pipeline,
data_df.text.values,
data_df.label.values,
cv=cv)
report = classification_report(predictions, data_df.label.values)
print(report)
In [ ]: