In [1]:
import numpy as np
import pandas as pd
import os
import sys
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
In [2]:
HAM = 'ham'
SPAM = 'spam'
NEWLINE = '\n'
sources = [('../data/enron3/ham/', HAM), ('../data/enron3/spam/', SPAM)]
SKIP_FILES = {'cmds'}
In [3]:
def read_files(path):
'''
Generator of pairs (filename, filecontent)
for all files below path whose name is not in SKIP_FILES.
Returns body only of the emails.
'''
for root, dir_names, file_names in os.walk(path):
for path in dir_names:
read_files(os.path.join(root, path))
for file_name in file_names:
if file_name not in SKIP_FILES:
file_path = os.path.join(root, file_name)
if os.path.isfile(file_path):
past_header, lines = False, []
f = open(file_path, encoding='latin-1')
for line in f:
lines.append(line)
f.close()
content = NEWLINE.join(lines)
yield file_path, content
In [4]:
def build_data_frame(l, path, classification):
rows = []
index = []
for i, (file_name, text) in enumerate(read_files(path)):
rows.append({'text': text, 'class': classification})
index.append(file_name)
data_frame = pd.DataFrame(rows, index=index)
return data_frame, len(rows)
def load_data():
data = pd.DataFrame({'text': [], 'class': []})
l = 0
for path, classification in sources:
data_frame, nrows = build_data_frame(l, path, classification)
data = data.append(data_frame)
l += nrows
data = data.reindex(np.random.permutation(data.index))
return data
In [5]:
# Let's loadd all the data
In [6]:
data = load_data()
In [7]:
data.shape
Out[7]:
In [8]:
data.describe()
Out[8]:
In [19]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([
('counts', CountVectorizer(ngram_range=(1,2))),
('nb', MultinomialNB())
])
pipeline1 = Pipeline([
('tfid', TfidfVectorizer()),
('lr', LogisticRegression())
])
pipeline2 = Pipeline([
('counts', CountVectorizer(ngram_range=(1,2))),
('cnb', ComplementNB())
])
In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['class'],test_size=.20)
In [11]:
pipeline.fit(X_train, y_train)
Out[11]:
In [12]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, pipeline.predict(X_test)))
In [13]:
confusion_matrix(y_test, pipeline.predict(X_test))
Out[13]:
In [14]:
pipeline1.fit(X_train, y_train)
print(classification_report(y_test, pipeline1.predict(X_test)))
In [15]:
confusion_matrix(y_test, pipeline1.predict(X_test))
Out[15]:
In [16]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score
k_fold = KFold(n_splits=6)
scores = []
confusion = np.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold.split(data):
train_text = data.iloc[train_indices]['text'].values
train_y = data.iloc[train_indices]['class'].values
test_text = data.iloc[test_indices]['text'].values
test_y = data.iloc[test_indices]['class'].values
pipeline.fit(train_text, train_y)
predictions = pipeline.predict(test_text)
confusion += confusion_matrix(test_y, predictions)
score = f1_score(test_y, predictions, pos_label=SPAM)
scores.append(score)
print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)
In [17]:
k_fold = KFold(n_splits=6)
scores = []
confusion = np.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold.split(data):
train_text = data.iloc[train_indices]['text'].values
train_y = data.iloc[train_indices]['class'].values
test_text = data.iloc[test_indices]['text'].values
test_y = data.iloc[test_indices]['class'].values
pipeline1.fit(train_text, train_y)
predictions = pipeline1.predict(test_text)
confusion += confusion_matrix(test_y, predictions)
score = f1_score(test_y, predictions, pos_label=SPAM)
scores.append(score)
print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)
In [20]:
k_fold = KFold(n_splits=6)
scores = []
confusion = np.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold.split(data):
train_text = data.iloc[train_indices]['text'].values
train_y = data.iloc[train_indices]['class'].values
test_text = data.iloc[test_indices]['text'].values
test_y = data.iloc[test_indices]['class'].values
pipeline2.fit(train_text, train_y)
predictions = pipeline2.predict(test_text)
confusion += confusion_matrix(test_y, predictions)
score = f1_score(test_y, predictions, pos_label=SPAM)
scores.append(score)
print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)
In [ ]: