Loading data


In [1]:
import numpy as np
import pandas as pd
import os
import sys

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
HAM = 'ham'
SPAM = 'spam'
NEWLINE = '\n'

sources = [('../data/enron3/ham/', HAM), ('../data/enron3/spam/', SPAM)]
SKIP_FILES = {'cmds'}

In [3]:
def read_files(path):
    '''
    Generator of pairs (filename, filecontent)
    for all files below path whose name is not in SKIP_FILES.
    Returns body only of the emails.
    '''
    for root, dir_names, file_names in os.walk(path):
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in SKIP_FILES:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    past_header, lines = False, []
                    f = open(file_path, encoding='latin-1')
                    for line in f:
                        lines.append(line)
                    f.close()
                    content = NEWLINE.join(lines)
                    yield file_path, content

In [4]:
def build_data_frame(l, path, classification):
    rows = []
    index = []
    
    for i, (file_name, text) in enumerate(read_files(path)):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)
        
    data_frame = pd.DataFrame(rows, index=index)
    return data_frame, len(rows)

def load_data():
    data = pd.DataFrame({'text': [], 'class': []})
    l = 0
    
    for path, classification in sources:
        data_frame, nrows = build_data_frame(l, path, classification)
        data = data.append(data_frame)
        l += nrows
    data = data.reindex(np.random.permutation(data.index))
    
    return data

In [5]:
# Let's loadd all the data

In [6]:
data = load_data()


/home/ubuntu/miniconda3/lib/python3.7/site-packages/pandas/core/frame.py:6692: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  sort=sort)

In [7]:
data.shape


Out[7]:
(5512, 2)

In [8]:
data.describe()


Out[8]:
class text
count 5512 5512
unique 2 5274
top ham Subject: enron mentions\n\nusa : wrapup 1 - cr...
freq 4012 3

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('counts', CountVectorizer(ngram_range=(1,2))),
    ('nb', MultinomialNB())
])

pipeline1 = Pipeline([
    ('tfid', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

pipeline2 = Pipeline([
    ('counts', CountVectorizer(ngram_range=(1,2))),
    ('cnb', ComplementNB())
])

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['class'],test_size=.20)

In [11]:
pipeline.fit(X_train, y_train)


Out[11]:
Pipeline(memory=None,
         steps=[('counts',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [12]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, pipeline.predict(X_test)))


              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       776
        spam       1.00      0.91      0.95       327

    accuracy                           0.97      1103
   macro avg       0.98      0.96      0.97      1103
weighted avg       0.97      0.97      0.97      1103


In [13]:
confusion_matrix(y_test, pipeline.predict(X_test))


Out[13]:
array([[775,   1],
       [ 28, 299]])

In [14]:
pipeline1.fit(X_train, y_train)

print(classification_report(y_test, pipeline1.predict(X_test)))


/home/ubuntu/miniconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
              precision    recall  f1-score   support

         ham       0.98      0.99      0.99       776
        spam       0.99      0.96      0.98       327

    accuracy                           0.99      1103
   macro avg       0.99      0.98      0.98      1103
weighted avg       0.99      0.99      0.99      1103


In [15]:
confusion_matrix(y_test, pipeline1.predict(X_test))


Out[15]:
array([[772,   4],
       [ 12, 315]])

In [16]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score

k_fold = KFold(n_splits=6)
scores = []
confusion = np.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold.split(data):
    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['class'].values

    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['class'].values

    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)

    confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, pos_label=SPAM)
    scores.append(score)

print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)


Total emails classified: 5512
Score: 0.9516709009132277
Confusion matrix:
[[4007    5]
 [ 133 1367]]

In [17]:
k_fold = KFold(n_splits=6)
scores = []
confusion = np.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold.split(data):
    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['class'].values

    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['class'].values

    pipeline1.fit(train_text, train_y)
    predictions = pipeline1.predict(test_text)

    confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, pos_label=SPAM)
    scores.append(score)

print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)


/home/ubuntu/miniconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Total emails classified: 5512
Score: 0.9734716524062669
Confusion matrix:
[[3993   19]
 [  59 1441]]

In [20]:
k_fold = KFold(n_splits=6)
scores = []
confusion = np.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold.split(data):
    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['class'].values

    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['class'].values

    pipeline2.fit(train_text, train_y)
    predictions = pipeline2.predict(test_text)

    confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, pos_label=SPAM)
    scores.append(score)

print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)


Total emails classified: 5512
Score: 0.954718325061951
Confusion matrix:
[[4005    7]
 [ 123 1377]]

In [ ]: