UCI SMS Spam Collection Dataset

Dataset URL: http://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

A set of labeled SMS messages + label (ham vs Spam)


In [1]:
import pandas as pd
import string
import re
from sklearn import model_selection

In [2]:
DATASET_FILE = 'data/sms-spam/SMSSpamCollection'
dataset = pd.read_csv(DATASET_FILE, sep='\t', names=['class','sms'])
dataset.head()


Out[2]:
class sms
0 ham Go until jurong point, crazy.. Available only ...
1 ham Ok lar... Joking wif u oni...
2 spam Free entry in 2 a wkly comp to win FA Cup fina...
3 ham U dun say so early hor... U c already then say...
4 ham Nah I don't think he goes to usf, he lives aro...

In [3]:
print("Dataset Size: {}".format(len(dataset)))
value_counts = dataset['class'].value_counts()
print(value_counts)
print("ham %: {}".format(round(value_counts[0]/len(dataset)*100,2)))
print("ham %: {}".format(round(value_counts[1]/len(dataset)*100,2)))


Dataset Size: 5572
ham     4825
spam     747
Name: class, dtype: int64
ham %: 86.59
ham %: 13.41

Create Training and Validation Datasets


In [4]:
exclude = ['\t', '"']
def clean_text(text):
    for c in exclude:
        text=text.replace(c,'')
    return text.lower().strip()

sms_processed = list(map(lambda text: clean_text(text), 
                         dataset['sms'].values))

dataset['sms'] = sms_processed

splitter =  model_selection.StratifiedShuffleSplit(n_splits=1,
                                                   test_size=0.25, 
                                                   random_state=19850610)

splits = list(splitter.split(X=dataset['sms'], y=dataset['class']))
train_index = splits[0][0]
valid_index = splits[0][1]

train_df = dataset.loc[train_index,:]
print(len(train_df))

valid_df = dataset.loc[valid_index,:]
print(len(valid_df))


4179
1393

In [5]:
print("Training Set")
training_value_counts = train_df['class'].value_counts()
print(training_value_counts)
print("ham %: {}".format(round(training_value_counts[0]/len(train_df)*100,2)))
print("ham %: {}".format(round(training_value_counts[1]/len(train_df)*100,2)))
print("")
print("Validation Set")
validation_value_counts = valid_df['class'].value_counts()
print(validation_value_counts)
print("ham %: {}".format(round(validation_value_counts[0]/len(valid_df)*100,2)))
print("ham %: {}".format(round(validation_value_counts[1]/len(valid_df)*100,2)))


Training Set
ham     3619
spam     560
Name: class, dtype: int64
ham %: 86.6
ham %: 13.4

Validation Set
ham     1206
spam     187
Name: class, dtype: int64
ham %: 86.58
ham %: 13.42

Save Training and Validation Datasets


In [6]:
train_df.to_csv("data/sms-spam/train-data.tsv", header=False, index=False, sep='\t')
valid_df.to_csv("data/sms-spam/valid-data.tsv", header=False, index=False, sep='\t')

In [7]:
pd.read_csv("data/sms-spam/train-data.tsv", sep='\t', names=['class','sms']).tail()


Out[7]:
class sms
4174 ham just woke up. yeesh its late. but i didn't fal...
4175 ham what do u reckon as need 2 arrange transport i...
4176 spam free entry into our £250 weekly competition ju...
4177 spam -pls stop bootydelious (32/f) is inviting you ...
4178 ham tell my bad character which u dnt lik in me. ...

In [12]:
pd.read_csv("data/sms-spam/valid-data.tsv", sep='\t', names=['class','sms']).tail()


Out[12]:
class sms
1387 ham true dear..i sat to pray evening and felt so.s...
1388 ham what will we do in the shower, baby?
1389 ham where are you ? what are you doing ? are yuou ...
1390 spam ur cash-balance is currently 500 pounds - to m...
1391 spam not heard from u4 a while. call 4 rude chat pr...

Calculate Vocabulary


In [9]:
def get_vocab():
    vocab = set()
    for text in train_df['sms'].values:
        words = text.split(' ')
        word_set = set(words)
        vocab.update(word_set)
    
    vocab.remove('')
    return list(vocab)

In [10]:
vocab = get_vocab()
print(len(vocab))
vocab[10:20]


11330
Out[10]:
['child',
 'place..',
 'hi..i',
 'oso?',
 'home!',
 'lasting',
 'there..do',
 'clock',
 'advice',
 'free...']

Save Vocabulary


In [11]:
PAD_WORD = '#=KS=#'

with open('data/sms-spam/vocab_list.tsv', 'w') as file:
    file.write("{}\n".format(PAD_WORD))
    for word in vocab:
        file.write("{}\n".format(word))
        
with open('data/sms-spam/n_words.tsv', 'w') as file:
    file.write(str(len(vocab)))

In [ ]: