In [1]:
    
%%bash
ls | grep .csv
    
    
In [2]:
    
# %%bash
# pip3 install bokeh
    
In [3]:
    
# built-in libs
import email
# processing libs
import pandas as pd
# display libs
from tqdm import tqdm_notebook
    
In [4]:
    
emails_full_df = pd.read_csv('emails.csv', chunksize=10000)
emails_df = next(emails_full_df)
    
In [5]:
    
print(emails_df.shape)
emails_df.head()
    
    
    Out[5]:
In [6]:
    
emails_df.info()
    
    
In [7]:
    
%time
messages_obj_lst = []
messages_str_lst = []
message_metadata = {}
for i in tqdm_notebook(range(emails_df.shape[0])):
    msg = email.message_from_string(emails_df.message[i])
    
    for msg_property in msg:
        if msg_property in message_metadata:
            message_metadata[msg_property][i] = msg[msg_property]
        else:
            message_metadata[msg_property] = ['N/A'] * emails_df.shape[0]
    
    payload = msg.get_payload() # decode=True
    
    messages_obj_lst.append(msg)
    messages_str_lst.append(payload) #.encode('utf-8').decode('unicode_escape')
    #except KeyboardInterrupt:
    #    break
print('messages_obj_lst size: %i' % len(messages_obj_lst))
    
    
    
 
 
    
In [8]:
    
# update dataframe object
# emails_df.rename(columns = {'message':'message_obj'}, inplace = True)
emails_df = emails_df.assign(message_obj = pd.Series(messages_obj_lst).values)
emails_df = emails_df.assign(payload     = pd.Series(messages_str_lst).values)
# print(emails_df.payload.str.contains(r'\\'))
emails_df['payload'] = emails_df.payload.str.replace(r'\n', '')
    
In [9]:
    
emails_df.head()
    
    Out[9]:
In [10]:
    
# del messages_obj_lst
# del messages_str_lst
emails_df.drop('message', axis=1, inplace=True)
    
In [ ]:
    
    
In [11]:
    
corpus_text = '\n'.join(emails_df[:50000]['payload'])
sentences = corpus_text.split('\n')
sentences = [line.lower().split(' ') for line in sentences]
    
In [12]:
    
def clean(s):
    return [w.strip(',."!?:;()\'') for w in s]
sentences = [clean(s) for s in sentences if len(s) > 0]
    
In [ ]:
    
    
In [13]:
    
from gensim.models import Word2Vec
model = Word2Vec(sentences, size=100, window=5, min_count=3, workers=4)
    
In [14]:
    
vectors = model.wv
# del model
    
In [15]:
    
vectors['good']
    
    Out[15]:
In [16]:
    
print(vectors.similarity('you', 'your'))
print(vectors.similarity('you', 'internet'))
    
    
    
In [17]:
    
vectors.most_similar('kill')
    
    
    Out[17]:
In [18]:
    
len(model.wv.vocab)
    
    Out[18]:
In [19]:
    
# build a list of the terms, integer indices,
# and term counts from the food2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count) for term, voc in model.wv.vocab.items()]
# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda k: -k[2])
# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)
# print(ordered_terms)
# create a DataFrame with the food2vec vectors as data,
# and the terms as row labels
word_vectors = pd.DataFrame(model.wv.syn0norm[term_indices, :], index=ordered_terms)
word_vectors.head(3)
    
    
    Out[19]:
In [20]:
    
def get_related_terms(token, topn=10):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """
    for word, similarity in model.most_similar(positive=[token], topn=topn):
        print (word, round(similarity, 3))
    
In [21]:
    
get_related_terms(u'illegal')
    
    
    
In [22]:
    
get_related_terms(u'killed')
    
    
    
In [23]:
    
get_related_terms(u'contract')
    
    
    
In [24]:
    
get_related_terms(u'fired')
    
    
    
In [25]:
    
def word_algebra(add=[], subtract=[], topn=1):
    """
    combine the vectors associated with the words provided
    in add= and subtract=, look up the topn most similar
    terms to the combined vector, and print the result(s)
    """
    answers = model.most_similar(positive=add, negative=subtract, topn=topn)
    
    for term, similarity in answers:
        print(term)
    
In [26]:
    
word_algebra(add=[u'i', u'will'])
    
    
    
In [27]:
    
word_algebra(add=[u'you', u'will'])
    
    
    
In [28]:
    
word_algebra(add=[u'i', u'am'])
    
    
    
In [29]:
    
word_algebra(add=[u'mother', u'fuck'])
    
    
    
In [ ]:
    
    
In [30]:
    
from sklearn.manifold import TSNE
    
In [31]:
    
tsne_input = word_vectors
tsne_input = tsne_input.head(5000)
    
In [32]:
    
tsne_input[:2]
    
    Out[32]:
In [33]:
    
%%time
tsne = TSNE()
tsne_vectors = tsne.fit_transform(tsne_input.values)
    
    
In [34]:
    
tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(tsne_input.index),
                            columns=[u'x_coord', u'y_coord'])
tsne_vectors.head()
    
    Out[34]:
In [35]:
    
tsne_vectors[u'word'] = tsne_vectors.index
    
In [36]:
    
tsne_vectors.head()
    
    Out[36]:
In [37]:
    
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value
output_notebook()
    
    
    
In [38]:
    
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)
# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, reset'),
                   active_scroll=u'wheel_zoom')
# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )
# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')
# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None
# engage!
show(tsne_plot);
    
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]: