In [1]:
    
%%bash
ls | grep .csv
    
    
In [2]:
    
# built-in libs
import email
# processing libs
import pandas as pd
# display libs
from tqdm import tqdm_notebook
    
In [3]:
    
emails_full_df = pd.read_csv('emails.csv', chunksize=10000)
emails_df = next(emails_full_df)
    
In [4]:
    
print(emails_df.shape)
emails_df.head()
    
    
    Out[4]:
In [5]:
    
emails_df.info()
    
    
In [6]:
    
%time
messages_obj_lst = []
messages_str_lst = []
message_metadata = {}
for i in tqdm_notebook(range(emails_df.shape[0])):
    msg = email.message_from_string(emails_df.message[i])
    
    for msg_property in msg:
        if msg_property in message_metadata:
            message_metadata[msg_property][i] = msg[msg_property]
        else:
            message_metadata[msg_property] = ['N/A'] * emails_df.shape[0]
    
    payload = msg.get_payload() # decode=True
    
    messages_obj_lst.append(msg)
    messages_str_lst.append(payload) #.encode('utf-8').decode('unicode_escape')
    #except KeyboardInterrupt:
    #    break
print('messages_obj_lst size: %i' % len(messages_obj_lst))
    
    
    
 
 
    
In [7]:
    
# update dataframe object
# emails_df.rename(columns = {'message':'message_obj'}, inplace = True)
emails_df = emails_df.assign(message_obj = pd.Series(messages_obj_lst).values)
emails_df = emails_df.assign(payload     = pd.Series(messages_str_lst).values)
# print(emails_df.payload.str.contains(r'\\'))
emails_df['payload'] = emails_df.payload.str.replace(r'\n', '')
    
In [8]:
    
emails_df.head()
    
    Out[8]:
In [9]:
    
for i in range(50):
    print(emails_df.message_obj[i]['Subject'])
    
    
In [10]:
    
del messages_obj_lst
del messages_str_lst
emails_df.drop('message', axis=1, inplace=True)
    
In [ ]:
    
    
In [11]:
    
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
    
In [12]:
    
train = emails_df[:7000]
test = emails_df[7000:]
    
In [13]:
    
trainheadlines = []
for row in range(0,len(train.index)):
    trainheadlines.append(emails_df.message_obj[row]['Subject']) # ' '.join(str(x))
trainheadlines = list(filter(None, trainheadlines))
[row for row in trainheadlines[:10]]
    
    Out[13]:
In [14]:
    
# trainvect = CountVectorizer()
# Trainfeature = trainvect.fit_transform(trainheadlines)
    
In [15]:
    
# ####Detailed view of Document Count Matrix
# DTM_With_Colm = pd.DataFrame(Trainfeature.toarray(),columns= trainvect.get_feature_names())
    
In [16]:
    
# Trainfeature.shape
    
In [17]:
    
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
    
In [18]:
    
%time
tokenizer = RegexpTokenizer(r'\w+')
# create English stop words list
en_stop = get_stop_words('en')
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
#Our Document
trainheadlines
# list for tokenized documents in loop
texts = []
# loop through document list
for i in trainheadlines:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
    
    
In [19]:
    
%time
#generate LDA
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1,chunksize=10000,update_every=1)
    
    
In [20]:
    
%time
import pyLDAvis.gensim
print(ldamodel.print_topics(num_topics=10, num_words=3))
    
    
In [21]:
    
ldamodel.print_topics(5)
    
    Out[21]:
In [22]:
    
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
news = pyLDAvis.gensim.prepare(ldamodel,corpus, dictionary)
    
    
In [23]:
    
news
    
    Out[23]:
In [ ]:
    
    
In [24]:
    
# %%bash
# nvidia-smi
    
In [ ]:
    
    
In [ ]: