Describing parameters:
In [ ]:
PATH_NEWS_ARTICLES = ""
In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.snowball import SnowballStemmer
import re
import pickle
import pandas as pd
import gensim
from gensim import corpora, models
In [ ]:
df=pd.read_csv(PATH_NEWS_ARTICLES)
df.head(5)
In [ ]:
stop_words = set(stopwords.words('english'))
tknzr = TweetTokenizer()
stemmer = SnowballStemmer("english")
In [ ]:
def clean_text(text):
cleaned_text=re.sub('[^\w_\s-]', ' ', text) #remove punctuation marks
return cleaned_text #and other symbols
def tokenize(text):
word = tknzr.tokenize(text) #tokenization
filtered_sentence = [w for w in word if not w.lower() in stop_words] #removing stop words
stemmed_filtered_tokens = [stemmer.stem(plural) for plural in filtered_sentence] #stemming
tokens = [i for i in stemmed_filtered_tokens if i.isalpha() and len(i) not in [0, 1]]
return tokens
In [ ]:
# Cleaning all articles
# Returns a list containing list of words of each article
def text_processing():
news_articles = df['Content'].tolist()
cleaned_text = list(map(clean_text, news_articles))
article_vocabulary = list(map(tokenize, cleaned_text))
return article_vocabulary
In [ ]:
article_vocabulary = text_processing()
In [ ]:
#Parameters for LDA :-
#NUMBER_OF_TOPICS is the number of requested latent topics to be extracted from the training corpus.
NUMBER_OF_TOPICS = 5
#PASSES refers to number of iterations
PASSES = 1
#NUMBER_OF_WORDS is the number of words for which you want to check the topic-word distribution
NUMBER_OF_WORDS = 10
In [ ]:
#Mapping vocabulary with IDs
dictionary = corpora.Dictionary(article_vocabulary)
pickle.dump(dictionary, open("dictionary_of_vocabulary.p", "wb"))
zip(dictionary.keys(),dictionary.values())
In [ ]:
#Mapping Vocabulary to Corpus
corpus = [dictionary.doc2bow(text) for text in article_vocabulary]
In [ ]:
#Training LDA Model
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=NUMBER_OF_TOPICS,passes=PASSES)
lda.save('lda.model')
In [ ]:
#Topic-Word Distribution
topic_words = lda.show_topics(num_topics=NUMBER_OF_TOPICS, num_words=NUMBER_OF_WORDS) # narray of Shape: n_topics*vocab
topic_words
In [ ]:
#Article - Topic Distribution for first Article
def get_article_topic_distribution(article):
return lda.get_document_topics(article)
#Returns a list containing a list of tuple
#Each inner list corresponds to an article and each tuple refers to topicID and its corresponding probability
map(get_article_topic_distribution, corpus)
In [ ]:
new_article = """At the dawn of history India started on her unending quest, and trackless centuries are filled with her
striving and the grandeur of her success and her failures. Through good and ill fortune alike she has
never lost sight of that quest or forgotten the ideals which gave her strength. We end today a period of
ill fortune and India discovers herself again. The achievement we celebrate today is but a step, an opening
of opportunity, to the greater triumphs and achievements that await us.
Are we brave enough and wise enough to grasp this opportunity and accept the challenge of the future?"""
Describing parameters:
In [ ]:
DICTIONARY_PATH = "dictionary_of_vocabulary.p"
LDA_MODEL_PATH = "lda.model"
In [ ]:
#Cleaning the article
cleaned_text = clean_text(new_article)
article_vocabulary = tokenize(cleaned_text)
In [ ]:
#Load model dictionary
model_dictionary = pickle.load(open(DICTIONARY_PATH,"rb"))
#Generate article maping using IDs associated with vocab
corpus = [model_dictionary.doc2bow(text) for text in [article_vocabulary]]
In [ ]:
#Load LDA Model
lda = models.LdaModel.load(LDA_MODEL_PATH)
In [ ]:
#Article-Topic Distribution
article_topic_distribution=lda.get_document_topics(corpus[0])
article_topic_distribution
In [ ]: