Describing parameters:
In [1]:
PATH_ARTICLE_TOPIC_DISTRIBUTION = "/home/phoenix/Documents/HandsOn/Final/python/Topic Model/model/Article_Topic_Distribution.csv"
PATH_NEWS_ARTICLES = "/home/phoenix/Documents/HandsOn/Final/news_articles.csv"
NO_OF_TOPICS=150
ARTICLES_READ=[7,6,76,61,761]
NUM_RECOMMENDED_ARTICLES=5
In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
In [3]:
article_topic_distribution = pd.read_csv(PATH_ARTICLE_TOPIC_DISTRIBUTION)
article_topic_distribution.shape
Out[3]:
In [4]:
article_topic_distribution.head()
Out[4]:
Generate Article-Topic Distribution matrix
In [5]:
#Pivot the dataframe
article_topic_pivot = article_topic_distribution.pivot(index='Article_Id', columns='Topic_Id', values='Topic_Weight')
#Fill NaN with 0
article_topic_pivot.fillna(value=0, inplace=True)
#Get the values in dataframe as matrix
articles_topic_matrix = article_topic_pivot.values
articles_topic_matrix.shape
Out[5]:
In [6]:
article_topic_pivot.head()
Out[6]:
A user vector is represented in terms of average of read articles topic vector
In [7]:
#Select user in terms of read article topic distribution
row_idx = np.array(ARTICLES_READ)
read_articles_topic_matrix=articles_topic_matrix[row_idx[:, None]]
#Calculate the average of read articles topic vector
user_vector = np.mean(read_articles_topic_matrix, axis=0)
user_vector.shape
Out[7]:
In [8]:
user_vector
Out[8]:
In [9]:
def calculate_cosine_similarity(articles_topic_matrix, user_vector):
articles_similarity_score=cosine_similarity(articles_topic_matrix, user_vector)
recommended_articles_id = articles_similarity_score.flatten().argsort()[::-1]
#Remove read articles from recommendations
final_recommended_articles_id = [article_id for article_id in recommended_articles_id
if article_id not in ARTICLES_READ ][:NUM_RECOMMENDED_ARTICLES]
return final_recommended_articles_id
In [10]:
recommended_articles_id = calculate_cosine_similarity(articles_topic_matrix, user_vector)
recommended_articles_id
Out[10]:
In [11]:
#Recommended Articles and their title
news_articles = pd.read_csv(PATH_NEWS_ARTICLES)
print 'Articles Read'
print news_articles.loc[news_articles['Article_Id'].isin(ARTICLES_READ)]['Title']
print '\n'
print 'Recommender '
print news_articles.loc[news_articles['Article_Id'].isin(recommended_articles_id)]['Title']
(Alpha) <Topic Vector> + (1-Alpha) <NER Vector> <br/>
where
In [12]:
ALPHA = 0.5
DICTIONARY_PATH = "/home/phoenix/Documents/HandsOn/Final/python/Topic Model/model/dictionary_of_words.p"
LDA_MODEL_PATH = "/home/phoenix/Documents/HandsOn/Final/python/Topic Model/model/lda.model"
In [13]:
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import tree2conlltags
import re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.snowball import SnowballStemmer
import pickle
import gensim
from gensim import corpora, models
In [14]:
row_idx = np.array(ARTICLES_READ)
read_articles_topic_matrix=articles_topic_matrix[row_idx[:, None]]
#Calculate the average of read articles topic vector
user_topic_vector = np.mean(read_articles_topic_matrix, axis=0)
user_topic_vector.shape
Out[14]:
In [15]:
# Get NERs of read articles
def get_ner(article):
ne_tree = ne_chunk(pos_tag(word_tokenize(article)))
iob_tagged = tree2conlltags(ne_tree)
ner_token = ' '.join([token for token,pos,ner_tag in iob_tagged if not ner_tag==u'O']) #Discarding tokens with 'Other' tag
return ner_token
In [16]:
articles = news_articles['Content'].tolist()
user_articles_ner = ' '.join([get_ner(articles[i]) for i in ARTICLES_READ])
print "NERs of Read Article =>", user_articles_ner
In [17]:
stop_words = set(stopwords.words('english'))
tknzr = TweetTokenizer()
stemmer = SnowballStemmer("english")
In [18]:
def clean_text(text):
cleaned_text=re.sub('[^\w_\s-]', ' ', text) #remove punctuation marks
return cleaned_text #and other symbols
def tokenize(text):
word = tknzr.tokenize(text) #tokenization
filtered_sentence = [w for w in word if not w.lower() in stop_words] #removing stop words
stemmed_filtered_tokens = [stemmer.stem(plural) for plural in filtered_sentence] #stemming
tokens = [i for i in stemmed_filtered_tokens if i.isalpha() and len(i) not in [0, 1]]
return tokens
In [19]:
#Cleaning the article
cleaned_text = clean_text(user_articles_ner)
article_vocabulary = tokenize(cleaned_text)
In [20]:
#Load model dictionary
model_dictionary = pickle.load(open(DICTIONARY_PATH,"rb"))
#Generate article maping using IDs associated with vocab
corpus = [model_dictionary.doc2bow(text) for text in [article_vocabulary]]
In [21]:
#Load LDA Model
lda = models.LdaModel.load(LDA_MODEL_PATH)
In [22]:
# Get topic distribution for the concated NERs
article_topic_distribution=lda.get_document_topics(corpus[0])
article_topic_distribution
Out[22]:
In [23]:
ner_vector =[0]*NO_OF_TOPICS
for topic_id, topic_weight in article_topic_distribution:
ner_vector[topic_id]=topic_weight
user_ner_vector = np.asarray(ner_vector).reshape(1,150)
In [24]:
alpha_topic_vector = ALPHA*user_topic_vector
alpha_ner_vector = (1-ALPHA) * user_ner_vector
user_vector = np.add(alpha_topic_vector,alpha_ner_vector)
user_vector
Out[24]:
In [25]:
recommended_articles_id = calculate_cosine_similarity(articles_topic_matrix, user_vector)
recommended_articles_id
# [array([ 0.75807146]), array([ 0.74644157]), array([ 0.74440326]), array([ 0.7420562]), array([ 0.73966259])]
Out[25]:
In [26]:
#Recommended Articles and their title
news_articles = pd.read_csv(PATH_NEWS_ARTICLES)
print 'Articles Read'
print news_articles.loc[news_articles['Article_Id'].isin(ARTICLES_READ)]['Title']
print '\n'
print 'Recommender '
print news_articles.loc[news_articles['Article_Id'].isin(recommended_articles_id)]['Title']
In [ ]: