In [53]:
import nltk
import numpy as np
import pprint
import utils as utl
from time import time
from gensim import corpora, models, utils
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import EnglishStemmer
from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm
In [39]:
authorID_to_titles = utl.load_pickle("../pmi_data/authorID_to_publications_clean.p")
For the topic extraction part we will use the dictionary of author->list_of_publications collected in the previous step. We need to do some preprocessing first
In [4]:
#Uncomment this cell if you don't have the data on your computer
#nltk.download("stopwords")
#nltk.download("wordnet")
For the stop words we use the one given by nltk. This set seems small so we include also other common English stop words found online or in the titles
In [73]:
english_stop_words = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the",'like', 'think', 'know', 'want', 'sure', 'thing', 'send', 'sent', 'speech', 'print', 'time','want', 'said', 'maybe', 'today', 'tomorrow', 'thank', 'thanks']
specific_stop_words = ['base', 'use', 'model', 'process', 'network']
sw =stopwords.words('english') + english_stop_words + specific_stop_words
We decide to use a stemmer and not a lemmatizer (from nltk). The reason is that we want to group together words with the same meaning. For example if one publication contains algorithm and another one contains Algorithmic in this case it would help to map those 2 words to the same. Let's see the output of a stemmer and lemmatizer. Even if our model should be able to capture the similitude among those 2 words, it will help reduce the vocabulary and speed up the training
In [74]:
lemmatizer = WordNetLemmatizer()
stemmer = EnglishStemmer()
print("Stemmer",stemmer.stem("Algorithm"), stemmer.stem("Algorithmic"))
print("Lemmatizer",lemmatizer.lemmatize("algorithm"), lemmatizer.lemmatize("Algorithmic"))
Indeed the lemmatizer keep 2 different words. Let's use the stemmer
In [75]:
def pre_processing(titles):
list_of_tokens = []
for title in titles:
tokens = utils.simple_preprocess(title)
tokens = [stemmer.stem(x) for x in tokens]
tokens = list(filter(lambda t: t not in sw, tokens))
list_of_tokens.append(tokens)
return list_of_tokens
authorID_to_titles_stem = {id_: pre_processing(titles) for id_, titles in tqdm(authorID_to_titles.items())}
utl.pickle_data(authorID_to_titles_stem, "../pmi_data/authorID_to_titles_stem.p")
We want to extract the k main topics among all the publication. And then for each author we will compute its score in each one of those topics
We use Latent Dirichlet allocation and the implementation provided by Gensim.
The principle behind LDA is that if you have a collection of documents, each documents represent a mixtures of topics. It's means that a documents contains words that belong to different categories. The goal of LDA is to retrieve those sets of words used to create the documents
We have a dictionnary of authorID-> list(list(tokens)) with the inner list representing the titles
The LDA implementation of gensim take as parameter:
- a dictionary token -> id
- list of list of (token,token_count)
We use 2 functions provided by Gensim
Since we are dealing with title, most of the time, all the words we have an occurance of 1 in the titles. And then all the word will have the same importance it will be hard for the algorithm to infer the probality p(topics | title)
Since we want to find the set of topic that represent an author, it means that we have already made the assumption that all the publications of one author should be in a subset of topics. So lets put all the publication of one author together like if it was a big documents
In [76]:
authorID_to_titles_stem = utl.load_pickle("../pmi_data/authorID_to_titles_stem.p")
In [77]:
authorID_to_document = dict()
for author, titles in tqdm(authorID_to_titles_stem.items()):
authorID_to_document[author] = []
for t in titles:
authorID_to_document[author].extend(t)
Now we have a list of author->document. We can build the dictionaray and transform each document to a list of (token, token_count)
In [78]:
dictionary = corpora.Dictionary([doc for doc in tqdm(authorID_to_document.values())])
corpus = [dictionary.doc2bow(doc) for doc in tqdm(authorID_to_document.values())]
Set up the number of parameter, we select 20 topics.
In [94]:
#parameters
num_topics = 20 # number of topics LDA has to select
passes = 1 # number of passe in the lda training
num_words = 5 # number of most important word in one topic to be printed
In [80]:
tmp = corpus
In [81]:
corpus = tmp
corpus = np.random.choice(corpus, int(len(corpus)/1000))
In [82]:
len(corpus)
Out[82]:
In [91]:
c = [c for c in tqdm(tmp) if len(c)> 100]
len(c)
Out[91]:
In [95]:
start = time()
pp = pprint.PrettyPrinter(depth=2)
lda = models.LdaModel(c, num_topics=num_topics, id2word = dictionary, passes=passes)
print("Training time:", round((time()-start)/60,2),"[min]")
pp.pprint(lda.print_topics(lda.num_topics, num_words=num_words))
lda.save('lda.model')
utl.pickle_data(lda, "../pmi_data/lda_model__20_100.p")
In [96]:
def compute_score(titles):
total_score = np.zeros(num_topics)
for title in titles:
#lda output : [(id1, score1), (id2, score2),... if id != 0]
for id_, value in lda[dictionary.doc2bow(title)]:
total_score[id_] += value
return total_score
In [98]:
score_by_author_by_document = [compute_score([doc]) for _, doc in tqdm(authorID_to_document.items())]
utl.pickle_data(score_by_author_by_document, "../pmi_data/score_by_author_by_document.p")
In [ ]:
score_by_author_by_titles = [compute_score(titles) for _, titles in tqdm(authorID_to_titles_stem.items())]
utl.pickle_data(score_by_author_by_titles,"../pmi_data/score_by_author_by_titles.p")
In [ ]:
In [ ]: