In [53]:

    
import nltk
import numpy as np
import pprint
import utils as utl
from time import time
from gensim import corpora, models, utils
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import EnglishStemmer
from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm



In [39]:

    
authorID_to_titles = utl.load_pickle("../pmi_data/authorID_to_publications_clean.p")

Preprocessing

For the topic extraction part we will use the dictionary of author->list_of_publications collected in the previous step. We need to do some preprocessing first

We use the utils.simple_preprocess function from gensim to return a list of lowered tokenized word
We stem each word
filter out the stopwords.



In [4]:

    
#Uncomment this cell if you don't have the data on your computer
#nltk.download("stopwords")
#nltk.download("wordnet")

For the stop words we use the one given by nltk. This set seems small so we include also other common English stop words found online or in the titles



In [73]:

    
english_stop_words = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount",  "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as",  "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the",'like', 'think', 'know', 'want', 'sure', 'thing', 'send', 'sent', 'speech', 'print', 'time','want', 'said', 'maybe', 'today', 'tomorrow', 'thank', 'thanks']
specific_stop_words = ['base', 'use', 'model', 'process', 'network']
sw =stopwords.words('english') + english_stop_words + specific_stop_words

We decide to use a stemmer and not a lemmatizer (from nltk). The reason is that we want to group together words with the same meaning. For example if one publication contains algorithm and another one contains Algorithmic in this case it would help to map those 2 words to the same. Let's see the output of a stemmer and lemmatizer. Even if our model should be able to capture the similitude among those 2 words, it will help reduce the vocabulary and speed up the training



In [74]:

    
lemmatizer = WordNetLemmatizer()
stemmer = EnglishStemmer()

print("Stemmer",stemmer.stem("Algorithm"), stemmer.stem("Algorithmic"))
print("Lemmatizer",lemmatizer.lemmatize("algorithm"), lemmatizer.lemmatize("Algorithmic"))









    



Stemmer algorithm algorithm
Lemmatizer algorithm Algorithmic

Indeed the lemmatizer keep 2 different words. Let's use the stemmer



In [75]:

    
def pre_processing(titles):
    list_of_tokens = []
    for title in titles:
        tokens = utils.simple_preprocess(title)
        tokens = [stemmer.stem(x) for x in tokens]
        tokens = list(filter(lambda t: t not in sw, tokens))
        list_of_tokens.append(tokens)
    return list_of_tokens
authorID_to_titles_stem = {id_: pre_processing(titles) for id_, titles in tqdm(authorID_to_titles.items())} 
                      
utl.pickle_data(authorID_to_titles_stem, "../pmi_data/authorID_to_titles_stem.p")

Topic Extraction

We want to extract the k main topics among all the publication. And then for each author we will compute its score in each one of those topics

We use Latent Dirichlet allocation and the implementation provided by Gensim.

Latent Dirichlet allocation (LDA)

The principle behind LDA is that if you have a collection of documents, each documents represent a mixtures of topics. It's means that a documents contains words that belong to different categories. The goal of LDA is to retrieve those sets of words used to create the documents

Extraction

We have a dictionnary of authorID-> list(list(tokens)) with the inner list representing the titles

The LDA implementation of gensim take as parameter:

- a dictionary token -> id
- list of list of (token,token_count)

We use 2 functions provided by Gensim

Since we are dealing with title, most of the time, all the words we have an occurance of 1 in the titles. And then all the word will have the same importance it will be hard for the algorithm to infer the probality p(topics | title)

Since we want to find the set of topic that represent an author, it means that we have already made the assumption that all the publications of one author should be in a subset of topics. So lets put all the publication of one author together like if it was a big documents



In [76]:

    
authorID_to_titles_stem = utl.load_pickle("../pmi_data/authorID_to_titles_stem.p")



In [77]:

    
authorID_to_document = dict()
for author, titles in tqdm(authorID_to_titles_stem.items()):
    authorID_to_document[author] = []
    for t in titles:
        authorID_to_document[author].extend(t)

Now we have a list of author->document. We can build the dictionaray and transform each document to a list of (token, token_count)



In [78]:

    
dictionary = corpora.Dictionary([doc for doc in tqdm(authorID_to_document.values())])
corpus = [dictionary.doc2bow(doc) for doc in tqdm(authorID_to_document.values())]

Set up the number of parameter, we select 20 topics.



In [94]:

    
#parameters
num_topics = 20 # number of topics LDA has to select
passes = 1 # number of passe in the lda training
num_words = 5 # number of most important word in one topic to be printed



In [80]:

    
tmp = corpus



In [81]:

    
corpus = tmp
corpus = np.random.choice(corpus, int(len(corpus)/1000))



In [82]:

    
len(corpus)









    Out[82]:





1888



In [91]:

    
c = [c for c in tqdm(tmp) if len(c)> 100]
len(c)









    





 
 










    Out[91]:





67927






    



17849/|/  1%|| 17849/1888607 [00:40<1:09:54, 445.98it/s]



In [95]:

    
start = time()
pp = pprint.PrettyPrinter(depth=2)
lda = models.LdaModel(c, num_topics=num_topics, id2word = dictionary, passes=passes)
print("Training time:", round((time()-start)/60,2),"[min]")
pp.pprint(lda.print_topics(lda.num_topics, num_words=num_words))
lda.save('lda.model')
utl.pickle_data(lda, "../pmi_data/lda_model__20_100.p")









    



Training time: 7.2 [min]
[(0,
  '0.060*"data" + 0.021*"databas" + 0.013*"protein" + 0.013*"predict" + '
  '0.013*"queri"'),
 (1,
  '0.048*"algorithm" + 0.040*"problem" + 0.030*"optim" + 0.015*"approxim" + '
  '0.011*"search"'),
 (2,
  '0.064*"control" + 0.033*"fuzzi" + 0.017*"neural" + 0.016*"optim" + '
  '0.014*"stabil"'),
 (3,
  '0.044*"robot" + 0.018*"mobil" + 0.017*"interact" + 0.014*"human" + '
  '0.014*"visual"'),
 (4,
  '0.027*"web" + 0.025*"semant" + 0.019*"retriev" + 0.018*"inform" + '
  '0.016*"search"'),
 (5,
  '0.059*"secur" + 0.022*"attack" + 0.018*"scheme" + 0.018*"privaci" + '
  '0.017*"key"'),
 (6,
  '0.033*"logic" + 0.028*"program" + 0.019*"languag" + 0.011*"verif" + '
  '0.011*"formal"'),
 (7,
  '0.049*"architectur" + 0.036*"design" + 0.034*"simul" + 0.023*"reconfigur" + '
  '0.019*"applic"'),
 (8,
  '0.020*"method" + 0.017*"estim" + 0.013*"analysi" + 0.012*"signal" + '
  '0.011*"filter"'),
 (9, '0.041*"eacut" + 0.021*"data" + 0.014*"sar" + 0.010*"la" + 0.009*"remot"'),
 (10,
  '0.032*"wireless" + 0.025*"channel" + 0.014*"code" + 0.014*"sensor" + '
  '0.012*"communic"'),
 (11,
  '0.062*"imag" + 0.016*"detect" + 0.014*"segment" + 0.010*"featur" + '
  '0.009*"object"'),
 (12,
  '0.039*"graph" + 0.017*"comput" + 0.014*"complex" + 0.013*"code" + '
  '0.011*"bound"'),
 (13,
  '0.023*"power" + 0.018*"design" + 0.015*"circuit" + 0.015*"low" + '
  '0.013*"high"'),
 (14,
  '0.044*"learn" + 0.026*"recognit" + 0.017*"featur" + 0.016*"classif" + '
  '0.012*"neural"'),
 (15,
  '0.032*"servic" + 0.022*"agent" + 0.018*"manag" + 0.015*"approach" + '
  '0.014*"web"'),
 (16,
  '0.020*"distribut" + 0.018*"schedul" + 0.015*"perform" + 0.015*"data" + '
  '0.014*"servic"'),
 (17,
  '0.045*"softwar" + 0.019*"develop" + 0.017*"test" + 0.017*"engin" + '
  '0.014*"studi"'),
 (18,
  '0.016*"inform" + 0.012*"design" + 0.011*"user" + 0.009*"studi" + '
  '0.009*"interact"'),
 (19,
  '0.054*"video" + 0.034*"parallel" + 0.026*"code" + 0.018*"perform" + '
  '0.016*"stream"')]



In [96]:

    
def compute_score(titles):
    total_score = np.zeros(num_topics)
    for title in titles:
        #lda output : [(id1, score1), (id2, score2),... if id != 0]
        for id_, value in lda[dictionary.doc2bow(title)]:
            total_score[id_] += value
    return total_score



In [98]:

    
score_by_author_by_document = [compute_score([doc]) for _, doc in tqdm(authorID_to_document.items())]
utl.pickle_data(score_by_author_by_document, "../pmi_data/score_by_author_by_document.p")









    





 
 










    



3500/|/  0%|| 3500/1888607 [00:27<4:03:43, 128.91it/s]



In [ ]:

    
score_by_author_by_titles = [compute_score(titles) for _, titles in tqdm(authorID_to_titles_stem.items())]
utl.pickle_data(score_by_author_by_titles,"../pmi_data/score_by_author_by_titles.p")



In [ ]:



In [ ]: