We query arxiv to get papers, and then run them against Crossref event data to find social media discussion and Microsoft Academic Knowledge to find institutional affiliations
Query Arxiv -> Paper repository -> Analysis -> Topic model -> Classify
| |
| |----> Social network analysis of researchers
| |----> Geocoding of institutions (via GRID?)
|
Extract author data from Google Scholar ----> Geocode institution via Google Places API?
| |
Enrich paper data with MAK(?) |---> Spatial and network analysis
|
Obtain Crossref Event data
In [2]:
%matplotlib inline
#Some imports
import time
#import xml.etree.ElementTree as etree
from lxml import etree
import feedparser
#Imports
#Key imports are loaded from my profile (see standard_imports.py in src folder).
#Paths
#Paths
top = os.path.dirname(os.getcwd())
#External data (to download the GRID database)
ext_data = os.path.join(top,'data/external')
#Interim data (to place seed etc)
int_data = os.path.join(top,'data/interim')
#Figures
fig_path = os.path.join(top,'reports')
#Models
mod_path = os.path.join(top,'models')
#Get date for saving files
today = datetime.datetime.today()
today_str = "_".join([str(x) for x in [today.day,today.month,today.year]])
In [2]:
#Functions
In [3]:
class Arxiv_querier():
'''
This class takes as an input a query and the number of results, and returns all the parsed results.
Includes routines to deal with multiple pages of results.
'''
def __init__(self,base_url="http://export.arxiv.org/api/query?"):
'''
Initialise
'''
self.base_url = base_url
def query(self,query_string,max_results=100,wait_time=3):
'''
Query the base url
'''
#Attribute query string
#Load base URL
base_url = self.base_url
#Prepare query string
processed_query = re.sub(' ','+',query_string)
self.query_string="_".join(query_string.split(" "))
start=0
pages = 0
#Run the query and store results for as long as the number of results is bigger than the max results
keep_running = True
result_store = []
while keep_running==True:
pages +=1
print(pages)
#Query url (NB the start arg, which will change as we go through different
#pages)
query_url = base_url+'search_query=all:{q}&start={s}&max_results={max_res}'.format(
q=processed_query,s=start,max_res=max_results)
#Download
source = requests.get(query_url)
#Parse the xml and get the entries (papers)
parsed = feedparser.parse(source.content)
#Extract entries
entries = parsed['entries']
#If the number of entries is bigger than the maximum number of results
#this means we need to go to another page. We do that by offseting the
#start with max results
result_store.append(entries)
if len(entries)==max_results:
start+=max_results
#If we have less than max results this means we have run out of
#results and we toggle the keep_running switch off.
if len(entries)<max_results:
keep_running=False
time.sleep(wait_time)
#Save results in a flat list
self.entry_results = [x for el in result_store for x in el]
def extract_data(self):
'''
Here we extract data from the entries
'''
#Load entries
entries = self.entry_results
#Create df
output = pd.concat([pd.DataFrame({
'query':self.query_string,
'id':x['id'],
'link':x['link'],
'title':x['title'],
'authors':", ".join([el['name'] for el in x['authors']]),
'summary':x['summary'],
'updated':x['updated'],
'published':x['published'],
'category':x['arxiv_primary_category']['term'],
'pdf':str([el['href'] for el in x['links'] if el['type']=='application/pdf'][0]
)},index=[0]) for x in entries]).reset_index(drop=True)
output['year_published'] = [x.split("-")[0] for x in output['published']]
self.output_df = output
In [216]:
query_terms = ['artificial intelligence','machine learning','deep learning']
#There are some inconsistencies in the number of results so we run the query three times for each
#term and remove duplicated results
def extract_arxiv_data(term,max_results=1000,wait_time=10, tests=3):
'''
This function initialises the Arxiv_querier class, extracts the data and outputs it
'''
print(term)
collected = []
#We collect the data thrice
for i in np.arange(tests):
print('run'+ ' ' +str(i))
initialised = Arxiv_querier()
initialised.query(term,max_results,wait_time)
initialised.extract_data()
out = initialised.output_df
collected.append(out)
#We concatenate the dfs and remove the duplicates.
output = pd.concat(collected)
output_no_dupes = output.drop_duplicates('id')
#Return both
return([output,output_no_dupes])
arxiv_ai_results_three = [extract_arxiv_data(term=q) for q in query_terms]
In [373]:
all_papers = pd.concat([x[1] for x in arxiv_ai_results_three]).drop_duplicates('id').reset_index(drop=True)
print(all_papers.shape)
all_papers.head()
Out[373]:
In [374]:
all_papers.to_csv(int_data+'/{today}_ai_papers.csv'.format(today=today_str),index=False)
In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer, PunktSentenceTokenizer
from nltk.stem import WordNetLemmatizer, SnowballStemmer, PorterStemmer
import scipy
import ast
import string as st
from bs4 import BeautifulSoup
import gensim
from gensim.models.coherencemodel import CoherenceModel
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import product
stopwords_c = stopwords.words('english')
stemmer = PorterStemmer()
lemmatizer= WordNetLemmatizer()
In [7]:
#Read papers
all_papers = pd.read_csv(int_data+'/19_8_2017_ai_papers.csv'.format(today=today_str))
In [8]:
#Let's begin by looking at years
#When where they published?
#Year distribution
year_pubs = all_papers['year_published'].value_counts()
year_pubs.index = [int(x) for x in year_pubs.index]
fig,ax = plt.subplots(figsize=(10,5))
year_pubs_sorted = year_pubs[sorted(year_pubs.index)]
year_pubs_subset = year_pubs_sorted[year_pubs_sorted.index>1991]
ax.plot(np.arange(1993,2018),year_pubs_subset.cumsum(),color='red')
ax.bar(np.arange(1993,2018),year_pubs_subset)
ax.hlines(xmin=1993,xmax=2017,y=[10000,20000,30000,40000],colors='green',linestyles='dashed',alpha=0.7)
ax.set_title("Papers on AI, ML and DL, total per year (bar) and cumulative (red)",size=14)
Out[8]:
In [9]:
#What are the categories of the papers? Are we capturing what we think we are capturing
#Top 20
all_papers['category'].value_counts()[:20]
Out[9]:
See here for abbreviations of categories.
In a nutshell, AI is AI, LG is 'Learning', CV is 'Computer Vision', 'CL' is 'computation and language' and NE is 'Neural and Evolutionary computing'. SL.ML is kind of self-explanatory. We seem to be picking up the main things
In [10]:
#NB do we want to remove hyphens?
punct = re.sub('-','',st.punctuation)
In [11]:
def comp_sentence(sentence):
'''
Takes a sentence and pre-processes it.
The output is the sentence as a bag of words
'''
#Remove line breaks and hyphens
sentence = re.sub('\n',' ',sentence)
sentence = re.sub('-',' ',sentence)
#Lowercase and tokenise
text_lowered = [x.lower() for x in sentence.split(" ")]
#Remove signs and digits
text_no_signs_digits = ["".join([x for x in el if x not in punct+st.digits]) for
el in text_lowered]
#Remove stop words, single letters
text_stopped = [w for w in text_no_signs_digits if w not in stopwords_c and
len(w)>1]
#Stem
text_lemmatised = [lemmatizer.lemmatize(w) for w in text_stopped]
#Output
return(text_lemmatised)
In [90]:
#Process text
clean_corpus = [comp_sentence(x) for x in all_papers['summary']]
In [91]:
#We remove rate words
word_freqs = pd.Series([x for el in clean_corpus for x in el]).value_counts()
word_freqs[:30]
Out[91]:
In [92]:
rare_words = word_freqs.index[word_freqs<=2]
rare_words[:10]
Out[92]:
Lots of the rare words seem to be typos and so forth. We remove them
In [93]:
#Removing rare words
clean_corpus_no_rare = [[x for x in el if x not in rare_words] for el in clean_corpus]
In [94]:
#Identify 2-grams (frequent in science!)
bigram_transformer = gensim.models.Phrases(clean_corpus_no_rare)
#Train the model on the corpus
#Let's do a bit of grid search
#model = gensim.models.Word2Vec(bigram_transformer[clean_corpus], size=360, window=15, min_count=2, iter=20)
In [472]:
model.most_similar('ai_safety')
Out[472]:
In [470]:
model.most_similar('complexity')
Out[470]:
In [475]:
model.most_similar('github')
Out[475]:
In [96]:
#Create 3 different dictionaries and bows depending on word sizes
def remove_words_below_threshold(corpus,threshold):
'''
Takes a list of terms and removes any which are below a threshold of occurrences
'''
#Produce token frequencies
token_frequencies = pd.Series([x for el in corpus for x in el]).value_counts()
#Identify tokens to drop (below a threshold)
tokens_to_drop = token_frequencies.index[token_frequencies<=threshold]
#Processed corpus
processed_corpus = [[x for x in el if x not in tokens_to_drop] for el in corpus]
#Dictionary
dictionary = gensim.corpora.Dictionary(processed_corpus)
corpus_bow = [dictionary.doc2bow(x) for x in processed_corpus]
return([dictionary,corpus_bow,processed_corpus])
In [97]:
#Initial model run to see what comes out.
#Transform corpus to bigrams
transformed_corpus = bigram_transformer[clean_corpus]
corpora_to_process = {str(x):remove_words_below_threshold(transformed_corpus,x) for x in [1,2,5,10]}
In [534]:
#Need to turn this into a function.
#Topic modelling
#Parameters for Grid search.
lda_params = list(product([100,200,300],[2,5]))
#Model container
lda_models = []
for x in lda_params:
#Print stage
print('{x}_{y}'.format(x=x[0],y=x[1]))
#Load corpus and dict
dictionary = corpora_to_process[str(x[1])][0]
corpus_bow = corpora_to_process[str(x[1])][1]
corpus = corpora_to_process[str(x[1])][2]
print('training')
#Train model
mod = gensim.models.LdaModel(corpus_bow,num_topics=x[0],id2word=dictionary,
passes=10,iterations=50)
print('coherence')
#Extract coherence
cm = CoherenceModel(mod,texts=corpus,
dictionary=dictionary,coherence='u_mass')
#Get value
try:
coherence_value = cm.get_coherence()
except:
print('coherence_error')
coherence_value='error'
lda_models.append([x,mod,[coherence_value,cm]])
In [535]:
with open(mod_path+'/{t}_ai_topic_models.p'.format(t=today_str),'wb') as outfile:
pickle.dump(lda_models,outfile)
In [536]:
#Visualiase model performance
model_eval = pd.DataFrame([[x[0][0],x[0][1],x[2][0]] for x in lda_models],columns=['topics','word_lim','coherence'])
fig,ax = plt.subplots(figsize=(10,5))
cols = ['red','green','blue']
legs = []
for num,x in enumerate(set(model_eval['word_lim'])):
subset = model_eval.loc[[z == x for z in model_eval['word_lim']],:]
ax.plot(subset.loc[:,'topics'],subset.loc[:,'coherence'],color=cols[num-1])
legs.append([cols[num-1],x])
ax.legend(labels=[x[1] for x in legs],title='Min word count')
ax.set_title('Model performance with different parameters')
Out[536]:
In [15]:
with open(mod_path+'/19_8_2017_ai_topic_models.p','rb') as infile:
lda_models = pickle.load(infile)
In [100]:
check_model= lda_models[1][1]
#Explore topics via LDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(
#Insert best model/corpus/topics here
check_model,
corpora_to_process[str(5)][1],
corpora_to_process[str(5)][0])
Out[100]:
In [17]:
#Can we extract the relevant terms for the topics as in Sievert and Shirley in order to name them?
#First - create a matrix with top 30 terms per topic
top_30_kws = [check_model.get_topic_terms(topicid=n,topn=1000) for n in np.arange(0,100)]
#Keyword df where the columns are tokens and the rows are topics
top_30_kws_df = pd.concat([pd.DataFrame([x[1] for x in el],
index=[x[0] for x in el]) for el in top_30_kws],
axis=1).fillna(0).T.reset_index(drop=True)
In [98]:
#This is the dictionary
selected_dictionary = corpora_to_process[str(5)][0]
#Total number of terms in the document
total_terms = np.sum([vals for vals in selected_dictionary.dfs.values()])
#Appearances of different terms
document_freqs = pd.Series([v for v in selected_dictionary.dfs.values()],
index=[k for k in selected_dictionary.dfs.keys()])[top_30_kws_df.columns]/total_terms
#Normalise the terms (divide the vector of probabilities of each keywords in each topic by the totals)
top_30_kws_normalised = top_30_kws_df.apply(lambda x: x/document_freqs,axis=0)
In [ ]:
#Now we want to extract, for each topic, the relevance score.
def relevance_score(prob_in_topic,prob_in_corpus,id2word_lookup,lambda_par = 0.6):
'''
Combines the probabilities using the definition in Sievert and Shirley and returns the top 5 named
#terms for each topic
'''
#Create dataframe
combined = pd.concat([prob_in_topic,prob_in_corpus],axis=1)
combined.columns=['prob_in_topic','prob_in_corpus']
#Create relevance metric
combined['relevance'] = lambda_par*combined['prob_in_topic'] + (1-lambda_par)*combined['prob_in_corpus']
#Top words
top_ids = list(combined.sort_values('relevance',ascending=False).index[:5])
#Top words
top_words = "_".join([id2word_lookup[this_id] for this_id in top_ids])
return(top_words)
In [ ]:
relevance_scores = [relevance_score(top_30_kws_df.iloc[n,:],
top_30_kws_normalised.iloc[n,:],
dictionary.id2token,lambda_par=0.6) for n in np.arange(len(top_30_kws_df))]
In [601]:
%%time
#Create a df with the topic predictions.
paper_preds = check_model[corpora_to_process[str(5)][1]]
paper_topics_df = pd.concat([pd.DataFrame([x[1] for x in el],index=[x[0] for x in el]) for el in paper_preds],
axis=1).T
#Replace NAs with zeros and drop pointless index
paper_topics_df.fillna(value=0,inplace=True)
paper_topics_df.reset_index(drop=True,inplace=True)
In [21]:
paper_topics_df.columns = relevance_scores
paper_topics_df.to_csv(int_data+'/{t}_paper_topic_mix.csv'.format(t=today_str),index=False)
In [19]:
#paper_topics_df = pd.read_csv(int_data+'/{t}_paper_topic_mix.csv')
In [86]:
#Quick test of Deep learning papers
#These are papers with a topic that seems to capture deep learning
dl_papers = [x>0.05 for x in paper_topics_df['network_training_model_deep_deep_learning']]
dl_papers_metadata = pd.concat([pd.Series(dl_papers),all_papers],axis=1)
paper_frequencies = pd.crosstab(dl_papers_metadata.year_published,dl_papers_metadata[0])
paper_frequencies.columns=['no_dl','dl']
fig,ax = plt.subplots(figsize=(10,5))
paper_frequencies.plot.bar(stacked=True,ax=ax)
ax.set_title('Number of papers in the DL \'topic\'')
ax.legend(labels=['Not ANN/DL related','NN/DL topic >0.05'])
Out[86]:
Some of this is interesting. Doesn't seem to be picking up the policy related terms (safety, discrimination)
Next stages - focus on policy related terms. Can we look for papers in keyword dictionaries identified through the word embeddings?
In [102]:
#How many authors are there in the data? Can we collect all their institutions from Google Scholar
paper_authors = pd.Series([x for el in all_papers['authors'] for x in el.split(", ")])
paper_authors_unique = paper_authors.drop_duplicates()
len(paper_authors_unique)
Out[102]:
We have 68,000 authors. It might take a while to get their data from Google Scholar
In [103]:
#Top authors and frequencies
authors_freq = paper_authors.value_counts()
fig,ax=plt.subplots(figsize=(10,3))
ax.hist(authors_freq,bins=30)
ax.set_title('Distribution of publications')
Out[103]:
In [104]:
#Pretty skewed distribution!
print(authors_freq.describe())
np.sum(authors_freq>2)
Out[104]:
Less than 10,000 authors with 3+ papers in the data
In [ ]:
get_scholar_data(
In [186]:
%%time
#Test run
import scholarly
@ratelim.patient(max_calls=30,time_interval=60)
def get_scholar_data(scholarly_object):
''''''
try:
scholarly_object = next(scholarly_object)
metadata = {}
metadata['name']=scholarly_object.name
metadata['affiliation'] = scholarly_object.affiliation
metadata['interests'] = scholarly_object.interests
return(metadata)
except:
return('nothing')
#Extract information from each query (it is a generator)
#Get data
#ml_author_gscholar=[]
for num,x in enumerate(paper_authors_unique[1484:]):
if num % 100 == 0:
print(str(num)+":"+x)
result = get_scholar_data(scholarly.search_author(x))
ml_author_gscholar.append(result)
In [182]:
len(ml_author_gscholar)
Out[182]:
In [ ]: