In [248]:
import pymongo
from pymongo import MongoClient
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from wordcloud import WordCloud
%matplotlib inline
In [3]:
client = MongoClient('localhost:27017')
db = client.arXivDB
db.arXivfeeds.count()
Out[3]:
retrieving the available fields as a reference
In [13]:
print(db.arXivfeeds.find_one().keys())
In [58]:
for item in db.arXivfeeds.find({'published_parsed': 2016}).sort('_id', pymongo.DESCENDING).limit(5):
print(item['title'])
In [44]:
#db.arXivfeeds.delete_many({})
In [ ]:
In [400]:
def cleaner(doc, stem=False):
'''Function to clean the text data and prep for further analysis'''
doc = doc.lower() # turn text to lowercase
stops = set(stopwords.words("english")) # Creating a set of Stopwords
p_stemmer = PorterStemmer() # Creating the stemmer model
doc = re.sub(r"quantum", '', doc) # removing the word quantum (duh)
doc = re.sub(r"physics", '', doc) # removing the word physics (duh)
doc = re.sub(r"state", '', doc) # removing the word state (duh)
doc = re.sub(r'\$.*?\$', 'latexinlineformula', doc) # replacing latex inline formula
doc = re.sub(r'\\n', ' ', doc) # removing new line character
doc = re.sub(r'\\\\\"', '', doc) # removing german double dotted letters
doc = re.sub(r"</?\w+[^>]*>", '', doc) # removing html tags
doc = re.sub("[^a-zA-Z]", ' ', doc) # removing anythin other alpha-numerical char's and @ and !
doc = doc.split() # Splits the data into individual words
doc = [w for w in doc if not w in stops and len(w) > 3] # Removes stopwords and short length words
if stem:
doc = [p_stemmer.stem(i) for i in doc] # Stemming (reducing words to their root)
if not len(doc): # dealing with comments that are all emojis, stop words or other languages
doc = ['emptystring']
# print('text cleaning done!')
return ' '.join(doc)
In [401]:
cleaner(db.arXivfeeds.find_one({'published_parsed': 2016})['summary'])
Out[401]:
In [167]:
def plot_abstract_and_title_wordcloud(arXivfeed_query_result):
arXivfeed_2015_text = cleaner(' '.join([' '.join(list(d.values())) for d in arXivfeed_query_result]))
# Generate a word cloud image
wordcloud_arXivfeed_2015 = WordCloud().generate(arXivfeed_2015_text)
# Display the generated image:
plt.imshow(wordcloud_arXivfeed_2015)
plt.axis("off")
In [173]:
plot_abstract_and_title_wordcloud(list(db.arXivfeeds.find({'published_parsed': 1995}, {'_id':0,'title':1})))
In [174]:
plot_abstract_and_title_wordcloud(list(db.arXivfeeds.find({'published_parsed': 2002}, {'_id':0,'title':1})))
In [175]:
plot_abstract_and_title_wordcloud(list(db.arXivfeeds.find({'published_parsed': 2015}, {'_id':0,'title':1})))
In [244]:
years = range(1994,2016,1)
In [117]:
num_publications_per_year = [db.arXivfeeds.find({'published_parsed': y}).count() for y in years]
In [245]:
plt.plot(years, num_publications_per_year)
Out[245]:
this shows that the database is missing entries and judging from the total number of downloaded items, there must be around 30000 papers missing
In [288]:
pattern1 = r'[Pp]hoton\w*'
pattern2 = r'[Oo]ptic\w*'
set(re.findall(pattern2,
' '.join([' '.join(list(d.values())) for d in db.arXivfeeds.find({}, {'_id':0,'summary':1})])))
Out[288]:
In [289]:
num_ph_papers = np.zeros(len(years))
for i, y in enumerate(years):
num_ph_papers[i] = db.arXivfeeds.find({'$and':[{'published_parsed': y},
{'$or':[
{'summary': {'$regex': pattern1}},
{'title': {'$regex': pattern1}},
{'summary': {'$regex': pattern2}},
{'title': {'$regex': pattern2}}
]}
]}).count()
In [290]:
plt.plot(years, num_ph_papers/num_publications_per_year)
Out[290]:
there is no affiliation in the parsed data
In [309]:
list(db.arXivfeeds.find({'published_parsed': 2016}).limit(1))[0]
Out[309]:
In [410]:
import nltk
import gensim
import pyLDAvis
import pyLDAvis.gensim
In [409]:
documents = [cleaner(d['summary']) for d in db.arXivfeeds.find({'published_parsed': 2010}, {'_id':0, 'summary':1})]
# documents = [cleaner(d['summary']) for d in db.arXivfeeds.find({}, {'_id':0, 'summary':1})]
In [411]:
train_set = []
for j in range(len(documents)):
train_set.append(nltk.word_tokenize(documents[j]))
In [412]:
dic = gensim.corpora.Dictionary(train_set)
print(len(dic))
dic.filter_extremes(no_below=20, no_above=0.1)
print(len(dic))
In [413]:
corpus = [dic.doc2bow(text) for text in train_set] # transform every token into BOW
In [414]:
tfidf = gensim.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lda = gensim.models.LdaModel(corpus_tfidf, id2word = dic, num_topics = 10, iterations=20, passes = 10)
corpus_lda = lda[corpus_tfidf]
In [415]:
vis_data = pyLDAvis.gensim.prepare(lda, corpus, dic)
pyLDAvis.display(vis_data)
Out[415]:
In [ ]: