In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
import numpy as np
from wordcloud import WordCloud, STOPWORDS
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
papers = []
paper_type = 'long'
with open('emnlp_papers.txt') as f:
while True:
try:
line = next(f)
except StopIteration:
break
if not line.strip():
continue
if line.strip() == 'Long Papers':
paper_type = 'long'
elif line.strip() == 'Short Papers':
paper_type = 'short'
elif line.strip() == 'Demo Papers':
paper_type = 'demo'
else:
title = line.strip()
authors = next(f).strip()
papers.append({'title': title, 'authors': authors, 'type': paper_type})
papers = pd.DataFrame(papers)
print("{} papers accepts".format(len(papers)))
In [3]:
papers.type.value_counts()
Out[3]:
In [4]:
def authors2list(authors):
if ' and ' not in authors:
return [authors.strip()]
first, last = authors.split(' and ')
return first.split(', ') + [last.strip()]
papers['author_list'] = papers.authors.apply(authors2list)
papers['first_author'] = papers.author_list.apply(lambda a: a[0])
In [5]:
p = papers.first_author.value_counts()
p[p>1]
Out[5]:
In [6]:
author_cnt = defaultdict(int)
for al in papers.author_list:
for a in al:
author_cnt[a] += 1
In [7]:
sorted(author_cnt.items(), key=lambda l: -l[1])[:39]
Out[7]:
In [8]:
papers['n_authors'] = papers.author_list.apply(len)
ax = sns.countplot(papers['n_authors'], palette='Set2')
_ = ax.set_xlabel("Number of authors")
In [9]:
word_cnt = pd.DataFrame(columns=['count'])
for title in papers.title:
for word in title.lower().split():
if not word in word_cnt.index:
word_cnt.loc[word] = 0
word_cnt.loc[word] += 1
splits = [d.reset_index(drop=True) for d in np.split(word_cnt.sort_values('count', ascending=False).head(80).reset_index(), [20, 40, 60])]
pd.concat(splits, axis=1).rename(columns={'index': 'word'})
Out[9]:
In [10]:
wordcloud = WordCloud(width = 1000, height = 600,
background_color ='white',
stopwords = set(STOPWORDS),
min_font_size = 10).generate(" ".join(papers.title).lower())
plt.figure(figsize = (16, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
In [ ]: