In [1]:
    
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
import numpy as np
from wordcloud import WordCloud, STOPWORDS 
%load_ext autoreload
%autoreload 2
%matplotlib inline
    
In [2]:
    
papers = []
paper_type = 'long'
with open('emnlp_papers.txt') as f:
    while True:
        try:
            line = next(f)
        except StopIteration:
            break
        if not line.strip():
            continue
        if line.strip() == 'Long Papers':
            paper_type = 'long'
        elif line.strip() == 'Short Papers':
            paper_type = 'short'
        elif line.strip() == 'Demo Papers':
            paper_type = 'demo'
        else:
            title = line.strip()
            authors = next(f).strip()
            papers.append({'title': title, 'authors': authors, 'type': paper_type})
papers = pd.DataFrame(papers)
print("{} papers accepts".format(len(papers)))
    
    
In [3]:
    
papers.type.value_counts()
    
    Out[3]:
In [4]:
    
def authors2list(authors):
    if ' and ' not in authors:
        return [authors.strip()]
    first, last = authors.split(' and ')
    return first.split(', ') + [last.strip()]
papers['author_list'] = papers.authors.apply(authors2list)
papers['first_author'] = papers.author_list.apply(lambda a: a[0])
    
In [5]:
    
p = papers.first_author.value_counts()
p[p>1]
    
    Out[5]:
In [6]:
    
author_cnt = defaultdict(int)
for al in papers.author_list:
    for a in al:
        author_cnt[a] += 1
    
In [7]:
    
sorted(author_cnt.items(), key=lambda l: -l[1])[:39]
    
    Out[7]:
In [8]:
    
papers['n_authors'] = papers.author_list.apply(len)
ax = sns.countplot(papers['n_authors'], palette='Set2')
_ = ax.set_xlabel("Number of authors")
    
    
In [9]:
    
word_cnt = pd.DataFrame(columns=['count'])
for title in papers.title:
    for word in title.lower().split():
        if not word in word_cnt.index:
            word_cnt.loc[word] = 0
        word_cnt.loc[word] += 1
splits = [d.reset_index(drop=True) for d in np.split(word_cnt.sort_values('count', ascending=False).head(80).reset_index(), [20, 40, 60])]
pd.concat(splits, axis=1).rename(columns={'index': 'word'})
    
    Out[9]:
In [10]:
    
wordcloud = WordCloud(width = 1000, height = 600, 
                background_color ='white', 
                stopwords = set(STOPWORDS), 
                min_font_size = 10).generate(" ".join(papers.title).lower()) 
plt.figure(figsize = (16, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0)
    
    
In [ ]: