EMNLP 2019 accepted paper list analysis



In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
import numpy as np
from wordcloud import WordCloud, STOPWORDS 

%load_ext autoreload
%autoreload 2

%matplotlib inline



In [2]:

    
papers = []
paper_type = 'long'
with open('emnlp_papers.txt') as f:
    while True:
        try:
            line = next(f)
        except StopIteration:
            break
        if not line.strip():
            continue
        if line.strip() == 'Long Papers':
            paper_type = 'long'
        elif line.strip() == 'Short Papers':
            paper_type = 'short'
        elif line.strip() == 'Demo Papers':
            paper_type = 'demo'
        else:
            title = line.strip()
            authors = next(f).strip()
            papers.append({'title': title, 'authors': authors, 'type': paper_type})
papers = pd.DataFrame(papers)
print("{} papers accepts".format(len(papers)))









    



727 papers accepts

Long vs. short vs. demo papers



In [3]:

    
papers.type.value_counts()









    Out[3]:





long     466
short    217
demo      44
Name: type, dtype: int64



In [4]:

    
def authors2list(authors):
    if ' and ' not in authors:
        return [authors.strip()]
    first, last = authors.split(' and ')
    return first.split(', ') + [last.strip()]

papers['author_list'] = papers.authors.apply(authors2list)
papers['first_author'] = papers.author_list.apply(lambda a: a[0])

First authors with more than one accepted paper



In [5]:

    
p = papers.first_author.value_counts()
p[p>1]









    Out[5]:





Eric Wallace                3
Chuhan Wu                   3
Dongyeop Kang               3
Jingjing Xu                 3
Ming Tan                    2
Chunning Du                 2
Libo Qin                    2
Jesse Dodge                 2
Hagai Taitelbaum            2
Kawin Ethayarajh            2
Edoardo Maria Ponti         2
Binxuan Huang               2
Mengting Hu                 2
Xiaoyu Shen                 2
Yiming Cui                  2
Zeynep Akkalyoncu Yilmaz    2
Elena Voita                 2
Wenjie Zhou                 2
Lijun Wu                    2
Jie Hao                     2
Ze Yang                     2
Brian Thompson              2
Deng Cai                    2
Ming Jiang                  2
Bailin Wang                 2
Zi-Yi Dou                   2
Mingda Chen                 2
Zhenjie Zhao                2
Bonan Min                   2
Zhangming Chan              2
Name: first_author, dtype: int64



In [6]:

    
author_cnt = defaultdict(int)

for al in papers.author_list:
    for a in al:
        author_cnt[a] += 1

Authors with more 5+ papers



In [7]:

    
sorted(author_cnt.items(), key=lambda l: -l[1])[:39]









    Out[7]:





[('Ting Liu', 10),
 ('Shuming Shi', 8),
 ('Luke Zettlemoyer', 8),
 ('Lidong Bing', 8),
 ('Zhiyuan Liu', 8),
 ('Dongyan Zhao', 8),
 ('Iryna Gurevych', 7),
 ('Jimmy Lin', 7),
 ('Xiang Ren', 7),
 ('Zhaopeng Tu', 7),
 ('Kai-Wei Chang', 7),
 ('Noah A. Smith', 7),
 ('Guodong Zhou', 6),
 ('Ryan Cotterell', 6),
 ('Graham Neubig', 6),
 ('Jie Zhou', 6),
 ('Jianfeng Gao', 6),
 ('Wei Lu', 6),
 ('Xu SUN', 6),
 ('Ivan Titov', 6),
 ('Yue Zhang', 6),
 ('Matt Gardner', 6),
 ('Rui Yan', 6),
 ('Maosong Sun', 6),
 ('Mirella Lapata', 6),
 ('Wanxiang Che', 5),
 ('Min Zhang', 5),
 ('Anders Søgaard', 5),
 ('Mohit Bansal', 5),
 ('Dan Roth', 5),
 ('Preslav Nakov', 5),
 ('Yejin Choi', 5),
 ('Yansong Feng', 5),
 ('Tao QIN', 5),
 ('Kang Liu', 5),
 ('Jun Zhao', 5),
 ('Nanyun Peng', 5),
 ('Xing Wang', 5),
 ('Xu Han', 5)]

Number of authors-per-paper



In [8]:

    
papers['n_authors'] = papers.author_list.apply(len)
ax = sns.countplot(papers['n_authors'], palette='Set2')
_ = ax.set_xlabel("Number of authors")

80 most frequent words in the titles



In [9]:

    
word_cnt = pd.DataFrame(columns=['count'])

for title in papers.title:
    for word in title.lower().split():
        if not word in word_cnt.index:
            word_cnt.loc[word] = 0
        word_cnt.loc[word] += 1

splits = [d.reset_index(drop=True) for d in np.split(word_cnt.sort_values('count', ascending=False).head(80).reset_index(), [20, 40, 60])]
pd.concat(splits, axis=1).rename(columns={'index': 'word'})









    Out[9]:







  
    
      
      word
      count
      word
      count
      word
      count
      word
      count
    
  
  
    
      0
      for
      342
      model
      33
      unsupervised
      23
      information
      17
    
    
      1
      a
      162
      on
      33
      dataset
      23
      supervised
      15
    
    
      2
      and
      149
      network
      33
      dialogue
      22
      evaluation
      15
    
    
      3
      with
      133
      analysis
      32
      an
      22
      sequence
      15
    
    
      4
      of
      111
      extraction
      32
      deep
      22
      study
      15
    
    
      5
      in
      102
      via
      32
      hierarchical
      22
      low-resource
      15
    
    
      6
      learning
      95
      modeling
      30
      detection
      22
      understanding
      15
    
    
      7
      neural
      80
      entity
      30
      approach
      21
      transfer
      15
    
    
      8
      text
      69
      cross-lingual
      30
      embeddings
      21
      labeling
      15
    
    
      9
      the
      68
      from
      30
      summarization
      21
      multilingual
      15
    
    
      10
      generation
      66
      question
      29
      reasoning
      20
      comprehension
      14
    
    
      11
      language
      61
      models
      29
      by
      20
      event
      14
    
    
      12
      to
      55
      graph
      28
      reading
      20
      embedding
      14
    
    
      13
      knowledge
      46
      sentiment
      27
      domain
      19
      dialog
      14
    
    
      14
      classification
      44
      networks
      27
      natural
      18
      latent
      14
    
    
      15
      semantic
      43
      answering
      25
      adversarial
      18
      questions
      14
    
    
      16
      machine
      42
      attention
      25
      towards
      18
      bert
      14
    
    
      17
      translation
      36
      representations
      25
      framework
      18
      conversation
      13
    
    
      18
      using
      35
      relation
      24
      sentence
      18
      recognition
      13
    
    
      19
      word
      33
      parsing
      23
      data
      17
      selection
      13

Title wordclouds

without stopwords



In [10]:

    
wordcloud = WordCloud(width = 1000, height = 600, 
                background_color ='white', 
                stopwords = set(STOPWORDS), 
                min_font_size = 10).generate(" ".join(papers.title).lower()) 
plt.figure(figsize = (16, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0)



In [ ]:

	word	count	word	count	word	count	word	count
0	for	342	model	33	unsupervised	23	information	17
1	a	162	on	33	dataset	23	supervised	15
2	and	149	network	33	dialogue	22	evaluation	15
3	with	133	analysis	32	an	22	sequence	15
4	of	111	extraction	32	deep	22	study	15
5	in	102	via	32	hierarchical	22	low-resource	15
6	learning	95	modeling	30	detection	22	understanding	15
7	neural	80	entity	30	approach	21	transfer	15
8	text	69	cross-lingual	30	embeddings	21	labeling	15
9	the	68	from	30	summarization	21	multilingual	15
10	generation	66	question	29	reasoning	20	comprehension	14
11	language	61	models	29	by	20	event	14
12	to	55	graph	28	reading	20	embedding	14
13	knowledge	46	sentiment	27	domain	19	dialog	14
14	classification	44	networks	27	natural	18	latent	14
15	semantic	43	answering	25	adversarial	18	questions	14
16	machine	42	attention	25	towards	18	bert	14
17	translation	36	representations	25	framework	18	conversation	13
18	using	35	relation	24	sentence	18	recognition	13
19	word	33	parsing	23	data	17	selection	13