In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sqlite3 import connect
%matplotlib inline

In [2]:
con = connect('../data/nips-papers/database.sqlite')
papers_df = pd.read_sql_query('select * from papers;', con, index_col='id')
authors_df = pd.read_sql_query('select * from authors;', con, index_col='id')

Basic papers statistics


In [3]:
papers_df.head()


Out[3]:
year title event_type pdf_name abstract paper_text
id
1 1987 Self-Organization of Associative Database and ... 1-self-organization-of-associative-database-an... Abstract Missing 767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
2 1987 The Capacity of the Kanerva Associative Memory... 2-the-capacity-of-the-kanerva-associative-memo... Abstract Missing 184\n\nTHE CAPACITY OF THE KANERVA ASSOCIATIVE...
3 1987 Supervised Learning of Probability Distributio... 3-supervised-learning-of-probability-distribut... Abstract Missing 52\n\nSupervised Learning of Probability Distr...
4 1987 Constrained Differential Optimization 4-constrained-differential-optimization.pdf Abstract Missing 612\n\nConstrained Differential Optimization\n...
5 1987 Towards an Organizing Principle for a Layered ... 5-towards-an-organizing-principle-for-a-layere... Abstract Missing 485\n\nTOWARDS AN ORGANIZING PRINCIPLE FOR\nA ...

In [4]:
papers_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6560 entries, 1 to 6603
Data columns (total 6 columns):
year          6560 non-null int64
title         6560 non-null object
event_type    6560 non-null object
pdf_name      6560 non-null object
abstract      6560 non-null object
paper_text    6560 non-null object
dtypes: int64(1), object(5)
memory usage: 358.8+ KB

event_type statistics


In [5]:
print(papers_df.event_type.unique())
papers_df.event_type.value_counts()


['' 'Oral' 'Spotlight' 'Poster']
Out[5]:
             4819
Poster       1505
Spotlight     181
Oral           55
Name: event_type, dtype: int64

abstract statistics


In [6]:
papers_df.abstract.describe()


Out[6]:
count                 6560
unique                3244
top       Abstract Missing
freq                  3317
Name: abstract, dtype: object

In [7]:
sum(papers_df.abstract == 'Abstract Missing')


Out[7]:
3317

title statistics


In [8]:
papers_df.title.describe()


Out[8]:
count                                    6560
unique                                   6560
top       Classification by Pairwise Coupling
freq                                        1
Name: title, dtype: object

year statistics


In [9]:
print(papers_df.year.min())
print(papers_df.year.max())


1987
2016

In [17]:
_, ax = plt.subplots(figsize=(10, 8))
sb.countplot(papers_df.year, ax=ax)
plt.xticks(rotation=90)


Out[17]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
 <a list of 30 Text xticklabel objects>)

Basic authors statistics


In [84]:
authors_df.head()


Out[84]:
name
id
1 Hisashi Suzuki
2 Suguru Arimoto
3 Philip A. Chou
4 John C. Platt
5 Alan H. Barr

In [104]:
authors_df.name.describe()


Out[104]:
count             8653
unique            8595
top       Stella X. Yu
freq                 3
Name: name, dtype: object

Non-uniqe author entries


In [108]:
pd.read_sql_query('''select name, count(name) name_count from authors
group by name having count(name) > 1 order by count(name) desc;''', con)


Out[108]:
name name_count
0 Daniel D. Lee 3
1 Stella X. Yu 3
2 Alex J. Smola 2
3 Alexander T. Ihler 2
4 Andrew Y. Ng 2
5 Angela J. Yu 2
6 Arian Maleki 2
7 Brendan Frey 2
8 C. A. Mead 2
9 Carl Edward Rasmussen 2
10 Chris M. Bishop 2
11 Christopher J. C. Burges 2
12 Christopher K. I. Williams 2
13 Chuong B. Do 2
14 David B. Dunson 2
15 David E. Carlson 2
16 David Hsu 2
17 Dean P. Foster 2
18 Eric P. Xing 2
19 F. Fallside 2
20 Inderjit S. Dhillon 2
21 Jan R. Peters 2
22 Jeff Schneider 2
23 Jianbo Shi 2
24 Jo?o F. Henriques 2
25 Joachim M. Buhmann 2
26 Joaquin Qui?onero Candela 2
27 Jonathan W. Pillow 2
28 Joseph L. Austerweil 2
29 Jun Zhu 2
30 Kinh Tieu 2
31 Lenka Zdeborov? 2
32 Martin J. Wainwright 2
33 Matthias W. Seeger 2
34 Matus J. Telgarsky 2
35 Michael I. Jordan 2
36 Miguel L?zaro-Gredilla 2
37 Nando de Freitas 2
38 Peter L. Bartlett 2
39 Pradeep K. Ravikumar 2
40 Quoc V. Le 2
41 Rajesh P. N. Rao 2
42 Rodney Goodman 2
43 Roman Garnett 2
44 Sham M. Kakade 2
45 Shih-Chii Liu 2
46 Stuart J. Russell 2
47 Sung Ju Hwang 2
48 T.J. Sejnowski 2
49 Tomaso A. Poggio 2
50 V. P. Roychowdhury 2
51 Vikash K. Mansinghka 2
52 Wei Zhang 2
53 Xiuwen Liu 2
54 Xuejun Liao 2
55 Yee-Chun Lee 2

Basic paper_authors statistics


In [113]:
pa_df = pd.read_sql_query('select * from paper_authors;', con)

In [114]:
pa_df.head()


Out[114]:
id paper_id author_id
0 1 63 94
1 2 80 124
2 3 80 125
3 4 80 126
4 5 80 127

Top 20 authors by article count


In [133]:
pd.read_sql_query('''select author_id, authors.name, count(paper_id)
from paper_authors join authors on author_id = authors.id
group by author_id order by count(paper_id) desc limit 20;
''', con)


Out[133]:
author_id name count(paper_id)
0 330 Michael I. Jordan 79
1 1472 Prof. Bernhard Sch?lkopf 59
2 121 Geoffrey E. Hinton 57
3 178 Yoshua Bengio 56
4 1020 Zoubin Ghahramani 50
5 54 Terrence J. Sejnowski 48
6 632 Peter Dayan 47
7 1410 Alex J. Smola 40
8 191 Michael C. Mozer 38
9 1853 Andrew Y. Ng 38
10 41 Christof Koch 35
11 1282 Klaus-Robert M?ller 35
12 2816 Ruslan R. Salakhutdinov 33
13 1188 Joshua B. Tenenbaum 31
14 1901 Tong Zhang 31
15 3001 Lawrence Carin 31
16 2044 Thomas L. Griffiths 30
17 976 Yoram Singer 29
18 709 Manfred Opper 28
19 937 Alan L. Yuille 28

In [ ]: