In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sqlite3 import connect
%matplotlib inline



In [2]:

    
con = connect('../data/nips-papers/database.sqlite')
papers_df = pd.read_sql_query('select * from papers;', con, index_col='id')
authors_df = pd.read_sql_query('select * from authors;', con, index_col='id')

Basic papers statistics



In [3]:

    
papers_df.head()









    Out[3]:







  
    
      
      year
      title
      event_type
      pdf_name
      abstract
      paper_text
    
    
      id
      
      
      
      
      
      
    
  
  
    
      1
      1987
      Self-Organization of Associative Database and ...
      
      1-self-organization-of-associative-database-an...
      Abstract Missing
      767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
    
    
      2
      1987
      The Capacity of the Kanerva Associative Memory...
      
      2-the-capacity-of-the-kanerva-associative-memo...
      Abstract Missing
      184\n\nTHE CAPACITY OF THE KANERVA ASSOCIATIVE...
    
    
      3
      1987
      Supervised Learning of Probability Distributio...
      
      3-supervised-learning-of-probability-distribut...
      Abstract Missing
      52\n\nSupervised Learning of Probability Distr...
    
    
      4
      1987
      Constrained Differential Optimization
      
      4-constrained-differential-optimization.pdf
      Abstract Missing
      612\n\nConstrained Differential Optimization\n...
    
    
      5
      1987
      Towards an Organizing Principle for a Layered ...
      
      5-towards-an-organizing-principle-for-a-layere...
      Abstract Missing
      485\n\nTOWARDS AN ORGANIZING PRINCIPLE FOR\nA ...



In [4]:

    
papers_df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 6560 entries, 1 to 6603
Data columns (total 6 columns):
year          6560 non-null int64
title         6560 non-null object
event_type    6560 non-null object
pdf_name      6560 non-null object
abstract      6560 non-null object
paper_text    6560 non-null object
dtypes: int64(1), object(5)
memory usage: 358.8+ KB

event_type statistics



In [5]:

    
print(papers_df.event_type.unique())
papers_df.event_type.value_counts()









    



['' 'Oral' 'Spotlight' 'Poster']






    Out[5]:





             4819
Poster       1505
Spotlight     181
Oral           55
Name: event_type, dtype: int64

abstract statistics



In [6]:

    
papers_df.abstract.describe()









    Out[6]:





count                 6560
unique                3244
top       Abstract Missing
freq                  3317
Name: abstract, dtype: object



In [7]:

    
sum(papers_df.abstract == 'Abstract Missing')









    Out[7]:





3317

title statistics



In [8]:

    
papers_df.title.describe()









    Out[8]:





count                                    6560
unique                                   6560
top       Classification by Pairwise Coupling
freq                                        1
Name: title, dtype: object

year statistics



In [9]:

    
print(papers_df.year.min())
print(papers_df.year.max())



In [17]:

    
_, ax = plt.subplots(figsize=(10, 8))
sb.countplot(papers_df.year, ax=ax)
plt.xticks(rotation=90)









    Out[17]:





(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
 <a list of 30 Text xticklabel objects>)

Basic authors statistics



In [84]:

    
authors_df.head()









    Out[84]:







  
    
      
      name
    
    
      id
      
    
  
  
    
      1
      Hisashi Suzuki
    
    
      2
      Suguru Arimoto
    
    
      3
      Philip A. Chou
    
    
      4
      John C. Platt
    
    
      5
      Alan H. Barr



In [104]:

    
authors_df.name.describe()









    Out[104]:





count             8653
unique            8595
top       Stella X. Yu
freq                 3
Name: name, dtype: object

Non-uniqe author entries



In [108]:

    
pd.read_sql_query('''select name, count(name) name_count from authors
group by name having count(name) > 1 order by count(name) desc;''', con)









    Out[108]:







  
    
      
      name
      name_count
    
  
  
    
      0
      Daniel D. Lee
      3
    
    
      1
      Stella X. Yu
      3
    
    
      2
      Alex J. Smola
      2
    
    
      3
      Alexander T. Ihler
      2
    
    
      4
      Andrew Y. Ng
      2
    
    
      5
      Angela J. Yu
      2
    
    
      6
      Arian Maleki
      2
    
    
      7
      Brendan Frey
      2
    
    
      8
      C. A. Mead
      2
    
    
      9
      Carl Edward Rasmussen
      2
    
    
      10
      Chris M. Bishop
      2
    
    
      11
      Christopher J. C. Burges
      2
    
    
      12
      Christopher K. I. Williams
      2
    
    
      13
      Chuong B. Do
      2
    
    
      14
      David B. Dunson
      2
    
    
      15
      David E. Carlson
      2
    
    
      16
      David Hsu
      2
    
    
      17
      Dean P. Foster
      2
    
    
      18
      Eric P. Xing
      2
    
    
      19
      F. Fallside
      2
    
    
      20
      Inderjit S. Dhillon
      2
    
    
      21
      Jan R. Peters
      2
    
    
      22
      Jeff Schneider
      2
    
    
      23
      Jianbo Shi
      2
    
    
      24
      Jo?o F. Henriques
      2
    
    
      25
      Joachim M. Buhmann
      2
    
    
      26
      Joaquin Qui?onero Candela
      2
    
    
      27
      Jonathan W. Pillow
      2
    
    
      28
      Joseph L. Austerweil
      2
    
    
      29
      Jun Zhu
      2
    
    
      30
      Kinh Tieu
      2
    
    
      31
      Lenka Zdeborov?
      2
    
    
      32
      Martin J. Wainwright
      2
    
    
      33
      Matthias W. Seeger
      2
    
    
      34
      Matus J. Telgarsky
      2
    
    
      35
      Michael I. Jordan
      2
    
    
      36
      Miguel L?zaro-Gredilla
      2
    
    
      37
      Nando de Freitas
      2
    
    
      38
      Peter L. Bartlett
      2
    
    
      39
      Pradeep K. Ravikumar
      2
    
    
      40
      Quoc V. Le
      2
    
    
      41
      Rajesh P. N. Rao
      2
    
    
      42
      Rodney Goodman
      2
    
    
      43
      Roman Garnett
      2
    
    
      44
      Sham M. Kakade
      2
    
    
      45
      Shih-Chii Liu
      2
    
    
      46
      Stuart J. Russell
      2
    
    
      47
      Sung Ju Hwang
      2
    
    
      48
      T.J. Sejnowski
      2
    
    
      49
      Tomaso A. Poggio
      2
    
    
      50
      V. P. Roychowdhury
      2
    
    
      51
      Vikash K. Mansinghka
      2
    
    
      52
      Wei Zhang
      2
    
    
      53
      Xiuwen Liu
      2
    
    
      54
      Xuejun Liao
      2
    
    
      55
      Yee-Chun Lee
      2

Basic paper_authors statistics



In [113]:

    
pa_df = pd.read_sql_query('select * from paper_authors;', con)



In [114]:

    
pa_df.head()

	id	paper_id	author_id
0	1	63	94
1	2	80	124
2	3	80	125
3	4	80	126
4	5	80	127

	author_id	name	count(paper_id)
0	330	Michael I. Jordan	79
1	1472	Prof. Bernhard Sch?lkopf	59
2	121	Geoffrey E. Hinton	57
3	178	Yoshua Bengio	56
4	1020	Zoubin Ghahramani	50
5	54	Terrence J. Sejnowski	48
6	632	Peter Dayan	47
7	1410	Alex J. Smola	40
8	191	Michael C. Mozer	38
9	1853	Andrew Y. Ng	38
10	41	Christof Koch	35
11	1282	Klaus-Robert M?ller	35
12	2816	Ruslan R. Salakhutdinov	33
13	1188	Joshua B. Tenenbaum	31
14	1901	Tong Zhang	31
15	3001	Lawrence Carin	31
16	2044	Thomas L. Griffiths	30
17	976	Yoram Singer	29
18	709	Manfred Opper	28
19	937	Alan L. Yuille	28

	year	title	pdf_name	abstract	paper_text
id
1	1987	Self-Organization of Associative Database and ...	1-self-organization-of-associative-database-an...	Abstract Missing	767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
2	1987	The Capacity of the Kanerva Associative Memory...	2-the-capacity-of-the-kanerva-associative-memo...	Abstract Missing	184\n\nTHE CAPACITY OF THE KANERVA ASSOCIATIVE...
3	1987	Supervised Learning of Probability Distributio...	3-supervised-learning-of-probability-distribut...	Abstract Missing	52\n\nSupervised Learning of Probability Distr...
4	1987	Constrained Differential Optimization	4-constrained-differential-optimization.pdf	Abstract Missing	612\n\nConstrained Differential Optimization\n...
5	1987	Towards an Organizing Principle for a Layered ...	5-towards-an-organizing-principle-for-a-layere...	Abstract Missing	485\n\nTOWARDS AN ORGANIZING PRINCIPLE FOR\nA ...

	name
id
1	Hisashi Suzuki
2	Suguru Arimoto
3	Philip A. Chou
4	John C. Platt
5	Alan H. Barr

	name	name_count
0	Daniel D. Lee	3
1	Stella X. Yu	3
2	Alex J. Smola	2
3	Alexander T. Ihler	2
4	Andrew Y. Ng	2
5	Angela J. Yu	2
6	Arian Maleki	2
7	Brendan Frey	2
8	C. A. Mead	2
9	Carl Edward Rasmussen	2
10	Chris M. Bishop	2
11	Christopher J. C. Burges	2
12	Christopher K. I. Williams	2
13	Chuong B. Do	2
14	David B. Dunson	2
15	David E. Carlson	2
16	David Hsu	2
17	Dean P. Foster	2
18	Eric P. Xing	2
19	F. Fallside	2
20	Inderjit S. Dhillon	2
21	Jan R. Peters	2
22	Jeff Schneider	2
23	Jianbo Shi	2
24	Jo?o F. Henriques	2
25	Joachim M. Buhmann	2
26	Joaquin Qui?onero Candela	2
27	Jonathan W. Pillow	2
28	Joseph L. Austerweil	2
29	Jun Zhu	2
30	Kinh Tieu	2
31	Lenka Zdeborov?	2
32	Martin J. Wainwright	2
33	Matthias W. Seeger	2
34	Matus J. Telgarsky	2
35	Michael I. Jordan	2
36	Miguel L?zaro-Gredilla	2
37	Nando de Freitas	2
38	Peter L. Bartlett	2
39	Pradeep K. Ravikumar	2
40	Quoc V. Le	2
41	Rajesh P. N. Rao	2
42	Rodney Goodman	2
43	Roman Garnett	2
44	Sham M. Kakade	2
45	Shih-Chii Liu	2
46	Stuart J. Russell	2
47	Sung Ju Hwang	2
48	T.J. Sejnowski	2
49	Tomaso A. Poggio	2
50	V. P. Roychowdhury	2
51	Vikash K. Mansinghka	2
52	Wei Zhang	2
53	Xiuwen Liu	2
54	Xuejun Liao	2
55	Yee-Chun Lee	2

Basic papers statistics

event_type statistics

abstract statistics

title statistics

year statistics

Basic authors statistics

Non-uniqe author entries

Basic paper_authors statistics

Top 20 authors by article count