In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sqlite3 import connect
%matplotlib inline
In [2]:
con = connect('../data/nips-papers/database.sqlite')
papers_df = pd.read_sql_query('select * from papers;', con, index_col='id')
authors_df = pd.read_sql_query('select * from authors;', con, index_col='id')
In [3]:
papers_df.head()
Out[3]:
In [4]:
papers_df.info()
In [5]:
print(papers_df.event_type.unique())
papers_df.event_type.value_counts()
Out[5]:
In [6]:
papers_df.abstract.describe()
Out[6]:
In [7]:
sum(papers_df.abstract == 'Abstract Missing')
Out[7]:
In [8]:
papers_df.title.describe()
Out[8]:
In [9]:
print(papers_df.year.min())
print(papers_df.year.max())
In [17]:
_, ax = plt.subplots(figsize=(10, 8))
sb.countplot(papers_df.year, ax=ax)
plt.xticks(rotation=90)
Out[17]:
In [84]:
authors_df.head()
Out[84]:
In [104]:
authors_df.name.describe()
Out[104]:
In [108]:
pd.read_sql_query('''select name, count(name) name_count from authors
group by name having count(name) > 1 order by count(name) desc;''', con)
Out[108]:
In [113]:
pa_df = pd.read_sql_query('select * from paper_authors;', con)
In [114]:
pa_df.head()
Out[114]:
In [133]:
pd.read_sql_query('''select author_id, authors.name, count(paper_id)
from paper_authors join authors on author_id = authors.id
group by author_id order by count(paper_id) desc limit 20;
''', con)
Out[133]:
In [ ]: