In [1]:
import numpy as np
import nltk
import pandas as pd
from sqlite3 import connect
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

Vector space ranking demonstration

Fetch data


In [2]:
con = connect('../data/nips-papers/database.sqlite')

In [15]:
texts = [x[0] for x in con.execute('select paper_text from papers;').fetchall()]
titles = [x[0] for x in con.execute('select title from papers;').fetchall()]
years = con.execute('select year from papers;').fetchall()



Proprocess and tokenize texts


In [4]:
from preprocessing import Preprocessor

In [5]:
prepr = Preprocessor()
tokenized = [prepr.process(text) for text in tqdm(texts)]


100%|██████████████████████████████████████████████████████████████████████████████| 6560/6560 [08:44<00:00, 10.15it/s]

Build ranker


In [105]:
from importlib import reload
import ranking
reload(ranking)
from ranking import BasicVSRanker

In [106]:
ranker = BasicVSRanker.from_tokenized(tokenized)

Example query processing


In [107]:
for i in ranker.get_best_matches('neural networks', 5):
    print("{} {}".format(years[i], titles[i]))


(1989,) A Large-Scale Neural Network Which Recognizes Handwritten Kanji Characters
(1989,) Analog Neural Networks of Limited Precision I: Computing with Multilinear Threshold Functions
(1989,) Designing Application-Specific Neural Networks Using the Genetic Algorithm
(1991,) Refining PID Controllers using Neural Networks
(1994,) An experimental comparison of recurrent neural networks

In [108]:
for i in ranker.get_best_matches('neural networks deep learning', 5):
    print("{} {}".format(years[i], titles[i]))


(2010,) Layer-wise analysis of deep networks with Gaussian kernels
(2014,) Factoring Variations in Natural Images with Deep Gaussian Mixture Models
(2016,) Deep Learning without Poor Local Minima
(2014,) Searching for Higgs Boson Decay Modes with Deep Learning
(2015,) Training Very Deep Networks

In [109]:
for i in ranker.get_best_matches('pca', 5):
    print("{} {}".format(years[i], titles[i]))


(1998,) Bayesian PCA
(2013,) Faster Ridge Regression via the Subsampled Randomized Hadamard Transform
(2013,) Robust Transfer Principal Component Analysis with Rank Constraints
(2006,) Nonnegative Sparse PCA
(2012,) Semiparametric Principal Component Analysis

In [ ]:

Ranking based on document body and title example


In [47]:
tokenized_titles = [prepr.process(title) for title in tqdm(titles)]



  0%|                                                                                         | 0/6560 [00:00<?, ?it/s]

  6%|████▍                                                                        | 374/6560 [00:00<00:01, 3730.09it/s]

 12%|█████████▏                                                                   | 784/6560 [00:00<00:01, 3830.98it/s]

 18%|█████████████▉                                                              | 1205/6560 [00:00<00:01, 3934.39it/s]

 25%|██████████████████▉                                                         | 1637/6560 [00:00<00:01, 4039.39it/s]

 32%|████████████████████████▍                                                   | 2105/6560 [00:00<00:01, 4209.56it/s]

 39%|█████████████████████████████▋                                              | 2563/6560 [00:00<00:00, 4311.01it/s]

 46%|██████████████████████████████████▉                                         | 3011/6560 [00:00<00:00, 4356.97it/s]

 53%|████████████████████████████████████████                                    | 3454/6560 [00:00<00:00, 4378.09it/s]

 59%|████████████████████████████████████████████▉                               | 3877/6560 [00:00<00:00, 4329.14it/s]

 66%|█████████████████████████████████████████████████▉                          | 4305/6560 [00:01<00:00, 4310.80it/s]

 72%|██████████████████████████████████████████████████████▊                     | 4727/6560 [00:01<00:00, 4202.76it/s]

 78%|███████████████████████████████████████████████████████████▌                | 5142/6560 [00:01<00:00, 4157.82it/s]

 85%|████████████████████████████████████████████████████████████████▎           | 5556/6560 [00:01<00:00, 4149.13it/s]

 91%|█████████████████████████████████████████████████████████████████████▎      | 5979/6560 [00:01<00:00, 4170.11it/s]

 97%|██████████████████████████████████████████████████████████████████████████  | 6394/6560 [00:01<00:00, 4135.83it/s]

100%|████████████████████████████████████████████████████████████████████████████| 6560/6560 [00:01<00:00, 4201.23it/s]

In [110]:
title_ranker = BasicVSRanker.from_tokenized(tokenized_titles)

In [111]:
def joint_best_matches(query, body_ranker, title_ranker, alpha=0.5, n=10):
    scores = (body_ranker.get_scores(query) * alpha +
              title_ranker.get_scores(query) * (1 - alpha))
    return np.argsort(-scores).flatten()[:n]

In [ ]:


In [116]:
for i in joint_best_matches("neural networks and deep learning", ranker, title_ranker, n=5, alpha=0.3):
    print("{} {}".format(years[i], titles[i]))


(2011,) Shallow vs. Deep Sum-Product Networks
(2015,) Path-SGD: Path-Normalized Optimization in Deep Neural Networks
(2014,) Deep Symmetry Networks
(2015,) Training Very Deep Networks
(2016,) Deep Neural Networks with Inexact Matching for Person Re-Identification

In [ ]: