In [1]:

    
import numpy as np
import nltk
import pandas as pd
from sqlite3 import connect
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

Vector space ranking demonstration

Fetch data



In [2]:

    
con = connect('../data/nips-papers/database.sqlite')



In [15]:

    
texts = [x[0] for x in con.execute('select paper_text from papers;').fetchall()]
titles = [x[0] for x in con.execute('select title from papers;').fetchall()]
years = con.execute('select year from papers;').fetchall()

Proprocess and tokenize texts



In [4]:

    
from preprocessing import Preprocessor



In [5]:

    
prepr = Preprocessor()
tokenized = [prepr.process(text) for text in tqdm(texts)]









    



100%|██████████████████████████████████████████████████████████████████████████████| 6560/6560 [08:44<00:00, 10.15it/s]

Build ranker



In [105]:

    
from importlib import reload
import ranking
reload(ranking)
from ranking import BasicVSRanker



In [106]:

    
ranker = BasicVSRanker.from_tokenized(tokenized)

Example query processing



In [107]:

    
for i in ranker.get_best_matches('neural networks', 5):
    print("{} {}".format(years[i], titles[i]))









    



(1989,) A Large-Scale Neural Network Which Recognizes Handwritten Kanji Characters
(1989,) Analog Neural Networks of Limited Precision I: Computing with Multilinear Threshold Functions
(1989,) Designing Application-Specific Neural Networks Using the Genetic Algorithm
(1991,) Refining PID Controllers using Neural Networks
(1994,) An experimental comparison of recurrent neural networks



In [108]:

    
for i in ranker.get_best_matches('neural networks deep learning', 5):
    print("{} {}".format(years[i], titles[i]))









    



(2010,) Layer-wise analysis of deep networks with Gaussian kernels
(2014,) Factoring Variations in Natural Images with Deep Gaussian Mixture Models
(2016,) Deep Learning without Poor Local Minima
(2014,) Searching for Higgs Boson Decay Modes with Deep Learning
(2015,) Training Very Deep Networks



In [109]:

    
for i in ranker.get_best_matches('pca', 5):
    print("{} {}".format(years[i], titles[i]))









    



(1998,) Bayesian PCA
(2013,) Faster Ridge Regression via the Subsampled Randomized Hadamard Transform
(2013,) Robust Transfer Principal Component Analysis with Rank Constraints
(2006,) Nonnegative Sparse PCA
(2012,) Semiparametric Principal Component Analysis



In [ ]:

Ranking based on document body and title example



In [47]:

    
tokenized_titles = [prepr.process(title) for title in tqdm(titles)]









    




  0%|                                                                                         | 0/6560 [00:00<?, ?it/s]

  6%|████▍                                                                        | 374/6560 [00:00<00:01, 3730.09it/s]

 12%|█████████▏                                                                   | 784/6560 [00:00<00:01, 3830.98it/s]

 18%|█████████████▉                                                              | 1205/6560 [00:00<00:01, 3934.39it/s]

 25%|██████████████████▉                                                         | 1637/6560 [00:00<00:01, 4039.39it/s]

 32%|████████████████████████▍                                                   | 2105/6560 [00:00<00:01, 4209.56it/s]

 39%|█████████████████████████████▋                                              | 2563/6560 [00:00<00:00, 4311.01it/s]

 46%|██████████████████████████████████▉                                         | 3011/6560 [00:00<00:00, 4356.97it/s]

 53%|████████████████████████████████████████                                    | 3454/6560 [00:00<00:00, 4378.09it/s]

 59%|████████████████████████████████████████████▉                               | 3877/6560 [00:00<00:00, 4329.14it/s]

 66%|█████████████████████████████████████████████████▉                          | 4305/6560 [00:01<00:00, 4310.80it/s]

 72%|██████████████████████████████████████████████████████▊                     | 4727/6560 [00:01<00:00, 4202.76it/s]

 78%|███████████████████████████████████████████████████████████▌                | 5142/6560 [00:01<00:00, 4157.82it/s]

 85%|████████████████████████████████████████████████████████████████▎           | 5556/6560 [00:01<00:00, 4149.13it/s]

 91%|█████████████████████████████████████████████████████████████████████▎      | 5979/6560 [00:01<00:00, 4170.11it/s]

 97%|██████████████████████████████████████████████████████████████████████████  | 6394/6560 [00:01<00:00, 4135.83it/s]

100%|████████████████████████████████████████████████████████████████████████████| 6560/6560 [00:01<00:00, 4201.23it/s]



In [110]:

    
title_ranker = BasicVSRanker.from_tokenized(tokenized_titles)



In [111]:

    
def joint_best_matches(query, body_ranker, title_ranker, alpha=0.5, n=10):
    scores = (body_ranker.get_scores(query) * alpha +
              title_ranker.get_scores(query) * (1 - alpha))
    return np.argsort(-scores).flatten()[:n]



In [ ]:



In [116]:

    
for i in joint_best_matches("neural networks and deep learning", ranker, title_ranker, n=5, alpha=0.3):
    print("{} {}".format(years[i], titles[i]))









    



(2011,) Shallow vs. Deep Sum-Product Networks
(2015,) Path-SGD: Path-Normalized Optimization in Deep Neural Networks
(2014,) Deep Symmetry Networks
(2015,) Training Very Deep Networks
(2016,) Deep Neural Networks with Inexact Matching for Person Re-Identification



In [ ]: