notebook.community



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlite3 import connect
import nltk
from tqdm import tqdm
import os
%matplotlib inline



In [2]:

    
con = connect(os.path.join('..', '..', 'nips-data', 'database.sqlite'))



In [3]:

    
titles = [x[0] for x in con.execute('select title from papers;').fetchall()]
texts = [x[0] for x in con.execute('select paper_text from papers;').fetchall()]



In [4]:

    
from collections import Counter



In [5]:

    
def tokenize(texts):
    texts_tokenized = []
    counter = Counter()
    for text in tqdm(texts):
        tokens = [x.lower() for x in nltk.word_tokenize(text)]
        texts_tokenized.append(tokens)
        counter.update(tokens)
    return texts_tokenized, counter

Text vocabulary analysis



In [6]:

    
texts_tokenized, counter = tokenize(texts)









    



 76%|███████▌  | 4964/6560 [03:23<01:05, 24.39it/s]





    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-6-188f2713c61f> in <module>()
----> 1 texts_tokenized, counter = tokenize(texts)

<ipython-input-5-56ba90ca983f> in tokenize(texts)
      3     counter = Counter()
      4     for text in tqdm(texts):
----> 5         tokens = [x.lower() for x in nltk.word_tokenize(text)]
      6         texts_tokenized.append(tokens)
      7         counter.update(tokens)

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/__init__.py in word_tokenize(text, language, preserve_line)
    126     :type preserver_line: bool
    127     """
--> 128     sentences = [text] if preserve_line else sent_tokenize(text, language)
    129     return [token for sent in sentences
    130             for token in _treebank_word_tokenizer.tokenize(sent)]

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/__init__.py in sent_tokenize(text, language)
     93     """
     94     tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
---> 95     return tokenizer.tokenize(text)
     96 
     97 # Standard word tokenizer.

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in tokenize(self, text, realign_boundaries)
   1235         Given a text, returns a list of the sentences in that text.
   1236         """
-> 1237         return list(self.sentences_from_text(text, realign_boundaries))
   1238 
   1239     def debug_decisions(self, text):

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in sentences_from_text(self, text, realign_boundaries)
   1283         follows the period.
   1284         """
-> 1285         return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
   1286 
   1287     def _slices_from_text(self, text):

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in span_tokenize(self, text, realign_boundaries)
   1274         if realign_boundaries:
   1275             slices = self._realign_boundaries(text, slices)
-> 1276         return [(sl.start, sl.stop) for sl in slices]
   1277 
   1278     def sentences_from_text(self, text, realign_boundaries=True):

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in <listcomp>(.0)
   1274         if realign_boundaries:
   1275             slices = self._realign_boundaries(text, slices)
-> 1276         return [(sl.start, sl.stop) for sl in slices]
   1277 
   1278     def sentences_from_text(self, text, realign_boundaries=True):

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in _realign_boundaries(self, text, slices)
   1314         """
   1315         realign = 0
-> 1316         for sl1, sl2 in _pair_iter(slices):
   1317             sl1 = slice(sl1.start + realign, sl1.stop)
   1318             if not sl2:

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in _pair_iter(it)
    311     it = iter(it)
    312     prev = next(it)
--> 313     for el in it:
    314         yield (prev, el)
    315         prev = el

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in _slices_from_text(self, text)
   1289         for match in self._lang_vars.period_context_re().finditer(text):
   1290             context = match.group() + match.group('after_tok')
-> 1291             if self.text_contains_sentbreak(context):
   1292                 yield slice(last_break, match.end())
   1293                 if match.group('next_tok'):

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in text_contains_sentbreak(self, text)
   1335         """
   1336         found = False # used to ignore last token
-> 1337         for t in self._annotate_tokens(self._tokenize_words(text)):
   1338             if found:
   1339                 return True

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in _annotate_second_pass(self, tokens)
   1471         """
   1472         for t1, t2 in _pair_iter(tokens):
-> 1473             self._second_pass_annotation(t1, t2)
   1474             yield t1
   1475 

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in _second_pass_annotation(self, aug_tok1, aug_tok2)
   1507         # the token is an abbreviation or an ellipsis, then decide
   1508         # whether we should *also* classify it as a sentbreak.
-> 1509         if ( (aug_tok1.abbr or aug_tok1.ellipsis) and
   1510              (not tok_is_initial) ):
   1511             # [4.1.1. Orthographic Heuristic] Check if there's

KeyboardInterrupt:

Top 10 tokens without filtering



In [23]:

    
counter.most_common(10)









    Out[23]:





[(',', 1960823),
 ('the', 1738830),
 ('.', 1621878),
 ('?', 1582064),
 (')', 1047467),
 ('(', 1043377),
 ('of', 886950),
 ('and', 647244),
 ('a', 642349),
 ('in', 543638)]

Top 10 tokens with length greater than 2



In [42]:

    
words = counter.keys()
long_words = [w for w in words if len(w) > 2]
long_counter = Counter({k : counter[k] for k in long_words})



In [43]:

    
long_counter.most_common(10)









    Out[43]:





[('the', 1738830),
 ('and', 647244),
 ('for', 354940),
 ('that', 264697),
 ('with', 201241),
 ('this', 168342),
 ('are', 164026),
 ('can', 116736),
 ('from', 111621),
 ('which', 95596)]

Top 10 tokens with length greater than 2 and without stopwords



In [37]:

    
stop_words = set(nltk.corpus.stopwords.words('english'))
good_words = [w for w in words if len(w) > 2 and not w in stop_words]
good_counter = Counter({k : counter[k] for k in good_words})



In [38]:

    
good_counter.most_common(10)









    Out[38]:





[('model', 91029),
 ('learning', 88581),
 ('data', 81327),
 ('algorithm', 71782),
 ('set', 65855),
 ('function', 60905),
 ('using', 59934),
 ('one', 51899),
 ('figure', 47961),
 ('number', 47635)]

Texts length ditribution analysis



In [70]:

    
def print_stats(array, name):
    print('Min {}: '.format(name), np.min(array))
    print('Max {}: '.format(name), np.max(array))
    print('Mean {}: '.format(name), np.mean(array))

Distribution of text lengths in characters



In [71]:

    
char_lengths = [len(text) for text in texts]



In [73]:

    
print_stats(char_lengths, 'length in chars')









    



Min length in chars:  9
Max length in chars:  123727
Mean length in chars:  27389.4207317



In [59]:

    
plt.subplots(figsize=(10, 8))
plt.hist(char_lengths, bins=40, range=(0, 60000));
plt.xlabel('Text length (chars)')









    Out[59]:





<matplotlib.text.Text at 0x15fc11054a8>

Distribution of text lengths in unfiltered tokens



In [31]:

    
token_lengths = [len(tokenized_text) for tokenized_text in texts_tokenized]



In [74]:

    
print_stats(token_lengths, 'length in unfiltered tokens')









    



Min length in unfiltered tokens:  0
Max length in unfiltered tokens:  27072
Mean length in unfiltered tokens:  5897.45045732



In [56]:

    
plt.subplots(figsize=(10, 8))
plt.hist(token_lengths, bins=40, range=(0, 15000));
plt.xlabel('Text length (tokens)')









    Out[56]:





<matplotlib.text.Text at 0x15fc0fca940>

Distribution of text lengths in filtered tokens



In [39]:

    
texts_tokenized_filtered = []
for tokenized_text in texts_tokenized:
    texts_tokenized_filtered.append([t for t in tokenized_text if len(t) > 2 and not t in stop_words])



In [40]:

    
token_lengths_filtered = [len(tokenized_text) for tokenized_text in texts_tokenized_filtered]



In [75]:

    
print_stats(token_lengths_filtered, 'length in filtered tokens')









    



Min length in filtered tokens:  0
Max length in filtered tokens:  10900
Mean length in filtered tokens:  2298.52088415



In [55]:

    
plt.subplots(figsize=(10, 8))
plt.hist(token_lengths_filtered, bins=40, range=(0, 6000));
plt.xlabel('Text length (filtered tokens)')









    Out[55]:





<matplotlib.text.Text at 0x15fc0f462b0>

Title vocabulary analysis



In [48]:

    
titles_tokenized, title_counter = tokenize(titles)









    



100%|████████████████████████████████████████████████████████████████████████████| 6560/6560 [00:00<00:00, 7321.42it/s]

Top 10 tokens without filter



In [49]:

    
title_counter.most_common(10)









    Out[49]:





[('for', 1846),
 ('of', 1714),
 ('learning', 1373),
 ('and', 1190),
 ('a', 1086),
 ('in', 987),
 ('the', 897),
 ('with', 884),
 (':', 753),
 ('neural', 580)]

Top 10 tokens after filtration



In [50]:

    
good_title_words = [w for w in title_counter.keys() if len(w) > 2 and not w in stop_words]
filtered_title_counter = Counter({k : title_counter[k] for k in good_title_words})



In [54]:

    
filtered_title_counter.most_common(10)









    Out[54]:





[('learning', 1373),
 ('neural', 580),
 ('networks', 534),
 ('models', 448),
 ('using', 433),
 ('model', 360),
 ('bayesian', 267),
 ('network', 243),
 ('inference', 238),
 ('data', 237)]

Title length distribution analysis

Distribution of title lengths in chars



In [62]:

    
title_lengths_chars = [len(title) for title in titles]



In [76]:

    
print_stats(title_lengths_chars, 'length in chars')









    



Min length in chars:  5
Max length in chars:  156
Mean length in chars:  60.2858231707



In [77]:

    
plt.subplots(figsize=(10, 8))
plt.hist(title_lengths_chars, bins=20);
plt.xlabel('Title length (chars)')









    Out[77]:





<matplotlib.text.Text at 0x15fc0f5d0b8>

Destribution of title lengths in unfiltered tokens



In [63]:

    
title_lengths_tokens = [len(title) for title in titles_tokenized]



In [78]:

    
print_stats(title_lengths_tokens, 'length in unfiltered tokens')









    



Min length in unfiltered tokens:  1
Max length in unfiltered tokens:  23
Mean length in unfiltered tokens:  7.83582317073



In [80]:

    
plt.subplots(figsize=(10, 8))
plt.hist(title_lengths_tokens, bins=20);
plt.xlabel('Title length (unfiltered tokens)')









    Out[80]:





<matplotlib.text.Text at 0x15f02741c88>



In [83]:

    
titles_tokenized_filtered = []
for tokenized_title in titles_tokenized:
    titles_tokenized_filtered.append([t for t in tokenized_title if len(t) > 2 and not t in stop_words])



In [84]:

    
title_lengths_tokens_filtered = [len(title) for title in titles_tokenized_filtered]

Distribution of title lengths in filtered tokens



In [85]:

    
print_stats(title_lengths_tokens_filtered, 'length in filtered tokens')









    



Min length in filtered tokens:  1
Max length in filtered tokens:  15
Mean length in filtered tokens:  5.94451219512



In [88]:

    
plt.subplots(figsize=(10, 8))
plt.hist(title_lengths_tokens_filtered, bins=15);
plt.xlabel('Title length (filtered tokens)')









    Out[88]:





<matplotlib.text.Text at 0x15f03a0ecc0>



In [ ]: