In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlite3 import connect
import nltk
from tqdm import tqdm
import os
%matplotlib inline

In [2]:
con = connect(os.path.join('..', '..', 'nips-data', 'database.sqlite'))

In [3]:
titles = [x[0] for x in con.execute('select title from papers;').fetchall()]
texts = [x[0] for x in con.execute('select paper_text from papers;').fetchall()]

In [4]:
from collections import Counter

In [5]:
def tokenize(texts):
    texts_tokenized = []
    counter = Counter()
    for text in tqdm(texts):
        tokens = [x.lower() for x in nltk.word_tokenize(text)]
        texts_tokenized.append(tokens)
        counter.update(tokens)
    return texts_tokenized, counter

Text vocabulary analysis


In [6]:
texts_tokenized, counter = tokenize(texts)


 76%|███████▌  | 4964/6560 [03:23<01:05, 24.39it/s]
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-6-188f2713c61f> in <module>()
----> 1 texts_tokenized, counter = tokenize(texts)

<ipython-input-5-56ba90ca983f> in tokenize(texts)
      3     counter = Counter()
      4     for text in tqdm(texts):
----> 5         tokens = [x.lower() for x in nltk.word_tokenize(text)]
      6         texts_tokenized.append(tokens)
      7         counter.update(tokens)

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/__init__.py in word_tokenize(text, language, preserve_line)
    126     :type preserver_line: bool
    127     """
--> 128     sentences = [text] if preserve_line else sent_tokenize(text, language)
    129     return [token for sent in sentences
    130             for token in _treebank_word_tokenizer.tokenize(sent)]

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/__init__.py in sent_tokenize(text, language)
     93     """
     94     tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
---> 95     return tokenizer.tokenize(text)
     96 
     97 # Standard word tokenizer.

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in tokenize(self, text, realign_boundaries)
   1235         Given a text, returns a list of the sentences in that text.
   1236         """
-> 1237         return list(self.sentences_from_text(text, realign_boundaries))
   1238 
   1239     def debug_decisions(self, text):

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in sentences_from_text(self, text, realign_boundaries)
   1283         follows the period.
   1284         """
-> 1285         return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
   1286 
   1287     def _slices_from_text(self, text):

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in span_tokenize(self, text, realign_boundaries)
   1274         if realign_boundaries:
   1275             slices = self._realign_boundaries(text, slices)
-> 1276         return [(sl.start, sl.stop) for sl in slices]
   1277 
   1278     def sentences_from_text(self, text, realign_boundaries=True):

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in <listcomp>(.0)
   1274         if realign_boundaries:
   1275             slices = self._realign_boundaries(text, slices)
-> 1276         return [(sl.start, sl.stop) for sl in slices]
   1277 
   1278     def sentences_from_text(self, text, realign_boundaries=True):

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in _realign_boundaries(self, text, slices)
   1314         """
   1315         realign = 0
-> 1316         for sl1, sl2 in _pair_iter(slices):
   1317             sl1 = slice(sl1.start + realign, sl1.stop)
   1318             if not sl2:

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in _pair_iter(it)
    311     it = iter(it)
    312     prev = next(it)
--> 313     for el in it:
    314         yield (prev, el)
    315         prev = el

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in _slices_from_text(self, text)
   1289         for match in self._lang_vars.period_context_re().finditer(text):
   1290             context = match.group() + match.group('after_tok')
-> 1291             if self.text_contains_sentbreak(context):
   1292                 yield slice(last_break, match.end())
   1293                 if match.group('next_tok'):

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in text_contains_sentbreak(self, text)
   1335         """
   1336         found = False # used to ignore last token
-> 1337         for t in self._annotate_tokens(self._tokenize_words(text)):
   1338             if found:
   1339                 return True

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in _annotate_second_pass(self, tokens)
   1471         """
   1472         for t1, t2 in _pair_iter(tokens):
-> 1473             self._second_pass_annotation(t1, t2)
   1474             yield t1
   1475 

~/.env/nipstimemachine/lib/python3.6/site-packages/nltk/tokenize/punkt.py in _second_pass_annotation(self, aug_tok1, aug_tok2)
   1507         # the token is an abbreviation or an ellipsis, then decide
   1508         # whether we should *also* classify it as a sentbreak.
-> 1509         if ( (aug_tok1.abbr or aug_tok1.ellipsis) and
   1510              (not tok_is_initial) ):
   1511             # [4.1.1. Orthographic Heuristic] Check if there's

KeyboardInterrupt: 

Top 10 tokens without filtering


In [23]:
counter.most_common(10)


Out[23]:
[(',', 1960823),
 ('the', 1738830),
 ('.', 1621878),
 ('?', 1582064),
 (')', 1047467),
 ('(', 1043377),
 ('of', 886950),
 ('and', 647244),
 ('a', 642349),
 ('in', 543638)]

Top 10 tokens with length greater than 2


In [42]:
words = counter.keys()
long_words = [w for w in words if len(w) > 2]
long_counter = Counter({k : counter[k] for k in long_words})

In [43]:
long_counter.most_common(10)


Out[43]:
[('the', 1738830),
 ('and', 647244),
 ('for', 354940),
 ('that', 264697),
 ('with', 201241),
 ('this', 168342),
 ('are', 164026),
 ('can', 116736),
 ('from', 111621),
 ('which', 95596)]

Top 10 tokens with length greater than 2 and without stopwords


In [37]:
stop_words = set(nltk.corpus.stopwords.words('english'))
good_words = [w for w in words if len(w) > 2 and not w in stop_words]
good_counter = Counter({k : counter[k] for k in good_words})

In [38]:
good_counter.most_common(10)


Out[38]:
[('model', 91029),
 ('learning', 88581),
 ('data', 81327),
 ('algorithm', 71782),
 ('set', 65855),
 ('function', 60905),
 ('using', 59934),
 ('one', 51899),
 ('figure', 47961),
 ('number', 47635)]

Texts length ditribution analysis


In [70]:
def print_stats(array, name):
    print('Min {}: '.format(name), np.min(array))
    print('Max {}: '.format(name), np.max(array))
    print('Mean {}: '.format(name), np.mean(array))

Distribution of text lengths in characters


In [71]:
char_lengths = [len(text) for text in texts]

In [73]:
print_stats(char_lengths, 'length in chars')


Min length in chars:  9
Max length in chars:  123727
Mean length in chars:  27389.4207317

In [59]:
plt.subplots(figsize=(10, 8))
plt.hist(char_lengths, bins=40, range=(0, 60000));
plt.xlabel('Text length (chars)')


Out[59]:
<matplotlib.text.Text at 0x15fc11054a8>

Distribution of text lengths in unfiltered tokens


In [31]:
token_lengths = [len(tokenized_text) for tokenized_text in texts_tokenized]

In [74]:
print_stats(token_lengths, 'length in unfiltered tokens')


Min length in unfiltered tokens:  0
Max length in unfiltered tokens:  27072
Mean length in unfiltered tokens:  5897.45045732

In [56]:
plt.subplots(figsize=(10, 8))
plt.hist(token_lengths, bins=40, range=(0, 15000));
plt.xlabel('Text length (tokens)')


Out[56]:
<matplotlib.text.Text at 0x15fc0fca940>

Distribution of text lengths in filtered tokens


In [39]:
texts_tokenized_filtered = []
for tokenized_text in texts_tokenized:
    texts_tokenized_filtered.append([t for t in tokenized_text if len(t) > 2 and not t in stop_words])

In [40]:
token_lengths_filtered = [len(tokenized_text) for tokenized_text in texts_tokenized_filtered]

In [75]:
print_stats(token_lengths_filtered, 'length in filtered tokens')


Min length in filtered tokens:  0
Max length in filtered tokens:  10900
Mean length in filtered tokens:  2298.52088415

In [55]:
plt.subplots(figsize=(10, 8))
plt.hist(token_lengths_filtered, bins=40, range=(0, 6000));
plt.xlabel('Text length (filtered tokens)')


Out[55]:
<matplotlib.text.Text at 0x15fc0f462b0>

Title vocabulary analysis


In [48]:
titles_tokenized, title_counter = tokenize(titles)


100%|████████████████████████████████████████████████████████████████████████████| 6560/6560 [00:00<00:00, 7321.42it/s]

Top 10 tokens without filter


In [49]:
title_counter.most_common(10)


Out[49]:
[('for', 1846),
 ('of', 1714),
 ('learning', 1373),
 ('and', 1190),
 ('a', 1086),
 ('in', 987),
 ('the', 897),
 ('with', 884),
 (':', 753),
 ('neural', 580)]

Top 10 tokens after filtration


In [50]:
good_title_words = [w for w in title_counter.keys() if len(w) > 2 and not w in stop_words]
filtered_title_counter = Counter({k : title_counter[k] for k in good_title_words})

In [54]:
filtered_title_counter.most_common(10)


Out[54]:
[('learning', 1373),
 ('neural', 580),
 ('networks', 534),
 ('models', 448),
 ('using', 433),
 ('model', 360),
 ('bayesian', 267),
 ('network', 243),
 ('inference', 238),
 ('data', 237)]

Title length distribution analysis

Distribution of title lengths in chars


In [62]:
title_lengths_chars = [len(title) for title in titles]

In [76]:
print_stats(title_lengths_chars, 'length in chars')


Min length in chars:  5
Max length in chars:  156
Mean length in chars:  60.2858231707

In [77]:
plt.subplots(figsize=(10, 8))
plt.hist(title_lengths_chars, bins=20);
plt.xlabel('Title length (chars)')


Out[77]:
<matplotlib.text.Text at 0x15fc0f5d0b8>

Destribution of title lengths in unfiltered tokens


In [63]:
title_lengths_tokens = [len(title) for title in titles_tokenized]

In [78]:
print_stats(title_lengths_tokens, 'length in unfiltered tokens')


Min length in unfiltered tokens:  1
Max length in unfiltered tokens:  23
Mean length in unfiltered tokens:  7.83582317073

In [80]:
plt.subplots(figsize=(10, 8))
plt.hist(title_lengths_tokens, bins=20);
plt.xlabel('Title length (unfiltered tokens)')


Out[80]:
<matplotlib.text.Text at 0x15f02741c88>

In [83]:
titles_tokenized_filtered = []
for tokenized_title in titles_tokenized:
    titles_tokenized_filtered.append([t for t in tokenized_title if len(t) > 2 and not t in stop_words])

In [84]:
title_lengths_tokens_filtered = [len(title) for title in titles_tokenized_filtered]

Distribution of title lengths in filtered tokens


In [85]:
print_stats(title_lengths_tokens_filtered, 'length in filtered tokens')


Min length in filtered tokens:  1
Max length in filtered tokens:  15
Mean length in filtered tokens:  5.94451219512

In [88]:
plt.subplots(figsize=(10, 8))
plt.hist(title_lengths_tokens_filtered, bins=15);
plt.xlabel('Title length (filtered tokens)')


Out[88]:
<matplotlib.text.Text at 0x15f03a0ecc0>

In [ ]: