Best Practices for Preprocessing Natural Language Data

In this notebook, we improve the quality of our Project Gutenberg word vectors by adopting best-practices for preprocessing natural language data.

N.B.: Some, all or none of these preprocessing steps may be helpful to a given downstream application.

Load dependencies


In [1]:
# the initial block is copied from creating_word_vectors_with_word2vec.ipynb
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure
%matplotlib inline


Using TensorFlow backend.

In [2]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Out[2]:
True

In [3]:
# new!
import string
from nltk.corpus import stopwords
from nltk.stem.porter import *
from gensim.models.phrases import Phraser, Phrases
from keras.preprocessing.text import one_hot

In [4]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Out[4]:
True

Load data


In [5]:
nltk.download('gutenberg')


[nltk_data] Downloading package gutenberg to /home/jovyan/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
Out[5]:
True

In [6]:
from nltk.corpus import gutenberg

In [7]:
gberg_sents = gutenberg.sents()

Iteratively preprocess a sentence

a tokenized sentence:

In [8]:
gberg_sents[4]


Out[8]:
['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']
to lowercase:

In [9]:
[w.lower() for w in gberg_sents[4]]


Out[9]:
['she',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']
remove stopwords and punctuation:

In [10]:
stpwrds = stopwords.words('english') + list(string.punctuation)

In [11]:
stpwrds


Out[11]:
['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 'not',
 'only',
 'own',
 'same',
 'so',
 'than',
 'too',
 'very',
 's',
 't',
 'can',
 'will',
 'just',
 'don',
 'should',
 'now',
 'd',
 'll',
 'm',
 'o',
 're',
 've',
 'y',
 'ain',
 'aren',
 'couldn',
 'didn',
 'doesn',
 'hadn',
 'hasn',
 'haven',
 'isn',
 'ma',
 'mightn',
 'mustn',
 'needn',
 'shan',
 'shouldn',
 'wasn',
 'weren',
 'won',
 'wouldn',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~']

In [12]:
# CODE HERE
[w.lower() for w in gberg_sents[4] if w not in stpwrds]


Out[12]:
['she',
 'youngest',
 'two',
 'daughters',
 'affectionate',
 'indulgent',
 'father',
 'consequence',
 'sister',
 'marriage',
 'mistress',
 'house',
 'early',
 'period']
stem words:

In [13]:
stemmer = PorterStemmer()

In [14]:
# CODE HERE
[stemmer.stem(w.lower()) for w in gberg_sents[4] if w not in stpwrds]


Out[14]:
['she',
 'youngest',
 'two',
 'daughter',
 'affection',
 'indulg',
 'father',
 'consequ',
 'sister',
 'marriag',
 'mistress',
 'hous',
 'earli',
 'period']
handle bigram collocations:

In [15]:
phrases = Phrases(gberg_sents) # train detector

In [16]:
bigram = Phraser(phrases) # create a more efficient Phraser object for transforming sentences

In [17]:
bigram.phrasegrams # output count and score of each bigram


Out[17]:
{(b'two', b'daughters'): (19, 11.966813731181546),
 (b'her', b'sister'): (195, 17.7960829227865),
 (b"'", b's'): (9781, 31.066242737744524),
 (b'very', b'early'): (24, 11.01214147275924),
 (b'Her', b'mother'): (14, 13.529425062715127),
 (b'long', b'ago'): (38, 63.22343628984788),
 (b'more', b'than'): (541, 29.023584433996874),
 (b'had', b'been'): (1256, 22.306024648925288),
 (b'an', b'excellent'): (54, 39.063874851750626),
 (b'Miss', b'Taylor'): (48, 453.75918026073305),
 (b'very', b'fond'): (28, 24.134280468850747),
 (b'passed', b'away'): (25, 12.35053642325912),
 (b'too', b'much'): (173, 31.376002029426687),
 (b'did', b'not'): (935, 11.728416217142811),
 (b'any', b'means'): (27, 14.096964108090186),
 (b'wedding', b'-'): (15, 17.4695197740113),
 (b'Her', b'father'): (18, 13.129571562488772),
 (b'after', b'dinner'): (21, 21.5285481168817),
 (b'self', b'-'): (124, 47.79018053120332),
 (b'sixteen', b'years'): (12, 107.0461671612265),
 (b'five', b'years'): (42, 40.128755673408115),
 (b'years', b'old'): (176, 54.735425236061104),
 (b'seven', b'years'): (51, 52.59411150244507),
 (b'each', b'other'): (236, 79.4168405322873),
 (b'a', b'mile'): (48, 12.783091600264584),
 (b'must', b'be'): (601, 10.229989650632808),
 (b'difference', b'between'): (44, 220.5253730524468),
 (b'could', b'not'): (1049, 10.870983286982371),
 (b'having', b'been'): (49, 11.538018331569376),
 (b'miles', b'off'): (16, 34.7868137375225),
 (b'at', b'Hartfield'): (66, 27.282227059708397),
 (b'her', b'husband'): (158, 27.544395195049724),
 (b'in', b'spite'): (96, 13.441914963088442),
 (b'Emma', b'could'): (61, 11.335111257744053),
 (b'every', b'body'): (127, 36.972582976825784),
 (b'no', b'means'): (80, 32.57361823161739),
 (b'his', b'own'): (773, 10.402387689390995),
 (b'obliged', b'to'): (179, 10.43662879984823),
 (b'able', b'to'): (348, 11.446828804673226),
 (b'very', b'much'): (234, 16.21027499400206),
 (b'have', b'been'): (986, 17.981191047801964),
 (b'great', b'deal'): (181, 118.04013764577145),
 (b'"', b'Poor'): (30, 10.125586409383336),
 (b'agree', b'with'): (25, 13.611743912451109),
 (b'-', b'humoured'): (22, 33.94078127522195),
 (b'for', b'ever'): (555, 12.4761138144463),
 (b'This', b'is'): (353, 11.381028160111825),
 (b'three', b'times'): (36, 35.42578086743132),
 (b'my', b'dear'): (253, 24.47894249615343),
 (b'How', b'often'): (12, 12.377968718725876),
 (b'My', b'dear'): (85, 84.80698289930197),
 (b'so', b'far'): (98, 10.161632478973766),
 (b'"', b'No'): (351, 15.063706270011586),
 (b'We', b'must'): (68, 18.765646900386386),
 (b'last', b'night'): (63, 23.592598950882213),
 (b'doubt', b'whether'): (12, 22.924130736398396),
 (b'anywhere', b'else'): (6, 16.100101533414907),
 (b'I', b'am'): (2428, 16.951297329049222),
 (b'very', b'glad'): (46, 18.28434074650975),
 (b'am', b'sure'): (282, 65.1446020744797),
 (b'very', b'pretty'): (39, 20.068178868120455),
 (b'be', b'able'): (121, 11.347612277348256),
 (b'immediately', b'afterwards'): (10, 41.06053966483414),
 (b'sensible', b'man'): (17, 14.541388094211703),
 (b'intimate', b'friend'): (6, 21.898760623229464),
 (b'connected', b'with'): (31, 18.375854281808998),
 (b'than', b'usual'): (30, 28.951968704471486),
 (b'Brunswick', b'Square'): (11, 10881.307917888562),
 (b'some', b'time'): (146, 12.926553753557874),
 (b'poor', b'Isabella'): (10, 41.30241100647832),
 (b'It', b'is'): (777, 11.705870174225897),
 (b'am', b'afraid'): (65, 25.627391867391864),
 (b'moonlight', b'night'): (6, 14.745374344301382),
 (b'Look', b'at'): (33, 13.630464729284446),
 (b'"', b'Well'): (311, 21.191330893669),
 (b'vast', b'deal'): (11, 61.90400400400401),
 (b'an', b'hour'): (150, 41.75757187695002),
 (b'pretty', b'well'): (20, 17.716415202444615),
 (b'tolerably', b'well'): (7, 18.357580705009276),
 (b'"', b'Ah'): (83, 17.279509006244886),
 (b'Ah', b'!'): (68, 37.53295698121932),
 (b"'", b'Tis'): (64, 23.239343942788935),
 (b'Miss', b'Woodhouse'): (173, 294.52709701744294),
 (b'you', b'please'): (93, 13.035980721875688),
 (b'any', b'rate'): (47, 83.92034351736973),
 (b',"', b'said'): (2583, 36.03254133384804),
 (b'My', b'dearest'): (7, 26.665272507761294),
 (b'so', b'much'): (483, 20.56443776042493),
 (b'much', b'less'): (38, 19.104435569934505),
 (b'any', b'body'): (93, 21.716438732250484),
 (b'has', b'been'): (263, 29.2606767165493),
 (b'been', b'used'): (29, 14.09410182779525),
 (b'Well', b',"'): (60, 12.493546273257474),
 (b'tell', b'you'): (296, 11.612165547868624),
 (b'Every', b'body'): (21, 72.2001079929367),
 (b'"', b'Dear'): (39, 20.048661090579007),
 (b'every', b'thing'): (240, 27.277168506679985),
 (b'very', b'sorry'): (32, 20.255731941654293),
 (b'turned', b'away'): (50, 19.3446247292484),
 (b'divided', b'between'): (10, 35.82806127178346),
 (b'knows', b'how'): (13, 14.800957338598696),
 (b'how', b'much'): (110, 15.417663894373641),
 (b'four', b'years'): (21, 16.257604884476734),
 (b'years', b'ago'): (56, 163.33147420261938),
 (b'any', b'thing'): (383, 35.72804044966351),
 (b'need', b'not'): (107, 13.478832663938945),
 (b'his', b'wife'): (263, 10.870850757012368),
 (b'Ever', b'since'): (8, 99.63818474758324),
 (b'leave', b'off'): (18, 10.507247077643713),
 (b'you', b'mean'): (142, 10.573995913362895),
 (b'young', b'lady'): (73, 113.30511794581632),
 (b'depend', b'upon'): (28, 66.33685452578166),
 (b'quarrel', b'with'): (21, 10.691406127597961),
 (b'-', b'hearted'): (45, 49.037248488452775),
 (b'their', b'own'): (279, 10.164510720981893),
 (b'You', b'are'): (231, 12.60019671587268),
 (b'more', b'likely'): (16, 11.176882987148272),
 (b'have', b'done'): (272, 12.664105519891397),
 (b',"', b'rejoined'): (6, 11.956633540853023),
 (b'any', b'longer'): (32, 16.396201569570135),
 (b'very', b'well'): (171, 13.844437162007257),
 (b'young', b'man'): (260, 25.863812524396362),
 (b'dine', b'with'): (22, 13.88397879070013),
 (b'much', b'better'): (38, 10.763384154831522),
 (b'I', b'dare'): (138, 13.676468274997303),
 (b'dare', b'say'): (114, 128.21086697682205),
 (b'Depend', b'upon'): (17, 92.29475412282665),
 (b'take', b'care'): (59, 72.93974751004718),
 (b'CHAPTER', b'II'): (11, 335.5512750949539),
 (b'entering', b'into'): (14, 16.437457915441044),
 (b'never', b'seen'): (42, 14.015206798866856),
 (b'refrain', b'from'): (9, 12.438010669696954),
 (b'at', b'once'): (263, 21.418172245725966),
 (b'three', b'years'): (77, 37.3552600780725),
 (b'any', b'other'): (138, 10.208401968552408),
 (b'twenty', b'years'): (69, 85.29044131115464),
 (b'an', b'easy'): (18, 10.427467282325322),
 (b'according', b'to'): (747, 12.093327441482012),
 (b'had', b'begun'): (25, 12.032976708302648),
 (b'passed', b'through'): (43, 31.462199837199837),
 (b'its', b'being'): (58, 16.06447342496047),
 (b'deal', b'better'): (14, 19.992919953446272),
 (b'fine', b'young'): (13, 10.403137320870965),
 (b'belonging', b'to'): (35, 10.512393800210106),
 (b'Frank', b'Churchill'): (151, 1750.6779772753712),
 (b'Miss', b'Bates'): (113, 400.4260773639656),
 (b'a', b'few'): (404, 11.554600796586188),
 (b'few', b'days'): (53, 35.91529644067919),
 (b'I', b'suppose'): (210, 12.338158409520457),
 (b'very', b'handsome'): (21, 19.759437654764756),
 (b'an', b'irresistible'): (7, 11.369078040261053),
 (b'good', b'sense'): (28, 17.373370904818092),
 (b'had', b'already'): (64, 11.990720435581183),
 (b'She', b'felt'): (26, 13.338332744482605),
 (b'most', b'fortunate'): (6, 11.471572464709046),
 (b'long', b'enough'): (38, 15.189529976554649),
 (b'know', b'how'): (120, 12.782869530582152),
 (b'dear', b'Emma'): (31, 28.38977406756079),
 (b'at', b'Randalls'): (39, 27.033755046414154),
 (b'few', b'weeks'): (19, 134.46858012611438),
 (b'no', b'longer'): (113, 44.45340841020726),
 (b'CHAPTER', b'III'): (10, 354.1930126002291),
 (b'Donwell', b'Abbey'): (9, 753.4827901309776),
 (b'card', b'-'): (18, 15.66232807325151),
 (b'drawing', b'-'): (53, 20.08471734497107),
 (b'-', b'room'): (116, 10.86339885106585),
 (b'thrown', b'away'): (11, 14.820643707910945),
 (b'After', b'these'): (18, 11.091988134657838),
 (b'an', b'invitation'): (11, 10.459551797040168),
 (b'old', b'lady'): (16, 10.886003613395488),
 (b'those', b'who'): (150, 15.975643084740117),
 (b'as', b'possible'): (81, 11.70949877112537),
 (b'young', b'ladies'): (44, 113.63480411788262),
 (b'-', b'fashioned'): (31, 34.9390395480226),
 (b'Goddard', b"'"): (34, 15.29483969345862),
 (b'found', b'herself'): (27, 11.22605649143189),
 (b's', b'sake'): (142, 28.092000957580005),
 (b'much', b'pleased'): (18, 13.276184200965723),
 (b'be', b'allowed'): (32, 10.133274789777253),
 (b'Miss', b'Smith'): (58, 165.24316871017183),
 (b'Harriet', b'Smith'): (31, 180.54871092346391),
 (b'several', b'years'): (10, 17.577367349955093),
 (b'pretty', b'girl'): (10, 40.4556337659619),
 (b'blue', b'eyes'): (28, 35.59533676681832),
 (b'They', b'were'): (188, 10.653371553453791),
 (b'due', b'time'): (18, 21.04160963161683),
 (b'its', b'own'): (54, 10.834370434674414),
 (b'better', b'than'): (170, 42.51368036077655),
 (b'body', b'else'): (31, 39.469837225343845),
 (b'apple', b'-'): (26, 28.21999348109518),
 (b'You', b'need'): (16, 14.652632145780382),
 (b'half', b'-'): (179, 14.007817698976801),
 (b'much', b'more'): (159, 10.555945043417811),
 (b'little', b'girl'): (50, 35.05677573772557),
 (b'at', b'last'): (420, 22.75660921756972),
 (b'CHAPTER', b'IV'): (8, 335.5512750949539),
 (b'every', b'respect'): (14, 12.224980231945178),
 (b'guided', b'by'): (14, 23.954538020555372),
 (b'different', b'sort'): (8, 14.490845895493244),
 (b'-', b'Mill'): (7, 12.705105290190035),
 (b'good', b'deal'): (62, 39.176849314155014),
 (b'very', b'happy'): (43, 11.360756752157577),
 (b'drink', b'tea'): (7, 32.50399453379586),
 (b'large', b'enough'): (11, 10.829067985816224),
 (b'had', b'taken'): (121, 10.962547054002629),
 (b'doing', b'something'): (9, 10.716846747710356),
 (b'three', b'miles'): (9, 16.651749532156657),
 (b'thing', b'else'): (26, 12.21019907747794),
 (b'very', b'obliging'): (14, 25.349278570257418),
 (b'on', b'purpose'): (35, 10.833361556419508),
 (b'very', b'clever'): (15, 21.69532850607617),
 (b'"', b'You'): (493, 11.717061811021022),
 (b'know', b'what'): (219, 10.687860772182088),
 (b'Miss', b'Nash'): (13, 337.68125042659204),
 (b'does', b'not'): (211, 13.23025142257054),
 (b'"', b'Oh'): (496, 20.296685763864517),
 (b'Oh', b'yes'): (11, 23.468003288849534),
 (b'very', b'entertaining'): (7, 16.054543094496367),
 (b'soon', b'as'): (271, 12.01164260504438),
 (b'Oh', b'!'): (285, 31.12698837846826),
 (b'have', b'seen'): (204, 13.438796505596505),
 (b'on', b'horseback'): (21, 54.889031885858834),
 (b'their', b'families'): (95, 35.26311487873489),
 (b'no', b'doubt'): (117, 40.18959620751882),
 (b'very', b'respectable'): (9, 10.88443599626872),
 (b'respectable', b'young'): (8, 27.70496528037034),
 (b'very', b'odd'): (16, 18.206182890665982),
 (b'perfectly', b'right'): (12, 16.998927982407917),
 (b'years', b'hence'): (9, 17.990952464071682),
 (b'young', b'woman'): (57, 30.4001550358284),
 (b'very', b'desirable'): (9, 14.595039176814876),
 (b'Dear', b'Miss'): (9, 32.278354820188945),
 (b'thirty', b'years'): (35, 72.53269372866845),
 (b'can', b'afford'): (11, 26.391592873146273),
 (b'good', b'luck'): (21, 51.57752734020704),
 (b'acquainted', b'with'): (88, 27.73083464345721),
 (b'your', b'own'): (181, 10.134692346685467),
 (b'"', b'Yes'): (349, 27.046036922192854),
 (b'next', b'day'): (100, 33.66839067944251),
 (b'an', b'opportunity'): (34, 39.49570340028189),
 (b'few', b'yards'): (15, 127.20000822740548),
 (b'Robert', b'Martin'): (31, 1963.7208109428432),
 (b'few', b'minutes'): (86, 316.36383789006993),
 (b'Only', b'think'): (9, 11.416616668358916),
 (b'been', b'able'): (40, 15.91768079235648),
 (b'-', b'morrow'): (134, 31.191253298926746),
 (b'should', b'happen'): (13, 20.434212265397832),
 (b'Do', b'you'): (187, 17.542972188763283),
 (b'compared', b'with'): (25, 15.313211901507499),
 (b'"', b'Certainly'): (39, 25.246462114062453),
 (b'You', b'must'): (84, 12.864839977038008),
 (b'an', b'old'): (158, 10.467761806300013),
 (b'old', b'man'): (201, 11.807109991026596),
 (b'more', b'valuable'): (10, 17.665941085058734),
 (b',"', b'replied'): (256, 68.6325679964557),
 (b'very', b'bad'): (36, 15.601593602802112),
 (b'deal', b'too'): (15, 12.720000822740548),
 (b'no', b'more'): (553, 17.350760547573774),
 (b'very', b'agreeable'): (21, 21.406057459328487),
 (b'fixed', b'on'): (31, 10.722857321964225),
 (b'same', b'time'): (104, 18.367425129939775),
 (b'pleasing', b'young'): (8, 23.351327879169286),
 (b'CHAPTER', b'V'): (7, 236.1286750668194),
 (b'very', b'differently'): (14, 48.16362928348909),
 (b'"', b'Perhaps'): (40, 10.879118421244423),
 (b'ever', b'since'): (60, 42.92801610440095),
 (b'twelve', b'years'): (22, 39.389282288763),
 (b'very', b'neatly'): (7, 22.935061563566236),
 (b'ten', b'years'): (32, 36.45848645037043),
 (b'being', b'able'): (20, 14.36891827839066),
 (b'her', b'mother'): (239, 11.54362134822892),
 (b'have', b'spoken'): (82, 11.59021954484605),
 (b'Yes', b',"'): (107, 26.304593789876648),
 (b'"', b'Thank'): (43, 24.575778111032328),
 (b'Thank', b'you'): (46, 27.918884665527383),
 (b'"', b'Why'): (191, 10.548941903388105),
 (b'could', b'possibly'): (21, 30.820114126236575),
 (b'How', b'can'): (53, 16.088661540487763),
 (b'much', b'mistaken'): (9, 10.098977725520935),
 (b'Very', b'well'): (40, 84.54149008885851),
 (b'oh', b'!'): (27, 22.810780631748376),
 (b'look', b'at'): (154, 10.281997140291956),
 (b'any', b'harm'): (10, 10.323534321581198),
 (b'"', b'Very'): (70, 19.596435652445642),
 (b'an', b'angel'): (52, 25.819271767903782),
 (b'an', b'end'): (127, 18.27126745760167),
 (b'many', b'years'): (54, 19.719030792857517),
 (b',"', b'cried'): (297, 34.723966485732284),
 (b'much', b'obliged'): (40, 42.98889166944722),
 (b'John', b'Knightley'): (58, 175.90370362419566),
 (b'ill', b'-'): (100, 22.276568839343266),
 (b'cared', b'for'): (14, 11.003932384341637),
 (b'I', b'assure'): (105, 13.117491742454519),
 (b'assure', b'you'): (125, 32.47600092425325),
 (b'soon', b'afterwards'): (36, 80.80970084767553),
 (b'CHAPTER', b'VI'): (6, 151.7970054000982),
 (b'most', b'agreeable'): (13, 28.296545412948984),
 (b'no', b'scruple'): (10, 27.380722571504467),
 (b'infinitely', b'superior'): (7, 278.568018018018),
 (b'am', b'glad'): (34, 17.031537511870848),
 (b'Exactly', b'so'): (9, 26.019606605658986),
 (b'Did', b'you'): (73, 15.109886745810456),
 (b'very', b'interesting'): (15, 17.838381216107074),
 (b'No', b'sooner'): (14, 65.68697776518907),
 (b'Don', b"'"): (134, 25.405723462879433),
 (b"'", b't'): (2200, 30.669962908216693),
 (b't', b'pretend'): (9, 22.215392905253704),
 (b'why', b'should'): (57, 22.803134218289085),
 (b'cannot', b'imagine'): (13, 50.34105640180307),
 (b'back', b'again'): (74, 19.215480532814567),
 (b'almost', b'every'): (37, 10.072259575009541),
 (b'higher', b'than'): (34, 46.27167976056865),
 (b'ten', b'times'): (16, 32.71643894251348),
 (b'dear', b'Isabella'): (6, 10.167718917496957),
 (b'must', b'allow'): (12, 16.27068909786588),
 (b'sitting', b'down'): (24, 17.45874757620558),
 (b'fore', b'-'): (11, 15.528462021343378),
 (b'must', b'confess'): (10, 12.590414182872408),
 (b'depended', b'on'): (14, 20.58338695719706),
 (b'no', b'sooner'): (26, 28.749758700079695),
 (b'after', b'breakfast'): (11, 12.17388137561763),
 (b'sooner', b'than'): (12, 16.389331849226902),
 (b'at', b'home'): (154, 15.749528854479703),
 (b'at', b'least'): (301, 41.37059021349224),
 (b'Upon', b'my'): (40, 19.147177205335975),
 (b'Will', b'you'): (82, 14.912145378521789),
 (b"'", b'd'): (2523, 30.546910997373118),
 (b'She', b'paused'): (9, 28.95364951542675),
 (b'replied', b'Emma'): (16, 16.331043870304843),
 (b'can', b'hardly'): (33, 26.420626528672287),
 (b'am', b'persuaded'): (12, 14.509626277861573),
 (b'Are', b'you'): (83, 16.465332468596394),
 (b'I', b'beg'): (52, 10.231643559114522),
 (b'beg', b'your'): (38, 44.837927443381055),
 (b'your', b'pardon'): (41, 42.182804133556154),
 (b'dear', b'Miss'): (28, 19.29791549647382),
 (b'little', b'while'): (54, 10.254870302373984),
 (b'`', b'No'): (8, 11.44545824696476),
 (b'entered', b'into'): (98, 58.47423553525749),
 (b'older', b'than'): (15, 59.834068655907735),
 (b'advise', b'you'): (16, 10.800158446902824),
 (b'run', b'away'): (46, 41.70122298206315),
 (b'At', b'last'): (91, 78.82820279093205),
 (b'"', b'Indeed'): (53, 16.73627360604856),
 (b'Dear', b'me'): (17, 11.639624322425217),
 (b'have', b'borne'): (26, 11.406974967061926),
 (b'good', b'opinion'): (19, 14.834508731529459),
 (b'good', b'natured'): (17, 34.331291635825316),
 (b'thank', b'you'): (59, 16.88752048061169),
 (b'merely', b'because'): (10, 11.301099005522438),
 (b'Emma', b'felt'): (19, 16.55226511478519),
 (b'no', b'difficulty'): (15, 13.258034087254796),
 (b'protest', b'against'): (6, 10.157197996222385),
 (b'Let', b'us'): (117, 32.264040693882876),
 (b'cried', b'Emma'): (27, 14.204759677426091),
 (b'"', b'Has'): (17, 11.456377766045145),
 (b'next', b'morning'): (62, 76.77797550074492),
 (b'dear', b'sir'): (21, 30.138577189712098),
 (b'am', b'going'): (42, 10.080878050929599),
 (b'sat', b'down'): (150, 58.92285841199917),
 (b'depends', b'upon'): (8, 18.878472434214544),
 (b'has', b'happened'): (14, 16.886495752427184),
 (b'presently', b'added'): (6, 24.986707070707073),
 (b'could', b'afford'): (11, 16.180559916274202),
 (b'Certainly', b',"'): (12, 17.049273752697825),
 (b'stood', b'up'): (80, 10.28632971468491),
 (b'"', b'Nonsense'): (8, 10.024330545289503),
 (b'are', b'mistaken'): (17, 10.34529425541169),
 (b'does', b'seem'): (9, 14.17908976269632),
 (b'few', b'moments'): (43, 388.789590364635),
 (b'nobody', b'knows'): (7, 38.60305867665418),
 (b'very', b'likely'): (29, 25.18359701097469),
 (b'all', b'probability'): (16, 13.453639483026294),
 (b'no', b'harm'): (25, 27.989183073093457),
 (b'cannot', b'help'): (16, 20.51549678061432),
 (b'very', b'different'): (29, 17.75617669437386),
 (b'common', b'sense'): (26, 145.78524280999528),
 (b'.--', b'She'): (78, 30.440671188909267),
 (b'-', b'natured'): (60, 48.041179378531076),
 (b'an', b'hundred'): (183, 30.38185737390911),
 (b'exactly', b'what'): (25, 14.387504885629737),
 (b'every', b'man'): (307, 12.828795338958603),
 (b'be', b'satisfied'): (66, 12.272908603731745),
 (b'less', b'than'): (85, 36.58643688514103),
 (b'large', b'fortune'): (9, 38.26270688321733),
 (b'no', b'use'): (40, 14.694321113374066),
 (b'these', b'words'): (111, 26.137410685805424),
 (b'well', b'acquainted'): (9, 11.682096812278632),
 (b'twenty', b'thousand'): (48, 77.04104378157882),
 (b'thousand', b'pounds'): (47, 448.5645551257253),
 (b'Good', b'morning'): (10, 19.263291344272915),
 (b'walked', b'off'): (15, 10.916907922609802),
 (b'cast', b'down'): (44, 15.322709019072688),
 (b'its', b'effects'): (7, 29.72249056785139),
 (b'deal', b'more'): (28, 10.737496923771433),
 (b'longer', b'than'): (31, 18.302185706512955),
 (b'perfectly', b'satisfied'): (12, 93.75697392359005),
 (b'three', b'hundred'): (77, 49.30309130408786),
 (b'looking', b'at'): (106, 12.747008682016011),
 (b'next', b'moment'): (21, 24.667278275263754),
 (b'ready', b'wit'): (8, 70.12901152901154),
 (b'very', b'pleasant'): (19, 11.64578255559322),
 (b'an', b'idea'): (37, 14.230682717061454),
 (b'Give', b'me'): (67, 25.719421458776054),
 (b'arrive', b'at'): (10, 11.92665663812389),
 (b'very', b'superior'): (13, 10.703028729664243),
 (b'pre', b'-'): (17, 49.325702891326024),
 (b'have', b'chosen'): (38, 12.418092369477911),
 (b'without', b'exception'): (8, 55.117736185383244),
 (b'her', b'cheeks'): (13, 10.316064715093864),
 (b'sit', b'down'): (54, 36.049581123554866),
 (b'reason', b'why'): (20, 38.722303389547506),
 (b'could', b'hardly'): (46, 23.719027600038913),
 (b'It', b'seemed'): (50, 10.045411819743178),
 (b'an', b'offering'): (70, 11.10900109162763),
 (b'let', b'us'): (282, 32.253455378694596),
 (b'Have', b'you'): (81, 15.500622663363387),
 (b'"', b'Aye'): (59, 19.50680538542822),
 (b'Very', b'true'): (18, 128.88521410135147),
 (b'can', b'easily'): (12, 15.161836814749437),
 (b'Nobody', b'could'): (9, 15.935399917542776),
 (b'dear', b'mother'): (21, 13.460467748430139),
 (b'those', b'things'): (85, 16.269842914134045),
 (b'next', b'week'): (12, 51.500749500333114),
 (b'Why', b'should'): (43, 13.316921218220724),
 (b'.--', b'Poor'): (10, 33.94933025911286),
 (b'taken', b'away'): (75, 33.882342535970594),
 (b'stay', b'longer'): (9, 32.079105716360615),
 (b'three', b'days'): (97, 38.359912674236874),
 (b'cannot', b'bear'): (17, 21.2053080048691),
 (b'We', b'are'): (133, 13.053427143739613),
 (b'four', b'o'): (8, 13.636425778378854),
 (b'o', b"'"): (216, 29.05179422396496),
 (b"'", b'clock'): (67, 18.373899375590025),
 (b'ask', b'whether'): (9, 11.352902840883013),
 (b're', b'-'): (54, 17.206160179428213),
 (b'Of', b'course'): (52, 64.52969306765831),
 (b'ran', b'away'): (24, 15.685508857974716),
 (b'who', b'lived'): (27, 14.51537608023045),
 (b'A', b'few'): (48, 27.798793006840953),
 (b'.--', b'Emma'): (18, 10.090713174013455),
 (b'thus', b'began'): (15, 10.569835629596586),
 (b'Never', b'mind'): (10, 58.81881301122313),
 (b'good', b'fortune'): (18, 19.83585738958796),
 (b'Those', b'who'): (17, 18.714038582776446),
 (b'Jane', b'Fairfax'): (111, 897.6983416183942),
 (b'nothing', b'else'): (45, 34.14808689686209),
 (b'present', b'instance'): (6, 15.747924624395214),
 (b'These', b'are'): (118, 23.590054389582264),
 (b'once', b'more'): (124, 21.461454181291764),
 (b'still', b'greater'): (11, 12.724188305008026),
 (b'here', b'comes'): (14, 13.200741413619147),
 (b'turned', b'back'): (42, 28.862454905522338),
 (b'will', b'bring'): (144, 12.921944022284055),
 (b'each', b'side'): (37, 22.901592275974444),
 (b'waiting', b'for'): (50, 11.22850243300167),
 (b'still', b'remained'): (10, 13.210905890445488),
 (b'she', b'hoped'): (24, 14.771734566364692),
 (b'ten', b'minutes'): (39, 192.59628296373643),
 (b'most', b'favourable'): (6, 12.48377003512455),
 (b'ten', b'days'): (21, 17.36302410708332),
 (b'many', b'months'): (10, 10.650678561587213),
 (b'little', b'ones'): (53, 64.8622484431334),
 (b'-', b'tempered'): (21, 31.944264729620663),
 (b'passed', b'over'): (52, 23.11431354773038),
 (b'sir', b',"'): (108, 26.985788449773853),
 (b'cannot', b'deny'): (9, 34.78623560349313),
 (b'talking', b'about'): (24, 19.191191650605955),
 (b'never', b'forget'): (18, 21.690200998246326),
 (b'cannot', b'tell'): (30, 18.28907708359457),
 (b'two', b'years'): (53, 12.9458705561574),
 (b'indeed', b'!--'): (19, 15.372462749108678),
 (b'most', b'amiable'): (8, 17.207358697063572),
 (b',"', b'observed'): (18, 11.710949290013575),
 (b'our', b'lives'): (16, 19.654875413170608),
 (b'think', b'differently'): (6, 12.463139862958483),
 (b'shake', b'hands'): (13, 44.83992241738721),
 (b'How', b'long'): (48, 18.446626247771526),
 (b'South', b'End'): (8, 1381.4318689501117),
 (b'perfectly', b'convinced'): (8, 75.68177368034593),
 (b'tells', b'me'): (15, 12.932915913805795),
 (b'bad', b'cold'): (7, 14.307186896320372),
 (b'far', b'off'): (67, 27.665131723254174),
 (b'am', b'sorry'): (32, 26.675240833932428),
 (b'Ah', b'!"'): (14, 17.41235304055111),
 (b'an', b'interval'): (9, 11.127182762808689),
 (b'perfectly', b'well'): (16, 14.848043217286914),
 (b'He', b'paused'): (14, 28.88198206613114),
 (b'can', b'tell'): (51, 12.430822730105128),
 (b'morrow', b'morning'): (14, 22.316089764922395),
 (b'own', b'feelings'): (16, 10.723815634320593),
 (b'sore', b'throat'): (7, 129.16060985797827),
 (b'&', b'c'): (17, 4365.324705882353),
 (b'well', b'satisfied'): (17, 19.87160797964922),
 (b'looked', b'at'): (184, 13.030752013575848),
 (b'well', b'pleased'): (21, 19.251395495889877),
 (b'set', b'forward'): (22, 30.104250841148257),
 (b'eldest', b'daughter'): (10, 86.17906911928651),
 (b'short', b'time'): (23, 10.749383663832438),
 (b'Ha', b'!'): (35, 53.03251532865493),
 (b'"', b'Quite'): (25, 17.43361833963392),
 (b',"', b'continued'): (103, 41.17971551769827),
 (b'dining', b'-'): (20, 27.583452274754684),
 (b'such', b'circumstances'): (10, 10.212955347912391),
 (b'enter', b'into'): (108, 69.91980623618285),
 (b'gone', b'through'): (24, 11.358845786496218),
 (b'turn', b'away'): (49, 25.69282323847976),
 (b',"', b'repeated'): (29, 16.023102816168517),
 (b'several', b'times'): (18, 100.66202138582126),
 (b'great', b'curiosity'): (13, 12.762131764948666),
 (b'upper', b'end'): (11, 50.599001800032724),
 (b'an', b'odd'): (25, 26.95760772433033),
 (b'In', b'short'): (23, 17.859158861411398),
 (b'dearest', b'Emma'): (8, 41.198769763723575),
 (b'continued', b'Mrs'): (17, 12.808985040466856),
 (b'go', b'home'): (35, 10.887636156851283),
 (b'covered', b'with'): (53, 10.080468634592364),
 (b'hardly', b'knew'): (16, 30.25731454547072),
 (b'knew', b'how'): (29, 10.771586203292852),
 (b'set', b'off'): (42, 12.87623069823977),
 (b'can', b'get'): (31, 10.915759166817718),
 (b'got', b'home'): (12, 13.02303200594452),
 (b'most', b'extraordinary'): (16, 46.22702963501566),
 (b'an', b'inch'): (27, 63.919483204134366),
 (b'at', b'ease'): (36, 17.60601694199241),
 (b'tete', b'-'): (7, 10.750473707083877),
 (b'well', b'known'): (33, 14.650186556114896),
 (b'Smith', b'!--'): (9, 18.891159579667644),
 (b'extremely', b'sorry'): (8, 73.46994297481388),
 (b'Every', b'thing'): (18, 22.46842755413457),
 (b'many', b'weeks'): (10, 20.757955155746508),
 (b'Am', b'I'): (43, 15.552098209854075),
 (b'madam', b',"'): (13, 15.249039878189363),
 (b'extremely', b'well'): (16, 29.947748184019368),
 (b'!--', b'Such'): (10, 19.885431136492258),
 (b'poor', b'Harriet'): (13, 12.145847347359634),
 (b'-', b'headed'): (29, 37.268308851224106),
 (b'an', b'instant'): (95, 43.261013866434524),
 (b'thirty', b'thousand'): (18, 42.25608127995964),
 (b'so', b'easily'): (19, 10.348707172705279),
 (b'worth', b'having'): (9, 21.664774916798038),
 (b'poor', b'girl'): (11, 16.40337746734157),
 (b'laugh', b'at'): (34, 11.791126449054302),
 (b'knowing', b'what'): (21, 15.75053166426834),
 (b'many', b'days'): (49, 14.230254790394108),
 (b'whole', b'party'): (15, 21.733756466486728),
 (b'six', b'weeks'): (6, 18.23820337383508),
 (b'too', b'late'): (56, 87.81454226528567),
 (b'-', b'minded'): (19, 20.81474696477942),
 (b'her', b'companions'): (36, 11.854581263120794),
 (b'drew', b'near'): (34, 135.39134997206745),
 (b'three', b'months'): (35, 82.38692831354994),
 (b'other', b'side'): (133, 28.113809270630338),
 (b'an', b'unnatural'): (10, 19.2271172739709),
 (b'get', b'rid'): (18, 302.7023984336759),
 (b'watering', b'-'): (10, 20.552376204719177),
 (b'while', b'ago'): (10, 15.477085481465169),
 (b'at', b'Weymouth'): (16, 43.7310743397876),
 (b'present', b'occasion'): (9, 32.735843313703434),
 (b'No', b',"'): (86, 11.56716665027149),
 (b'their', b'hearts'): (49, 18.284578085269942),
 (b'break', b'through'): (12, 11.454968114101241),
 (b'burst', b'forth'): (18, 49.66500488033903),
 (b'young', b'men'): (141, 28.058139771512256),
 (b'-', b'bred'): (21, 27.95123163841808),
 (b'nobody', b'else'): (20, 96.32925917464537),
 (b'something', b'else'): (35, 38.92533060859677),
 (b'walking', b'together'): (9, 11.84464959491295),
 (b'burst', b'out'): (19, 11.10216993785889),
 (b'-', b'sized'): (12, 27.17480853735091),
 (b'how', b'long'): (57, 10.246816619029866),
 (b'Miss', b'Fairfax'): (125, 273.22756777255375),
 (b'extremely', b'happy'): (7, 19.519016507275197),
 (b'don', b"'"): (693, 30.892577640288216),
 (b'ma', b"'"): (213, 29.826517196587346),
 (b's', b'handwriting'): (7, 11.482861705288176),
 (b'Ma', b"'"): (15, 17.287270917893842),
 (b'without', b'seeming'): (8, 24.251803921568627),
 (b'Colonel', b'Campbell'): (28, 896.7708845596117),
 (b'those', b'days'): (84, 24.94261977039944),
 (b'Miss', b'Campbell'): (12, 75.31616124710753),
 (b'most', b'charming'): (7, 10.48020200479592),
 (b'caught', b'hold'): (10, 25.41177679158448),
 (b'four', b'months'): (9, 21.51366300812301),
 (b'may', b'guess'): (11, 13.366929644439642),
 (b'Bless', b'me'): (14, 16.96059544124817),
 (b'running', b'away'): (12, 11.650452540443363),
 (b'My', b'father'): (32, 11.188894979347538),
 (b'five', b'minutes'): (37, 145.59218386745533),
 (b'nine', b'years'): (9, 22.042968784808547),
 (b'hundred', b'pounds'): (11, 62.90946390424197),
 (b'more', b'honourable'): (10, 10.647964489624442),
 (b'rather', b'than'): (75, 19.03811275415246),
 (b'few', b'months'): (17, 59.138014296301606),
 (b'she', b'wished'): (31, 11.819087018586881),
 (b'without', b'feeling'): (12, 13.668488522623221),
 (b'twelve', b'thousand'): (24, 59.18560102353321),
 (b'passed', b'between'): (17, 19.790738607270864),
 (b",'", b'said'): (250, 30.379793675936877),
 (b'Miss', b'Hawkins'): (18, 356.6758207630878),
 (b'dear', b'Jane'): (14, 28.087065128531204),
 (b'three', b'minutes'): (10, 10.882367432840969),
 (b'have', b'suffered'): (29, 12.705105290190035),
 (b'hour', b'ago'): (10, 35.65865950134926),
 (b'looked', b'round'): (26, 11.609345695809427),
 (b'help', b'thinking'): (10, 32.395023572551075),
 (b'a', b'series'): (16, 10.464298240216587),
 (b'laughed', b'at'): (28, 12.563958901229746),
 (b'weeks', b'ago'): (7, 66.07767923923495),
 (b'She', b'wished'): (10, 10.481853991240559),
 (b'twenty', b'miles'): (8, 32.07910571636062),
 (b'elder', b'sister'): (6, 20.892601351351352),
 (b'alas', b'!'): (23, 57.08794297143443),
 (b'no', b'fault'): (13, 10.495943652410048),
 (b'driven', b'away'): (10, 15.108423197384944),
 (b'setting', b'off'): (8, 17.077163107511044),
 (b'little', b'farther'): (16, 13.377838741396264),
 (b'spot', b'where'): (18, 40.488884971796935),
 (b'front', b'door'): (15, 45.13461833203179),
 (b'they', b'parted'): (20, 10.443101887710071),
 (b'without', b'delay'): (8, 17.832208765859285),
 (b'six', b'months'): (21, 149.7251460218503),
 (b'months', b'ago'): (7, 33.90373070913626),
 (b'leaned', b'back'): (6, 12.550400811770675),
 (b'at', b'Oxford'): (10, 14.908320797654866),
 (b'turned', b'round'): (20, 11.258866238141938),
 (b'pass', b'through'): (62, 22.187894661308842),
 (b'clock', b'struck'): (16, 287.94205291005295),
 (b'four', b'hours'): (15, 47.339972748289114),
 (b'faster', b'than'): (10, 39.88937910393849),
 (b'musical', b'society'): (6, 113.40931597285898),
 (b'worth', b'while'): (16, 39.414977692797954),
 (b'mixed', b'with'): (23, 10.669850873308452),
 (b'extremely', b'glad'): (10, 72.78966572504707),
 (b'knew', b'nothing'): (25, 12.448864562804845),
 (b'make', b'amends'): (9, 67.17951224811253),
 (b'amends', b'for'): (12, 15.103436605959109),
 (b'oftener', b'than'): (9, 50.68297580265126),
 (b'old', b'woman'): (61, 19.444449684793035),
 (b'post', b'-'): (19, 12.22866384180791),
 (b'just', b'going'): (25, 13.260137742937028),
 (b'At', b'least'): (17, 28.035284695357834),
 (b'their', b'lives'): (30, 14.905906047774408),
 (b'six', b'days'): (15, 14.20782138820221),
 (b'may', b'prove'): (9, 10.693543715551712),
 (b'stronger', b'than'): (29, 58.08610709966773),
 (b'particular', b'friend'): (10, 19.81788291694974),
 (b'Hum', b'!'): (7, 26.95819529206626),
 (b'good', b'tidings'): (14, 31.69042304845414),
 (b'among', b'themselves'): (30, 14.912692334403358),
 (b'next', b'summer'): (8, 20.513743918620076),
 (b'breaking', b'up'): (12, 10.278262005104766),
 (b'perfectly', b'safe'): (6, 16.44561748750133),
 (b'two', b'ladies'): (12, 10.206988182478378),
 (b'same', b'moment'): (29, 15.560860129375872),
 (b'well', b'worth'): (11, 11.682096812278632),
 (b',"', b'added'): (51, 22.00020571516956),
 (b'little', b'girls'): (15, 24.596639156806205),
 (b'be', b'ashamed'): (88, 15.57521865836133),
 (b'been', b'staying'): (9, 13.958581617912605),
 (b'shut', b'up'): (64, 33.02080970890481),
 (b'too', b'large'): (19, 13.758850077869242),
 (b'At', b'first'): (26, 13.622350914881029),
 (b'worse', b'than'): (55, 50.56400168104879),
 (b'opposite', b'side'): (9, 21.484700834657843),
 (b'short', b'pause'): (11, 86.61965123608095),
 (b'large', b'party'): (8, 14.159718830137988),
 (b'six', b'years'): (19, 24.75055888120844),
 (b'who', b'knows'): (23, 17.154535367545076),
 (b'extremely', b'fond'): (6, 34.25396034119863),
 (b'or', b'twice'): (16, 10.479935604134893),
 (b'somebody', b'else'): (14, 146.97092685503037),
 (b'five', b'couple'): (7, 31.78235173193545),
 (b'"', b'Don'): (54, 12.435245486561662),
 (b'bad', b'news'): (7, 32.31039707419018),
 (b'baked', b'apples'): (6, 613.5128968253968),
 (b'will', b'send'): (73, 16.24789061081296),
 (b'William', b'Larkins'): (13, 5074.223589743589),
 (b'low', b'voice'): (39, 55.283668345011634),
 (b'one', b'leg'): (17, 10.745803649000868),
 (b'an', b'immediate'): (12, 14.761464229693788),
 (b'Tell', b'me'): (40, 25.092668376242766),
 (b',"', b'resumed'): (18, 28.97963722613529),
 (b'many', b'times'): (18, 11.52315244811375),
 (b'Nothing', b'can'): (12, 15.5144376709064),
 (b'few', b'words'): (18, 11.709704106675987),
 (b'no', b'objection'): (17, 30.84522216218463),
 (b'It', b'seems'): (24, 18.227904675031336),
 (b'astonished', b'at'): (22, 13.982976748145253),
 (b'four', b'times'): (13, 17.90461714401523),
 (b'other', b'end'): (40, 10.241020603889318),
 (b'few', b'hours'): (23, 78.07853039580834),
 (b'an', b'extraordinary'): (13, 10.355991878257592),
 (b'look', b'forward'): (11, 11.731590179742955),
 (b'Alas', b'!'): (16, 24.207359037773784),
 (b'immediately', b'followed'): (7, 12.604630780832807),
 (b'wait', b'till'): (17, 29.399153804709538),
 (b'-', b'bye'): (37, 39.93033091202583),
 (b'contrast', b'between'): (12, 166.24220430107528),
 (b'dared', b'not'): (22, 11.783382014386875),
 (b'three', b'weeks'): (9, 21.40939225562999),
 (b'-', b'sighted'): (11, 32.25142112125163),
 (b'Maple', b'Grove'): (31, 16731.47346514048),
 (b'My', b'brother'): (17, 11.063251359603091),
 (b'at', b'Maple'): (10, 11.541925778829572),
 (b'almost', b'fancy'): (9, 16.725382014874913),
 (b'left', b'behind'): (27, 29.19139358595579),
 (b'barouche', b'-'): (7, 13.97561581920904),
 (b'-', b'landau'): (7, 19.965165456012915),
 (b'whose', b'name'): (60, 31.460744776298874),
 (b'most', b'serious'): (8, 10.186756348661634),
 (b'We', b'cannot'): (19, 11.86846921192564),
 (b'waited', b'for'): (38, 10.711792586527258),
 (b'E', b'.,'): (6, 566.3195970695971),
 (b'person', b'who'): (30, 10.409305441471528),
 (b'greater', b'part'): (10, 16.18360863375623),
 (b'drew', b'back'): (11, 15.195552552368083),
 (b'Her', b'manners'): (6, 10.694144704987204),
 (b'third', b'time'): (23, 13.583626301884719),
 (b'very', b'extraordinary'): (12, 11.12691105559154),
 (b'better', b'acquainted'): (7, 13.44978251413658),
 (b'According', b'to'): (45, 10.652559050879573),
 (b'have', b'committed'): (34, 16.773483913206135),
 (b'hardly', b'less'): (8, 12.999957957579198),
 (b'will', b'shew'): (48, 12.349694416837725),
 (b'little', b'boys'): (17, 17.749466634776372),
 (b'easily', b'believe'): (8, 21.82456945228684),
 (b'my', b'lord'): (180, 36.09212316660775),
 (b'"', b'Excuse'): (11, 17.184566649067715),
 (b'Excuse', b'me'): (13, 37.6902120916626),
 (b'put', b'forth'): (36, 10.696077873443938),
 (b'drawing', b'near'): (8, 18.702625052924454),
 (b'great', b'joy'): (24, 10.008039650578848),
 (b'eight', b'o'): (9, 65.15181205225453),
 (b'spread', b'abroad'): (14, 194.30906996229578),
 (b'few', b'lines'): (10, 43.57778059642595),
 (b'good', b'news'): (18, 24.794821736984947),
 (b'most', b'likely'): (12, 19.419197832415968),
 (b'talk', b'about'): (35, 21.82456945228684),
 (b'tells', b'us'): (12, 30.445234478296342),
 (b'dear', b'madam'): (10, 68.52158400921863),
 (b'eleven', b'years'): (6, 16.99145510495659),
 (b'your', b'sister'): (79, 15.96501961999174),
 (b'two', b'hours'): (17, 15.078657986492088),
 (b'two', b'months'): (19, 19.986458535324154),
 (b'door', b'opened'): (19, 32.959977767541375),
 (b'Who', b'can'): (30, 10.979605329969967),
 (b'began', b'talking'): (9, 13.574813692886854),
 (b'mean', b'?"'): (59, 26.417439208554274),
 (b'In', b'spite'): (9, 11.906105907607598),
 (b'many', b'hours'): (12, 13.124384550084889),
 (b'few', b'steps'): (8, 22.23496206809765),
 (b'most', b'excellent'): (10, 12.940493329092522),
 (b'later', b'than'): (8, 11.966813731181546),
 (b'whole', b'story'): (18, 36.394832862523536),
 (b'whole', b'history'): (8, 19.243869803335823),
 (b'lined', b'with'): (12, 13.540103155017157),
 (b'-', b'plaister'): (9, 17.4695197740113),
 (b'Lord', b'bless'): (12, 16.701184413580247),
 (b'these', b'things'): (325, 42.22024236217784),
 (b'laid', b'down'): (26, 12.408517237129121),
 (b'forty', b'years'): (63, 161.26435572340617),
 (b'faint', b'smile'): (6, 24.270839874411305),
 (b'turned', b'towards'): (11, 11.117214558790042),
 (b'totally', b'different'): (7, 158.3259088581669),
 (b'Box', b'Hill'): (18, 8589.180555555555),
 (b'some', b'surprise'): (19, 22.397283733443707),
 (b'may', b'depend'): (9, 21.164305270362764),
 (b',"', b'interrupted'): (25, 30.235165275720284),
 (b'whatever', b'else'): (8, 18.739462440532105),
 (b'mid', b'-'): (22, 25.824507492016703),
 (b'larger', b'than'): (11, 19.006115925994223),
 (b'were', b'assembled'): (17, 18.151482242442032),
 (b'insisted', b'on'): (9, 11.611141360470139),
 (b'clothed', b'with'): (36, 12.658921838579532),
 (b'twenty', b'minutes'): (7, 11.181099087860133),
 (b'quite', b'alone'): (14, 11.34125512474631),
 (b'etc', b'.,'): (11, 3964.237179487179),
 (b'As', b'soon'): (46, 20.834318136064653),
 (b'without', b'knowing'): (18, 34.569457344341245),
 (b',"', b'whispered'): (18, 26.71560306784347),
 (b'shan', b"'"): (18, 22.473452193261995),
 (b'looking', b'round'): (23, 17.251680754316958),
 (b'Pardon', b'me'): (7, 10.147364793909164),
 (b',"', b'answered'): (143, 22.574837953998617),
 (b'An', b'old'): (15, 19.494650501535183),
 (b'Shall', b'we'): (25, 13.344431737263449),
 (b'old', b'age'): (36, 48.2305151350481),
 (b'an', b'infant'): (9, 23.77170862963675),
 (b'be', b'forgiven'): (36, 18.33315259385065),
 (b'lie', b'down'): (41, 31.29163861453632),
 (b'four', b'miles'): (7, 16.305990613299585),
 (b'great', b'hurry'): (16, 19.653682918020948),
 (b'without', b'waiting'): (12, 19.24746342981637),
 (b'comes', b'back'): (13, 15.420287686817213),
 (b'heightened', b'by'): (7, 11.874899189677024),
 (b'In', b'fact'): (28, 39.68262953497974),
 (b'cut', b'off'): (213, 155.49937138546645),
 (b'never', b'mind'): (29, 11.197235791371655),
 (b'trembling', b'voice'): (7, 11.950048791799114),
 (b'More', b'than'): (14, 24.232797805642633),
 (b'time', b'past'): (23, 13.32652296494053),
 (b'second', b'time'): (44, 20.945309358703252),
 (b'five', b'hundred'): (65, 85.88703405366368),
 (b'turning', b'away'): (13, 10.732190271245857),
 (b'an', b'arrow'): (10, 14.85731789352297),
 (b'--', b'oh'): (17, 12.810820328683882),
 (b'presented', b'themselves'): (6, 11.894541467918142),
 (b'at', b'random'): (11, 18.66781039010696),
 (b'far', b'distant'): (10, 26.635412180205016),
 (b'few', b'seconds'): (9, 96.54154470592826),
 (b'passing', b'through'): (10, 15.455115709501674),
 (b'will', b'heal'): (10, 10.57744672117128),
 (b'rose', b'early'): (12, 50.250699385933345),
 (b'east', b'wind'): (22, 148.3861256175018),
 (b'gone', b'mad'): (10, 20.604417938295462),
 (b'freed', b'from'): (10, 25.542343339556247),
 (b'sinned', b'against'): (43, 81.41629018847007),
 (b'locked', b'up'): (11, 13.13762812682564),
 (b'deep', b'sigh'): (7, 47.7618937287612),
 (b'ten', b'thousand'): (75, 127.07678714010615),
 (b'happier', b'than'): (10, 23.413331213181287),
 (b'contend', b'with'): (14, 10.669850873308452),
 (b'had', b'formerly'): (10, 11.685871610947764),
 (b'little', b'boy'): (63, 26.45163569321534),
 (b'fancying', b'herself'): (6, 29.039303155522166),
 (b'right', b'hand'): (196, 45.67275064863757),
 (b'surrounded', b'by'): (18, 30.406938834172983),
 (b'infinitely', b'more'): (8, 12.604887693122986),
 (b'such', b'cases'): (8, 14.9147687533664),
 (b'No', b'wonder'): (13, 19.253530718908465),
 (b'poor', b'fellow'): (31, 72.63216713721062),
 (b'Poor', b'fellow'): (7, 45.430376492194675),
 (b'days', b'ago'): (11, 15.442637278485753),
 (b'help', b'laughing'): (7, 20.696820615796522),
 (b'draw', b'near'): (18, 83.0335357666646),
 (b'at', b'intervals'): (30, 31.38593852137866),
 (b'into', b'temptation'): (8, 11.801251836726903),
 (b'stood', b'before'): (56, 10.3924315864046),
 (b'Sir', b'Walter'): (136, 1001.3120125576278),
 (b'Walter', b'Elliot'): (16, 158.52514448173008),
 (b'Kellynch', b'Hall'): (24, 4945.285774410774),
 (b'arising', b'from'): (7, 10.216937335822498),
 (b'Charles', b'Musgrove'): (14, 248.91721824686942),
 (b'first', b'year'): (71, 36.52536994480359),
 (b'Lady', b'Elliot'): (12, 34.95596737726098),
 (b'seventeen', b'years'): (7, 50.974365314869765),
 (b'an', b'awful'): (13, 15.611271338865924),
 (b'Lady', b'Russell'): (147, 1370.6224754175123),
 (b'Anne', b'Elliot'): (23, 69.51674910071942),
 (b'Miss', b'Elliot'): (48, 81.92874088041015),
 (b'everybody', b'else'): (20, 116.64359274208759),
 (b'her', b'mistress'): (30, 10.11840289117327),
 (b'Mr', b'Elliot'): (174, 154.42250147754135),
 (b'Mr', b'Shepherd'): (26, 153.50875886524824),
 (b'anybody', b'else'): (21, 167.79555359595722),
 (b'reference', b'to'): (30, 10.087650616363232),
 (b'an', b'honest'): (28, 22.111184865066537),
 (b'descend', b'into'): (11, 19.585056239674437),
 (b'Mrs', b'Clay'): (66, 287.0445438704621),
 (b'Miss', b'Anne'): (19, 13.816993610080877),
 (b'their', b'fathers'): (151, 21.03847250792006),
 (b'an', b'example'): (14, 17.82878147222756),
 (b'Admiral', b'Croft'): (14, 1020.8710564930301),
 (b'Mrs', b'Croft'): (41, 207.37305091376518),
 (b'walked', b'along'): (8, 11.422274896105895),
 (b'Frederick', b'Wentworth'): (6, 23.252406376898783),
 (b'either', b'side'): (18, 19.363312306866),
 (b'Captain', b'Wentworth'): (196, 976.2658980080998),
 (b'eldest', b'son'): (15, 39.758333601208655),
 (b'removed', b'from'): (36, 14.303712270151498),
 (b'good', b'humour'): (23, 57.21881939304219),
 (b'The', b'Crofts'): (8, 10.249618801378945),
 (b'startled', b'by'): (14, 14.177175563185834),
 (b'most', b'important'): (14, 33.80560735175321),
 (b'replied', b'Anne'): (11, 13.874444726962713),
 (b'at', b'Uppercross'): (20, 13.940248018586365),
 (b'Great', b'House'): (13, 1177.944761904762),
 (b'left', b'alone'): (16, 14.13574836462775),
 (b'Mr', b'Musgrove'): (21, 32.388661211129296),
 (b'Miss', b'Musgroves'): (22, 227.52303763499037),
 (b'Mrs', b'Musgrove'): (66, 156.77048165232932),
 (b'flower', b'-'): (23, 13.52478950246036),
 (b'grown', b'up'): (19, 12.21891287320147),
 (b'their', b'faces'): (62, 22.656977192617102),
 (b'surprised', b'at'): (27, 14.056416752074586),
 (b'ere', b'long'): (20, 33.88236905544598),
 (b'anything', b'else'): (30, 74.81068120893052),
 (b'quite', b'different'): (12, 19.349238133975785),
 (b'their', b'sakes'): (13, 16.878072078710716),
 (b'twentieth', b'year'): (13, 185.54485448544855),
 (b'on', b'board'): (69, 34.25698443940126),
 (b'eight', b'years'): (22, 61.89744359662757),
 (b'-', b'bone'): (22, 13.499174370826912),
 (b'their', b'heads'): (77, 21.62117992564767),
 (b'Your', b'sister'): (11, 24.014484311898105),
 (b'dressing', b'-'): (19, 29.645245677110086),
 (b'up', b'stairs'): (15, 14.51249618660972),
 (b'waited', b'till'): (7, 12.054520291606565),
 (b'third', b'part'): (39, 80.67867146551659),
 (b'Phoo', b'!'): (7, 23.96284025961445),
 (b'dear', b'fellow'): (11, 20.63122602168474),
 (b'good', b'cheer'): (14, 58.85364280427197),
 (b'Mrs', b'Harville'): (24, 84.63892670628489),
 (b'"', b'Ay'): (34, 18.457497511961623),
 (b'fifteen', b'years'): (9, 52.05892627901593),
 (b'Charles', b'Hayter'): (33, 2649.294369645043),
 (b'came', b'near'): (42, 11.627278418555207),
 (b'Her', b'husband'): (9, 21.943829394649068),
 (b'two', b'hundred'): (102, 34.52901130958137),
 (b'Dr', b'Shirley'): (9, 1086.3785682916116),
 (b'went', b'up'): (206, 10.892879248012838),
 (b'within', b'reach'): (7, 18.479083248670293),
 (b'-', b'yard'): (19, 16.037591923682502),
 (b'turn', b'back'): (15, 10.596023199431151),
 (b'walking', b'along'): (8, 19.12445108751675),
 (b'leaning', b'against'): (12, 23.700128657852233),
 (b'trodden', b'under'): (9, 72.24544392523364),
 (b'under', b'foot'): (15, 22.22936736161035),
 (b'Louisa', b'Musgrove'): (15, 189.52528348145879),
 (b'provoke', b'me'): (18, 16.48946779010239),
 (b'Very', b'good'): (11, 10.325200491977538),
 (b'good', b'humoured'): (9, 26.15717457967643),
 (b'Captain', b'Harville'): (37, 475.42275075075077),
 (b'at', b'Lyme'): (24, 20.293117264867515),
 (b'earnest', b'desire'): (6, 32.79008483563097),
 (b'Captain', b'Benwick'): (56, 811.8267953667954),
 (b'an', b'officer'): (9, 10.895366455250176),
 (b'place', b'where'): (114, 29.88335727268697),
 (b'-', b'coat'): (34, 13.073963185711682),
 (b'an', b'introduction'): (7, 10.895366455250176),
 (b'preceding', b'evening'): (6, 48.9411997467553),
 (b'an', b'agony'): (10, 15.202836914302573),
 (b'catching', b'hold'): (10, 135.5294762217839),
 (b'raised', b'up'): (35, 20.917452604163728),
 (b'could', b'scarcely'): (17, 17.384072637319388),
 (b'passed', b'along'): (11, 16.544168004280362),
 (b'leaning', b'over'): (16, 33.66056062742769),
 (b't', b'talk'): (19, 11.57051713815297),
 (b'Camden', b'Place'): (29, 11505.506976744186),
 (b'straight', b'forward'): (6, 11.997691337666117),
 (b'same', b'hour'): (17, 12.921683411398027),
 (b'-', b'glasses'): (16, 16.35444404375526),
 (b'poring', b'over'): (6, 41.31068804275217),
 (b'thirty', b'feet'): (8, 12.120752621435336),
 (b'Colonel', b'Wallis'): (23, 967.3744677153037),
 (b'Mrs', b'Wallis'): (11, 54.17854483332603),
 (b'-', b'haired'): (38, 60.68359500446031),
 (b'at', b'length'): (74, 17.88998495718584),
 (b'carried', b'away'): (73, 68.93771731236751),
 (b'greater', b'than'): (56, 48.182171075546755),
 (b'Miss', b'Carteret'): (12, 320.09368530020697),
 (b'contact', b'with'): (11, 11.025512569085398),
 (b'Lady', b'Dalrymple'): (25, 1027.2774086378738),
 (b'Laura', b'Place'): (7, 777.3991200502828),
 (b'be', b'established'): (41, 13.256107901092253),
 (b'Mrs', b'Smith'): (64, 111.99977591965032),
 (b'Westgate', b'Buildings'): (7, 8589.180555555555),
 (b'buried', b'him'): (40, 10.212015743068294),
 (b'at', b'liberty'): (25, 14.03136075073399),
 (b'human', b'nature'): (9, 43.51094068810244),
 (b'five', b'thousand'): (31, 37.910597744077265),
 (b'whose', b'names'): (9, 20.29106718070708),
 (b'her', b'ladyship'): (21, 34.12236790377201),
 (b'-', b'maker'): (21, 26.620220608017217),
 (b'old', b'gentleman'): (31, 27.054312400201237),
 (b'almost', b'entirely'): (13, 36.10558657179347),
 (b'lower', b'part'): (8, 15.927465187754331),
 (b'staring', b'at'): (33, 25.04597894006017),
 (b'an', b'oath'): (37, 44.98731955716202),
 (b'wiser', b'than'): (8, 29.373088249263795),
 (b'prejudice', b'against'): (7, 39.177763699714916),
 (b'both', b'sides'): (30, 99.67073029216844),
 (b'my', b'soul'): (234, 16.443509373671976),
 (b'rejoice', b'over'): (13, 10.116903194143386),
 (b'same', b'instant'): (19, 25.162444780283654),
 (b'every', b'one'): (375, 14.671392435905247),
 (b'their', b'seats'): (11, 12.658554059033037),
 (b'their', b'mouths'): (24, 26.497142818484406),
 (b'short', b'silence'): (9, 19.427041120849434),
 (b'-', b'blooded'): (7, 17.4695197740113),
 (b'general', b'character'): (6, 10.833526031812768),
 (b'fifty', b'pounds'): (6, 35.380799816923165),
 (b'be', b'saved'): (61, 13.4097233300756),
 (b'threw', b'himself'): (8, 10.02991247371238),
 (b'some', b'moments'): (13, 21.006148097826088),
 (b'exclaimed', b'Mrs'): (11, 12.149128235351897),
 (b'compassion', b'on'): (20, 13.01248600742343),
 (b'an', b'explanation'): (12, 16.343049682875264),
 (b'our', b'hearts'): (17, 14.94420279348509),
 (b'minutes', b'afterwards'): (7, 22.216989096657148),
 (b'make', b'haste'): (26, 50.3846341860844),
 (b"'", b'n'): (26, 22.533063472289214),
 (b'n', b"'"): (20, 16.09504533734944),
 (b'rising', b'sun'): (7, 13.06573846170098),
 (b'-', b'faced'): (30, 42.60858481466171),
 (b'an', b'atonement'): (66, 89.61132859823739),
 (b'atonement', b'for'): (64, 24.315805643301744),
 (b'"', b'Look'): (42, 10.09252327008739),
 (b'Look', b'here'): (14, 26.311681865240885),
 ...}

In [18]:
"Jon lives in New York City".split()


Out[18]:
['Jon', 'lives', 'in', 'New', 'York', 'City']

In [19]:
# CODE HERE
bigram["Jon lives in New York City".split()]


Out[19]:
['Jon', 'lives', 'in', 'New_York', 'City']

Preprocess the corpus


In [20]:
lower_sents = []
for s in gberg_sents:
    lower_sents.append([w.lower() for w in s if w not in list(string.punctuation)])

In [21]:
lower_sents[0:5]


Out[21]:
[['emma', 'by', 'jane', 'austen', '1816'],
 ['volume', 'i'],
 ['chapter', 'i'],
 ['emma',
  'woodhouse',
  'handsome',
  'clever',
  'and',
  'rich',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her'],
 ['she',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  'indulgent',
  'father',
  'and',
  'had',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  's',
  'marriage',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period']]

In [22]:
lower_bigram = Phraser(Phrases(lower_sents))

In [23]:
lower_bigram.phrasegrams # miss taylor, mr woodhouse, mr weston


Out[23]:
{(b'two', b'daughters'): (19, 11.080802900992637),
 (b'her', b'sister'): (201, 16.93971298099339),
 (b'very', b'early'): (25, 10.516998773665177),
 (b'her', b'mother'): (253, 10.70812618607742),
 (b'long', b'ago'): (38, 59.226442015336005),
 (b'more', b'than'): (562, 28.529926612065935),
 (b'had', b'been'): (1260, 21.583193129694834),
 (b'an', b'excellent'): (58, 37.41859680854167),
 (b'sixteen', b'years'): (15, 131.42913000977515),
 (b'miss', b'taylor'): (48, 420.4340982546865),
 (b'mr', b'woodhouse'): (132, 104.19907841850323),
 (b'very', b'fond'): (30, 24.185726346489627),
 (b'passed', b'away'): (25, 11.751473221742694),
 (b'too', b'much'): (177, 30.36309017383541),
 (b'did', b'not'): (977, 10.846196223896685),
 (b'any', b'means'): (28, 14.294148100212627),
 (b'after', b'dinner'): (22, 18.60737125272944),
 (b'mr', b'weston'): (162, 91.63290824201266),
 (b'five', b'years'): (42, 37.66428596665674),
 (b'years', b'old'): (176, 48.599094446190286),
 (b'seven', b'years'): (53, 50.3345604292756),
 (b'each', b'other'): (239, 71.31277029783762),
 (b'well', b'informed'): (8, 14.185028016786625),
 (b'a', b'mile'): (49, 11.700110753652233),
 (b'difference', b'between'): (44, 207.86784241868986),
 (b'mrs', b'weston'): (249, 180.6778969011602),
 (b'could', b'not'): (1059, 10.213333164207079),
 (b'having', b'been'): (49, 10.723750443105281),
 (b'sixteen', b'miles'): (6, 105.040625),
 (b'miles', b'off'): (16, 32.99182066941624),
 (b'at', b'hartfield'): (67, 25.555992478744276),
 (b'her', b'husband'): (168, 26.67842743680748),
 (b'in', b'spite'): (105, 13.346436370855669),
 (b'emma', b'could'): (61, 10.88608805283633),
 (b'every', b'body'): (148, 39.26110856993498),
 (b'no', b'means'): (80, 26.766046928639682),
 (b'able', b'to'): (349, 10.854471217684639),
 (b'very', b'much'): (241, 15.43191856451234),
 (b'have', b'been'): (986, 17.20622462716941),
 (b'great', b'deal'): (182, 110.16914388000741),
 (b'agree', b'with'): (26, 13.12648342622773),
 (b'good', b'humoured'): (30, 149.07455772926625),
 (b'for', b'ever'): (565, 10.477844500702485),
 (b'three', b'times'): (41, 38.14441525690869),
 (b'my', b'dear'): (340, 26.343257103019322),
 (b'last', b'night'): (70, 23.230537949447992),
 (b'doubt', b'whether'): (12, 19.564378663784144),
 (b'anywhere', b'else'): (6, 15.30646630236794),
 (b'i', b'am'): (2445, 16.330282814272387),
 (b'very', b'glad'): (46, 16.952537612290158),
 (b'am', b'sure'): (282, 60.9206736247954),
 (b'very', b'pretty'): (40, 18.02785881936292),
 (b'be', b'able'): (121, 10.915272046078586),
 (b'immediately', b'afterwards'): (10, 37.530798337572115),
 (b'mr', b'knightley'): (277, 179.5658713081734),
 (b'sensible', b'man'): (17, 13.469145146927872),
 (b'intimate', b'friend'): (6, 20.19472630173565),
 (b'connected', b'with'): (31, 16.865113476644794),
 (b'elder', b'brother'): (6, 14.417929653989134),
 (b'than', b'usual'): (30, 27.96484669654346),
 (b'brunswick', b'square'): (11, 2374.2341399607585),
 (b'some', b'time'): (149, 11.678170286892607),
 (b'poor', b'isabella'): (11, 43.036881601877866),
 (b'am', b'afraid'): (65, 24.40430621044607),
 (b'moonlight', b'night'): (6, 13.233464566929133),
 (b'look', b'at'): (188, 10.167669372199262),
 (b'vast', b'deal'): (11, 58.66522301228184),
 (b'an', b'hour'): (155, 40.46461919025373),
 (b'pretty', b'well'): (22, 13.991375757171793),
 (b'tolerably', b'well'): (7, 13.779741502021295),
 (b'miss', b'woodhouse'): (173, 272.89637286224394),
 (b'you', b'please'): (94, 10.458014920259812),
 (b'any', b'rate'): (47, 81.39444780766237),
 (b'very', b'true'): (50, 13.110718065819091),
 (b',"', b'said'): (2585, 35.208800203215695),
 (b'my', b'dearest'): (20, 15.989439634668441),
 (b'so', b'much'): (501, 16.689081989684137),
 (b'much', b'less'): (40, 18.956871177506912),
 (b'any', b'body'): (93, 20.81402827327549),
 (b'has', b'been'): (266, 28.01515665997839),
 (b'been', b'used'): (29, 13.60480451219398),
 (b'dear', b'emma'): (33, 26.724453867413775),
 (b'every', b'thing'): (258, 26.813583851480026),
 (b'very', b'sorry'): (34, 20.451631026452166),
 (b'turned', b'away'): (50, 18.47583754119931),
 (b'divided', b'between'): (10, 32.86799217731421),
 (b'how', b'much'): (142, 13.223145529540588),
 (b'four', b'years'): (23, 17.08789651812486),
 (b'years', b'ago'): (56, 157.92138920022722),
 (b'any', b'thing'): (384, 34.60353767883287),
 (b'oh', b'dear'): (22, 13.071010293556995),
 (b'need', b'not'): (108, 12.811033130226706),
 (b'ever', b'since'): (68, 42.54639154544724),
 (b'leave', b'off'): (19, 10.456551581109641),
 (b'match', b'making'): (6, 19.514707779641338),
 (b'young', b'lady'): (73, 46.68694722651542),
 (b'depend', b'upon'): (45, 79.94054343302217),
 (b'more', b'likely'): (16, 10.63926988816206),
 (b'have', b'done'): (272, 12.079550287766299),
 (b',"', b'rejoined'): (6, 10.722990216928967),
 (b'mr', b'elton'): (214, 139.40875576036868),
 (b'any', b'longer'): (32, 15.647026370007673),
 (b'very', b'well'): (211, 12.391112026636758),
 (b'young', b'man'): (266, 24.28633425456424),
 (b'dine', b'with'): (23, 13.166381552143315),
 (b'much', b'better'): (40, 10.49785543468346),
 (b'i', b'dare'): (138, 13.033485583230483),
 (b'dare', b'say'): (115, 119.19759719744675),
 (b'take', b'care'): (71, 74.57127010537992),
 (b'chapter', b'ii'): (11, 279.33240997229916),
 (b'entering', b'into'): (14, 14.764128843338215),
 (b'never', b'seen'): (42, 13.026399306383945),
 (b'mrs', b'churchill'): (59, 72.70198534025822),
 (b'refrain', b'from'): (10, 13.332613486117234),
 (b'at', b'once'): (270, 18.772289267502124),
 (b'three', b'years'): (80, 36.12160535692111),
 (b'mother', b's'): (212, 10.43349082727472),
 (b'twenty', b'years'): (71, 82.70010934937123),
 (b'according', b'to'): (792, 11.428224398751397),
 (b'had', b'begun'): (25, 11.498896738107863),
 (b'passed', b'through'): (45, 29.981417521047756),
 (b'its', b'being'): (58, 15.035551583602627),
 (b'deal', b'better'): (14, 18.324491856239064),
 (b'belonging', b'to'): (36, 10.007375856554956),
 (b'mr', b'frank'): (50, 51.13385894796941),
 (b'frank', b'churchill'): (151, 1615.1350106048417),
 (b'mrs', b'perry'): (11, 23.552718142359424),
 (b'miss', b'bates'): (113, 368.52784388923357),
 (b'a', b'few'): (452, 11.993654742645493),
 (b'few', b'days'): (53, 34.43889574044208),
 (b'i', b'suppose'): (210, 11.320989637575268),
 (b'very', b'handsome'): (21, 18.293203927526704),
 (b'an', b'irresistible'): (7, 10.743649616890554),
 (b'good', b'sense'): (28, 15.484518577039912),
 (b'had', b'already'): (64, 11.161348479021472),
 (b'long', b'enough'): (39, 14.424561747959498),
 (b'at', b'randalls'): (39, 24.914802703291915),
 (b'few', b'weeks'): (19, 130.17082616179002),
 (b'no', b'longer'): (117, 37.27164862774777),
 (b'mr', b'perry'): (36, 95.9153455928979),
 (b'chapter', b'iii'): (10, 294.85087719298247),
 (b'donwell', b'abbey'): (9, 737.1720986902222),
 (b'card', b'table'): (7, 52.05489116407124),
 (b'drawing', b'room'): (49, 219.77986922924512),
 (b'thrown', b'away'): (11, 13.991597804637394),
 (b'mrs', b'goddard'): (58, 292.6791161249692),
 (b'an', b'invitation'): (13, 13.178876863385744),
 (b'mrs', b'bates'): (30, 54.6665437867962),
 (b'those', b'who'): (174, 14.383248820933867),
 (b'as', b'possible'): (81, 10.515284953239568),
 (b'young', b'ladies'): (47, 111.39207280187263),
 (b'old', b'fashioned'): (38, 181.0280072171398),
 (b'coming', b'back'): (15, 10.421002047913248),
 (b'goddard', b's'): (34, 30.051494868668044),
 (b'found', b'herself'): (27, 10.853751672615102),
 (b's', b'sake'): (143, 27.30490695465855),
 (b'much', b'pleased'): (18, 12.52905813538587),
 (b'miss', b'smith'): (58, 148.87785876781652),
 (b'harriet', b'smith'): (31, 171.76079313432186),
 (b'several', b'years'): (10, 16.071496876195333),
 (b'pretty', b'girl'): (10, 36.100742261151815),
 (b'blue', b'eyes'): (28, 33.95451351272216),
 (b'due', b'time'): (18, 19.500628497333533),
 (b'its', b'own'): (54, 10.339536241447126),
 (b'an', b'egg'): (7, 16.473596079232184),
 (b'better', b'than'): (175, 40.46903672973475),
 (b'body', b'else'): (31, 37.603287294006286),
 (b'much', b'more'): (163, 10.196735265430883),
 (b'little', b'girl'): (54, 33.82044741555345),
 (b'at', b'last'): (512, 25.084119133118794),
 (b'chapter', b'iv'): (8, 252.72932330827066),
 (b'every', b'respect'): (14, 10.88373348044036),
 (b'guided', b'by'): (14, 22.059348262898627),
 (b'different', b'sort'): (8, 13.93659398034398),
 (b'abbey', b'mill'): (11, 1868.3499742665979),
 (b'good', b'deal'): (62, 34.988675608221904),
 (b'very', b'happy'): (45, 10.609415283099846),
 (b'mrs', b'martin'): (8, 10.982447223684451),
 (b'drink', b'tea'): (7, 29.38199300699301),
 (b'large', b'enough'): (11, 10.328337316490268),
 (b'had', b'taken'): (121, 10.546895054673817),
 (b'mr', b'martin'): (37, 92.3345987295168),
 (b'three', b'miles'): (9, 15.396834283387623),
 (b'thing', b'else'): (26, 11.724101848622253),
 (b'very', b'obliging'): (14, 23.82930511612031),
 (b'on', b'purpose'): (36, 10.243903158363866),
 (b'very', b'clever'): (15, 19.348581077191703),
 (b'miss', b'nash'): (13, 312.88118939883645),
 (b'does', b'not'): (218, 11.755881647050003),
 (b'oh', b'yes'): (33, 23.31216733177377),
 (b'very', b'entertaining'): (7, 15.091893240209531),
 (b'soon', b'as'): (277, 10.47893531442171),
 (b'have', b'seen'): (204, 12.7295316573734),
 (b'on', b'horseback'): (21, 51.189475020093916),
 (b'their', b'families'): (95, 33.21807400900406),
 (b'no', b'doubt'): (125, 34.5945564760134),
 (b'very', b'respectable'): (9, 10.061262160139687),
 (b'respectable', b'young'): (8, 26.124093264248707),
 (b'very', b'odd'): (20, 23.337979237437413),
 (b'perfectly', b'right'): (12, 15.182023486901535),
 (b'six', b'years'): (23, 29.166464467922708),
 (b'years', b'hence'): (10, 17.92215409224207),
 (b'young', b'woman'): (57, 28.12531780001931),
 (b'very', b'desirable'): (9, 13.719902945645027),
 (b'dear', b'miss'): (39, 23.615690866510537),
 (b'thirty', b'years'): (36, 70.51678321678321),
 (b'can', b'afford'): (11, 24.00055535354203),
 (b'good', b'luck'): (24, 50.83824661023695),
 (b'acquainted', b'with'): (88, 25.94043153278337),
 (b'harriet', b's'): (91, 10.391255619709803),
 (b'next', b'day'): (103, 31.552600854623535),
 (b'an', b'opportunity'): (36, 39.48568132393281),
 (b'few', b'yards'): (15, 121.49277108433735),
 (b'robert', b'martin'): (31, 1822.180470288428),
 (b'few', b'minutes'): (86, 306.2525246213068),
 (b'been', b'able'): (40, 15.563975922210217),
 (b'should', b'happen'): (13, 19.56851425106125),
 (b'compared', b'with'): (28, 15.206045360968936),
 (b'well', b'bred'): (15, 56.08034332217969),
 (b'an', b'old'): (175, 10.093145123027886),
 (b'old', b'man'): (225, 11.391492291491135),
 (b'more', b'valuable'): (10, 16.92611118571237),
 (b',"', b'replied'): (256, 67.14687428979221),
 (b'very', b'bad'): (37, 14.502720230831981),
 (b'deal', b'too'): (15, 11.983244206773618),
 (b'no', b'more'): (597, 15.083521866761401),
 (b'good', b'humour'): (28, 64.86757782273477),
 (b'very', b'agreeable'): (21, 20.122524320279375),
 (b'fixed', b'on'): (32, 10.217254086671971),
 (b'same', b'time'): (104, 17.450483557641775),
 (b'pleasing', b'young'): (8, 22.392079940784605),
 (b'chapter', b'v'): (7, 186.22160664819944),
 (b'very', b'differently'): (14, 45.27567972062859),
 (b'twelve', b'years'): (25, 44.13664813761106),
 (b'very', b'neatly'): (7, 21.559847486013613),
 (b'ten', b'years'): (32, 33.477231228905),
 (b'being', b'able'): (20, 13.768546669336779),
 (b'have', b'spoken'): (82, 11.097204476268063),
 (b'yes', b',"'): (117, 21.3190811413499),
 (b'thank', b'you'): (105, 18.729348571046483),
 (b'could', b'possibly'): (21, 29.277491471292734),
 (b'grown', b'up'): (21, 13.110499863891405),
 (b'any', b'harm'): (11, 11.88333382435259),
 (b'an', b'angel'): (58, 20.854313507945204),
 (b'excuse', b'me'): (31, 17.305246066110463),
 (b'an', b'end'): (129, 17.098710216167333),
 (b'many', b'years'): (55, 18.58677200012534),
 (b',"', b'cried'): (297, 33.94160588989982),
 (b'much', b'obliged'): (41, 42.51023467141318),
 (b'mrs', b'john'): (39, 22.74667029354493),
 (b'john', b'knightley'): (58, 169.26886715265277),
 (b'be', b'satisfied'): (68, 11.946063802781145),
 (b'ill', b'humour'): (6, 26.63236200369751),
 (b'i', b'assure'): (105, 12.733255394756087),
 (b'assure', b'you'): (126, 28.436400120257016),
 (b'soon', b'afterwards'): (38, 78.59594937705434),
 (b'chapter', b'vi'): (6, 126.36466165413533),
 (b'most', b'agreeable'): (13, 26.484307288246878),
 (b'no', b'scruple'): (10, 22.498995969001474),
 (b'infinitely', b'superior'): (7, 270.2854590127318),
 (b'am', b'glad'): (34, 16.09346987631334),
 (b'very', b'interesting'): (15, 16.768770266899477),
 (b'no', b'sooner'): (40, 38.53551437243869),
 (b'don', b't'): (830, 258.7786954087346),
 (b't', b'pretend'): (9, 21.455106382978723),
 (b'why', b'should'): (100, 17.72050499445061),
 (b'cannot', b'imagine'): (13, 46.70634553033812),
 (b'back', b'again'): (74, 17.518775026164434),
 (b'an', b'artist'): (10, 15.443996324280171),
 (b'higher', b'than'): (34, 44.04157170252713),
 (b'ten', b'times'): (17, 32.73329275715155),
 (b'mr', b'john'): (33, 14.765003852281561),
 (b'must', b'allow'): (12, 15.128011144449205),
 (b'sitting', b'down'): (24, 16.50978251971512),
 (b'must', b'confess'): (10, 11.475103270125821),
 (b'depended', b'on'): (14, 19.196053132535216),
 (b'after', b'breakfast'): (11, 10.176820020576768),
 (b'sooner', b'than'): (12, 15.493715063361526),
 (b'at', b'home'): (158, 14.817173413852018),
 (b'at', b'least'): (318, 40.005129511612694),
 (b'yes', b'indeed'): (18, 13.32791435368755),
 (b'replied', b'emma'): (16, 15.97752469115056),
 (b'can', b'hardly'): (33, 23.71390376951493),
 (b'am', b'persuaded'): (12, 13.9008842237933),
 (b'beg', b'your'): (40, 41.769363228062694),
 (b'your', b'pardon'): (42, 35.83165749517416),
 (b'tell', b'me'): (198, 10.695563383577781),
 (b'entered', b'into'): (99, 56.41577688104846),
 (b'older', b'than'): (15, 56.23201606007659),
 (b'run', b'away'): (47, 37.8745366554535),
 (b'have', b'borne'): (26, 10.466681494661922),
 (b'good', b'opinion'): (19, 13.341832563897801),
 (b'good', b'natured'): (66, 159.13709037599173),
 (b'emma', b'felt'): (19, 16.14051174170535),
 (b'no', b'difficulty'): (16, 11.983675747699731),
 (b'let', b'us'): (399, 31.76961003819016),
 (b'cried', b'emma'): (27, 13.884714320584314),
 (b'bond', b'street'): (8, 172.37435897435898),
 (b'some', b'weeks'): (10, 11.200682740440154),
 (b'next', b'morning'): (69, 78.14689256415173),
 (b'without', b'ceremony'): (6, 10.070807949665435),
 (b'dear', b'sir'): (24, 14.956604215456675),
 (b'sat', b'down'): (150, 55.2769805500231),
 (b'depends', b'upon'): (8, 17.98662227242999),
 (b'has', b'happened'): (14, 15.991242753687029),
 (b'presently', b'added'): (6, 18.897863568215893),
 (b'could', b'afford'): (11, 15.53959162707076),
 (b'does', b'seem'): (9, 12.372250907417822),
 (b'few', b'moments'): (43, 372.31655654877574),
 (b'nobody', b'knows'): (7, 29.65198853194148),
 (b'very', b'likely'): (33, 27.439805891290053),
 (b'good', b'tempered'): (10, 28.20329470553686),
 (b'all', b'probability'): (16, 12.433560319461959),
 (b'no', b'harm'): (25, 22.746237682946543),
 (b'cannot', b'help'): (16, 18.923655657158456),
 (b'very', b'different'): (29, 16.46388353477403),
 (b'common', b'sense'): (30, 161.56643879512924),
 (b'an', b'hundred'): (186, 29.194395140414738),
 (b'every', b'man'): (327, 11.99741282796895),
 (b'less', b'than'): (90, 36.5391922539134),
 (b'large', b'fortune'): (9, 31.79160591133005),
 (b'no', b'use'): (43, 12.936922682175846),
 (b'these', b'words'): (121, 23.297961459941245),
 (b'twenty', b'thousand'): (49, 74.70393983612077),
 (b'thousand', b'pounds'): (48, 447.518052808119),
 (b'walked', b'off'): (15, 10.460621861989791),
 (b'cast', b'down'): (44, 13.997424310193253),
 (b'its', b'effects'): (8, 41.35193876156195),
 (b'deal', b'more'): (29, 10.514102007124858),
 (b'longer', b'than'): (32, 18.0629530688156),
 (b'perfectly', b'satisfied'): (12, 88.02506546950993),
 (b'three', b'hundred'): (78, 46.95431447257635),
 (b'well', b'known'): (41, 14.092917445249052),
 (b'destin', b'd'): (7, 66.58969843715606),
 (b'looking', b'at'): (107, 11.578307624593487),
 (b'next', b'moment'): (22, 24.112003750805837),
 (b'ready', b'wit'): (8, 64.00444303395747),
 (b'very', b'pleasant'): (20, 11.433252454704187),
 (b'an', b'idea'): (37, 13.222953374634528),
 (b'arrive', b'at'): (10, 10.99182472204055),
 (b'nobody', b'could'): (24, 14.343345036944983),
 (b'have', b'chosen'): (38, 11.818665160781638),
 (b'without', b'exception'): (8, 49.91617853312434),
 (b'her', b'cheeks'): (14, 10.713032768823975),
 (b'sit', b'down'): (61, 35.66504947879112),
 (b'reason', b'why'): (21, 17.06388019290972),
 (b'could', b'hardly'): (47, 23.030990847938746),
 (b'an', b'offering'): (71, 10.631590689986872),
 (b'can', b'easily'): (12, 13.582403838136347),
 (b'dear', b'mother'): (25, 13.628181044152825),
 (b'those', b'things'): (86, 14.861480769860593),
 (b'next', b'week'): (13, 53.87957922858575),
 (b'taken', b'away'): (75, 32.39398354768761),
 (b'stay', b'longer'): (10, 35.24360408220327),
 (b'three', b'days'): (100, 36.85541434763652),
 (b'cannot', b'bear'): (17, 14.75548726953468),
 (b'o', b'clock'): (67, 157.91912099014903),
 (b'ask', b'whether'): (11, 13.90355382441143),
 (b'ran', b'away'): (24, 14.924370991613223),
 (b'who', b'lived'): (27, 11.87499040954371),
 (b'never', b'mind'): (39, 14.649399814275704),
 (b'good', b'fortune'): (20, 17.88894692751195),
 (b'jane', b'fairfax'): (111, 878.265806663654),
 (b'nothing', b'else'): (45, 30.0085928868752),
 (b'present', b'instance'): (6, 15.177453341360627),
 (b'once', b'more'): (141, 21.758506838947973),
 (b'still', b'greater'): (11, 11.400499332652899),
 (b'here', b'comes'): (20, 16.697963238946844),
 (b'turned', b'back'): (42, 27.419992871324965),
 (b'will', b'bring'): (145, 11.542472822887643),
 (b'each', b'side'): (37, 20.499637888317135),
 (b'still', b'remained'): (10, 12.13368361944489),
 (b'she', b'hoped'): (30, 15.572051565559544),
 (b'ten', b'minutes'): (42, 194.73592734674708),
 (b'most', b'favourable'): (6, 11.684253215403034),
 (b'ten', b'days'): (21, 15.98003268459367),
 (b'little', b'ones'): (53, 58.130349583162364),
 (b'mr', b'wingfield'): (9, 102.72224108658745),
 (b'passed', b'over'): (52, 21.874777302895755),
 (b'yes', b'sir'): (25, 17.0480135249366),
 (b'sir', b',"'): (121, 14.215621316157259),
 (b'cannot', b'deny'): (9, 32.95392156862745),
 (b'talking', b'about'): (25, 19.032959773504775),
 (b'never', b'forget'): (18, 19.811797243380486),
 (b'cannot', b'tell'): (30, 16.07682643742666),
 (b'two', b'years'): (53, 11.95816852658443),
 (b'indeed', b'!--'): (21, 14.904334331005433),
 (b'dear', b'madam'): (15, 86.09887295081967),
 (b'madam', b",'"): (7, 23.782783018867924),
 (b'most', b'amiable'): (9, 20.908663648615956),
 (b',"', b'observed'): (18, 11.457441601650128),
 (b'five', b'times'): (10, 11.19749042251957),
 (b'our', b'lives'): (16, 17.148517657729155),
 (b'think', b'differently'): (6, 11.912463083284111),
 (b'grow', b'up'): (20, 10.504597456351945),
 (b'shake', b'hands'): (15, 50.53995355597508),
 (b'how', b'long'): (106, 12.747548678067647),
 (b'perfectly', b'convinced'): (8, 72.51995685005393),
 (b'tells', b'me'): (15, 12.055904371165493),
 (b'bad', b'cold'): (7, 12.931807252091948),
 (b'far', b'off'): (83, 31.653885985340363),
 (b'am', b'sorry'): (32, 25.55609823252787),
 (b'mrs', b'campbell'): (9, 25.553929487788654),
 (b'ah', b'!"'): (15, 17.216148693638065),
 (b'an', b'interval'): (11, 15.772591990754218),
 (b'perfectly', b'well'): (16, 10.91605036682757),
 (b'ill', b'judged'): (6, 16.56130074179509),
 (b'can', b'tell'): (52, 10.59868410018371),
 (b'morrow', b'morning'): (14, 21.336820698491813),
 (b'own', b'feelings'): (16, 10.440997949569127),
 (b'sore', b'throat'): (9, 237.47777450691785),
 (b'well', b'satisfied'): (17, 14.614877350628644),
 (b'looked', b'at'): (184, 11.997174544178405),
 (b'well', b'pleased'): (26, 18.61784927203245),
 (b'set', b'forward'): (22, 28.094228510556935),
 (b'eldest', b'daughter'): (10, 81.36551909628834),
 (b'short', b'time'): (23, 10.057824159586259),
 (b',"', b'continued'): (103, 40.15998883792505),
 (b'dining', b'room'): (18, 272.7268377253814),
 (b'enter', b'into'): (110, 32.02961835848166),
 (b'half', b'hour'): (17, 17.306134513071832),
 (b'gone', b'through'): (24, 10.232695965811631),
 (b'turn', b'away'): (53, 24.3249510219842),
 (b'own', b'sake'): (17, 10.579228284294224),
 (b'an', b'effort'): (8, 12.355197059424137),
 (b',"', b'repeated'): (29, 15.364284489928071),
 (b'several', b'times'): (19, 99.00042075736326),
 (b'great', b'curiosity'): (13, 12.09281966721631),
 (b'upper', b'end'): (11, 46.569735221674875),
 (b'an', b'odd'): (25, 25.47463311221472),
 (b'dearest', b'emma'): (8, 38.814087759815244),
 (b'continued', b'mrs'): (17, 12.451436979718673),
 (b'go', b'home'): (37, 10.007173127288121),
 (b'judge', b'between'): (11, 14.60799652325076),
 (b'hardly', b'knew'): (16, 28.982276298141407),
 (b'set', b'off'): (42, 12.027500784416544),
 (b'got', b'home'): (12, 12.614899339653833),
 (b'most', b'extraordinary'): (16, 42.018372140007074),
 (b'an', b'inch'): (28, 63.14878497039003),
 (b'at', b'ease'): (36, 15.972495299215172),
 (b'three', b'quarters'): (8, 46.190502850162865),
 (b'smith', b'!--'): (9, 17.971662805204062),
 (b'extremely', b'sorry'): (8, 71.27550459436111),
 (b'many', b'weeks'): (10, 19.402314036865533),
 (b'madam', b',"'): (14, 12.063363994045087),
 (b'extremely', b'well'): (16, 22.29075831209327),
 (b'without', b'knowing'): (19, 31.149243193151232),
 (b'poor', b'harriet'): (15, 13.183234482788675),
 (b'an', b'instant'): (99, 42.54170416065454),
 (b'thirty', b'thousand'): (18, 40.06847682119205),
 (b'somebody', b'else'): (17, 161.2778888444622),
 (b'worth', b'having'): (9, 20.026280947967695),
 (b'poor', b'girl'): (16, 25.65660249342719),
 (b'laugh', b'at'): (34, 10.684790511927686),
 (b'knowing', b'what'): (22, 10.324563551972247),
 (b'many', b'days'): (50, 13.4746779374137),
 (b'whole', b'party'): (15, 20.8072769791595),
 (b'six', b'weeks'): (6, 16.914565278166062),
 (b'too', b'late'): (56, 82.78464324520101),
 (b'her', b'companions'): (37, 11.248284965327484),
 (b'drew', b'near'): (34, 127.06472006778334),
 (b'three', b'months'): (36, 79.13813268974364),
 (b'other', b'side'): (133, 27.098664528007962),
 (b'an', b'unnatural'): (10, 18.169407440329614),
 (b'get', b'rid'): (18, 208.69330573907504),
 (b'watering', b'place'): (9, 80.21132175526978),
 (b'while', b'ago'): (10, 12.961199989717313),
 (b'at', b'weymouth'): (16, 40.30335731414868),
 (b'present', b'occasion'): (9, 30.82190524707081),
 (b'their', b'hearts'): (50, 16.688506081078597),
 (b'break', b'through'): (12, 10.134403108825648),
 (b'burst', b'forth'): (18, 46.35182980729566),
 (b'young', b'men'): (142, 26.25825955394037),
 (b'nobody', b'else'): (21, 79.07196941851062),
 (b'something', b'else'): (35, 35.09762973791375),
 (b'walking', b'together'): (9, 11.011028608866564),
 (b'burst', b'out'): (19, 10.336378346321213),
 (b'mrs', b'cole'): (30, 133.52975449561694),
 (b'mr', b'cole'): (23, 75.77870244092516),
 (b'miss', b'fairfax'): (125, 253.16112835175565),
 (b'extremely', b'happy'): (7, 17.871069693255155),
 (b'ma', b'am'): (216, 180.33443656689536),
 (b's', b'handwriting'): (7, 11.116226942015139),
 (b'without', b'seeming'): (8, 22.511217769840382),
 (b'colonel', b'campbell'): (28, 852.6827205882353),
 (b'those', b'days'): (84, 22.563533110519458),
 (b'mrs', b'dixon'): (14, 66.64348656190336),
 (b'mr', b'dixon'): (22, 99.22034650409013),
 (b'miss', b'campbell'): (12, 69.78477508650519),
 (b'caught', b'hold'): (10, 22.46057555238774),
 (b'four', b'months'): (12, 35.22349653606567),
 (b'may', b'guess'): (11, 12.384068773773176),
 (b'running', b'away'): (12, 10.939329522648434),
 (b'five', b'minutes'): (37, 138.27274480256534),
 (b'nine', b'years'): (9, 20.394175346344422),
 (b'hundred', b'pounds'): (11, 61.54765860771082),
 (b'rather', b'than'): (78, 18.774125510292563),
 (b'few', b'months'): (17, 56.654660091141764),
 (b'she', b'wished'): (41, 13.111156859133413),
 (b'without', b'feeling'): (12, 12.941231471550754),
 (b'ill', b'health'): (7, 24.48192283569709),
 (b'twelve', b'thousand'): (24, 56.813511910645445),
 (b'mr', b'churchill'): (19, 14.856522471200662),
 (b'passed', b'between'): (17, 18.633822336745062),
 (b",'", b'said'): (252, 29.904409319301955),
 (b'miss', b'hawkins'): (18, 330.480756302521),
 (b'dear', b'jane'): (15, 27.27885083590326),
 (b'three', b'minutes'): (10, 10.22196466946896),
 (b'have', b'suffered'): (30, 12.671527233246879),
 (b'hour', b'ago'): (10, 34.58206634811038),
 (b'ford', b's'): (10, 13.291140908931144),
 (b'looked', b'round'): (26, 11.112143696738173),
 (b'help', b'thinking'): (10, 29.995538104586824),
 (b'can', b't'): (299, 33.88376276641461),
 (b'human', b'nature'): (10, 30.58786058786059),
 (b'brown', b's'): (105, 12.276957466080978),
 (b'laughed', b'at'): (28, 11.57917413466867),
 (b'weeks', b'ago'): (7, 64.64729137728389),
 (b'twenty', b'miles'): (8, 30.043399099576273),
 (b'elder', b'sister'): (6, 19.20011424219345),
 (b'driven', b'away'): (10, 13.8188620292715),
 (b'setting', b'off'): (8, 15.77691050691885),
 (b'little', b'farther'): (16, 12.063393148450245),
 (b'spot', b'where'): (18, 30.983262494042016),
 (b'front', b'door'): (16, 47.78138820286026),
 (b'they', b'parted'): (23, 10.79338518624233),
 (b'without', b'delay'): (8, 16.401030089455137),
 (b'six', b'months'): (21, 137.41988785577402),
 (b'months', b'ago'): (7, 32.82608577706644),
 (b'leaned', b'back'): (6, 11.972573463935888),
 (b'ill', b'disposed'): (6, 23.461842717543043),
 (b'at', b'oxford'): (10, 13.739780902550686),
 (b'turned', b'round'): (21, 11.459329950969135),
 (b'pass', b'through'): (64, 20.610161604147784),
 (b'clock', b'struck'): (16, 271.6146594294576),
 (b'"\'', b'tis'): (7, 65.71456500488759),
 (b'four', b'hours'): (15, 44.143732671822555),
 (b'parlour', b'door'): (6, 13.546649351812462),
 (b'faster', b'than'): (10, 38.52934433745988),
 (b'musical', b'society'): (6, 97.72799224680989),
 (b'worth', b'while'): (16, 32.809013467947075),
 (b'kind', b'hearted'): (6, 22.299646174259177),
 (b'mixed', b'with'): (29, 12.31482453633305),
 (b'extremely', b'glad'): (10, 69.64820996891908),
 (b'knew', b'nothing'): (25, 11.012511694448186),
 (b'make', b'amends'): (9, 63.131237772270765),
 (b'amends', b'for'): (12, 12.75831653909067),
 (b'oftener', b'than'): (9, 48.954931628772556),
 (b'old', b'woman'): (61, 16.854605854208888),
 (b'just', b'going'): (25, 11.441554905030975),
 (b'their', b'lives'): (30, 13.550496390619374),
 (b'six', b'days'): (24, 24.79915003775213),
 (b'may', b'prove'): (10, 12.110271334897238),
 (b'stronger', b'than'): (29, 55.48225584594223),
 (b'particular', b'friend'): (10, 18.111862154023),
 (b'good', b'tidings'): (14, 28.459688293769013),
 (b'among', b'themselves'): (31, 14.391206579513122),
 (b'next', b'summer'): (9, 24.61739395788831),
 (b'on', b'tuesday'): (9, 11.26168450442066),
 (b'breaking', b'up'): (13, 11.204903953442072),
 (b'after', b'tea'): (9, 10.09972289920876),
 (b'perfectly', b'safe'): (6, 15.182023486901535),
 (b'two', b'ladies'): (13, 10.349582404988046),
 (b'same', b'moment'): (29, 15.167877661019073),
 (b'mr', b'cox'): (13, 82.17779286926995),
 (b',"', b'added'): (51, 21.445980433857933),
 (b'little', b'girls'): (15, 21.933442088091358),
 (b'be', b'ashamed'): (88, 14.839124152298211),
 (b'been', b'staying'): (9, 13.648409654861268),
 (b'shut', b'up'): (64, 31.307093975641568),
 (b'too', b'large'): (19, 13.11269940787182),
 (b'an', b'elderly'): (7, 12.355197059424137),
 (b'worse', b'than'): (59, 50.38186909777266),
 (b'opposite', b'side'): (9, 20.550379566258513),
 (b'short', b'pause'): (11, 83.26155963302752),
 (b'large', b'party'): (8, 13.46015572858732),
 (b'who', b'knows'): (35, 23.27214961035764),
 (b'extremely', b'fond'): (6, 32.5917905623788),
 (b'five', b'couple'): (8, 45.27680909975305),
 (b'mr', b'william'): (9, 12.681758158837956),
 (b'bow', b'window'): (8, 25.073932863655198),
 (b'bad', b'news'): (9, 59.57111209570226),
 (b'baked', b'apples'): (6, 587.9825072886297),
 (b'mrs', b'wallis'): (14, 79.25171374929049),
 (b'will', b'send'): (73, 14.065286062406157),
 (b'william', b'larkins'): (13, 4596.649572649572),
 (b'low', b'voice'): (39, 51.584448802114416),
 (b'one', b'leg'): (18, 10.201187499351517),
 (b'an', b'immediate'): (12, 13.305596833225994),
 (b',"', b'resumed'): (18, 27.87977456401531),
 (b'many', b'times'): (21, 13.08508748808824),
 (b'few', b'words'): (18, 11.206759868233577),
 (b'no', b'objection'): (18, 27.457958345842616),
 (b'astonished', b'at'): (22, 12.457401351645954),
 (b'four', b'times'): (13, 16.708165484388754),
 (b'c', b'.,'): (6, 484.8028846153846),
 (b'few', b'hours'): (23, 74.55238225629792),
 (b'an', b'extraordinary'): (14, 10.691997455270888),
 (b'immediately', b'followed'): (7, 11.60491790701243),
 (b'wait', b'till'): (20, 30.635873858181593),
 (b'good', b'bye'): (45, 141.49449547184594),
 (b'contrast', b'between'): (12, 157.76636245110822),
 (b'dared', b'not'): (22, 11.170638889884433),
 (b'three', b'weeks'): (11, 30.16522635112677),
 (b'self', b'command'): (15, 129.45780554604588),
 (b'mrs', b'elton'): (142, 115.93850995531123),
 (b'maple', b'grove'): (31, 6513.823602484473),
 (b'mr', b'suckling'): (10, 67.58042176749174),
 (b'almost', b'fancy'): (9, 14.733568732497261),
 (b'left', b'behind'): (27, 27.450387913434056),
 (b'barouche', b'landau'): (7, 17286.685714285715),
 (b'whose', b'name'): (60, 27.214399799798326),
 (b'mr', b'e'): (10, 15.658878214418817),
 (b'e', b'.,'): (6, 189.19136960600375),
 (b'good', b'breeding'): (8, 36.8301848507599),
 (b'greater', b'part'): (10, 15.073845233942896),
 (b'drew', b'back'): (11, 14.116134651802135),
 (b'third', b'time'): (23, 12.45550979515326),
 (b'very', b'extraordinary'): (13, 11.609148646315022),
 (b'better', b'acquainted'): (7, 12.586519658830872),
 (b'have', b'committed'): (34, 16.059987478581785),
 (b'drawing', b'rooms'): (8, 127.1079831932773),
 (b'hardly', b'less'): (8, 12.215670175249683),
 (b'will', b'shew'): (48, 11.000022831745259),
 (b'little', b'boys'): (17, 15.38189445138874),
 (b'post', b'office'): (12, 378.61952440550687),
 (b'easily', b'believe'): (8, 20.680209638828753),
 (b'put', b'forth'): (41, 11.449099051744076),
 (b'mrs', b'bragge'): (6, 46.544657281329336),
 (b'drawing', b'near'): (8, 17.91737739872068),
 (b'great', b'joy'): (26, 10.054550288918538),
 (b'spread', b'abroad'): (14, 187.66563275434243),
 (b'few', b'lines'): (10, 41.41799014238773),
 (b'good', b'news'): (18, 22.238991398956113),
 (b'most', b'likely'): (12, 18.05748224198651),
 (b'talk', b'about'): (37, 21.439425951764),
 (b'tells', b'us'): (12, 28.49862729793018),
 (b'sixty', b'five'): (6, 24.217828123123724),
 (b'eleven', b'years'): (6, 15.563975922210217),
 (b'your', b'sister'): (93, 17.156751175996334),
 (b'two', b'hours'): (18, 15.059818488167268),
 (b'two', b'months'): (20, 19.80765285410601),
 (b'twenty', b'four'): (17, 24.690562341866855),
 (b'door', b'opened'): (19, 31.97009247027741),
 (b'began', b'talking'): (9, 12.980428544611012),
 (b'mean', b'?"'): (59, 24.672830374194895),
 (b'pretty', b'soon'): (13, 15.281868324424673),
 (b'many', b'hours'): (12, 12.099988572081596),
 (b'few', b'steps'): (8, 21.356151167168672),
 (b'most', b'excellent'): (11, 13.620500891098393),
 (b'surrounded', b'by'): (19, 28.432048872180452),
 (b'later', b'than'): (9, 15.131524321620608),
 (b'whole', b'story'): (18, 33.811825091134196),
 (b'another', b'minute'): (9, 10.47206452506231),
 (b'whole', b'history'): (9, 21.865274113693037),
 (b'lined', b'with'): (12, 12.665905060395177),
 (b'court', b'plaister'): (9, 660.5174672489082),
 (b'these', b'things'): (366, 38.76461305007588),
 (b'laid', b'down'): (26, 11.76806952449668),
 (b'forty', b'years'): (68, 158.55386535221814),
 (b'faint', b'smile'): (6, 23.553176580504516),
 (b'turned', b'towards'): (11, 10.359757315632365),
 (b'totally', b'different'): (7, 152.78636363636363),
 (b'box', b'hill'): (18, 162.97043283674864),
 (b'some', b'surprise'): (19, 20.08802185605737),
 (b'may', b'depend'): (9, 14.385534434180961),
 (b',"', b'interrupted'): (25, 29.580662667390254),
 (b'whatever', b'else'): (9, 19.41965768758576),
 (b'larger', b'than'): (11, 18.09203994976377),
 (b'were', b'assembled'): (17, 17.322526847214835),
 (b'insisted', b'on'): (9, 10.828542792712174),
 (b'clothed', b'with'): (37, 11.957873100497311),
 (b'twenty', b'minutes'): (8, 15.956660102679514),
 (b'quite', b'alone'): (14, 10.13378072065835),
 (b'etc', b'.,'): (11, 3723.286153846154),
 (b',"', b'whispered'): (18, 25.73517652062952),
 (b'shan', b't'): (20, 201.14162234042553),
 (b'looking', b'round'): (23, 16.13132441247656),
 (b',"', b'answered'): (143, 22.031354589620804),
 (b'yes', b'yes'): (31, 34.423873463814296),
 (b'old', b'age'): (51, 59.17756524475554),
 (b'an', b'infant'): (10, 23.75999434504642),
 (b'be', b'forgiven'): (36, 17.634665613629313),
 (b'lie', b'down'): (41, 29.047286538446148),
 (b'mrs', b'smallridge'): (7, 65.16252019386107),
 (b'four', b'miles'): (7, 15.174408105939005),
 (b'great', b'hurry'): (16, 18.475141158247137),
 (b'without', b'waiting'): (12, 17.062642768222968),
 (b'comes', b'back'): (13, 14.131562121366947),
 (b'heightened', b'by'): (7, 10.935403412377099),
 (b'cut', b'off'): (217, 148.51792396902442),
 (b'trembling', b'voice'): (7, 11.589135556534565),
 (b'time', b'past'): (23, 12.30314881906576),
 (b'second', b'time'): (44, 18.990808144462072),
 (b'five', b'hundred'): (67, 84.28815897682486),
 (b'an', b'arrow'): (10, 13.727996732693487),
 (b'presented', b'themselves'): (6, 11.270705264334413),
 (b'at', b'random'): (13, 22.939460289475928),
 (b'far', b'distant'): (12, 33.91663463386507),
 (b'few', b'seconds'): (9, 91.11957831325302),
 (b'passing', b'through'): (12, 17.888241729000978),
 (b'domestic', b'happiness'): (6, 55.01309328968903),
 (b'western', b'sun'): (7, 33.125321653435535),
 (b'rose', b'early'): (12, 38.936482399124785),
 (b'east', b'wind'): (22, 116.45940284649961),
 (b'gone', b'mad'): (10, 19.017256011315414),
 (b'freed', b'from'): (11, 28.569886041679784),
 (b'sinned', b'against'): (43, 77.5224293267156),
 (b'locked', b'up'): (11, 12.67923342100024),
 (b'deep', b'sigh'): (7, 42.620033812341504),
 (b'ten', b'thousand'): (82, 129.36153942271648),
 (b'happier', b'than'): (11, 25.476546051708166),
 (b'nay', b'nay'): (7, 32.48591908507611),
 (b'had', b'formerly'): (10, 11.277764108528865),
 (b'little', b'boy'): (67, 21.623428772358615),
 (b'fancying', b'herself'): (6, 25.962667353244075),
 (b'right', b'hand'): (199, 41.80066296107419),
 (b'infinitely', b'more'): (8, 12.077009062238014),
 (b'such', b'cases'): (8, 12.891349431818181),
 (b'poor', b'fellow'): (38, 75.64730219711522),
 (b'days', b'ago'): (11, 14.965593436946037),
 (b'help', b'laughing'): (7, 15.772934643760266),
 (b'draw', b'near'): (18, 72.06547373629091),
 (b'at', b'intervals'): (34, 32.97547416612164),
 (b'into', b'temptation'): (8, 11.357022187183242),
 (b'sir', b'walter'): (136, 503.2346285714286),
 (b'walter', b'elliot'): (16, 153.5265051903114),
 (b'kellynch', b'hall'): (25, 1284.9824784963364),
 (b'charles', b'musgrove'): (14, 242.12120942641175),
 (b'first', b'year'): (71, 32.984816933402165),
 (b'lady', b'elliot'): (12, 19.257296673017198),
 (b'seventeen', b'years'): (7, 49.28592375366569),
 (b'an', b'awful'): (13, 14.535525952263692),
 (b'thirteen', b'years'): (7, 35.84430818448414),
 (b'lady', b'russell'): (147, 757.699847427881),
 (b'anne', b'elliot'): (23, 67.77658011998705),
 (b'miss', b'elliot'): (48, 75.64904190049722),
 (b'everybody', b'else'): (22, 107.67307329941586),
 (b'russell', b's'): (30, 10.25826311763142),
 (b'mr', b'elliot'): (174, 150.17351854354047),
 (b'mr', b'shepherd'): (26, 56.76755428469306),
 (b'ill', b'used'): (8, 18.88940691636053),
 (b'anybody', b'else'): (21, 154.67587000287605),
 (b'an', b'honest'): (29, 20.449981339736503),
 (b'descend', b'into'): (11, 16.714108124533826),
 (b'mrs', b'clay'): (66, 167.01318200947586),
 (b'therefore', b'thus'): (66, 16.596041635939144),
 (b'miss', b'anne'): (19, 12.80224291155311),
 (b'their', b'fathers'): (151, 19.01897570711605),
 (b'an', b'example'): (14, 16.847995990123824),
 (b'admiral', b'croft'): (14, 929.5503584841242),
 (b'mrs', b'croft'): (41, 202.22851094646535),
 (b'walked', b'along'): (8, 10.917841095692655),
 (b'frederick', b'wentworth'): (6, 22.56410830163348),
 (b'either', b'side'): (18, 17.75174234901147),
 (b'captain', b'wentworth'): (196, 617.1112879281435),
 (b'eldest', b'son'): (15, 33.798893916540976),
 (b'removed', b'from'): (36, 13.162771276103),
 (b'startled', b'by'): (14, 12.543550973020787),
 (b'most', b'important'): (14, 31.64054410542769),
 (b'replied', b'anne'): (11, 13.57410371079436),
 (b'at', b'uppercross'): (20, 12.847587337449994),
 (b'left', b'alone'): (17, 14.25135144684309),
 (b'mr', b'musgrove'): (21, 31.606843411257675),
 (b'miss', b'musgroves'): (22, 210.81324041811845),
 (b'mrs', b'musgrove'): (66, 152.88129737790481),
 (b'piano', b'forte'): (7, 11524.457142857142),
 (b'their', b'faces'): (63, 20.621617809871022),
 (b'surprised', b'at'): (28, 13.305893084575402),
 (b'ere', b'long'): (23, 32.47536007156702),
 (b'anything', b'else'): (31, 72.72513938587002),
 (b'quite', b'different'): (12, 17.841595753643947),
 (b'their', b'sakes'): (13, 15.501767870868562),
 (b'twentieth', b'year'): (13, 176.0098909090909),
 (b'on', b'board'): (70, 31.77124534667287),
 (b'eight', b'years'): (24, 64.58155526342401),
 (b'their', b'heads'): (79, 20.781355479062924),
 (b'dressing', b'room'): (14, 228.86168200731305),
 (b'up', b'stairs'): (15, 14.006129941802593),
 (b'waited', b'till'): (7, 10.336277440847356),
 (b'third', b'part'): (39, 74.5758658942438),
 (b'dear', b'fellow'): (11, 17.042251140780802),
 (b'good', b'cheer'): (15, 59.6298230917065),
 (b'mrs', b'harville'): (24, 82.53919224555735),
 (b'fifteen', b'years'): (10, 60.35011071877431),
 (b'charles', b'hayter'): (33, 2576.962579860055),
 (b'came', b'near'): (42, 11.125204066571548),
 (b'mansion', b'house'): (8, 28.44987460815047),
 (b'two', b'hundred'): (105, 33.2713403032416),
 (b'dr', b'shirley'): (9, 1057.7517482517483),
 (b'went', b'up'): (207, 10.546791909362577),
 (b'within', b'reach'): (7, 16.56674242216807),
 (b'turn', b'back'): (16, 10.061840177671076),
 (b'walking', b'along'): (8, 17.573893342628093),
 (b'leaning', b'against'): (13, 24.29099376700023),
 (b'trodden', b'under'): (9, 65.1309543032456),
 (b'under', b'foot'): (15, 20.587370613094873),
 (b'louisa', b'musgrove'): (15, 183.23258631132646),
 (b'provoke', b'me'): (18, 15.67267568251514),
 (b'captain', b'harville'): (37, 300.52135040745054),
 (b'at', b'lyme'): (26, 20.671192760852374),
 (b'earnest', b'desire'): (6, 31.26467548573791),
 (b'sea', b'shore'): (14, 26.870495928941523),
 (b'captain', b'benwick'): (56, 513.167038084151),
 (b'an', b'officer'): (9, 10.085875150550317),
 (b'place', b'where'): (125, 24.673747113061946),
 (b'breakfast', b'table'): (9, 46.889138605804625),
 (b'great', b'coat'): (15, 13.653183495244221),
 (b'mean', b'while'): (30, 21.601999982862186),
 (b'preceding', b'evening'): (6, 46.916408188585606),
 (b'dark', b'blue'): (7, 10.384888690547708),
 (b'an', b'agony'): (10, 14.366508208632718),
 (b'catching', b'hold'): (10, 119.13870510396976),
 (b'raised', b'up'): (35, 19.639030027092765),
 (b'every', b'one'): (395, 13.121859438999877),
 (b'could', b'scarcely'): (18, 16.706049522741466),
 (b'passed', b'along'): (11, 15.774990874485061),
 (b'leaning', b'over'): (17, 32.99075765424357),
 (b't', b'talk'): (19, 10.789205796038152),
 (b'camden', b'place'): (29, 304.8030226700252),
 (b'straight', b'forward'): (6, 11.08933284457478),
 (b'same', b'hour'): (17, 12.501907897455677),
 (b'looking', b'glasses'): (6, 21.695137693631672),
 (b'poring', b'over'): (6, 39.405627198124265),
 (b'thirty', b'feet'): (8, 11.482173582995951),
 (b'colonel', b'wallis'): (23, 919.8152027027028),
 (b'at', b'length'): (101, 22.85664635341284),
 (b'carried', b'away'): (73, 65.89981996296747),
 (b'greater', b'than'): (58, 46.92382276332348),
 (b'miss', b'carteret'): (12, 296.58529411764704),
 (b'lady', b'dalrymple'): (25, 567.8937488267317),
 (b'laura', b'place'): (7, 20.594798829055758),
 (b'be', b'established'): (41, 12.514923983865962),
 (b'mrs', b'smith'): (79, 133.20515177750607),
 (b'westgate', b'buildings'): (7, 3878.4230769230767),
 (b'at', b'liberty'): (25, 11.17812683597344),
 (b'five', b'thousand'): (31, 35.86163157834745),
 (b'whose', b'names'): (9, 17.589220303506018),
 (b'her', b'ladyship'): (22, 32.97674284395198),
 (b'ladyship', b's'): (10, 11.322082996496901),
 (b'old', b'gentleman'): (32, 24.07985483618954),
 (b'their', b'minds'): (18, 15.50176787086856),
 (b'almost', b'entirely'): (13, 33.520817751184246),
 (b'lower', b'part'): (8, 14.360893094499653),
 (b'staring', b'at'): (33, 23.08283191628515),
 (b'ay', b'ay'): (6, 102.04655085174566),
 (b'an', b'oath'): (39, 43.30687629076502),
 (b'wiser', b'than'): (8, 28.37160810303864),
 (b'prejudice', b'against'): (7, 37.30402614217892),
 (b'both', b'sides'): (31, 93.4670833729442),
 (b'my', b'soul'): (259, 16.066751210923982),
 (b'same', b'instant'): (19, 24.46954622664402),
 (b'their', b'seats'): (12, 12.91813989239047),
 (b'their', b'mouths'): (24, 24.960473690381583),
 (b'short', b'silence'): (9, 17.742932551319647),
 (b'fifty', b'pounds'): (6, 33.29484921857803),
 (b'be', b'saved'): (61, 12.820166032252938),
 (b'hard', b'hearted'): (6, 30.946447752033144),
 (b'some', b'moments'): (14, 21.24516597218971),
 (b'exclaimed', b'mrs'): (11, 11.847730944338375),
 (b'compassion', b'on'): (20, 12.135435888384333),
 (b'an', b'explanation'): (12, 15.443996324280173),
 (b'our', b'hearts'): (21, 17.06646733400796),
 (b'minutes', b'afterwards'): (7, 21.366081045290017),
 (b'make', b'haste'): (38, 69.71776454772422),
 (b'n', b't'): (19, 110.43069461827284),
 (b'rising', b'sun'): (7, 11.4529037631559),
 (b'an', b'atonement'): (66, 83.74078006943026),
 (b'atonement', b'for'): (65, 20.65632201567061),
 (b'next', b'instant'): (8, 11.76765535349606),
 (b'she', b'doted'): (7, 14.810751266798853),
 (b'god', b'forbid'): (30, 46.144377261328756),
 (b'i', b'll'): (384, 11.17502796097598),
 (b'll', b'answer'): (12, 13.013882743362831),
 (b'market', b'place'): (13, 39.58480813896431),
 (b'poured', b'out'): (53, 41.589583712234585),
 (b'at', b'norland'): (19, 17.421005219837852),
 (b'many', b'generations'): (11, 16.534145874894104),
 (b'seven', b'thousand'): (27, 31.259095392419333),
 (b'mr', b'dashwood'): (15, 10.190698520494786),
 (b'john', b'dashwood'): (37, 157.761220299208),
 (b'four', b'thousand'): (45, 51.45229768371371),
 (b'three', b'thousand'): (45, 26.103242228789615),
 (b'mrs', b'dashwood'): (121, 149.97722901761674),
 (b'miss', b'dashwoods'): (23, 254.21596638655464),
 (b'edward', b'ferrars'): (13, 135.88635597978663),
 (b'younger', b'brother'): (8, 31.67883135242683),
 (b'few', b'miles'): (7, 14.237434111445785),
 (b'replied', b'elinor'): (26, 38.56234426453405),
 (b'mrs', b'ferrars'): (73, 170.42505281471358),
 (b'barton', b'park'): (12, 511.6875679594056),
 (b'from', b'whence'): (44, 15.950059078483806),
 (b'barton', b'cottage'): (7, 104.5866897147796),
 (b'sir', b'john'): (113, 127.78658844235845),
 (b'at', b'barton'): (35, 22.230656741205607),
 (b'lady', b'middleton'): (95, 500.3819045606168),
 (b'be', b'fulfilled'): (39, 13.72604565972396),
 (b'their', b'arrival'): (15, 11.398358728579826),
 (b'present', b'case'): (12, 20.32458969190901),
 (b'mrs', b'jennings'): (229, 317.3131418135843),
 (b'colonel', b'brandon'): (132, 1667.519921875),
 (b'now', b'therefore'): (145, 11.124642048311067),
 (b'ill', b'natured'): (11, 147.80960912052115),
 (b'blue', b'sky'): (11, 51.83932141429143),
 (b'rose', b'up'): (112, 34.00644003284101),
 (b'at', b'allenham'): (8, 10.991824722040551),
 (b'miss', b'dashwood'): (70, 131.14315726290516),
 (b'cried', b'marianne'): (34, 25.0791653802614),
 (b'mr', b'willoughby'): (36, 36.686514673781225),
 (b'miss', b'marianne'): (31, 20.916503563450693),
 (b'aye', b'aye'): (36, 468.90135),
 (b'have', b'erred'): (10, 17.591061335566256),
 (b'pronounce', b'him'): (19, 17.59389669411188),
 (b'by', b'reason'): (70, 10.359210631680098),
 (b'an', b'everlasting'): (37, 33.79199195739935),
 (b'seven', b'days'): (103, 82.77954431228012),
 (b'by', b'accident'): (20, 16.659403636043233),
 (b'went', b'out'): (262, 11.679230644951586),
 (b'won', b't'): (219, 217.3954908123791),
 (b'miss', b'williams'): (6, 72.63313325330132),
 (b'laughed', b'heartily'): (6, 104.9677307425399),
 (b'considerable', b'time'): (10, 11.403428752365373),
 (b'two', b'sides'): (13, 12.986418686513664),
 (b'at', b'delaford'): (11, 13.19018966644866),
 (b'two', b'thousand'): (76, 23.96682269843838),
 (b'seven', b'hundred'): (50, 63.02087384034221),
 (b'can', b'possibly'): (10, 14.130761757067319),
 (b'burst', b'into'): (18, 13.516455983337801),
 (b'turning', b'round'): (15, 21.605116375400833),
 (b'mr', b'ferrars'): (26, 41.483981977275704),
 (b'combe', b'magna'): (11, 18907.3125),
 (b'mrs', b'palmer'): (37, 135.40263936386714),
 (b'mr', b'palmer'): (35, 100.05413092849427),
 (b'without', b'ceasing'): (8, 82.0051504472757),
 (b'stared', b'at'): (14, 13.489966704322493),
 (b't', b'think'): (70, 10.296676867200363),
 (b'miss', b'steeles'): (29, 406.7455462184874),
 (b'most', b'beautiful'): (16, 16.30563694985349),
 (b'human', b'beings'): (7, 191.52706552706556),
 (b'sugar', b'plums'): (24, 5662.879802955666),
 (b'two', b'boys'): (13, 13.239400868718477),
 (b'miss', b'steele'): (27, 243.16309828279137),
 (b'lucy', b'steele'): (10, 285.9328922495274),
 (b'i', b'm'): (438, 16.271042011107397),
 (b'm', b'sure'): (88, 102.49120766084593),
 (b'robert', b'ferrars'): (7, 95.96098334655036),
 (b'mr', b'pratt'): (8, 85.6018675721562),
 (b'at', b'longstaple'): (7, 16.48773708306082),
 (b'poor', b'edward'): (10, 12.172840599071293),
 (b'their', b'names'): (36, 14.302226309432303),
 (b'latter', b'end'): (12, 42.203822544642854),
 (b'i', b've'): (218, 13.517866465271059),
 (b'lifted', b'up'): (151, 72.07416702839072),
 (b'third', b'day'): (65, 33.7914061089195),
 (b'starting', b'up'): (9, 11.47168738090498),
 (b't', b'know'): (147, 12.457577307748522),
 (b'returned', b'home'): (11, 12.90725432262056),
 (b'berkeley', b'street'): (16, 1449.9725490196079),
 (b'conduit', b'street'): (6, 203.71515151515152),
 (b'lit', b'up'): (23, 36.13581524985069),
 (b'as', b'follows'): (17, 14.278650094398992),
 (b'having', b'received'): (10, 11.384803552611771),
 (b'who', b'cares'): (8, 12.52589229028073),
 (b'miss', b'grey'): (10, 10.25064380590946),
 (b'fifty', b'thousand'): (11, 20.37380177348749),
 (b'why', b'don'): (28, 12.795415425355841),
 (b'thousand', b'times'): (12, 12.06362743003632),
 (b'walked', b'across'): (7, 13.91218569999655),
 (b'fourteen', b'years'): (6, 15.563975922210217),
 (b'your', b'sakes'): (16, 32.49059753954306),
 (b'bartlett', b's'): (6, 10.18987469684721),
 (b'dressing', b'gown'): (6, 509.2878787878788),
 (b'wild', b'beasts'): (16, 105.2399430740038),
 (b'miss', b'morton'): (15, 267.5957540911101),
 (b'six', b'hundred'): (66, 132.00427053900353),
 (b'harley', b'street'): (16, 1540.5958333333335),
 (b'most', b'high'): (60, 22.2500544936901),
 (b'filled', b'with'): (114, 14.087588281459942),
 (b'two', b'thirds'): (6, 42.47641112047178),
 (b'public', b'school'): (6, 43.038412291933426),
 (b",'", b'says'): (13, 42.67564803385647),
 (b'fell', b'upon'): (62, 13.94880910923142),
 (b's', b'office'): (29, 12.330604675176458),
 (b'yes', b'ma'): (9, 15.105834768931166),
 (b'come', b'near'): (47, 11.571024613341859),
 (b'give', b'ear'): (30, 35.78134979774324),
 (b'reminds', b'me'): (7, 14.199176259372688),
 (b'ten', b'guineas'): (6, 26.425314465408807),
 (b'south', b'east'): (7, 17.11383597097883),
 (b'mr', b'harris'): (10, 85.6018675721562),
 (b'quicker', b'than'): (6, 12.238732907193139),
 (b'bent', b'over'): (10, 11.258750628035504),
 (b'justified', b'by'): (17, 11.526506299532617),
 (b'from', b'thence'): (103, 27.334646896223614),
 (b'latter', b'days'): (12, 29.77529527559055),
 (b'sprung', b'up'): (13, 24.090543499900463),
 (b'or', b'later'): (13, 13.913825368307583),
 (b'living', b'creature'): (16, 82.51963993453354),
 (b'first', b'month'): (33, 25.952445658418213),
 (b'have', b'transgressed'): (20, 23.003695592663565),
 ...}

In [24]:
lower_bigram["jon lives in new york city".split()]


Out[24]:
['jon', 'lives', 'in', 'new_york', 'city']

In [25]:
lower_bigram = Phraser(Phrases(lower_sents, min_count=32, threshold=64))
lower_bigram.phrasegrams


Out[25]:
{(b'afar', b'off'): (52, 108.14220347465505),
 (b'burnt', b'offering'): (184, 297.524653753951),
 (b'burnt', b'offerings'): (86, 299.15702343127646),
 (b'buster', b'bear'): (142, 479.87410772225826),
 (b'captain', b'benwick'): (56, 241.49037086312987),
 (b'captain', b'wentworth'): (196, 529.8756608388247),
 (b'charles', b'hayter'): (33, 92.03437785214481),
 (b'chief', b'priests'): (65, 116.31947753846512),
 (b'colonel', b'brandon'): (132, 1313.0078125),
 (b'couldn', b't'): (89, 171.76138536935215),
 (b'cut', b'off'): (217, 129.60290535032792),
 (b'dare', b'say'): (115, 89.94000515807346),
 (b'de', b'grey'): (77, 603.2109624246722),
 (b'didn', b't'): (180, 220.51081560283686),
 (b'doesn', b't'): (53, 106.2634985949418),
 (b'don', b't'): (830, 250.30957446808512),
 (b'dr', b'bull'): (65, 680.7870294599019),
 (b'dr', b'middleton'): (40, 162.73103819257668),
 (b'drawing', b'room'): (49, 84.91494947493561),
 (b'farmer', b'brown'): (100, 386.05179596892236),
 (b'father', b'brown'): (207, 91.68277248710235),
 (b'few', b'minutes'): (86, 204.16834974753786),
 (b'few', b'moments'): (43, 107.77584531675087),
 (b'fig', b'tree'): (37, 121.73722334004026),
 (b'fine', b'flour'): (36, 86.07682458386685),
 (b'fir', b'tree'): (36, 72.6789393074867),
 (b'forty', b'years'): (68, 90.60220877269607),
 (b'frank', b'churchill'): (151, 1316.4456593286038),
 (b'gathered', b'together'): (84, 103.28066074898891),
 (b'good', b'natured'): (66, 88.69936184891343),
 (b'great', b'deal'): (182, 93.36368125424357),
 (b'green', b'forest'): (66, 336.37733627667404),
 (b'guinea', b'hen'): (51, 905.8822695035461),
 (b'high', b'places'): (99, 129.8123390846559),
 (b'holy', b'ghost'): (90, 313.0305073859987),
 (b'isn', b't'): (63, 131.96593211752787),
 (b'jane', b'fairfax'): (111, 654.5565917587609),
 (b'jesus', b'christ'): (199, 172.16816954974848),
 (b'joe', b'otter'): (47, 1271.6141235813368),
 (b'john', b'knightley'): (58, 83.03755747111268),
 (b'lady', b'middleton'): (95, 350.26733319243175),
 (b'lady', b'russell'): (147, 613.6301581282135),
 (b'little', b'jackal'): (61, 69.81254128038833),
 (b'little', b'joe'): (111, 133.28784038147822),
 (b'm', b'sure'): (88, 69.15069432539002),
 (b'ma', b'am'): (216, 157.25846601094193),
 (b'mast', b'heads'): (37, 77.7358926919519),
 (b'meat', b'offering'): (122, 210.66724956379437),
 (b'mercy', b'endureth'): (41, 269.07674062361025),
 (b'miss', b'bates'): (113, 276.39588291692513),
 (b'miss', b'dashwood'): (70, 76.66830732292917),
 (b'miss', b'fairfax'): (125, 196.19987447261062),
 (b'miss', b'smith'): (58, 73.03442128232508),
 (b'miss', b'somers'): (49, 160.06190476190477),
 (b'miss', b'taylor'): (48, 156.44059469941823),
 (b'miss', b'woodhouse'): (173, 229.03802722366902),
 (b'moby', b'dick'): (84, 4115.877551020409),
 (b'mock', b'turtle'): (56, 2528.877742946708),
 (b'mr', b'elliot'): (174, 126.18129960463163),
 (b'mr', b'elton'): (214, 121.3990121932397),
 (b'mr', b'gresham'): (49, 87.31390492359931),
 (b'mr', b'knightley'): (277, 161.74131790625913),
 (b'mr', b'weston'): (162, 75.87438262077481),
 (b'mr', b'woodhouse'): (132, 82.04651843976633),
 (b'mrs', b'clay'): (66, 93.08931456265867),
 (b'mrs', b'dashwood'): (121, 115.06873605661974),
 (b'mrs', b'elton'): (142, 93.08931456265867),
 (b'mrs', b'ferrars'): (73, 102.75628184416554),
 (b'mrs', b'goddard'): (58, 143.57843432545658),
 (b'mrs', b'jennings'): (229, 279.0655756128398),
 (b'mrs', b'musgrove'): (66, 85.21252640735679),
 (b'mrs', b'smith'): (79, 84.60327207490248),
 (b'mrs', b'theresa'): (67, 170.20061244665206),
 (b'mrs', b'weston'): (249, 160.68485093258923),
 (b'o', b'clock'): (67, 89.14789088153573),
 (b'o', b'er'): (82, 108.14993564993564),
 (b'peace', b'offerings'): (83, 176.2577199456205),
 (b'sent', b'messengers'): (43, 79.21555418015616),
 (b'sin', b'offering'): (118, 129.96079665512747),
 (b'sir', b'arthur'): (71, 131.41924812030075),
 (b'sir', b'john'): (113, 95.83994133176884),
 (b'sir', b'walter'): (136, 399.5145142857143),
 (b'six', b'hundred'): (66, 73.57615079223149),
 (b'sperm', b'whale'): (183, 297.3672297627184),
 (b'sweet', b'savour'): (43, 286.17879256965944),
 (b'take', b'heed'): (58, 86.38454061712328),
 (b'ten', b'thousand'): (82, 84.00099962514057),
 (b'thou', b'shalt'): (1282, 66.88233182614454),
 (b'thousand', b'pounds'): (48, 166.51834523092802),
 (b'thus', b'saith'): (444, 144.0289127889979),
 (b'unleavened', b'bread'): (43, 237.70041787206688),
 (b'wasn', b't'): (58, 120.2225788701394),
 (b'wee', b'l'): (35, 450.39751861042186),
 (b'without', b'blemish'): (46, 83.71359108159393),
 (b'won', b't'): (219, 189.96708776595744),
 (b'wouldn', b't'): (58, 120.2225788701394),
 (b'years', b'ago'): (56, 74.31594785893046)}

In [26]:
# as in Maas et al. (2001):
# - leave in stop words ("indicative of sentiment")
# - no stemming ("model learns similar representations of words of the same stem when data suggests it")
clean_sents = []
for s in lower_sents:
    clean_sents.append(lower_bigram[s])

In [27]:
clean_sents[0:9]


Out[27]:
[['emma', 'by', 'jane', 'austen', '1816'],
 ['volume', 'i'],
 ['chapter', 'i'],
 ['emma',
  'woodhouse',
  'handsome',
  'clever',
  'and',
  'rich',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her'],
 ['she',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  'indulgent',
  'father',
  'and',
  'had',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  's',
  'marriage',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period'],
 ['her',
  'mother',
  'had',
  'died',
  'too',
  'long',
  'ago',
  'for',
  'her',
  'to',
  'have',
  'more',
  'than',
  'an',
  'indistinct',
  'remembrance',
  'of',
  'her',
  'caresses',
  'and',
  'her',
  'place',
  'had',
  'been',
  'supplied',
  'by',
  'an',
  'excellent',
  'woman',
  'as',
  'governess',
  'who',
  'had',
  'fallen',
  'little',
  'short',
  'of',
  'a',
  'mother',
  'in',
  'affection'],
 ['sixteen',
  'years',
  'had',
  'miss_taylor',
  'been',
  'in',
  'mr_woodhouse',
  's',
  'family',
  'less',
  'as',
  'a',
  'governess',
  'than',
  'a',
  'friend',
  'very',
  'fond',
  'of',
  'both',
  'daughters',
  'but',
  'particularly',
  'of',
  'emma'],
 ['between',
  '_them_',
  'it',
  'was',
  'more',
  'the',
  'intimacy',
  'of',
  'sisters'],
 ['even',
  'before',
  'miss_taylor',
  'had',
  'ceased',
  'to',
  'hold',
  'the',
  'nominal',
  'office',
  'of',
  'governess',
  'the',
  'mildness',
  'of',
  'her',
  'temper',
  'had',
  'hardly',
  'allowed',
  'her',
  'to',
  'impose',
  'any',
  'restraint',
  'and',
  'the',
  'shadow',
  'of',
  'authority',
  'being',
  'now',
  'long',
  'passed',
  'away',
  'they',
  'had',
  'been',
  'living',
  'together',
  'as',
  'friend',
  'and',
  'friend',
  'very',
  'mutually',
  'attached',
  'and',
  'emma',
  'doing',
  'just',
  'what',
  'she',
  'liked',
  'highly',
  'esteeming',
  'miss_taylor',
  's',
  'judgment',
  'but',
  'directed',
  'chiefly',
  'by',
  'her',
  'own']]

In [28]:
clean_sents[6] # could consider removing stop words or common words


Out[28]:
['sixteen',
 'years',
 'had',
 'miss_taylor',
 'been',
 'in',
 'mr_woodhouse',
 's',
 'family',
 'less',
 'as',
 'a',
 'governess',
 'than',
 'a',
 'friend',
 'very',
 'fond',
 'of',
 'both',
 'daughters',
 'but',
 'particularly',
 'of',
 'emma']

Run word2vec


In [ ]:
# max_vocab_size can be used instead of min_count (which has increased here)
# model = Word2Vec(sentences=clean_sents, size=64, sg=1, window=10, min_count=10, seed=42, workers=8)
# model.save('../clean_gutenberg_model.w2v')

Explore model


In [29]:
# skip re-training the model with the next line:  
model = gensim.models.Word2Vec.load('./clean_gutenberg_model.w2v')

In [30]:
len(model.wv.vocab) # down from 17k in previous notebook


Out[30]:
10329

In [31]:
model['ma_am']


Out[31]:
array([-0.27275795,  0.22294798, -0.27785164, -0.21537074, -0.04375846,
        0.35075492, -0.21310569,  0.28521448,  0.18605071,  0.22906332,
        0.3929922 ,  0.3332729 , -0.06804646, -0.36645287, -0.26969737,
       -0.51488483,  0.02132919,  0.07797143,  0.12759572,  0.20545809,
       -0.46009699,  0.24983054,  0.25449356, -0.64410228, -0.20215428,
        0.1824095 , -0.23551014,  0.82762975, -0.2430227 , -0.02013004,
       -0.88646883, -0.06863049, -0.63370681,  0.32553154,  0.02467243,
        0.02537833,  0.13675088, -0.08868799,  0.31126702,  0.48400268,
       -0.60338777, -0.20596088, -0.41460729, -0.47632793, -0.02355143,
        0.56830853,  0.15640558,  0.12009691,  0.16354683,  0.66328555,
       -0.24750067, -0.16379237,  0.00516235,  0.83655453,  0.10755659,
        0.13685858, -0.00376199, -0.17476274, -0.35152084, -0.01025227,
       -0.03357989,  0.33727771,  0.05614873, -0.19056796], dtype=float32)

In [32]:
model.most_similar('ma_am')


Out[32]:
[('madam', 0.8394622802734375),
 ('mamma', 0.8382871747016907),
 ('betty', 0.8321777582168579),
 ('nancy', 0.8240376710891724),
 ('shouldn', 0.8222858905792236),
 ('m_sure', 0.8149751424789429),
 ('madman', 0.8145883083343506),
 ('bunger', 0.8138953447341919),
 ('frederick', 0.8121578693389893),
 ('indignantly', 0.8070249557495117)]

In [33]:
model.most_similar(positive=['father', 'woman'], negative=['man'])


Out[33]:
[('mother', 0.7831815481185913),
 ('husband', 0.7511569857597351),
 ('daughter', 0.7453563213348389),
 ('wife', 0.7420299053192139),
 ('sister', 0.739268958568573),
 ('womb', 0.6849961280822754),
 ('loved', 0.6742969751358032),
 ('sarah', 0.6667004823684692),
 ('child', 0.6640843152999878),
 ('isaac', 0.6623573303222656)]

Reduce word vector dimensionality with t-SNE


In [ ]:
# tsne = TSNE(n_components=2, n_iter=1000)

In [ ]:
# X_2d = tsne.fit_transform(model[model.wv.vocab])

In [ ]:
# coords_df = pd.DataFrame(X_2d, columns=['x','y'])
# coords_df['token'] = model.wv.vocab.keys()

In [ ]:
# coords_df.head()

In [ ]:
# coords_df.to_csv('../clean_gutenberg_tsne.csv', index=False)

Visualise


In [34]:
coords_df = pd.read_csv('./clean_gutenberg_tsne.csv')

In [35]:
_ = coords_df.plot.scatter('x', 'y', figsize=(12,12), marker='.', s=10, alpha=0.2)



In [36]:
output_notebook()


Loading BokehJS ...

In [37]:
subset_df = coords_df.sample(n=5000)

In [38]:
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [39]:
show(p)



In [ ]:
# output_file() here