In this notebook, we improve the quality of our Project Gutenberg word vectors by adopting best-practices for preprocessing natural language data.
N.B.: Some, all or none of these preprocessing steps may be helpful to a given downstream application.
In [1]:
# the initial block is copied from creating_word_vectors_with_word2vec.ipynb
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure
%matplotlib inline
Using TensorFlow backend.
In [2]:
nltk.download('punkt')
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data] Package punkt is already up-to-date!
Out[2]:
True
In [3]:
# new!
import string
from nltk.corpus import stopwords
from nltk.stem.porter import *
from gensim.models.phrases import Phraser, Phrases
from keras.preprocessing.text import one_hot
In [4]:
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
Out[4]:
True
In [5]:
nltk.download('gutenberg')
[nltk_data] Downloading package gutenberg to /home/jovyan/nltk_data...
[nltk_data] Package gutenberg is already up-to-date!
Out[5]:
True
In [6]:
from nltk.corpus import gutenberg
In [7]:
gberg_sents = gutenberg.sents()
In [8]:
gberg_sents[4]
Out[8]:
['She',
'was',
'the',
'youngest',
'of',
'the',
'two',
'daughters',
'of',
'a',
'most',
'affectionate',
',',
'indulgent',
'father',
';',
'and',
'had',
',',
'in',
'consequence',
'of',
'her',
'sister',
"'",
's',
'marriage',
',',
'been',
'mistress',
'of',
'his',
'house',
'from',
'a',
'very',
'early',
'period',
'.']
In [9]:
[w.lower() for w in gberg_sents[4]]
Out[9]:
['she',
'was',
'the',
'youngest',
'of',
'the',
'two',
'daughters',
'of',
'a',
'most',
'affectionate',
',',
'indulgent',
'father',
';',
'and',
'had',
',',
'in',
'consequence',
'of',
'her',
'sister',
"'",
's',
'marriage',
',',
'been',
'mistress',
'of',
'his',
'house',
'from',
'a',
'very',
'early',
'period',
'.']
In [10]:
stpwrds = stopwords.words('english') + list(string.punctuation)
In [11]:
stpwrds
Out[11]:
['i',
'me',
'my',
'myself',
'we',
'our',
'ours',
'ourselves',
'you',
'your',
'yours',
'yourself',
'yourselves',
'he',
'him',
'his',
'himself',
'she',
'her',
'hers',
'herself',
'it',
'its',
'itself',
'they',
'them',
'their',
'theirs',
'themselves',
'what',
'which',
'who',
'whom',
'this',
'that',
'these',
'those',
'am',
'is',
'are',
'was',
'were',
'be',
'been',
'being',
'have',
'has',
'had',
'having',
'do',
'does',
'did',
'doing',
'a',
'an',
'the',
'and',
'but',
'if',
'or',
'because',
'as',
'until',
'while',
'of',
'at',
'by',
'for',
'with',
'about',
'against',
'between',
'into',
'through',
'during',
'before',
'after',
'above',
'below',
'to',
'from',
'up',
'down',
'in',
'out',
'on',
'off',
'over',
'under',
'again',
'further',
'then',
'once',
'here',
'there',
'when',
'where',
'why',
'how',
'all',
'any',
'both',
'each',
'few',
'more',
'most',
'other',
'some',
'such',
'no',
'nor',
'not',
'only',
'own',
'same',
'so',
'than',
'too',
'very',
's',
't',
'can',
'will',
'just',
'don',
'should',
'now',
'd',
'll',
'm',
'o',
're',
've',
'y',
'ain',
'aren',
'couldn',
'didn',
'doesn',
'hadn',
'hasn',
'haven',
'isn',
'ma',
'mightn',
'mustn',
'needn',
'shan',
'shouldn',
'wasn',
'weren',
'won',
'wouldn',
'!',
'"',
'#',
'$',
'%',
'&',
"'",
'(',
')',
'*',
'+',
',',
'-',
'.',
'/',
':',
';',
'<',
'=',
'>',
'?',
'@',
'[',
'\\',
']',
'^',
'_',
'`',
'{',
'|',
'}',
'~']
In [12]:
# CODE HERE
[w.lower() for w in gberg_sents[4] if w not in stpwrds]
Out[12]:
['she',
'youngest',
'two',
'daughters',
'affectionate',
'indulgent',
'father',
'consequence',
'sister',
'marriage',
'mistress',
'house',
'early',
'period']
In [13]:
stemmer = PorterStemmer()
In [14]:
# CODE HERE
[stemmer.stem(w.lower()) for w in gberg_sents[4] if w not in stpwrds]
Out[14]:
['she',
'youngest',
'two',
'daughter',
'affection',
'indulg',
'father',
'consequ',
'sister',
'marriag',
'mistress',
'hous',
'earli',
'period']
In [15]:
phrases = Phrases(gberg_sents) # train detector
In [16]:
bigram = Phraser(phrases) # create a more efficient Phraser object for transforming sentences
In [17]:
bigram.phrasegrams # output count and score of each bigram
Out[17]:
{(b'two', b'daughters'): (19, 11.966813731181546),
(b'her', b'sister'): (195, 17.7960829227865),
(b"'", b's'): (9781, 31.066242737744524),
(b'very', b'early'): (24, 11.01214147275924),
(b'Her', b'mother'): (14, 13.529425062715127),
(b'long', b'ago'): (38, 63.22343628984788),
(b'more', b'than'): (541, 29.023584433996874),
(b'had', b'been'): (1256, 22.306024648925288),
(b'an', b'excellent'): (54, 39.063874851750626),
(b'Miss', b'Taylor'): (48, 453.75918026073305),
(b'very', b'fond'): (28, 24.134280468850747),
(b'passed', b'away'): (25, 12.35053642325912),
(b'too', b'much'): (173, 31.376002029426687),
(b'did', b'not'): (935, 11.728416217142811),
(b'any', b'means'): (27, 14.096964108090186),
(b'wedding', b'-'): (15, 17.4695197740113),
(b'Her', b'father'): (18, 13.129571562488772),
(b'after', b'dinner'): (21, 21.5285481168817),
(b'self', b'-'): (124, 47.79018053120332),
(b'sixteen', b'years'): (12, 107.0461671612265),
(b'five', b'years'): (42, 40.128755673408115),
(b'years', b'old'): (176, 54.735425236061104),
(b'seven', b'years'): (51, 52.59411150244507),
(b'each', b'other'): (236, 79.4168405322873),
(b'a', b'mile'): (48, 12.783091600264584),
(b'must', b'be'): (601, 10.229989650632808),
(b'difference', b'between'): (44, 220.5253730524468),
(b'could', b'not'): (1049, 10.870983286982371),
(b'having', b'been'): (49, 11.538018331569376),
(b'miles', b'off'): (16, 34.7868137375225),
(b'at', b'Hartfield'): (66, 27.282227059708397),
(b'her', b'husband'): (158, 27.544395195049724),
(b'in', b'spite'): (96, 13.441914963088442),
(b'Emma', b'could'): (61, 11.335111257744053),
(b'every', b'body'): (127, 36.972582976825784),
(b'no', b'means'): (80, 32.57361823161739),
(b'his', b'own'): (773, 10.402387689390995),
(b'obliged', b'to'): (179, 10.43662879984823),
(b'able', b'to'): (348, 11.446828804673226),
(b'very', b'much'): (234, 16.21027499400206),
(b'have', b'been'): (986, 17.981191047801964),
(b'great', b'deal'): (181, 118.04013764577145),
(b'"', b'Poor'): (30, 10.125586409383336),
(b'agree', b'with'): (25, 13.611743912451109),
(b'-', b'humoured'): (22, 33.94078127522195),
(b'for', b'ever'): (555, 12.4761138144463),
(b'This', b'is'): (353, 11.381028160111825),
(b'three', b'times'): (36, 35.42578086743132),
(b'my', b'dear'): (253, 24.47894249615343),
(b'How', b'often'): (12, 12.377968718725876),
(b'My', b'dear'): (85, 84.80698289930197),
(b'so', b'far'): (98, 10.161632478973766),
(b'"', b'No'): (351, 15.063706270011586),
(b'We', b'must'): (68, 18.765646900386386),
(b'last', b'night'): (63, 23.592598950882213),
(b'doubt', b'whether'): (12, 22.924130736398396),
(b'anywhere', b'else'): (6, 16.100101533414907),
(b'I', b'am'): (2428, 16.951297329049222),
(b'very', b'glad'): (46, 18.28434074650975),
(b'am', b'sure'): (282, 65.1446020744797),
(b'very', b'pretty'): (39, 20.068178868120455),
(b'be', b'able'): (121, 11.347612277348256),
(b'immediately', b'afterwards'): (10, 41.06053966483414),
(b'sensible', b'man'): (17, 14.541388094211703),
(b'intimate', b'friend'): (6, 21.898760623229464),
(b'connected', b'with'): (31, 18.375854281808998),
(b'than', b'usual'): (30, 28.951968704471486),
(b'Brunswick', b'Square'): (11, 10881.307917888562),
(b'some', b'time'): (146, 12.926553753557874),
(b'poor', b'Isabella'): (10, 41.30241100647832),
(b'It', b'is'): (777, 11.705870174225897),
(b'am', b'afraid'): (65, 25.627391867391864),
(b'moonlight', b'night'): (6, 14.745374344301382),
(b'Look', b'at'): (33, 13.630464729284446),
(b'"', b'Well'): (311, 21.191330893669),
(b'vast', b'deal'): (11, 61.90400400400401),
(b'an', b'hour'): (150, 41.75757187695002),
(b'pretty', b'well'): (20, 17.716415202444615),
(b'tolerably', b'well'): (7, 18.357580705009276),
(b'"', b'Ah'): (83, 17.279509006244886),
(b'Ah', b'!'): (68, 37.53295698121932),
(b"'", b'Tis'): (64, 23.239343942788935),
(b'Miss', b'Woodhouse'): (173, 294.52709701744294),
(b'you', b'please'): (93, 13.035980721875688),
(b'any', b'rate'): (47, 83.92034351736973),
(b',"', b'said'): (2583, 36.03254133384804),
(b'My', b'dearest'): (7, 26.665272507761294),
(b'so', b'much'): (483, 20.56443776042493),
(b'much', b'less'): (38, 19.104435569934505),
(b'any', b'body'): (93, 21.716438732250484),
(b'has', b'been'): (263, 29.2606767165493),
(b'been', b'used'): (29, 14.09410182779525),
(b'Well', b',"'): (60, 12.493546273257474),
(b'tell', b'you'): (296, 11.612165547868624),
(b'Every', b'body'): (21, 72.2001079929367),
(b'"', b'Dear'): (39, 20.048661090579007),
(b'every', b'thing'): (240, 27.277168506679985),
(b'very', b'sorry'): (32, 20.255731941654293),
(b'turned', b'away'): (50, 19.3446247292484),
(b'divided', b'between'): (10, 35.82806127178346),
(b'knows', b'how'): (13, 14.800957338598696),
(b'how', b'much'): (110, 15.417663894373641),
(b'four', b'years'): (21, 16.257604884476734),
(b'years', b'ago'): (56, 163.33147420261938),
(b'any', b'thing'): (383, 35.72804044966351),
(b'need', b'not'): (107, 13.478832663938945),
(b'his', b'wife'): (263, 10.870850757012368),
(b'Ever', b'since'): (8, 99.63818474758324),
(b'leave', b'off'): (18, 10.507247077643713),
(b'you', b'mean'): (142, 10.573995913362895),
(b'young', b'lady'): (73, 113.30511794581632),
(b'depend', b'upon'): (28, 66.33685452578166),
(b'quarrel', b'with'): (21, 10.691406127597961),
(b'-', b'hearted'): (45, 49.037248488452775),
(b'their', b'own'): (279, 10.164510720981893),
(b'You', b'are'): (231, 12.60019671587268),
(b'more', b'likely'): (16, 11.176882987148272),
(b'have', b'done'): (272, 12.664105519891397),
(b',"', b'rejoined'): (6, 11.956633540853023),
(b'any', b'longer'): (32, 16.396201569570135),
(b'very', b'well'): (171, 13.844437162007257),
(b'young', b'man'): (260, 25.863812524396362),
(b'dine', b'with'): (22, 13.88397879070013),
(b'much', b'better'): (38, 10.763384154831522),
(b'I', b'dare'): (138, 13.676468274997303),
(b'dare', b'say'): (114, 128.21086697682205),
(b'Depend', b'upon'): (17, 92.29475412282665),
(b'take', b'care'): (59, 72.93974751004718),
(b'CHAPTER', b'II'): (11, 335.5512750949539),
(b'entering', b'into'): (14, 16.437457915441044),
(b'never', b'seen'): (42, 14.015206798866856),
(b'refrain', b'from'): (9, 12.438010669696954),
(b'at', b'once'): (263, 21.418172245725966),
(b'three', b'years'): (77, 37.3552600780725),
(b'any', b'other'): (138, 10.208401968552408),
(b'twenty', b'years'): (69, 85.29044131115464),
(b'an', b'easy'): (18, 10.427467282325322),
(b'according', b'to'): (747, 12.093327441482012),
(b'had', b'begun'): (25, 12.032976708302648),
(b'passed', b'through'): (43, 31.462199837199837),
(b'its', b'being'): (58, 16.06447342496047),
(b'deal', b'better'): (14, 19.992919953446272),
(b'fine', b'young'): (13, 10.403137320870965),
(b'belonging', b'to'): (35, 10.512393800210106),
(b'Frank', b'Churchill'): (151, 1750.6779772753712),
(b'Miss', b'Bates'): (113, 400.4260773639656),
(b'a', b'few'): (404, 11.554600796586188),
(b'few', b'days'): (53, 35.91529644067919),
(b'I', b'suppose'): (210, 12.338158409520457),
(b'very', b'handsome'): (21, 19.759437654764756),
(b'an', b'irresistible'): (7, 11.369078040261053),
(b'good', b'sense'): (28, 17.373370904818092),
(b'had', b'already'): (64, 11.990720435581183),
(b'She', b'felt'): (26, 13.338332744482605),
(b'most', b'fortunate'): (6, 11.471572464709046),
(b'long', b'enough'): (38, 15.189529976554649),
(b'know', b'how'): (120, 12.782869530582152),
(b'dear', b'Emma'): (31, 28.38977406756079),
(b'at', b'Randalls'): (39, 27.033755046414154),
(b'few', b'weeks'): (19, 134.46858012611438),
(b'no', b'longer'): (113, 44.45340841020726),
(b'CHAPTER', b'III'): (10, 354.1930126002291),
(b'Donwell', b'Abbey'): (9, 753.4827901309776),
(b'card', b'-'): (18, 15.66232807325151),
(b'drawing', b'-'): (53, 20.08471734497107),
(b'-', b'room'): (116, 10.86339885106585),
(b'thrown', b'away'): (11, 14.820643707910945),
(b'After', b'these'): (18, 11.091988134657838),
(b'an', b'invitation'): (11, 10.459551797040168),
(b'old', b'lady'): (16, 10.886003613395488),
(b'those', b'who'): (150, 15.975643084740117),
(b'as', b'possible'): (81, 11.70949877112537),
(b'young', b'ladies'): (44, 113.63480411788262),
(b'-', b'fashioned'): (31, 34.9390395480226),
(b'Goddard', b"'"): (34, 15.29483969345862),
(b'found', b'herself'): (27, 11.22605649143189),
(b's', b'sake'): (142, 28.092000957580005),
(b'much', b'pleased'): (18, 13.276184200965723),
(b'be', b'allowed'): (32, 10.133274789777253),
(b'Miss', b'Smith'): (58, 165.24316871017183),
(b'Harriet', b'Smith'): (31, 180.54871092346391),
(b'several', b'years'): (10, 17.577367349955093),
(b'pretty', b'girl'): (10, 40.4556337659619),
(b'blue', b'eyes'): (28, 35.59533676681832),
(b'They', b'were'): (188, 10.653371553453791),
(b'due', b'time'): (18, 21.04160963161683),
(b'its', b'own'): (54, 10.834370434674414),
(b'better', b'than'): (170, 42.51368036077655),
(b'body', b'else'): (31, 39.469837225343845),
(b'apple', b'-'): (26, 28.21999348109518),
(b'You', b'need'): (16, 14.652632145780382),
(b'half', b'-'): (179, 14.007817698976801),
(b'much', b'more'): (159, 10.555945043417811),
(b'little', b'girl'): (50, 35.05677573772557),
(b'at', b'last'): (420, 22.75660921756972),
(b'CHAPTER', b'IV'): (8, 335.5512750949539),
(b'every', b'respect'): (14, 12.224980231945178),
(b'guided', b'by'): (14, 23.954538020555372),
(b'different', b'sort'): (8, 14.490845895493244),
(b'-', b'Mill'): (7, 12.705105290190035),
(b'good', b'deal'): (62, 39.176849314155014),
(b'very', b'happy'): (43, 11.360756752157577),
(b'drink', b'tea'): (7, 32.50399453379586),
(b'large', b'enough'): (11, 10.829067985816224),
(b'had', b'taken'): (121, 10.962547054002629),
(b'doing', b'something'): (9, 10.716846747710356),
(b'three', b'miles'): (9, 16.651749532156657),
(b'thing', b'else'): (26, 12.21019907747794),
(b'very', b'obliging'): (14, 25.349278570257418),
(b'on', b'purpose'): (35, 10.833361556419508),
(b'very', b'clever'): (15, 21.69532850607617),
(b'"', b'You'): (493, 11.717061811021022),
(b'know', b'what'): (219, 10.687860772182088),
(b'Miss', b'Nash'): (13, 337.68125042659204),
(b'does', b'not'): (211, 13.23025142257054),
(b'"', b'Oh'): (496, 20.296685763864517),
(b'Oh', b'yes'): (11, 23.468003288849534),
(b'very', b'entertaining'): (7, 16.054543094496367),
(b'soon', b'as'): (271, 12.01164260504438),
(b'Oh', b'!'): (285, 31.12698837846826),
(b'have', b'seen'): (204, 13.438796505596505),
(b'on', b'horseback'): (21, 54.889031885858834),
(b'their', b'families'): (95, 35.26311487873489),
(b'no', b'doubt'): (117, 40.18959620751882),
(b'very', b'respectable'): (9, 10.88443599626872),
(b'respectable', b'young'): (8, 27.70496528037034),
(b'very', b'odd'): (16, 18.206182890665982),
(b'perfectly', b'right'): (12, 16.998927982407917),
(b'years', b'hence'): (9, 17.990952464071682),
(b'young', b'woman'): (57, 30.4001550358284),
(b'very', b'desirable'): (9, 14.595039176814876),
(b'Dear', b'Miss'): (9, 32.278354820188945),
(b'thirty', b'years'): (35, 72.53269372866845),
(b'can', b'afford'): (11, 26.391592873146273),
(b'good', b'luck'): (21, 51.57752734020704),
(b'acquainted', b'with'): (88, 27.73083464345721),
(b'your', b'own'): (181, 10.134692346685467),
(b'"', b'Yes'): (349, 27.046036922192854),
(b'next', b'day'): (100, 33.66839067944251),
(b'an', b'opportunity'): (34, 39.49570340028189),
(b'few', b'yards'): (15, 127.20000822740548),
(b'Robert', b'Martin'): (31, 1963.7208109428432),
(b'few', b'minutes'): (86, 316.36383789006993),
(b'Only', b'think'): (9, 11.416616668358916),
(b'been', b'able'): (40, 15.91768079235648),
(b'-', b'morrow'): (134, 31.191253298926746),
(b'should', b'happen'): (13, 20.434212265397832),
(b'Do', b'you'): (187, 17.542972188763283),
(b'compared', b'with'): (25, 15.313211901507499),
(b'"', b'Certainly'): (39, 25.246462114062453),
(b'You', b'must'): (84, 12.864839977038008),
(b'an', b'old'): (158, 10.467761806300013),
(b'old', b'man'): (201, 11.807109991026596),
(b'more', b'valuable'): (10, 17.665941085058734),
(b',"', b'replied'): (256, 68.6325679964557),
(b'very', b'bad'): (36, 15.601593602802112),
(b'deal', b'too'): (15, 12.720000822740548),
(b'no', b'more'): (553, 17.350760547573774),
(b'very', b'agreeable'): (21, 21.406057459328487),
(b'fixed', b'on'): (31, 10.722857321964225),
(b'same', b'time'): (104, 18.367425129939775),
(b'pleasing', b'young'): (8, 23.351327879169286),
(b'CHAPTER', b'V'): (7, 236.1286750668194),
(b'very', b'differently'): (14, 48.16362928348909),
(b'"', b'Perhaps'): (40, 10.879118421244423),
(b'ever', b'since'): (60, 42.92801610440095),
(b'twelve', b'years'): (22, 39.389282288763),
(b'very', b'neatly'): (7, 22.935061563566236),
(b'ten', b'years'): (32, 36.45848645037043),
(b'being', b'able'): (20, 14.36891827839066),
(b'her', b'mother'): (239, 11.54362134822892),
(b'have', b'spoken'): (82, 11.59021954484605),
(b'Yes', b',"'): (107, 26.304593789876648),
(b'"', b'Thank'): (43, 24.575778111032328),
(b'Thank', b'you'): (46, 27.918884665527383),
(b'"', b'Why'): (191, 10.548941903388105),
(b'could', b'possibly'): (21, 30.820114126236575),
(b'How', b'can'): (53, 16.088661540487763),
(b'much', b'mistaken'): (9, 10.098977725520935),
(b'Very', b'well'): (40, 84.54149008885851),
(b'oh', b'!'): (27, 22.810780631748376),
(b'look', b'at'): (154, 10.281997140291956),
(b'any', b'harm'): (10, 10.323534321581198),
(b'"', b'Very'): (70, 19.596435652445642),
(b'an', b'angel'): (52, 25.819271767903782),
(b'an', b'end'): (127, 18.27126745760167),
(b'many', b'years'): (54, 19.719030792857517),
(b',"', b'cried'): (297, 34.723966485732284),
(b'much', b'obliged'): (40, 42.98889166944722),
(b'John', b'Knightley'): (58, 175.90370362419566),
(b'ill', b'-'): (100, 22.276568839343266),
(b'cared', b'for'): (14, 11.003932384341637),
(b'I', b'assure'): (105, 13.117491742454519),
(b'assure', b'you'): (125, 32.47600092425325),
(b'soon', b'afterwards'): (36, 80.80970084767553),
(b'CHAPTER', b'VI'): (6, 151.7970054000982),
(b'most', b'agreeable'): (13, 28.296545412948984),
(b'no', b'scruple'): (10, 27.380722571504467),
(b'infinitely', b'superior'): (7, 278.568018018018),
(b'am', b'glad'): (34, 17.031537511870848),
(b'Exactly', b'so'): (9, 26.019606605658986),
(b'Did', b'you'): (73, 15.109886745810456),
(b'very', b'interesting'): (15, 17.838381216107074),
(b'No', b'sooner'): (14, 65.68697776518907),
(b'Don', b"'"): (134, 25.405723462879433),
(b"'", b't'): (2200, 30.669962908216693),
(b't', b'pretend'): (9, 22.215392905253704),
(b'why', b'should'): (57, 22.803134218289085),
(b'cannot', b'imagine'): (13, 50.34105640180307),
(b'back', b'again'): (74, 19.215480532814567),
(b'almost', b'every'): (37, 10.072259575009541),
(b'higher', b'than'): (34, 46.27167976056865),
(b'ten', b'times'): (16, 32.71643894251348),
(b'dear', b'Isabella'): (6, 10.167718917496957),
(b'must', b'allow'): (12, 16.27068909786588),
(b'sitting', b'down'): (24, 17.45874757620558),
(b'fore', b'-'): (11, 15.528462021343378),
(b'must', b'confess'): (10, 12.590414182872408),
(b'depended', b'on'): (14, 20.58338695719706),
(b'no', b'sooner'): (26, 28.749758700079695),
(b'after', b'breakfast'): (11, 12.17388137561763),
(b'sooner', b'than'): (12, 16.389331849226902),
(b'at', b'home'): (154, 15.749528854479703),
(b'at', b'least'): (301, 41.37059021349224),
(b'Upon', b'my'): (40, 19.147177205335975),
(b'Will', b'you'): (82, 14.912145378521789),
(b"'", b'd'): (2523, 30.546910997373118),
(b'She', b'paused'): (9, 28.95364951542675),
(b'replied', b'Emma'): (16, 16.331043870304843),
(b'can', b'hardly'): (33, 26.420626528672287),
(b'am', b'persuaded'): (12, 14.509626277861573),
(b'Are', b'you'): (83, 16.465332468596394),
(b'I', b'beg'): (52, 10.231643559114522),
(b'beg', b'your'): (38, 44.837927443381055),
(b'your', b'pardon'): (41, 42.182804133556154),
(b'dear', b'Miss'): (28, 19.29791549647382),
(b'little', b'while'): (54, 10.254870302373984),
(b'`', b'No'): (8, 11.44545824696476),
(b'entered', b'into'): (98, 58.47423553525749),
(b'older', b'than'): (15, 59.834068655907735),
(b'advise', b'you'): (16, 10.800158446902824),
(b'run', b'away'): (46, 41.70122298206315),
(b'At', b'last'): (91, 78.82820279093205),
(b'"', b'Indeed'): (53, 16.73627360604856),
(b'Dear', b'me'): (17, 11.639624322425217),
(b'have', b'borne'): (26, 11.406974967061926),
(b'good', b'opinion'): (19, 14.834508731529459),
(b'good', b'natured'): (17, 34.331291635825316),
(b'thank', b'you'): (59, 16.88752048061169),
(b'merely', b'because'): (10, 11.301099005522438),
(b'Emma', b'felt'): (19, 16.55226511478519),
(b'no', b'difficulty'): (15, 13.258034087254796),
(b'protest', b'against'): (6, 10.157197996222385),
(b'Let', b'us'): (117, 32.264040693882876),
(b'cried', b'Emma'): (27, 14.204759677426091),
(b'"', b'Has'): (17, 11.456377766045145),
(b'next', b'morning'): (62, 76.77797550074492),
(b'dear', b'sir'): (21, 30.138577189712098),
(b'am', b'going'): (42, 10.080878050929599),
(b'sat', b'down'): (150, 58.92285841199917),
(b'depends', b'upon'): (8, 18.878472434214544),
(b'has', b'happened'): (14, 16.886495752427184),
(b'presently', b'added'): (6, 24.986707070707073),
(b'could', b'afford'): (11, 16.180559916274202),
(b'Certainly', b',"'): (12, 17.049273752697825),
(b'stood', b'up'): (80, 10.28632971468491),
(b'"', b'Nonsense'): (8, 10.024330545289503),
(b'are', b'mistaken'): (17, 10.34529425541169),
(b'does', b'seem'): (9, 14.17908976269632),
(b'few', b'moments'): (43, 388.789590364635),
(b'nobody', b'knows'): (7, 38.60305867665418),
(b'very', b'likely'): (29, 25.18359701097469),
(b'all', b'probability'): (16, 13.453639483026294),
(b'no', b'harm'): (25, 27.989183073093457),
(b'cannot', b'help'): (16, 20.51549678061432),
(b'very', b'different'): (29, 17.75617669437386),
(b'common', b'sense'): (26, 145.78524280999528),
(b'.--', b'She'): (78, 30.440671188909267),
(b'-', b'natured'): (60, 48.041179378531076),
(b'an', b'hundred'): (183, 30.38185737390911),
(b'exactly', b'what'): (25, 14.387504885629737),
(b'every', b'man'): (307, 12.828795338958603),
(b'be', b'satisfied'): (66, 12.272908603731745),
(b'less', b'than'): (85, 36.58643688514103),
(b'large', b'fortune'): (9, 38.26270688321733),
(b'no', b'use'): (40, 14.694321113374066),
(b'these', b'words'): (111, 26.137410685805424),
(b'well', b'acquainted'): (9, 11.682096812278632),
(b'twenty', b'thousand'): (48, 77.04104378157882),
(b'thousand', b'pounds'): (47, 448.5645551257253),
(b'Good', b'morning'): (10, 19.263291344272915),
(b'walked', b'off'): (15, 10.916907922609802),
(b'cast', b'down'): (44, 15.322709019072688),
(b'its', b'effects'): (7, 29.72249056785139),
(b'deal', b'more'): (28, 10.737496923771433),
(b'longer', b'than'): (31, 18.302185706512955),
(b'perfectly', b'satisfied'): (12, 93.75697392359005),
(b'three', b'hundred'): (77, 49.30309130408786),
(b'looking', b'at'): (106, 12.747008682016011),
(b'next', b'moment'): (21, 24.667278275263754),
(b'ready', b'wit'): (8, 70.12901152901154),
(b'very', b'pleasant'): (19, 11.64578255559322),
(b'an', b'idea'): (37, 14.230682717061454),
(b'Give', b'me'): (67, 25.719421458776054),
(b'arrive', b'at'): (10, 11.92665663812389),
(b'very', b'superior'): (13, 10.703028729664243),
(b'pre', b'-'): (17, 49.325702891326024),
(b'have', b'chosen'): (38, 12.418092369477911),
(b'without', b'exception'): (8, 55.117736185383244),
(b'her', b'cheeks'): (13, 10.316064715093864),
(b'sit', b'down'): (54, 36.049581123554866),
(b'reason', b'why'): (20, 38.722303389547506),
(b'could', b'hardly'): (46, 23.719027600038913),
(b'It', b'seemed'): (50, 10.045411819743178),
(b'an', b'offering'): (70, 11.10900109162763),
(b'let', b'us'): (282, 32.253455378694596),
(b'Have', b'you'): (81, 15.500622663363387),
(b'"', b'Aye'): (59, 19.50680538542822),
(b'Very', b'true'): (18, 128.88521410135147),
(b'can', b'easily'): (12, 15.161836814749437),
(b'Nobody', b'could'): (9, 15.935399917542776),
(b'dear', b'mother'): (21, 13.460467748430139),
(b'those', b'things'): (85, 16.269842914134045),
(b'next', b'week'): (12, 51.500749500333114),
(b'Why', b'should'): (43, 13.316921218220724),
(b'.--', b'Poor'): (10, 33.94933025911286),
(b'taken', b'away'): (75, 33.882342535970594),
(b'stay', b'longer'): (9, 32.079105716360615),
(b'three', b'days'): (97, 38.359912674236874),
(b'cannot', b'bear'): (17, 21.2053080048691),
(b'We', b'are'): (133, 13.053427143739613),
(b'four', b'o'): (8, 13.636425778378854),
(b'o', b"'"): (216, 29.05179422396496),
(b"'", b'clock'): (67, 18.373899375590025),
(b'ask', b'whether'): (9, 11.352902840883013),
(b're', b'-'): (54, 17.206160179428213),
(b'Of', b'course'): (52, 64.52969306765831),
(b'ran', b'away'): (24, 15.685508857974716),
(b'who', b'lived'): (27, 14.51537608023045),
(b'A', b'few'): (48, 27.798793006840953),
(b'.--', b'Emma'): (18, 10.090713174013455),
(b'thus', b'began'): (15, 10.569835629596586),
(b'Never', b'mind'): (10, 58.81881301122313),
(b'good', b'fortune'): (18, 19.83585738958796),
(b'Those', b'who'): (17, 18.714038582776446),
(b'Jane', b'Fairfax'): (111, 897.6983416183942),
(b'nothing', b'else'): (45, 34.14808689686209),
(b'present', b'instance'): (6, 15.747924624395214),
(b'These', b'are'): (118, 23.590054389582264),
(b'once', b'more'): (124, 21.461454181291764),
(b'still', b'greater'): (11, 12.724188305008026),
(b'here', b'comes'): (14, 13.200741413619147),
(b'turned', b'back'): (42, 28.862454905522338),
(b'will', b'bring'): (144, 12.921944022284055),
(b'each', b'side'): (37, 22.901592275974444),
(b'waiting', b'for'): (50, 11.22850243300167),
(b'still', b'remained'): (10, 13.210905890445488),
(b'she', b'hoped'): (24, 14.771734566364692),
(b'ten', b'minutes'): (39, 192.59628296373643),
(b'most', b'favourable'): (6, 12.48377003512455),
(b'ten', b'days'): (21, 17.36302410708332),
(b'many', b'months'): (10, 10.650678561587213),
(b'little', b'ones'): (53, 64.8622484431334),
(b'-', b'tempered'): (21, 31.944264729620663),
(b'passed', b'over'): (52, 23.11431354773038),
(b'sir', b',"'): (108, 26.985788449773853),
(b'cannot', b'deny'): (9, 34.78623560349313),
(b'talking', b'about'): (24, 19.191191650605955),
(b'never', b'forget'): (18, 21.690200998246326),
(b'cannot', b'tell'): (30, 18.28907708359457),
(b'two', b'years'): (53, 12.9458705561574),
(b'indeed', b'!--'): (19, 15.372462749108678),
(b'most', b'amiable'): (8, 17.207358697063572),
(b',"', b'observed'): (18, 11.710949290013575),
(b'our', b'lives'): (16, 19.654875413170608),
(b'think', b'differently'): (6, 12.463139862958483),
(b'shake', b'hands'): (13, 44.83992241738721),
(b'How', b'long'): (48, 18.446626247771526),
(b'South', b'End'): (8, 1381.4318689501117),
(b'perfectly', b'convinced'): (8, 75.68177368034593),
(b'tells', b'me'): (15, 12.932915913805795),
(b'bad', b'cold'): (7, 14.307186896320372),
(b'far', b'off'): (67, 27.665131723254174),
(b'am', b'sorry'): (32, 26.675240833932428),
(b'Ah', b'!"'): (14, 17.41235304055111),
(b'an', b'interval'): (9, 11.127182762808689),
(b'perfectly', b'well'): (16, 14.848043217286914),
(b'He', b'paused'): (14, 28.88198206613114),
(b'can', b'tell'): (51, 12.430822730105128),
(b'morrow', b'morning'): (14, 22.316089764922395),
(b'own', b'feelings'): (16, 10.723815634320593),
(b'sore', b'throat'): (7, 129.16060985797827),
(b'&', b'c'): (17, 4365.324705882353),
(b'well', b'satisfied'): (17, 19.87160797964922),
(b'looked', b'at'): (184, 13.030752013575848),
(b'well', b'pleased'): (21, 19.251395495889877),
(b'set', b'forward'): (22, 30.104250841148257),
(b'eldest', b'daughter'): (10, 86.17906911928651),
(b'short', b'time'): (23, 10.749383663832438),
(b'Ha', b'!'): (35, 53.03251532865493),
(b'"', b'Quite'): (25, 17.43361833963392),
(b',"', b'continued'): (103, 41.17971551769827),
(b'dining', b'-'): (20, 27.583452274754684),
(b'such', b'circumstances'): (10, 10.212955347912391),
(b'enter', b'into'): (108, 69.91980623618285),
(b'gone', b'through'): (24, 11.358845786496218),
(b'turn', b'away'): (49, 25.69282323847976),
(b',"', b'repeated'): (29, 16.023102816168517),
(b'several', b'times'): (18, 100.66202138582126),
(b'great', b'curiosity'): (13, 12.762131764948666),
(b'upper', b'end'): (11, 50.599001800032724),
(b'an', b'odd'): (25, 26.95760772433033),
(b'In', b'short'): (23, 17.859158861411398),
(b'dearest', b'Emma'): (8, 41.198769763723575),
(b'continued', b'Mrs'): (17, 12.808985040466856),
(b'go', b'home'): (35, 10.887636156851283),
(b'covered', b'with'): (53, 10.080468634592364),
(b'hardly', b'knew'): (16, 30.25731454547072),
(b'knew', b'how'): (29, 10.771586203292852),
(b'set', b'off'): (42, 12.87623069823977),
(b'can', b'get'): (31, 10.915759166817718),
(b'got', b'home'): (12, 13.02303200594452),
(b'most', b'extraordinary'): (16, 46.22702963501566),
(b'an', b'inch'): (27, 63.919483204134366),
(b'at', b'ease'): (36, 17.60601694199241),
(b'tete', b'-'): (7, 10.750473707083877),
(b'well', b'known'): (33, 14.650186556114896),
(b'Smith', b'!--'): (9, 18.891159579667644),
(b'extremely', b'sorry'): (8, 73.46994297481388),
(b'Every', b'thing'): (18, 22.46842755413457),
(b'many', b'weeks'): (10, 20.757955155746508),
(b'Am', b'I'): (43, 15.552098209854075),
(b'madam', b',"'): (13, 15.249039878189363),
(b'extremely', b'well'): (16, 29.947748184019368),
(b'!--', b'Such'): (10, 19.885431136492258),
(b'poor', b'Harriet'): (13, 12.145847347359634),
(b'-', b'headed'): (29, 37.268308851224106),
(b'an', b'instant'): (95, 43.261013866434524),
(b'thirty', b'thousand'): (18, 42.25608127995964),
(b'so', b'easily'): (19, 10.348707172705279),
(b'worth', b'having'): (9, 21.664774916798038),
(b'poor', b'girl'): (11, 16.40337746734157),
(b'laugh', b'at'): (34, 11.791126449054302),
(b'knowing', b'what'): (21, 15.75053166426834),
(b'many', b'days'): (49, 14.230254790394108),
(b'whole', b'party'): (15, 21.733756466486728),
(b'six', b'weeks'): (6, 18.23820337383508),
(b'too', b'late'): (56, 87.81454226528567),
(b'-', b'minded'): (19, 20.81474696477942),
(b'her', b'companions'): (36, 11.854581263120794),
(b'drew', b'near'): (34, 135.39134997206745),
(b'three', b'months'): (35, 82.38692831354994),
(b'other', b'side'): (133, 28.113809270630338),
(b'an', b'unnatural'): (10, 19.2271172739709),
(b'get', b'rid'): (18, 302.7023984336759),
(b'watering', b'-'): (10, 20.552376204719177),
(b'while', b'ago'): (10, 15.477085481465169),
(b'at', b'Weymouth'): (16, 43.7310743397876),
(b'present', b'occasion'): (9, 32.735843313703434),
(b'No', b',"'): (86, 11.56716665027149),
(b'their', b'hearts'): (49, 18.284578085269942),
(b'break', b'through'): (12, 11.454968114101241),
(b'burst', b'forth'): (18, 49.66500488033903),
(b'young', b'men'): (141, 28.058139771512256),
(b'-', b'bred'): (21, 27.95123163841808),
(b'nobody', b'else'): (20, 96.32925917464537),
(b'something', b'else'): (35, 38.92533060859677),
(b'walking', b'together'): (9, 11.84464959491295),
(b'burst', b'out'): (19, 11.10216993785889),
(b'-', b'sized'): (12, 27.17480853735091),
(b'how', b'long'): (57, 10.246816619029866),
(b'Miss', b'Fairfax'): (125, 273.22756777255375),
(b'extremely', b'happy'): (7, 19.519016507275197),
(b'don', b"'"): (693, 30.892577640288216),
(b'ma', b"'"): (213, 29.826517196587346),
(b's', b'handwriting'): (7, 11.482861705288176),
(b'Ma', b"'"): (15, 17.287270917893842),
(b'without', b'seeming'): (8, 24.251803921568627),
(b'Colonel', b'Campbell'): (28, 896.7708845596117),
(b'those', b'days'): (84, 24.94261977039944),
(b'Miss', b'Campbell'): (12, 75.31616124710753),
(b'most', b'charming'): (7, 10.48020200479592),
(b'caught', b'hold'): (10, 25.41177679158448),
(b'four', b'months'): (9, 21.51366300812301),
(b'may', b'guess'): (11, 13.366929644439642),
(b'Bless', b'me'): (14, 16.96059544124817),
(b'running', b'away'): (12, 11.650452540443363),
(b'My', b'father'): (32, 11.188894979347538),
(b'five', b'minutes'): (37, 145.59218386745533),
(b'nine', b'years'): (9, 22.042968784808547),
(b'hundred', b'pounds'): (11, 62.90946390424197),
(b'more', b'honourable'): (10, 10.647964489624442),
(b'rather', b'than'): (75, 19.03811275415246),
(b'few', b'months'): (17, 59.138014296301606),
(b'she', b'wished'): (31, 11.819087018586881),
(b'without', b'feeling'): (12, 13.668488522623221),
(b'twelve', b'thousand'): (24, 59.18560102353321),
(b'passed', b'between'): (17, 19.790738607270864),
(b",'", b'said'): (250, 30.379793675936877),
(b'Miss', b'Hawkins'): (18, 356.6758207630878),
(b'dear', b'Jane'): (14, 28.087065128531204),
(b'three', b'minutes'): (10, 10.882367432840969),
(b'have', b'suffered'): (29, 12.705105290190035),
(b'hour', b'ago'): (10, 35.65865950134926),
(b'looked', b'round'): (26, 11.609345695809427),
(b'help', b'thinking'): (10, 32.395023572551075),
(b'a', b'series'): (16, 10.464298240216587),
(b'laughed', b'at'): (28, 12.563958901229746),
(b'weeks', b'ago'): (7, 66.07767923923495),
(b'She', b'wished'): (10, 10.481853991240559),
(b'twenty', b'miles'): (8, 32.07910571636062),
(b'elder', b'sister'): (6, 20.892601351351352),
(b'alas', b'!'): (23, 57.08794297143443),
(b'no', b'fault'): (13, 10.495943652410048),
(b'driven', b'away'): (10, 15.108423197384944),
(b'setting', b'off'): (8, 17.077163107511044),
(b'little', b'farther'): (16, 13.377838741396264),
(b'spot', b'where'): (18, 40.488884971796935),
(b'front', b'door'): (15, 45.13461833203179),
(b'they', b'parted'): (20, 10.443101887710071),
(b'without', b'delay'): (8, 17.832208765859285),
(b'six', b'months'): (21, 149.7251460218503),
(b'months', b'ago'): (7, 33.90373070913626),
(b'leaned', b'back'): (6, 12.550400811770675),
(b'at', b'Oxford'): (10, 14.908320797654866),
(b'turned', b'round'): (20, 11.258866238141938),
(b'pass', b'through'): (62, 22.187894661308842),
(b'clock', b'struck'): (16, 287.94205291005295),
(b'four', b'hours'): (15, 47.339972748289114),
(b'faster', b'than'): (10, 39.88937910393849),
(b'musical', b'society'): (6, 113.40931597285898),
(b'worth', b'while'): (16, 39.414977692797954),
(b'mixed', b'with'): (23, 10.669850873308452),
(b'extremely', b'glad'): (10, 72.78966572504707),
(b'knew', b'nothing'): (25, 12.448864562804845),
(b'make', b'amends'): (9, 67.17951224811253),
(b'amends', b'for'): (12, 15.103436605959109),
(b'oftener', b'than'): (9, 50.68297580265126),
(b'old', b'woman'): (61, 19.444449684793035),
(b'post', b'-'): (19, 12.22866384180791),
(b'just', b'going'): (25, 13.260137742937028),
(b'At', b'least'): (17, 28.035284695357834),
(b'their', b'lives'): (30, 14.905906047774408),
(b'six', b'days'): (15, 14.20782138820221),
(b'may', b'prove'): (9, 10.693543715551712),
(b'stronger', b'than'): (29, 58.08610709966773),
(b'particular', b'friend'): (10, 19.81788291694974),
(b'Hum', b'!'): (7, 26.95819529206626),
(b'good', b'tidings'): (14, 31.69042304845414),
(b'among', b'themselves'): (30, 14.912692334403358),
(b'next', b'summer'): (8, 20.513743918620076),
(b'breaking', b'up'): (12, 10.278262005104766),
(b'perfectly', b'safe'): (6, 16.44561748750133),
(b'two', b'ladies'): (12, 10.206988182478378),
(b'same', b'moment'): (29, 15.560860129375872),
(b'well', b'worth'): (11, 11.682096812278632),
(b',"', b'added'): (51, 22.00020571516956),
(b'little', b'girls'): (15, 24.596639156806205),
(b'be', b'ashamed'): (88, 15.57521865836133),
(b'been', b'staying'): (9, 13.958581617912605),
(b'shut', b'up'): (64, 33.02080970890481),
(b'too', b'large'): (19, 13.758850077869242),
(b'At', b'first'): (26, 13.622350914881029),
(b'worse', b'than'): (55, 50.56400168104879),
(b'opposite', b'side'): (9, 21.484700834657843),
(b'short', b'pause'): (11, 86.61965123608095),
(b'large', b'party'): (8, 14.159718830137988),
(b'six', b'years'): (19, 24.75055888120844),
(b'who', b'knows'): (23, 17.154535367545076),
(b'extremely', b'fond'): (6, 34.25396034119863),
(b'or', b'twice'): (16, 10.479935604134893),
(b'somebody', b'else'): (14, 146.97092685503037),
(b'five', b'couple'): (7, 31.78235173193545),
(b'"', b'Don'): (54, 12.435245486561662),
(b'bad', b'news'): (7, 32.31039707419018),
(b'baked', b'apples'): (6, 613.5128968253968),
(b'will', b'send'): (73, 16.24789061081296),
(b'William', b'Larkins'): (13, 5074.223589743589),
(b'low', b'voice'): (39, 55.283668345011634),
(b'one', b'leg'): (17, 10.745803649000868),
(b'an', b'immediate'): (12, 14.761464229693788),
(b'Tell', b'me'): (40, 25.092668376242766),
(b',"', b'resumed'): (18, 28.97963722613529),
(b'many', b'times'): (18, 11.52315244811375),
(b'Nothing', b'can'): (12, 15.5144376709064),
(b'few', b'words'): (18, 11.709704106675987),
(b'no', b'objection'): (17, 30.84522216218463),
(b'It', b'seems'): (24, 18.227904675031336),
(b'astonished', b'at'): (22, 13.982976748145253),
(b'four', b'times'): (13, 17.90461714401523),
(b'other', b'end'): (40, 10.241020603889318),
(b'few', b'hours'): (23, 78.07853039580834),
(b'an', b'extraordinary'): (13, 10.355991878257592),
(b'look', b'forward'): (11, 11.731590179742955),
(b'Alas', b'!'): (16, 24.207359037773784),
(b'immediately', b'followed'): (7, 12.604630780832807),
(b'wait', b'till'): (17, 29.399153804709538),
(b'-', b'bye'): (37, 39.93033091202583),
(b'contrast', b'between'): (12, 166.24220430107528),
(b'dared', b'not'): (22, 11.783382014386875),
(b'three', b'weeks'): (9, 21.40939225562999),
(b'-', b'sighted'): (11, 32.25142112125163),
(b'Maple', b'Grove'): (31, 16731.47346514048),
(b'My', b'brother'): (17, 11.063251359603091),
(b'at', b'Maple'): (10, 11.541925778829572),
(b'almost', b'fancy'): (9, 16.725382014874913),
(b'left', b'behind'): (27, 29.19139358595579),
(b'barouche', b'-'): (7, 13.97561581920904),
(b'-', b'landau'): (7, 19.965165456012915),
(b'whose', b'name'): (60, 31.460744776298874),
(b'most', b'serious'): (8, 10.186756348661634),
(b'We', b'cannot'): (19, 11.86846921192564),
(b'waited', b'for'): (38, 10.711792586527258),
(b'E', b'.,'): (6, 566.3195970695971),
(b'person', b'who'): (30, 10.409305441471528),
(b'greater', b'part'): (10, 16.18360863375623),
(b'drew', b'back'): (11, 15.195552552368083),
(b'Her', b'manners'): (6, 10.694144704987204),
(b'third', b'time'): (23, 13.583626301884719),
(b'very', b'extraordinary'): (12, 11.12691105559154),
(b'better', b'acquainted'): (7, 13.44978251413658),
(b'According', b'to'): (45, 10.652559050879573),
(b'have', b'committed'): (34, 16.773483913206135),
(b'hardly', b'less'): (8, 12.999957957579198),
(b'will', b'shew'): (48, 12.349694416837725),
(b'little', b'boys'): (17, 17.749466634776372),
(b'easily', b'believe'): (8, 21.82456945228684),
(b'my', b'lord'): (180, 36.09212316660775),
(b'"', b'Excuse'): (11, 17.184566649067715),
(b'Excuse', b'me'): (13, 37.6902120916626),
(b'put', b'forth'): (36, 10.696077873443938),
(b'drawing', b'near'): (8, 18.702625052924454),
(b'great', b'joy'): (24, 10.008039650578848),
(b'eight', b'o'): (9, 65.15181205225453),
(b'spread', b'abroad'): (14, 194.30906996229578),
(b'few', b'lines'): (10, 43.57778059642595),
(b'good', b'news'): (18, 24.794821736984947),
(b'most', b'likely'): (12, 19.419197832415968),
(b'talk', b'about'): (35, 21.82456945228684),
(b'tells', b'us'): (12, 30.445234478296342),
(b'dear', b'madam'): (10, 68.52158400921863),
(b'eleven', b'years'): (6, 16.99145510495659),
(b'your', b'sister'): (79, 15.96501961999174),
(b'two', b'hours'): (17, 15.078657986492088),
(b'two', b'months'): (19, 19.986458535324154),
(b'door', b'opened'): (19, 32.959977767541375),
(b'Who', b'can'): (30, 10.979605329969967),
(b'began', b'talking'): (9, 13.574813692886854),
(b'mean', b'?"'): (59, 26.417439208554274),
(b'In', b'spite'): (9, 11.906105907607598),
(b'many', b'hours'): (12, 13.124384550084889),
(b'few', b'steps'): (8, 22.23496206809765),
(b'most', b'excellent'): (10, 12.940493329092522),
(b'later', b'than'): (8, 11.966813731181546),
(b'whole', b'story'): (18, 36.394832862523536),
(b'whole', b'history'): (8, 19.243869803335823),
(b'lined', b'with'): (12, 13.540103155017157),
(b'-', b'plaister'): (9, 17.4695197740113),
(b'Lord', b'bless'): (12, 16.701184413580247),
(b'these', b'things'): (325, 42.22024236217784),
(b'laid', b'down'): (26, 12.408517237129121),
(b'forty', b'years'): (63, 161.26435572340617),
(b'faint', b'smile'): (6, 24.270839874411305),
(b'turned', b'towards'): (11, 11.117214558790042),
(b'totally', b'different'): (7, 158.3259088581669),
(b'Box', b'Hill'): (18, 8589.180555555555),
(b'some', b'surprise'): (19, 22.397283733443707),
(b'may', b'depend'): (9, 21.164305270362764),
(b',"', b'interrupted'): (25, 30.235165275720284),
(b'whatever', b'else'): (8, 18.739462440532105),
(b'mid', b'-'): (22, 25.824507492016703),
(b'larger', b'than'): (11, 19.006115925994223),
(b'were', b'assembled'): (17, 18.151482242442032),
(b'insisted', b'on'): (9, 11.611141360470139),
(b'clothed', b'with'): (36, 12.658921838579532),
(b'twenty', b'minutes'): (7, 11.181099087860133),
(b'quite', b'alone'): (14, 11.34125512474631),
(b'etc', b'.,'): (11, 3964.237179487179),
(b'As', b'soon'): (46, 20.834318136064653),
(b'without', b'knowing'): (18, 34.569457344341245),
(b',"', b'whispered'): (18, 26.71560306784347),
(b'shan', b"'"): (18, 22.473452193261995),
(b'looking', b'round'): (23, 17.251680754316958),
(b'Pardon', b'me'): (7, 10.147364793909164),
(b',"', b'answered'): (143, 22.574837953998617),
(b'An', b'old'): (15, 19.494650501535183),
(b'Shall', b'we'): (25, 13.344431737263449),
(b'old', b'age'): (36, 48.2305151350481),
(b'an', b'infant'): (9, 23.77170862963675),
(b'be', b'forgiven'): (36, 18.33315259385065),
(b'lie', b'down'): (41, 31.29163861453632),
(b'four', b'miles'): (7, 16.305990613299585),
(b'great', b'hurry'): (16, 19.653682918020948),
(b'without', b'waiting'): (12, 19.24746342981637),
(b'comes', b'back'): (13, 15.420287686817213),
(b'heightened', b'by'): (7, 11.874899189677024),
(b'In', b'fact'): (28, 39.68262953497974),
(b'cut', b'off'): (213, 155.49937138546645),
(b'never', b'mind'): (29, 11.197235791371655),
(b'trembling', b'voice'): (7, 11.950048791799114),
(b'More', b'than'): (14, 24.232797805642633),
(b'time', b'past'): (23, 13.32652296494053),
(b'second', b'time'): (44, 20.945309358703252),
(b'five', b'hundred'): (65, 85.88703405366368),
(b'turning', b'away'): (13, 10.732190271245857),
(b'an', b'arrow'): (10, 14.85731789352297),
(b'--', b'oh'): (17, 12.810820328683882),
(b'presented', b'themselves'): (6, 11.894541467918142),
(b'at', b'random'): (11, 18.66781039010696),
(b'far', b'distant'): (10, 26.635412180205016),
(b'few', b'seconds'): (9, 96.54154470592826),
(b'passing', b'through'): (10, 15.455115709501674),
(b'will', b'heal'): (10, 10.57744672117128),
(b'rose', b'early'): (12, 50.250699385933345),
(b'east', b'wind'): (22, 148.3861256175018),
(b'gone', b'mad'): (10, 20.604417938295462),
(b'freed', b'from'): (10, 25.542343339556247),
(b'sinned', b'against'): (43, 81.41629018847007),
(b'locked', b'up'): (11, 13.13762812682564),
(b'deep', b'sigh'): (7, 47.7618937287612),
(b'ten', b'thousand'): (75, 127.07678714010615),
(b'happier', b'than'): (10, 23.413331213181287),
(b'contend', b'with'): (14, 10.669850873308452),
(b'had', b'formerly'): (10, 11.685871610947764),
(b'little', b'boy'): (63, 26.45163569321534),
(b'fancying', b'herself'): (6, 29.039303155522166),
(b'right', b'hand'): (196, 45.67275064863757),
(b'surrounded', b'by'): (18, 30.406938834172983),
(b'infinitely', b'more'): (8, 12.604887693122986),
(b'such', b'cases'): (8, 14.9147687533664),
(b'No', b'wonder'): (13, 19.253530718908465),
(b'poor', b'fellow'): (31, 72.63216713721062),
(b'Poor', b'fellow'): (7, 45.430376492194675),
(b'days', b'ago'): (11, 15.442637278485753),
(b'help', b'laughing'): (7, 20.696820615796522),
(b'draw', b'near'): (18, 83.0335357666646),
(b'at', b'intervals'): (30, 31.38593852137866),
(b'into', b'temptation'): (8, 11.801251836726903),
(b'stood', b'before'): (56, 10.3924315864046),
(b'Sir', b'Walter'): (136, 1001.3120125576278),
(b'Walter', b'Elliot'): (16, 158.52514448173008),
(b'Kellynch', b'Hall'): (24, 4945.285774410774),
(b'arising', b'from'): (7, 10.216937335822498),
(b'Charles', b'Musgrove'): (14, 248.91721824686942),
(b'first', b'year'): (71, 36.52536994480359),
(b'Lady', b'Elliot'): (12, 34.95596737726098),
(b'seventeen', b'years'): (7, 50.974365314869765),
(b'an', b'awful'): (13, 15.611271338865924),
(b'Lady', b'Russell'): (147, 1370.6224754175123),
(b'Anne', b'Elliot'): (23, 69.51674910071942),
(b'Miss', b'Elliot'): (48, 81.92874088041015),
(b'everybody', b'else'): (20, 116.64359274208759),
(b'her', b'mistress'): (30, 10.11840289117327),
(b'Mr', b'Elliot'): (174, 154.42250147754135),
(b'Mr', b'Shepherd'): (26, 153.50875886524824),
(b'anybody', b'else'): (21, 167.79555359595722),
(b'reference', b'to'): (30, 10.087650616363232),
(b'an', b'honest'): (28, 22.111184865066537),
(b'descend', b'into'): (11, 19.585056239674437),
(b'Mrs', b'Clay'): (66, 287.0445438704621),
(b'Miss', b'Anne'): (19, 13.816993610080877),
(b'their', b'fathers'): (151, 21.03847250792006),
(b'an', b'example'): (14, 17.82878147222756),
(b'Admiral', b'Croft'): (14, 1020.8710564930301),
(b'Mrs', b'Croft'): (41, 207.37305091376518),
(b'walked', b'along'): (8, 11.422274896105895),
(b'Frederick', b'Wentworth'): (6, 23.252406376898783),
(b'either', b'side'): (18, 19.363312306866),
(b'Captain', b'Wentworth'): (196, 976.2658980080998),
(b'eldest', b'son'): (15, 39.758333601208655),
(b'removed', b'from'): (36, 14.303712270151498),
(b'good', b'humour'): (23, 57.21881939304219),
(b'The', b'Crofts'): (8, 10.249618801378945),
(b'startled', b'by'): (14, 14.177175563185834),
(b'most', b'important'): (14, 33.80560735175321),
(b'replied', b'Anne'): (11, 13.874444726962713),
(b'at', b'Uppercross'): (20, 13.940248018586365),
(b'Great', b'House'): (13, 1177.944761904762),
(b'left', b'alone'): (16, 14.13574836462775),
(b'Mr', b'Musgrove'): (21, 32.388661211129296),
(b'Miss', b'Musgroves'): (22, 227.52303763499037),
(b'Mrs', b'Musgrove'): (66, 156.77048165232932),
(b'flower', b'-'): (23, 13.52478950246036),
(b'grown', b'up'): (19, 12.21891287320147),
(b'their', b'faces'): (62, 22.656977192617102),
(b'surprised', b'at'): (27, 14.056416752074586),
(b'ere', b'long'): (20, 33.88236905544598),
(b'anything', b'else'): (30, 74.81068120893052),
(b'quite', b'different'): (12, 19.349238133975785),
(b'their', b'sakes'): (13, 16.878072078710716),
(b'twentieth', b'year'): (13, 185.54485448544855),
(b'on', b'board'): (69, 34.25698443940126),
(b'eight', b'years'): (22, 61.89744359662757),
(b'-', b'bone'): (22, 13.499174370826912),
(b'their', b'heads'): (77, 21.62117992564767),
(b'Your', b'sister'): (11, 24.014484311898105),
(b'dressing', b'-'): (19, 29.645245677110086),
(b'up', b'stairs'): (15, 14.51249618660972),
(b'waited', b'till'): (7, 12.054520291606565),
(b'third', b'part'): (39, 80.67867146551659),
(b'Phoo', b'!'): (7, 23.96284025961445),
(b'dear', b'fellow'): (11, 20.63122602168474),
(b'good', b'cheer'): (14, 58.85364280427197),
(b'Mrs', b'Harville'): (24, 84.63892670628489),
(b'"', b'Ay'): (34, 18.457497511961623),
(b'fifteen', b'years'): (9, 52.05892627901593),
(b'Charles', b'Hayter'): (33, 2649.294369645043),
(b'came', b'near'): (42, 11.627278418555207),
(b'Her', b'husband'): (9, 21.943829394649068),
(b'two', b'hundred'): (102, 34.52901130958137),
(b'Dr', b'Shirley'): (9, 1086.3785682916116),
(b'went', b'up'): (206, 10.892879248012838),
(b'within', b'reach'): (7, 18.479083248670293),
(b'-', b'yard'): (19, 16.037591923682502),
(b'turn', b'back'): (15, 10.596023199431151),
(b'walking', b'along'): (8, 19.12445108751675),
(b'leaning', b'against'): (12, 23.700128657852233),
(b'trodden', b'under'): (9, 72.24544392523364),
(b'under', b'foot'): (15, 22.22936736161035),
(b'Louisa', b'Musgrove'): (15, 189.52528348145879),
(b'provoke', b'me'): (18, 16.48946779010239),
(b'Very', b'good'): (11, 10.325200491977538),
(b'good', b'humoured'): (9, 26.15717457967643),
(b'Captain', b'Harville'): (37, 475.42275075075077),
(b'at', b'Lyme'): (24, 20.293117264867515),
(b'earnest', b'desire'): (6, 32.79008483563097),
(b'Captain', b'Benwick'): (56, 811.8267953667954),
(b'an', b'officer'): (9, 10.895366455250176),
(b'place', b'where'): (114, 29.88335727268697),
(b'-', b'coat'): (34, 13.073963185711682),
(b'an', b'introduction'): (7, 10.895366455250176),
(b'preceding', b'evening'): (6, 48.9411997467553),
(b'an', b'agony'): (10, 15.202836914302573),
(b'catching', b'hold'): (10, 135.5294762217839),
(b'raised', b'up'): (35, 20.917452604163728),
(b'could', b'scarcely'): (17, 17.384072637319388),
(b'passed', b'along'): (11, 16.544168004280362),
(b'leaning', b'over'): (16, 33.66056062742769),
(b't', b'talk'): (19, 11.57051713815297),
(b'Camden', b'Place'): (29, 11505.506976744186),
(b'straight', b'forward'): (6, 11.997691337666117),
(b'same', b'hour'): (17, 12.921683411398027),
(b'-', b'glasses'): (16, 16.35444404375526),
(b'poring', b'over'): (6, 41.31068804275217),
(b'thirty', b'feet'): (8, 12.120752621435336),
(b'Colonel', b'Wallis'): (23, 967.3744677153037),
(b'Mrs', b'Wallis'): (11, 54.17854483332603),
(b'-', b'haired'): (38, 60.68359500446031),
(b'at', b'length'): (74, 17.88998495718584),
(b'carried', b'away'): (73, 68.93771731236751),
(b'greater', b'than'): (56, 48.182171075546755),
(b'Miss', b'Carteret'): (12, 320.09368530020697),
(b'contact', b'with'): (11, 11.025512569085398),
(b'Lady', b'Dalrymple'): (25, 1027.2774086378738),
(b'Laura', b'Place'): (7, 777.3991200502828),
(b'be', b'established'): (41, 13.256107901092253),
(b'Mrs', b'Smith'): (64, 111.99977591965032),
(b'Westgate', b'Buildings'): (7, 8589.180555555555),
(b'buried', b'him'): (40, 10.212015743068294),
(b'at', b'liberty'): (25, 14.03136075073399),
(b'human', b'nature'): (9, 43.51094068810244),
(b'five', b'thousand'): (31, 37.910597744077265),
(b'whose', b'names'): (9, 20.29106718070708),
(b'her', b'ladyship'): (21, 34.12236790377201),
(b'-', b'maker'): (21, 26.620220608017217),
(b'old', b'gentleman'): (31, 27.054312400201237),
(b'almost', b'entirely'): (13, 36.10558657179347),
(b'lower', b'part'): (8, 15.927465187754331),
(b'staring', b'at'): (33, 25.04597894006017),
(b'an', b'oath'): (37, 44.98731955716202),
(b'wiser', b'than'): (8, 29.373088249263795),
(b'prejudice', b'against'): (7, 39.177763699714916),
(b'both', b'sides'): (30, 99.67073029216844),
(b'my', b'soul'): (234, 16.443509373671976),
(b'rejoice', b'over'): (13, 10.116903194143386),
(b'same', b'instant'): (19, 25.162444780283654),
(b'every', b'one'): (375, 14.671392435905247),
(b'their', b'seats'): (11, 12.658554059033037),
(b'their', b'mouths'): (24, 26.497142818484406),
(b'short', b'silence'): (9, 19.427041120849434),
(b'-', b'blooded'): (7, 17.4695197740113),
(b'general', b'character'): (6, 10.833526031812768),
(b'fifty', b'pounds'): (6, 35.380799816923165),
(b'be', b'saved'): (61, 13.4097233300756),
(b'threw', b'himself'): (8, 10.02991247371238),
(b'some', b'moments'): (13, 21.006148097826088),
(b'exclaimed', b'Mrs'): (11, 12.149128235351897),
(b'compassion', b'on'): (20, 13.01248600742343),
(b'an', b'explanation'): (12, 16.343049682875264),
(b'our', b'hearts'): (17, 14.94420279348509),
(b'minutes', b'afterwards'): (7, 22.216989096657148),
(b'make', b'haste'): (26, 50.3846341860844),
(b"'", b'n'): (26, 22.533063472289214),
(b'n', b"'"): (20, 16.09504533734944),
(b'rising', b'sun'): (7, 13.06573846170098),
(b'-', b'faced'): (30, 42.60858481466171),
(b'an', b'atonement'): (66, 89.61132859823739),
(b'atonement', b'for'): (64, 24.315805643301744),
(b'"', b'Look'): (42, 10.09252327008739),
(b'Look', b'here'): (14, 26.311681865240885),
...}
In [18]:
"Jon lives in New York City".split()
Out[18]:
['Jon', 'lives', 'in', 'New', 'York', 'City']
In [19]:
# CODE HERE
bigram["Jon lives in New York City".split()]
Out[19]:
['Jon', 'lives', 'in', 'New_York', 'City']
In [20]:
lower_sents = []
for s in gberg_sents:
lower_sents.append([w.lower() for w in s if w not in list(string.punctuation)])
In [21]:
lower_sents[0:5]
Out[21]:
[['emma', 'by', 'jane', 'austen', '1816'],
['volume', 'i'],
['chapter', 'i'],
['emma',
'woodhouse',
'handsome',
'clever',
'and',
'rich',
'with',
'a',
'comfortable',
'home',
'and',
'happy',
'disposition',
'seemed',
'to',
'unite',
'some',
'of',
'the',
'best',
'blessings',
'of',
'existence',
'and',
'had',
'lived',
'nearly',
'twenty',
'one',
'years',
'in',
'the',
'world',
'with',
'very',
'little',
'to',
'distress',
'or',
'vex',
'her'],
['she',
'was',
'the',
'youngest',
'of',
'the',
'two',
'daughters',
'of',
'a',
'most',
'affectionate',
'indulgent',
'father',
'and',
'had',
'in',
'consequence',
'of',
'her',
'sister',
's',
'marriage',
'been',
'mistress',
'of',
'his',
'house',
'from',
'a',
'very',
'early',
'period']]
In [22]:
lower_bigram = Phraser(Phrases(lower_sents))
In [23]:
lower_bigram.phrasegrams # miss taylor, mr woodhouse, mr weston
Out[23]:
{(b'two', b'daughters'): (19, 11.080802900992637),
(b'her', b'sister'): (201, 16.93971298099339),
(b'very', b'early'): (25, 10.516998773665177),
(b'her', b'mother'): (253, 10.70812618607742),
(b'long', b'ago'): (38, 59.226442015336005),
(b'more', b'than'): (562, 28.529926612065935),
(b'had', b'been'): (1260, 21.583193129694834),
(b'an', b'excellent'): (58, 37.41859680854167),
(b'sixteen', b'years'): (15, 131.42913000977515),
(b'miss', b'taylor'): (48, 420.4340982546865),
(b'mr', b'woodhouse'): (132, 104.19907841850323),
(b'very', b'fond'): (30, 24.185726346489627),
(b'passed', b'away'): (25, 11.751473221742694),
(b'too', b'much'): (177, 30.36309017383541),
(b'did', b'not'): (977, 10.846196223896685),
(b'any', b'means'): (28, 14.294148100212627),
(b'after', b'dinner'): (22, 18.60737125272944),
(b'mr', b'weston'): (162, 91.63290824201266),
(b'five', b'years'): (42, 37.66428596665674),
(b'years', b'old'): (176, 48.599094446190286),
(b'seven', b'years'): (53, 50.3345604292756),
(b'each', b'other'): (239, 71.31277029783762),
(b'well', b'informed'): (8, 14.185028016786625),
(b'a', b'mile'): (49, 11.700110753652233),
(b'difference', b'between'): (44, 207.86784241868986),
(b'mrs', b'weston'): (249, 180.6778969011602),
(b'could', b'not'): (1059, 10.213333164207079),
(b'having', b'been'): (49, 10.723750443105281),
(b'sixteen', b'miles'): (6, 105.040625),
(b'miles', b'off'): (16, 32.99182066941624),
(b'at', b'hartfield'): (67, 25.555992478744276),
(b'her', b'husband'): (168, 26.67842743680748),
(b'in', b'spite'): (105, 13.346436370855669),
(b'emma', b'could'): (61, 10.88608805283633),
(b'every', b'body'): (148, 39.26110856993498),
(b'no', b'means'): (80, 26.766046928639682),
(b'able', b'to'): (349, 10.854471217684639),
(b'very', b'much'): (241, 15.43191856451234),
(b'have', b'been'): (986, 17.20622462716941),
(b'great', b'deal'): (182, 110.16914388000741),
(b'agree', b'with'): (26, 13.12648342622773),
(b'good', b'humoured'): (30, 149.07455772926625),
(b'for', b'ever'): (565, 10.477844500702485),
(b'three', b'times'): (41, 38.14441525690869),
(b'my', b'dear'): (340, 26.343257103019322),
(b'last', b'night'): (70, 23.230537949447992),
(b'doubt', b'whether'): (12, 19.564378663784144),
(b'anywhere', b'else'): (6, 15.30646630236794),
(b'i', b'am'): (2445, 16.330282814272387),
(b'very', b'glad'): (46, 16.952537612290158),
(b'am', b'sure'): (282, 60.9206736247954),
(b'very', b'pretty'): (40, 18.02785881936292),
(b'be', b'able'): (121, 10.915272046078586),
(b'immediately', b'afterwards'): (10, 37.530798337572115),
(b'mr', b'knightley'): (277, 179.5658713081734),
(b'sensible', b'man'): (17, 13.469145146927872),
(b'intimate', b'friend'): (6, 20.19472630173565),
(b'connected', b'with'): (31, 16.865113476644794),
(b'elder', b'brother'): (6, 14.417929653989134),
(b'than', b'usual'): (30, 27.96484669654346),
(b'brunswick', b'square'): (11, 2374.2341399607585),
(b'some', b'time'): (149, 11.678170286892607),
(b'poor', b'isabella'): (11, 43.036881601877866),
(b'am', b'afraid'): (65, 24.40430621044607),
(b'moonlight', b'night'): (6, 13.233464566929133),
(b'look', b'at'): (188, 10.167669372199262),
(b'vast', b'deal'): (11, 58.66522301228184),
(b'an', b'hour'): (155, 40.46461919025373),
(b'pretty', b'well'): (22, 13.991375757171793),
(b'tolerably', b'well'): (7, 13.779741502021295),
(b'miss', b'woodhouse'): (173, 272.89637286224394),
(b'you', b'please'): (94, 10.458014920259812),
(b'any', b'rate'): (47, 81.39444780766237),
(b'very', b'true'): (50, 13.110718065819091),
(b',"', b'said'): (2585, 35.208800203215695),
(b'my', b'dearest'): (20, 15.989439634668441),
(b'so', b'much'): (501, 16.689081989684137),
(b'much', b'less'): (40, 18.956871177506912),
(b'any', b'body'): (93, 20.81402827327549),
(b'has', b'been'): (266, 28.01515665997839),
(b'been', b'used'): (29, 13.60480451219398),
(b'dear', b'emma'): (33, 26.724453867413775),
(b'every', b'thing'): (258, 26.813583851480026),
(b'very', b'sorry'): (34, 20.451631026452166),
(b'turned', b'away'): (50, 18.47583754119931),
(b'divided', b'between'): (10, 32.86799217731421),
(b'how', b'much'): (142, 13.223145529540588),
(b'four', b'years'): (23, 17.08789651812486),
(b'years', b'ago'): (56, 157.92138920022722),
(b'any', b'thing'): (384, 34.60353767883287),
(b'oh', b'dear'): (22, 13.071010293556995),
(b'need', b'not'): (108, 12.811033130226706),
(b'ever', b'since'): (68, 42.54639154544724),
(b'leave', b'off'): (19, 10.456551581109641),
(b'match', b'making'): (6, 19.514707779641338),
(b'young', b'lady'): (73, 46.68694722651542),
(b'depend', b'upon'): (45, 79.94054343302217),
(b'more', b'likely'): (16, 10.63926988816206),
(b'have', b'done'): (272, 12.079550287766299),
(b',"', b'rejoined'): (6, 10.722990216928967),
(b'mr', b'elton'): (214, 139.40875576036868),
(b'any', b'longer'): (32, 15.647026370007673),
(b'very', b'well'): (211, 12.391112026636758),
(b'young', b'man'): (266, 24.28633425456424),
(b'dine', b'with'): (23, 13.166381552143315),
(b'much', b'better'): (40, 10.49785543468346),
(b'i', b'dare'): (138, 13.033485583230483),
(b'dare', b'say'): (115, 119.19759719744675),
(b'take', b'care'): (71, 74.57127010537992),
(b'chapter', b'ii'): (11, 279.33240997229916),
(b'entering', b'into'): (14, 14.764128843338215),
(b'never', b'seen'): (42, 13.026399306383945),
(b'mrs', b'churchill'): (59, 72.70198534025822),
(b'refrain', b'from'): (10, 13.332613486117234),
(b'at', b'once'): (270, 18.772289267502124),
(b'three', b'years'): (80, 36.12160535692111),
(b'mother', b's'): (212, 10.43349082727472),
(b'twenty', b'years'): (71, 82.70010934937123),
(b'according', b'to'): (792, 11.428224398751397),
(b'had', b'begun'): (25, 11.498896738107863),
(b'passed', b'through'): (45, 29.981417521047756),
(b'its', b'being'): (58, 15.035551583602627),
(b'deal', b'better'): (14, 18.324491856239064),
(b'belonging', b'to'): (36, 10.007375856554956),
(b'mr', b'frank'): (50, 51.13385894796941),
(b'frank', b'churchill'): (151, 1615.1350106048417),
(b'mrs', b'perry'): (11, 23.552718142359424),
(b'miss', b'bates'): (113, 368.52784388923357),
(b'a', b'few'): (452, 11.993654742645493),
(b'few', b'days'): (53, 34.43889574044208),
(b'i', b'suppose'): (210, 11.320989637575268),
(b'very', b'handsome'): (21, 18.293203927526704),
(b'an', b'irresistible'): (7, 10.743649616890554),
(b'good', b'sense'): (28, 15.484518577039912),
(b'had', b'already'): (64, 11.161348479021472),
(b'long', b'enough'): (39, 14.424561747959498),
(b'at', b'randalls'): (39, 24.914802703291915),
(b'few', b'weeks'): (19, 130.17082616179002),
(b'no', b'longer'): (117, 37.27164862774777),
(b'mr', b'perry'): (36, 95.9153455928979),
(b'chapter', b'iii'): (10, 294.85087719298247),
(b'donwell', b'abbey'): (9, 737.1720986902222),
(b'card', b'table'): (7, 52.05489116407124),
(b'drawing', b'room'): (49, 219.77986922924512),
(b'thrown', b'away'): (11, 13.991597804637394),
(b'mrs', b'goddard'): (58, 292.6791161249692),
(b'an', b'invitation'): (13, 13.178876863385744),
(b'mrs', b'bates'): (30, 54.6665437867962),
(b'those', b'who'): (174, 14.383248820933867),
(b'as', b'possible'): (81, 10.515284953239568),
(b'young', b'ladies'): (47, 111.39207280187263),
(b'old', b'fashioned'): (38, 181.0280072171398),
(b'coming', b'back'): (15, 10.421002047913248),
(b'goddard', b's'): (34, 30.051494868668044),
(b'found', b'herself'): (27, 10.853751672615102),
(b's', b'sake'): (143, 27.30490695465855),
(b'much', b'pleased'): (18, 12.52905813538587),
(b'miss', b'smith'): (58, 148.87785876781652),
(b'harriet', b'smith'): (31, 171.76079313432186),
(b'several', b'years'): (10, 16.071496876195333),
(b'pretty', b'girl'): (10, 36.100742261151815),
(b'blue', b'eyes'): (28, 33.95451351272216),
(b'due', b'time'): (18, 19.500628497333533),
(b'its', b'own'): (54, 10.339536241447126),
(b'an', b'egg'): (7, 16.473596079232184),
(b'better', b'than'): (175, 40.46903672973475),
(b'body', b'else'): (31, 37.603287294006286),
(b'much', b'more'): (163, 10.196735265430883),
(b'little', b'girl'): (54, 33.82044741555345),
(b'at', b'last'): (512, 25.084119133118794),
(b'chapter', b'iv'): (8, 252.72932330827066),
(b'every', b'respect'): (14, 10.88373348044036),
(b'guided', b'by'): (14, 22.059348262898627),
(b'different', b'sort'): (8, 13.93659398034398),
(b'abbey', b'mill'): (11, 1868.3499742665979),
(b'good', b'deal'): (62, 34.988675608221904),
(b'very', b'happy'): (45, 10.609415283099846),
(b'mrs', b'martin'): (8, 10.982447223684451),
(b'drink', b'tea'): (7, 29.38199300699301),
(b'large', b'enough'): (11, 10.328337316490268),
(b'had', b'taken'): (121, 10.546895054673817),
(b'mr', b'martin'): (37, 92.3345987295168),
(b'three', b'miles'): (9, 15.396834283387623),
(b'thing', b'else'): (26, 11.724101848622253),
(b'very', b'obliging'): (14, 23.82930511612031),
(b'on', b'purpose'): (36, 10.243903158363866),
(b'very', b'clever'): (15, 19.348581077191703),
(b'miss', b'nash'): (13, 312.88118939883645),
(b'does', b'not'): (218, 11.755881647050003),
(b'oh', b'yes'): (33, 23.31216733177377),
(b'very', b'entertaining'): (7, 15.091893240209531),
(b'soon', b'as'): (277, 10.47893531442171),
(b'have', b'seen'): (204, 12.7295316573734),
(b'on', b'horseback'): (21, 51.189475020093916),
(b'their', b'families'): (95, 33.21807400900406),
(b'no', b'doubt'): (125, 34.5945564760134),
(b'very', b'respectable'): (9, 10.061262160139687),
(b'respectable', b'young'): (8, 26.124093264248707),
(b'very', b'odd'): (20, 23.337979237437413),
(b'perfectly', b'right'): (12, 15.182023486901535),
(b'six', b'years'): (23, 29.166464467922708),
(b'years', b'hence'): (10, 17.92215409224207),
(b'young', b'woman'): (57, 28.12531780001931),
(b'very', b'desirable'): (9, 13.719902945645027),
(b'dear', b'miss'): (39, 23.615690866510537),
(b'thirty', b'years'): (36, 70.51678321678321),
(b'can', b'afford'): (11, 24.00055535354203),
(b'good', b'luck'): (24, 50.83824661023695),
(b'acquainted', b'with'): (88, 25.94043153278337),
(b'harriet', b's'): (91, 10.391255619709803),
(b'next', b'day'): (103, 31.552600854623535),
(b'an', b'opportunity'): (36, 39.48568132393281),
(b'few', b'yards'): (15, 121.49277108433735),
(b'robert', b'martin'): (31, 1822.180470288428),
(b'few', b'minutes'): (86, 306.2525246213068),
(b'been', b'able'): (40, 15.563975922210217),
(b'should', b'happen'): (13, 19.56851425106125),
(b'compared', b'with'): (28, 15.206045360968936),
(b'well', b'bred'): (15, 56.08034332217969),
(b'an', b'old'): (175, 10.093145123027886),
(b'old', b'man'): (225, 11.391492291491135),
(b'more', b'valuable'): (10, 16.92611118571237),
(b',"', b'replied'): (256, 67.14687428979221),
(b'very', b'bad'): (37, 14.502720230831981),
(b'deal', b'too'): (15, 11.983244206773618),
(b'no', b'more'): (597, 15.083521866761401),
(b'good', b'humour'): (28, 64.86757782273477),
(b'very', b'agreeable'): (21, 20.122524320279375),
(b'fixed', b'on'): (32, 10.217254086671971),
(b'same', b'time'): (104, 17.450483557641775),
(b'pleasing', b'young'): (8, 22.392079940784605),
(b'chapter', b'v'): (7, 186.22160664819944),
(b'very', b'differently'): (14, 45.27567972062859),
(b'twelve', b'years'): (25, 44.13664813761106),
(b'very', b'neatly'): (7, 21.559847486013613),
(b'ten', b'years'): (32, 33.477231228905),
(b'being', b'able'): (20, 13.768546669336779),
(b'have', b'spoken'): (82, 11.097204476268063),
(b'yes', b',"'): (117, 21.3190811413499),
(b'thank', b'you'): (105, 18.729348571046483),
(b'could', b'possibly'): (21, 29.277491471292734),
(b'grown', b'up'): (21, 13.110499863891405),
(b'any', b'harm'): (11, 11.88333382435259),
(b'an', b'angel'): (58, 20.854313507945204),
(b'excuse', b'me'): (31, 17.305246066110463),
(b'an', b'end'): (129, 17.098710216167333),
(b'many', b'years'): (55, 18.58677200012534),
(b',"', b'cried'): (297, 33.94160588989982),
(b'much', b'obliged'): (41, 42.51023467141318),
(b'mrs', b'john'): (39, 22.74667029354493),
(b'john', b'knightley'): (58, 169.26886715265277),
(b'be', b'satisfied'): (68, 11.946063802781145),
(b'ill', b'humour'): (6, 26.63236200369751),
(b'i', b'assure'): (105, 12.733255394756087),
(b'assure', b'you'): (126, 28.436400120257016),
(b'soon', b'afterwards'): (38, 78.59594937705434),
(b'chapter', b'vi'): (6, 126.36466165413533),
(b'most', b'agreeable'): (13, 26.484307288246878),
(b'no', b'scruple'): (10, 22.498995969001474),
(b'infinitely', b'superior'): (7, 270.2854590127318),
(b'am', b'glad'): (34, 16.09346987631334),
(b'very', b'interesting'): (15, 16.768770266899477),
(b'no', b'sooner'): (40, 38.53551437243869),
(b'don', b't'): (830, 258.7786954087346),
(b't', b'pretend'): (9, 21.455106382978723),
(b'why', b'should'): (100, 17.72050499445061),
(b'cannot', b'imagine'): (13, 46.70634553033812),
(b'back', b'again'): (74, 17.518775026164434),
(b'an', b'artist'): (10, 15.443996324280171),
(b'higher', b'than'): (34, 44.04157170252713),
(b'ten', b'times'): (17, 32.73329275715155),
(b'mr', b'john'): (33, 14.765003852281561),
(b'must', b'allow'): (12, 15.128011144449205),
(b'sitting', b'down'): (24, 16.50978251971512),
(b'must', b'confess'): (10, 11.475103270125821),
(b'depended', b'on'): (14, 19.196053132535216),
(b'after', b'breakfast'): (11, 10.176820020576768),
(b'sooner', b'than'): (12, 15.493715063361526),
(b'at', b'home'): (158, 14.817173413852018),
(b'at', b'least'): (318, 40.005129511612694),
(b'yes', b'indeed'): (18, 13.32791435368755),
(b'replied', b'emma'): (16, 15.97752469115056),
(b'can', b'hardly'): (33, 23.71390376951493),
(b'am', b'persuaded'): (12, 13.9008842237933),
(b'beg', b'your'): (40, 41.769363228062694),
(b'your', b'pardon'): (42, 35.83165749517416),
(b'tell', b'me'): (198, 10.695563383577781),
(b'entered', b'into'): (99, 56.41577688104846),
(b'older', b'than'): (15, 56.23201606007659),
(b'run', b'away'): (47, 37.8745366554535),
(b'have', b'borne'): (26, 10.466681494661922),
(b'good', b'opinion'): (19, 13.341832563897801),
(b'good', b'natured'): (66, 159.13709037599173),
(b'emma', b'felt'): (19, 16.14051174170535),
(b'no', b'difficulty'): (16, 11.983675747699731),
(b'let', b'us'): (399, 31.76961003819016),
(b'cried', b'emma'): (27, 13.884714320584314),
(b'bond', b'street'): (8, 172.37435897435898),
(b'some', b'weeks'): (10, 11.200682740440154),
(b'next', b'morning'): (69, 78.14689256415173),
(b'without', b'ceremony'): (6, 10.070807949665435),
(b'dear', b'sir'): (24, 14.956604215456675),
(b'sat', b'down'): (150, 55.2769805500231),
(b'depends', b'upon'): (8, 17.98662227242999),
(b'has', b'happened'): (14, 15.991242753687029),
(b'presently', b'added'): (6, 18.897863568215893),
(b'could', b'afford'): (11, 15.53959162707076),
(b'does', b'seem'): (9, 12.372250907417822),
(b'few', b'moments'): (43, 372.31655654877574),
(b'nobody', b'knows'): (7, 29.65198853194148),
(b'very', b'likely'): (33, 27.439805891290053),
(b'good', b'tempered'): (10, 28.20329470553686),
(b'all', b'probability'): (16, 12.433560319461959),
(b'no', b'harm'): (25, 22.746237682946543),
(b'cannot', b'help'): (16, 18.923655657158456),
(b'very', b'different'): (29, 16.46388353477403),
(b'common', b'sense'): (30, 161.56643879512924),
(b'an', b'hundred'): (186, 29.194395140414738),
(b'every', b'man'): (327, 11.99741282796895),
(b'less', b'than'): (90, 36.5391922539134),
(b'large', b'fortune'): (9, 31.79160591133005),
(b'no', b'use'): (43, 12.936922682175846),
(b'these', b'words'): (121, 23.297961459941245),
(b'twenty', b'thousand'): (49, 74.70393983612077),
(b'thousand', b'pounds'): (48, 447.518052808119),
(b'walked', b'off'): (15, 10.460621861989791),
(b'cast', b'down'): (44, 13.997424310193253),
(b'its', b'effects'): (8, 41.35193876156195),
(b'deal', b'more'): (29, 10.514102007124858),
(b'longer', b'than'): (32, 18.0629530688156),
(b'perfectly', b'satisfied'): (12, 88.02506546950993),
(b'three', b'hundred'): (78, 46.95431447257635),
(b'well', b'known'): (41, 14.092917445249052),
(b'destin', b'd'): (7, 66.58969843715606),
(b'looking', b'at'): (107, 11.578307624593487),
(b'next', b'moment'): (22, 24.112003750805837),
(b'ready', b'wit'): (8, 64.00444303395747),
(b'very', b'pleasant'): (20, 11.433252454704187),
(b'an', b'idea'): (37, 13.222953374634528),
(b'arrive', b'at'): (10, 10.99182472204055),
(b'nobody', b'could'): (24, 14.343345036944983),
(b'have', b'chosen'): (38, 11.818665160781638),
(b'without', b'exception'): (8, 49.91617853312434),
(b'her', b'cheeks'): (14, 10.713032768823975),
(b'sit', b'down'): (61, 35.66504947879112),
(b'reason', b'why'): (21, 17.06388019290972),
(b'could', b'hardly'): (47, 23.030990847938746),
(b'an', b'offering'): (71, 10.631590689986872),
(b'can', b'easily'): (12, 13.582403838136347),
(b'dear', b'mother'): (25, 13.628181044152825),
(b'those', b'things'): (86, 14.861480769860593),
(b'next', b'week'): (13, 53.87957922858575),
(b'taken', b'away'): (75, 32.39398354768761),
(b'stay', b'longer'): (10, 35.24360408220327),
(b'three', b'days'): (100, 36.85541434763652),
(b'cannot', b'bear'): (17, 14.75548726953468),
(b'o', b'clock'): (67, 157.91912099014903),
(b'ask', b'whether'): (11, 13.90355382441143),
(b'ran', b'away'): (24, 14.924370991613223),
(b'who', b'lived'): (27, 11.87499040954371),
(b'never', b'mind'): (39, 14.649399814275704),
(b'good', b'fortune'): (20, 17.88894692751195),
(b'jane', b'fairfax'): (111, 878.265806663654),
(b'nothing', b'else'): (45, 30.0085928868752),
(b'present', b'instance'): (6, 15.177453341360627),
(b'once', b'more'): (141, 21.758506838947973),
(b'still', b'greater'): (11, 11.400499332652899),
(b'here', b'comes'): (20, 16.697963238946844),
(b'turned', b'back'): (42, 27.419992871324965),
(b'will', b'bring'): (145, 11.542472822887643),
(b'each', b'side'): (37, 20.499637888317135),
(b'still', b'remained'): (10, 12.13368361944489),
(b'she', b'hoped'): (30, 15.572051565559544),
(b'ten', b'minutes'): (42, 194.73592734674708),
(b'most', b'favourable'): (6, 11.684253215403034),
(b'ten', b'days'): (21, 15.98003268459367),
(b'little', b'ones'): (53, 58.130349583162364),
(b'mr', b'wingfield'): (9, 102.72224108658745),
(b'passed', b'over'): (52, 21.874777302895755),
(b'yes', b'sir'): (25, 17.0480135249366),
(b'sir', b',"'): (121, 14.215621316157259),
(b'cannot', b'deny'): (9, 32.95392156862745),
(b'talking', b'about'): (25, 19.032959773504775),
(b'never', b'forget'): (18, 19.811797243380486),
(b'cannot', b'tell'): (30, 16.07682643742666),
(b'two', b'years'): (53, 11.95816852658443),
(b'indeed', b'!--'): (21, 14.904334331005433),
(b'dear', b'madam'): (15, 86.09887295081967),
(b'madam', b",'"): (7, 23.782783018867924),
(b'most', b'amiable'): (9, 20.908663648615956),
(b',"', b'observed'): (18, 11.457441601650128),
(b'five', b'times'): (10, 11.19749042251957),
(b'our', b'lives'): (16, 17.148517657729155),
(b'think', b'differently'): (6, 11.912463083284111),
(b'grow', b'up'): (20, 10.504597456351945),
(b'shake', b'hands'): (15, 50.53995355597508),
(b'how', b'long'): (106, 12.747548678067647),
(b'perfectly', b'convinced'): (8, 72.51995685005393),
(b'tells', b'me'): (15, 12.055904371165493),
(b'bad', b'cold'): (7, 12.931807252091948),
(b'far', b'off'): (83, 31.653885985340363),
(b'am', b'sorry'): (32, 25.55609823252787),
(b'mrs', b'campbell'): (9, 25.553929487788654),
(b'ah', b'!"'): (15, 17.216148693638065),
(b'an', b'interval'): (11, 15.772591990754218),
(b'perfectly', b'well'): (16, 10.91605036682757),
(b'ill', b'judged'): (6, 16.56130074179509),
(b'can', b'tell'): (52, 10.59868410018371),
(b'morrow', b'morning'): (14, 21.336820698491813),
(b'own', b'feelings'): (16, 10.440997949569127),
(b'sore', b'throat'): (9, 237.47777450691785),
(b'well', b'satisfied'): (17, 14.614877350628644),
(b'looked', b'at'): (184, 11.997174544178405),
(b'well', b'pleased'): (26, 18.61784927203245),
(b'set', b'forward'): (22, 28.094228510556935),
(b'eldest', b'daughter'): (10, 81.36551909628834),
(b'short', b'time'): (23, 10.057824159586259),
(b',"', b'continued'): (103, 40.15998883792505),
(b'dining', b'room'): (18, 272.7268377253814),
(b'enter', b'into'): (110, 32.02961835848166),
(b'half', b'hour'): (17, 17.306134513071832),
(b'gone', b'through'): (24, 10.232695965811631),
(b'turn', b'away'): (53, 24.3249510219842),
(b'own', b'sake'): (17, 10.579228284294224),
(b'an', b'effort'): (8, 12.355197059424137),
(b',"', b'repeated'): (29, 15.364284489928071),
(b'several', b'times'): (19, 99.00042075736326),
(b'great', b'curiosity'): (13, 12.09281966721631),
(b'upper', b'end'): (11, 46.569735221674875),
(b'an', b'odd'): (25, 25.47463311221472),
(b'dearest', b'emma'): (8, 38.814087759815244),
(b'continued', b'mrs'): (17, 12.451436979718673),
(b'go', b'home'): (37, 10.007173127288121),
(b'judge', b'between'): (11, 14.60799652325076),
(b'hardly', b'knew'): (16, 28.982276298141407),
(b'set', b'off'): (42, 12.027500784416544),
(b'got', b'home'): (12, 12.614899339653833),
(b'most', b'extraordinary'): (16, 42.018372140007074),
(b'an', b'inch'): (28, 63.14878497039003),
(b'at', b'ease'): (36, 15.972495299215172),
(b'three', b'quarters'): (8, 46.190502850162865),
(b'smith', b'!--'): (9, 17.971662805204062),
(b'extremely', b'sorry'): (8, 71.27550459436111),
(b'many', b'weeks'): (10, 19.402314036865533),
(b'madam', b',"'): (14, 12.063363994045087),
(b'extremely', b'well'): (16, 22.29075831209327),
(b'without', b'knowing'): (19, 31.149243193151232),
(b'poor', b'harriet'): (15, 13.183234482788675),
(b'an', b'instant'): (99, 42.54170416065454),
(b'thirty', b'thousand'): (18, 40.06847682119205),
(b'somebody', b'else'): (17, 161.2778888444622),
(b'worth', b'having'): (9, 20.026280947967695),
(b'poor', b'girl'): (16, 25.65660249342719),
(b'laugh', b'at'): (34, 10.684790511927686),
(b'knowing', b'what'): (22, 10.324563551972247),
(b'many', b'days'): (50, 13.4746779374137),
(b'whole', b'party'): (15, 20.8072769791595),
(b'six', b'weeks'): (6, 16.914565278166062),
(b'too', b'late'): (56, 82.78464324520101),
(b'her', b'companions'): (37, 11.248284965327484),
(b'drew', b'near'): (34, 127.06472006778334),
(b'three', b'months'): (36, 79.13813268974364),
(b'other', b'side'): (133, 27.098664528007962),
(b'an', b'unnatural'): (10, 18.169407440329614),
(b'get', b'rid'): (18, 208.69330573907504),
(b'watering', b'place'): (9, 80.21132175526978),
(b'while', b'ago'): (10, 12.961199989717313),
(b'at', b'weymouth'): (16, 40.30335731414868),
(b'present', b'occasion'): (9, 30.82190524707081),
(b'their', b'hearts'): (50, 16.688506081078597),
(b'break', b'through'): (12, 10.134403108825648),
(b'burst', b'forth'): (18, 46.35182980729566),
(b'young', b'men'): (142, 26.25825955394037),
(b'nobody', b'else'): (21, 79.07196941851062),
(b'something', b'else'): (35, 35.09762973791375),
(b'walking', b'together'): (9, 11.011028608866564),
(b'burst', b'out'): (19, 10.336378346321213),
(b'mrs', b'cole'): (30, 133.52975449561694),
(b'mr', b'cole'): (23, 75.77870244092516),
(b'miss', b'fairfax'): (125, 253.16112835175565),
(b'extremely', b'happy'): (7, 17.871069693255155),
(b'ma', b'am'): (216, 180.33443656689536),
(b's', b'handwriting'): (7, 11.116226942015139),
(b'without', b'seeming'): (8, 22.511217769840382),
(b'colonel', b'campbell'): (28, 852.6827205882353),
(b'those', b'days'): (84, 22.563533110519458),
(b'mrs', b'dixon'): (14, 66.64348656190336),
(b'mr', b'dixon'): (22, 99.22034650409013),
(b'miss', b'campbell'): (12, 69.78477508650519),
(b'caught', b'hold'): (10, 22.46057555238774),
(b'four', b'months'): (12, 35.22349653606567),
(b'may', b'guess'): (11, 12.384068773773176),
(b'running', b'away'): (12, 10.939329522648434),
(b'five', b'minutes'): (37, 138.27274480256534),
(b'nine', b'years'): (9, 20.394175346344422),
(b'hundred', b'pounds'): (11, 61.54765860771082),
(b'rather', b'than'): (78, 18.774125510292563),
(b'few', b'months'): (17, 56.654660091141764),
(b'she', b'wished'): (41, 13.111156859133413),
(b'without', b'feeling'): (12, 12.941231471550754),
(b'ill', b'health'): (7, 24.48192283569709),
(b'twelve', b'thousand'): (24, 56.813511910645445),
(b'mr', b'churchill'): (19, 14.856522471200662),
(b'passed', b'between'): (17, 18.633822336745062),
(b",'", b'said'): (252, 29.904409319301955),
(b'miss', b'hawkins'): (18, 330.480756302521),
(b'dear', b'jane'): (15, 27.27885083590326),
(b'three', b'minutes'): (10, 10.22196466946896),
(b'have', b'suffered'): (30, 12.671527233246879),
(b'hour', b'ago'): (10, 34.58206634811038),
(b'ford', b's'): (10, 13.291140908931144),
(b'looked', b'round'): (26, 11.112143696738173),
(b'help', b'thinking'): (10, 29.995538104586824),
(b'can', b't'): (299, 33.88376276641461),
(b'human', b'nature'): (10, 30.58786058786059),
(b'brown', b's'): (105, 12.276957466080978),
(b'laughed', b'at'): (28, 11.57917413466867),
(b'weeks', b'ago'): (7, 64.64729137728389),
(b'twenty', b'miles'): (8, 30.043399099576273),
(b'elder', b'sister'): (6, 19.20011424219345),
(b'driven', b'away'): (10, 13.8188620292715),
(b'setting', b'off'): (8, 15.77691050691885),
(b'little', b'farther'): (16, 12.063393148450245),
(b'spot', b'where'): (18, 30.983262494042016),
(b'front', b'door'): (16, 47.78138820286026),
(b'they', b'parted'): (23, 10.79338518624233),
(b'without', b'delay'): (8, 16.401030089455137),
(b'six', b'months'): (21, 137.41988785577402),
(b'months', b'ago'): (7, 32.82608577706644),
(b'leaned', b'back'): (6, 11.972573463935888),
(b'ill', b'disposed'): (6, 23.461842717543043),
(b'at', b'oxford'): (10, 13.739780902550686),
(b'turned', b'round'): (21, 11.459329950969135),
(b'pass', b'through'): (64, 20.610161604147784),
(b'clock', b'struck'): (16, 271.6146594294576),
(b'"\'', b'tis'): (7, 65.71456500488759),
(b'four', b'hours'): (15, 44.143732671822555),
(b'parlour', b'door'): (6, 13.546649351812462),
(b'faster', b'than'): (10, 38.52934433745988),
(b'musical', b'society'): (6, 97.72799224680989),
(b'worth', b'while'): (16, 32.809013467947075),
(b'kind', b'hearted'): (6, 22.299646174259177),
(b'mixed', b'with'): (29, 12.31482453633305),
(b'extremely', b'glad'): (10, 69.64820996891908),
(b'knew', b'nothing'): (25, 11.012511694448186),
(b'make', b'amends'): (9, 63.131237772270765),
(b'amends', b'for'): (12, 12.75831653909067),
(b'oftener', b'than'): (9, 48.954931628772556),
(b'old', b'woman'): (61, 16.854605854208888),
(b'just', b'going'): (25, 11.441554905030975),
(b'their', b'lives'): (30, 13.550496390619374),
(b'six', b'days'): (24, 24.79915003775213),
(b'may', b'prove'): (10, 12.110271334897238),
(b'stronger', b'than'): (29, 55.48225584594223),
(b'particular', b'friend'): (10, 18.111862154023),
(b'good', b'tidings'): (14, 28.459688293769013),
(b'among', b'themselves'): (31, 14.391206579513122),
(b'next', b'summer'): (9, 24.61739395788831),
(b'on', b'tuesday'): (9, 11.26168450442066),
(b'breaking', b'up'): (13, 11.204903953442072),
(b'after', b'tea'): (9, 10.09972289920876),
(b'perfectly', b'safe'): (6, 15.182023486901535),
(b'two', b'ladies'): (13, 10.349582404988046),
(b'same', b'moment'): (29, 15.167877661019073),
(b'mr', b'cox'): (13, 82.17779286926995),
(b',"', b'added'): (51, 21.445980433857933),
(b'little', b'girls'): (15, 21.933442088091358),
(b'be', b'ashamed'): (88, 14.839124152298211),
(b'been', b'staying'): (9, 13.648409654861268),
(b'shut', b'up'): (64, 31.307093975641568),
(b'too', b'large'): (19, 13.11269940787182),
(b'an', b'elderly'): (7, 12.355197059424137),
(b'worse', b'than'): (59, 50.38186909777266),
(b'opposite', b'side'): (9, 20.550379566258513),
(b'short', b'pause'): (11, 83.26155963302752),
(b'large', b'party'): (8, 13.46015572858732),
(b'who', b'knows'): (35, 23.27214961035764),
(b'extremely', b'fond'): (6, 32.5917905623788),
(b'five', b'couple'): (8, 45.27680909975305),
(b'mr', b'william'): (9, 12.681758158837956),
(b'bow', b'window'): (8, 25.073932863655198),
(b'bad', b'news'): (9, 59.57111209570226),
(b'baked', b'apples'): (6, 587.9825072886297),
(b'mrs', b'wallis'): (14, 79.25171374929049),
(b'will', b'send'): (73, 14.065286062406157),
(b'william', b'larkins'): (13, 4596.649572649572),
(b'low', b'voice'): (39, 51.584448802114416),
(b'one', b'leg'): (18, 10.201187499351517),
(b'an', b'immediate'): (12, 13.305596833225994),
(b',"', b'resumed'): (18, 27.87977456401531),
(b'many', b'times'): (21, 13.08508748808824),
(b'few', b'words'): (18, 11.206759868233577),
(b'no', b'objection'): (18, 27.457958345842616),
(b'astonished', b'at'): (22, 12.457401351645954),
(b'four', b'times'): (13, 16.708165484388754),
(b'c', b'.,'): (6, 484.8028846153846),
(b'few', b'hours'): (23, 74.55238225629792),
(b'an', b'extraordinary'): (14, 10.691997455270888),
(b'immediately', b'followed'): (7, 11.60491790701243),
(b'wait', b'till'): (20, 30.635873858181593),
(b'good', b'bye'): (45, 141.49449547184594),
(b'contrast', b'between'): (12, 157.76636245110822),
(b'dared', b'not'): (22, 11.170638889884433),
(b'three', b'weeks'): (11, 30.16522635112677),
(b'self', b'command'): (15, 129.45780554604588),
(b'mrs', b'elton'): (142, 115.93850995531123),
(b'maple', b'grove'): (31, 6513.823602484473),
(b'mr', b'suckling'): (10, 67.58042176749174),
(b'almost', b'fancy'): (9, 14.733568732497261),
(b'left', b'behind'): (27, 27.450387913434056),
(b'barouche', b'landau'): (7, 17286.685714285715),
(b'whose', b'name'): (60, 27.214399799798326),
(b'mr', b'e'): (10, 15.658878214418817),
(b'e', b'.,'): (6, 189.19136960600375),
(b'good', b'breeding'): (8, 36.8301848507599),
(b'greater', b'part'): (10, 15.073845233942896),
(b'drew', b'back'): (11, 14.116134651802135),
(b'third', b'time'): (23, 12.45550979515326),
(b'very', b'extraordinary'): (13, 11.609148646315022),
(b'better', b'acquainted'): (7, 12.586519658830872),
(b'have', b'committed'): (34, 16.059987478581785),
(b'drawing', b'rooms'): (8, 127.1079831932773),
(b'hardly', b'less'): (8, 12.215670175249683),
(b'will', b'shew'): (48, 11.000022831745259),
(b'little', b'boys'): (17, 15.38189445138874),
(b'post', b'office'): (12, 378.61952440550687),
(b'easily', b'believe'): (8, 20.680209638828753),
(b'put', b'forth'): (41, 11.449099051744076),
(b'mrs', b'bragge'): (6, 46.544657281329336),
(b'drawing', b'near'): (8, 17.91737739872068),
(b'great', b'joy'): (26, 10.054550288918538),
(b'spread', b'abroad'): (14, 187.66563275434243),
(b'few', b'lines'): (10, 41.41799014238773),
(b'good', b'news'): (18, 22.238991398956113),
(b'most', b'likely'): (12, 18.05748224198651),
(b'talk', b'about'): (37, 21.439425951764),
(b'tells', b'us'): (12, 28.49862729793018),
(b'sixty', b'five'): (6, 24.217828123123724),
(b'eleven', b'years'): (6, 15.563975922210217),
(b'your', b'sister'): (93, 17.156751175996334),
(b'two', b'hours'): (18, 15.059818488167268),
(b'two', b'months'): (20, 19.80765285410601),
(b'twenty', b'four'): (17, 24.690562341866855),
(b'door', b'opened'): (19, 31.97009247027741),
(b'began', b'talking'): (9, 12.980428544611012),
(b'mean', b'?"'): (59, 24.672830374194895),
(b'pretty', b'soon'): (13, 15.281868324424673),
(b'many', b'hours'): (12, 12.099988572081596),
(b'few', b'steps'): (8, 21.356151167168672),
(b'most', b'excellent'): (11, 13.620500891098393),
(b'surrounded', b'by'): (19, 28.432048872180452),
(b'later', b'than'): (9, 15.131524321620608),
(b'whole', b'story'): (18, 33.811825091134196),
(b'another', b'minute'): (9, 10.47206452506231),
(b'whole', b'history'): (9, 21.865274113693037),
(b'lined', b'with'): (12, 12.665905060395177),
(b'court', b'plaister'): (9, 660.5174672489082),
(b'these', b'things'): (366, 38.76461305007588),
(b'laid', b'down'): (26, 11.76806952449668),
(b'forty', b'years'): (68, 158.55386535221814),
(b'faint', b'smile'): (6, 23.553176580504516),
(b'turned', b'towards'): (11, 10.359757315632365),
(b'totally', b'different'): (7, 152.78636363636363),
(b'box', b'hill'): (18, 162.97043283674864),
(b'some', b'surprise'): (19, 20.08802185605737),
(b'may', b'depend'): (9, 14.385534434180961),
(b',"', b'interrupted'): (25, 29.580662667390254),
(b'whatever', b'else'): (9, 19.41965768758576),
(b'larger', b'than'): (11, 18.09203994976377),
(b'were', b'assembled'): (17, 17.322526847214835),
(b'insisted', b'on'): (9, 10.828542792712174),
(b'clothed', b'with'): (37, 11.957873100497311),
(b'twenty', b'minutes'): (8, 15.956660102679514),
(b'quite', b'alone'): (14, 10.13378072065835),
(b'etc', b'.,'): (11, 3723.286153846154),
(b',"', b'whispered'): (18, 25.73517652062952),
(b'shan', b't'): (20, 201.14162234042553),
(b'looking', b'round'): (23, 16.13132441247656),
(b',"', b'answered'): (143, 22.031354589620804),
(b'yes', b'yes'): (31, 34.423873463814296),
(b'old', b'age'): (51, 59.17756524475554),
(b'an', b'infant'): (10, 23.75999434504642),
(b'be', b'forgiven'): (36, 17.634665613629313),
(b'lie', b'down'): (41, 29.047286538446148),
(b'mrs', b'smallridge'): (7, 65.16252019386107),
(b'four', b'miles'): (7, 15.174408105939005),
(b'great', b'hurry'): (16, 18.475141158247137),
(b'without', b'waiting'): (12, 17.062642768222968),
(b'comes', b'back'): (13, 14.131562121366947),
(b'heightened', b'by'): (7, 10.935403412377099),
(b'cut', b'off'): (217, 148.51792396902442),
(b'trembling', b'voice'): (7, 11.589135556534565),
(b'time', b'past'): (23, 12.30314881906576),
(b'second', b'time'): (44, 18.990808144462072),
(b'five', b'hundred'): (67, 84.28815897682486),
(b'an', b'arrow'): (10, 13.727996732693487),
(b'presented', b'themselves'): (6, 11.270705264334413),
(b'at', b'random'): (13, 22.939460289475928),
(b'far', b'distant'): (12, 33.91663463386507),
(b'few', b'seconds'): (9, 91.11957831325302),
(b'passing', b'through'): (12, 17.888241729000978),
(b'domestic', b'happiness'): (6, 55.01309328968903),
(b'western', b'sun'): (7, 33.125321653435535),
(b'rose', b'early'): (12, 38.936482399124785),
(b'east', b'wind'): (22, 116.45940284649961),
(b'gone', b'mad'): (10, 19.017256011315414),
(b'freed', b'from'): (11, 28.569886041679784),
(b'sinned', b'against'): (43, 77.5224293267156),
(b'locked', b'up'): (11, 12.67923342100024),
(b'deep', b'sigh'): (7, 42.620033812341504),
(b'ten', b'thousand'): (82, 129.36153942271648),
(b'happier', b'than'): (11, 25.476546051708166),
(b'nay', b'nay'): (7, 32.48591908507611),
(b'had', b'formerly'): (10, 11.277764108528865),
(b'little', b'boy'): (67, 21.623428772358615),
(b'fancying', b'herself'): (6, 25.962667353244075),
(b'right', b'hand'): (199, 41.80066296107419),
(b'infinitely', b'more'): (8, 12.077009062238014),
(b'such', b'cases'): (8, 12.891349431818181),
(b'poor', b'fellow'): (38, 75.64730219711522),
(b'days', b'ago'): (11, 14.965593436946037),
(b'help', b'laughing'): (7, 15.772934643760266),
(b'draw', b'near'): (18, 72.06547373629091),
(b'at', b'intervals'): (34, 32.97547416612164),
(b'into', b'temptation'): (8, 11.357022187183242),
(b'sir', b'walter'): (136, 503.2346285714286),
(b'walter', b'elliot'): (16, 153.5265051903114),
(b'kellynch', b'hall'): (25, 1284.9824784963364),
(b'charles', b'musgrove'): (14, 242.12120942641175),
(b'first', b'year'): (71, 32.984816933402165),
(b'lady', b'elliot'): (12, 19.257296673017198),
(b'seventeen', b'years'): (7, 49.28592375366569),
(b'an', b'awful'): (13, 14.535525952263692),
(b'thirteen', b'years'): (7, 35.84430818448414),
(b'lady', b'russell'): (147, 757.699847427881),
(b'anne', b'elliot'): (23, 67.77658011998705),
(b'miss', b'elliot'): (48, 75.64904190049722),
(b'everybody', b'else'): (22, 107.67307329941586),
(b'russell', b's'): (30, 10.25826311763142),
(b'mr', b'elliot'): (174, 150.17351854354047),
(b'mr', b'shepherd'): (26, 56.76755428469306),
(b'ill', b'used'): (8, 18.88940691636053),
(b'anybody', b'else'): (21, 154.67587000287605),
(b'an', b'honest'): (29, 20.449981339736503),
(b'descend', b'into'): (11, 16.714108124533826),
(b'mrs', b'clay'): (66, 167.01318200947586),
(b'therefore', b'thus'): (66, 16.596041635939144),
(b'miss', b'anne'): (19, 12.80224291155311),
(b'their', b'fathers'): (151, 19.01897570711605),
(b'an', b'example'): (14, 16.847995990123824),
(b'admiral', b'croft'): (14, 929.5503584841242),
(b'mrs', b'croft'): (41, 202.22851094646535),
(b'walked', b'along'): (8, 10.917841095692655),
(b'frederick', b'wentworth'): (6, 22.56410830163348),
(b'either', b'side'): (18, 17.75174234901147),
(b'captain', b'wentworth'): (196, 617.1112879281435),
(b'eldest', b'son'): (15, 33.798893916540976),
(b'removed', b'from'): (36, 13.162771276103),
(b'startled', b'by'): (14, 12.543550973020787),
(b'most', b'important'): (14, 31.64054410542769),
(b'replied', b'anne'): (11, 13.57410371079436),
(b'at', b'uppercross'): (20, 12.847587337449994),
(b'left', b'alone'): (17, 14.25135144684309),
(b'mr', b'musgrove'): (21, 31.606843411257675),
(b'miss', b'musgroves'): (22, 210.81324041811845),
(b'mrs', b'musgrove'): (66, 152.88129737790481),
(b'piano', b'forte'): (7, 11524.457142857142),
(b'their', b'faces'): (63, 20.621617809871022),
(b'surprised', b'at'): (28, 13.305893084575402),
(b'ere', b'long'): (23, 32.47536007156702),
(b'anything', b'else'): (31, 72.72513938587002),
(b'quite', b'different'): (12, 17.841595753643947),
(b'their', b'sakes'): (13, 15.501767870868562),
(b'twentieth', b'year'): (13, 176.0098909090909),
(b'on', b'board'): (70, 31.77124534667287),
(b'eight', b'years'): (24, 64.58155526342401),
(b'their', b'heads'): (79, 20.781355479062924),
(b'dressing', b'room'): (14, 228.86168200731305),
(b'up', b'stairs'): (15, 14.006129941802593),
(b'waited', b'till'): (7, 10.336277440847356),
(b'third', b'part'): (39, 74.5758658942438),
(b'dear', b'fellow'): (11, 17.042251140780802),
(b'good', b'cheer'): (15, 59.6298230917065),
(b'mrs', b'harville'): (24, 82.53919224555735),
(b'fifteen', b'years'): (10, 60.35011071877431),
(b'charles', b'hayter'): (33, 2576.962579860055),
(b'came', b'near'): (42, 11.125204066571548),
(b'mansion', b'house'): (8, 28.44987460815047),
(b'two', b'hundred'): (105, 33.2713403032416),
(b'dr', b'shirley'): (9, 1057.7517482517483),
(b'went', b'up'): (207, 10.546791909362577),
(b'within', b'reach'): (7, 16.56674242216807),
(b'turn', b'back'): (16, 10.061840177671076),
(b'walking', b'along'): (8, 17.573893342628093),
(b'leaning', b'against'): (13, 24.29099376700023),
(b'trodden', b'under'): (9, 65.1309543032456),
(b'under', b'foot'): (15, 20.587370613094873),
(b'louisa', b'musgrove'): (15, 183.23258631132646),
(b'provoke', b'me'): (18, 15.67267568251514),
(b'captain', b'harville'): (37, 300.52135040745054),
(b'at', b'lyme'): (26, 20.671192760852374),
(b'earnest', b'desire'): (6, 31.26467548573791),
(b'sea', b'shore'): (14, 26.870495928941523),
(b'captain', b'benwick'): (56, 513.167038084151),
(b'an', b'officer'): (9, 10.085875150550317),
(b'place', b'where'): (125, 24.673747113061946),
(b'breakfast', b'table'): (9, 46.889138605804625),
(b'great', b'coat'): (15, 13.653183495244221),
(b'mean', b'while'): (30, 21.601999982862186),
(b'preceding', b'evening'): (6, 46.916408188585606),
(b'dark', b'blue'): (7, 10.384888690547708),
(b'an', b'agony'): (10, 14.366508208632718),
(b'catching', b'hold'): (10, 119.13870510396976),
(b'raised', b'up'): (35, 19.639030027092765),
(b'every', b'one'): (395, 13.121859438999877),
(b'could', b'scarcely'): (18, 16.706049522741466),
(b'passed', b'along'): (11, 15.774990874485061),
(b'leaning', b'over'): (17, 32.99075765424357),
(b't', b'talk'): (19, 10.789205796038152),
(b'camden', b'place'): (29, 304.8030226700252),
(b'straight', b'forward'): (6, 11.08933284457478),
(b'same', b'hour'): (17, 12.501907897455677),
(b'looking', b'glasses'): (6, 21.695137693631672),
(b'poring', b'over'): (6, 39.405627198124265),
(b'thirty', b'feet'): (8, 11.482173582995951),
(b'colonel', b'wallis'): (23, 919.8152027027028),
(b'at', b'length'): (101, 22.85664635341284),
(b'carried', b'away'): (73, 65.89981996296747),
(b'greater', b'than'): (58, 46.92382276332348),
(b'miss', b'carteret'): (12, 296.58529411764704),
(b'lady', b'dalrymple'): (25, 567.8937488267317),
(b'laura', b'place'): (7, 20.594798829055758),
(b'be', b'established'): (41, 12.514923983865962),
(b'mrs', b'smith'): (79, 133.20515177750607),
(b'westgate', b'buildings'): (7, 3878.4230769230767),
(b'at', b'liberty'): (25, 11.17812683597344),
(b'five', b'thousand'): (31, 35.86163157834745),
(b'whose', b'names'): (9, 17.589220303506018),
(b'her', b'ladyship'): (22, 32.97674284395198),
(b'ladyship', b's'): (10, 11.322082996496901),
(b'old', b'gentleman'): (32, 24.07985483618954),
(b'their', b'minds'): (18, 15.50176787086856),
(b'almost', b'entirely'): (13, 33.520817751184246),
(b'lower', b'part'): (8, 14.360893094499653),
(b'staring', b'at'): (33, 23.08283191628515),
(b'ay', b'ay'): (6, 102.04655085174566),
(b'an', b'oath'): (39, 43.30687629076502),
(b'wiser', b'than'): (8, 28.37160810303864),
(b'prejudice', b'against'): (7, 37.30402614217892),
(b'both', b'sides'): (31, 93.4670833729442),
(b'my', b'soul'): (259, 16.066751210923982),
(b'same', b'instant'): (19, 24.46954622664402),
(b'their', b'seats'): (12, 12.91813989239047),
(b'their', b'mouths'): (24, 24.960473690381583),
(b'short', b'silence'): (9, 17.742932551319647),
(b'fifty', b'pounds'): (6, 33.29484921857803),
(b'be', b'saved'): (61, 12.820166032252938),
(b'hard', b'hearted'): (6, 30.946447752033144),
(b'some', b'moments'): (14, 21.24516597218971),
(b'exclaimed', b'mrs'): (11, 11.847730944338375),
(b'compassion', b'on'): (20, 12.135435888384333),
(b'an', b'explanation'): (12, 15.443996324280173),
(b'our', b'hearts'): (21, 17.06646733400796),
(b'minutes', b'afterwards'): (7, 21.366081045290017),
(b'make', b'haste'): (38, 69.71776454772422),
(b'n', b't'): (19, 110.43069461827284),
(b'rising', b'sun'): (7, 11.4529037631559),
(b'an', b'atonement'): (66, 83.74078006943026),
(b'atonement', b'for'): (65, 20.65632201567061),
(b'next', b'instant'): (8, 11.76765535349606),
(b'she', b'doted'): (7, 14.810751266798853),
(b'god', b'forbid'): (30, 46.144377261328756),
(b'i', b'll'): (384, 11.17502796097598),
(b'll', b'answer'): (12, 13.013882743362831),
(b'market', b'place'): (13, 39.58480813896431),
(b'poured', b'out'): (53, 41.589583712234585),
(b'at', b'norland'): (19, 17.421005219837852),
(b'many', b'generations'): (11, 16.534145874894104),
(b'seven', b'thousand'): (27, 31.259095392419333),
(b'mr', b'dashwood'): (15, 10.190698520494786),
(b'john', b'dashwood'): (37, 157.761220299208),
(b'four', b'thousand'): (45, 51.45229768371371),
(b'three', b'thousand'): (45, 26.103242228789615),
(b'mrs', b'dashwood'): (121, 149.97722901761674),
(b'miss', b'dashwoods'): (23, 254.21596638655464),
(b'edward', b'ferrars'): (13, 135.88635597978663),
(b'younger', b'brother'): (8, 31.67883135242683),
(b'few', b'miles'): (7, 14.237434111445785),
(b'replied', b'elinor'): (26, 38.56234426453405),
(b'mrs', b'ferrars'): (73, 170.42505281471358),
(b'barton', b'park'): (12, 511.6875679594056),
(b'from', b'whence'): (44, 15.950059078483806),
(b'barton', b'cottage'): (7, 104.5866897147796),
(b'sir', b'john'): (113, 127.78658844235845),
(b'at', b'barton'): (35, 22.230656741205607),
(b'lady', b'middleton'): (95, 500.3819045606168),
(b'be', b'fulfilled'): (39, 13.72604565972396),
(b'their', b'arrival'): (15, 11.398358728579826),
(b'present', b'case'): (12, 20.32458969190901),
(b'mrs', b'jennings'): (229, 317.3131418135843),
(b'colonel', b'brandon'): (132, 1667.519921875),
(b'now', b'therefore'): (145, 11.124642048311067),
(b'ill', b'natured'): (11, 147.80960912052115),
(b'blue', b'sky'): (11, 51.83932141429143),
(b'rose', b'up'): (112, 34.00644003284101),
(b'at', b'allenham'): (8, 10.991824722040551),
(b'miss', b'dashwood'): (70, 131.14315726290516),
(b'cried', b'marianne'): (34, 25.0791653802614),
(b'mr', b'willoughby'): (36, 36.686514673781225),
(b'miss', b'marianne'): (31, 20.916503563450693),
(b'aye', b'aye'): (36, 468.90135),
(b'have', b'erred'): (10, 17.591061335566256),
(b'pronounce', b'him'): (19, 17.59389669411188),
(b'by', b'reason'): (70, 10.359210631680098),
(b'an', b'everlasting'): (37, 33.79199195739935),
(b'seven', b'days'): (103, 82.77954431228012),
(b'by', b'accident'): (20, 16.659403636043233),
(b'went', b'out'): (262, 11.679230644951586),
(b'won', b't'): (219, 217.3954908123791),
(b'miss', b'williams'): (6, 72.63313325330132),
(b'laughed', b'heartily'): (6, 104.9677307425399),
(b'considerable', b'time'): (10, 11.403428752365373),
(b'two', b'sides'): (13, 12.986418686513664),
(b'at', b'delaford'): (11, 13.19018966644866),
(b'two', b'thousand'): (76, 23.96682269843838),
(b'seven', b'hundred'): (50, 63.02087384034221),
(b'can', b'possibly'): (10, 14.130761757067319),
(b'burst', b'into'): (18, 13.516455983337801),
(b'turning', b'round'): (15, 21.605116375400833),
(b'mr', b'ferrars'): (26, 41.483981977275704),
(b'combe', b'magna'): (11, 18907.3125),
(b'mrs', b'palmer'): (37, 135.40263936386714),
(b'mr', b'palmer'): (35, 100.05413092849427),
(b'without', b'ceasing'): (8, 82.0051504472757),
(b'stared', b'at'): (14, 13.489966704322493),
(b't', b'think'): (70, 10.296676867200363),
(b'miss', b'steeles'): (29, 406.7455462184874),
(b'most', b'beautiful'): (16, 16.30563694985349),
(b'human', b'beings'): (7, 191.52706552706556),
(b'sugar', b'plums'): (24, 5662.879802955666),
(b'two', b'boys'): (13, 13.239400868718477),
(b'miss', b'steele'): (27, 243.16309828279137),
(b'lucy', b'steele'): (10, 285.9328922495274),
(b'i', b'm'): (438, 16.271042011107397),
(b'm', b'sure'): (88, 102.49120766084593),
(b'robert', b'ferrars'): (7, 95.96098334655036),
(b'mr', b'pratt'): (8, 85.6018675721562),
(b'at', b'longstaple'): (7, 16.48773708306082),
(b'poor', b'edward'): (10, 12.172840599071293),
(b'their', b'names'): (36, 14.302226309432303),
(b'latter', b'end'): (12, 42.203822544642854),
(b'i', b've'): (218, 13.517866465271059),
(b'lifted', b'up'): (151, 72.07416702839072),
(b'third', b'day'): (65, 33.7914061089195),
(b'starting', b'up'): (9, 11.47168738090498),
(b't', b'know'): (147, 12.457577307748522),
(b'returned', b'home'): (11, 12.90725432262056),
(b'berkeley', b'street'): (16, 1449.9725490196079),
(b'conduit', b'street'): (6, 203.71515151515152),
(b'lit', b'up'): (23, 36.13581524985069),
(b'as', b'follows'): (17, 14.278650094398992),
(b'having', b'received'): (10, 11.384803552611771),
(b'who', b'cares'): (8, 12.52589229028073),
(b'miss', b'grey'): (10, 10.25064380590946),
(b'fifty', b'thousand'): (11, 20.37380177348749),
(b'why', b'don'): (28, 12.795415425355841),
(b'thousand', b'times'): (12, 12.06362743003632),
(b'walked', b'across'): (7, 13.91218569999655),
(b'fourteen', b'years'): (6, 15.563975922210217),
(b'your', b'sakes'): (16, 32.49059753954306),
(b'bartlett', b's'): (6, 10.18987469684721),
(b'dressing', b'gown'): (6, 509.2878787878788),
(b'wild', b'beasts'): (16, 105.2399430740038),
(b'miss', b'morton'): (15, 267.5957540911101),
(b'six', b'hundred'): (66, 132.00427053900353),
(b'harley', b'street'): (16, 1540.5958333333335),
(b'most', b'high'): (60, 22.2500544936901),
(b'filled', b'with'): (114, 14.087588281459942),
(b'two', b'thirds'): (6, 42.47641112047178),
(b'public', b'school'): (6, 43.038412291933426),
(b",'", b'says'): (13, 42.67564803385647),
(b'fell', b'upon'): (62, 13.94880910923142),
(b's', b'office'): (29, 12.330604675176458),
(b'yes', b'ma'): (9, 15.105834768931166),
(b'come', b'near'): (47, 11.571024613341859),
(b'give', b'ear'): (30, 35.78134979774324),
(b'reminds', b'me'): (7, 14.199176259372688),
(b'ten', b'guineas'): (6, 26.425314465408807),
(b'south', b'east'): (7, 17.11383597097883),
(b'mr', b'harris'): (10, 85.6018675721562),
(b'quicker', b'than'): (6, 12.238732907193139),
(b'bent', b'over'): (10, 11.258750628035504),
(b'justified', b'by'): (17, 11.526506299532617),
(b'from', b'thence'): (103, 27.334646896223614),
(b'latter', b'days'): (12, 29.77529527559055),
(b'sprung', b'up'): (13, 24.090543499900463),
(b'or', b'later'): (13, 13.913825368307583),
(b'living', b'creature'): (16, 82.51963993453354),
(b'first', b'month'): (33, 25.952445658418213),
(b'have', b'transgressed'): (20, 23.003695592663565),
...}
In [24]:
lower_bigram["jon lives in new york city".split()]
Out[24]:
['jon', 'lives', 'in', 'new_york', 'city']
In [25]:
lower_bigram = Phraser(Phrases(lower_sents, min_count=32, threshold=64))
lower_bigram.phrasegrams
Out[25]:
{(b'afar', b'off'): (52, 108.14220347465505),
(b'burnt', b'offering'): (184, 297.524653753951),
(b'burnt', b'offerings'): (86, 299.15702343127646),
(b'buster', b'bear'): (142, 479.87410772225826),
(b'captain', b'benwick'): (56, 241.49037086312987),
(b'captain', b'wentworth'): (196, 529.8756608388247),
(b'charles', b'hayter'): (33, 92.03437785214481),
(b'chief', b'priests'): (65, 116.31947753846512),
(b'colonel', b'brandon'): (132, 1313.0078125),
(b'couldn', b't'): (89, 171.76138536935215),
(b'cut', b'off'): (217, 129.60290535032792),
(b'dare', b'say'): (115, 89.94000515807346),
(b'de', b'grey'): (77, 603.2109624246722),
(b'didn', b't'): (180, 220.51081560283686),
(b'doesn', b't'): (53, 106.2634985949418),
(b'don', b't'): (830, 250.30957446808512),
(b'dr', b'bull'): (65, 680.7870294599019),
(b'dr', b'middleton'): (40, 162.73103819257668),
(b'drawing', b'room'): (49, 84.91494947493561),
(b'farmer', b'brown'): (100, 386.05179596892236),
(b'father', b'brown'): (207, 91.68277248710235),
(b'few', b'minutes'): (86, 204.16834974753786),
(b'few', b'moments'): (43, 107.77584531675087),
(b'fig', b'tree'): (37, 121.73722334004026),
(b'fine', b'flour'): (36, 86.07682458386685),
(b'fir', b'tree'): (36, 72.6789393074867),
(b'forty', b'years'): (68, 90.60220877269607),
(b'frank', b'churchill'): (151, 1316.4456593286038),
(b'gathered', b'together'): (84, 103.28066074898891),
(b'good', b'natured'): (66, 88.69936184891343),
(b'great', b'deal'): (182, 93.36368125424357),
(b'green', b'forest'): (66, 336.37733627667404),
(b'guinea', b'hen'): (51, 905.8822695035461),
(b'high', b'places'): (99, 129.8123390846559),
(b'holy', b'ghost'): (90, 313.0305073859987),
(b'isn', b't'): (63, 131.96593211752787),
(b'jane', b'fairfax'): (111, 654.5565917587609),
(b'jesus', b'christ'): (199, 172.16816954974848),
(b'joe', b'otter'): (47, 1271.6141235813368),
(b'john', b'knightley'): (58, 83.03755747111268),
(b'lady', b'middleton'): (95, 350.26733319243175),
(b'lady', b'russell'): (147, 613.6301581282135),
(b'little', b'jackal'): (61, 69.81254128038833),
(b'little', b'joe'): (111, 133.28784038147822),
(b'm', b'sure'): (88, 69.15069432539002),
(b'ma', b'am'): (216, 157.25846601094193),
(b'mast', b'heads'): (37, 77.7358926919519),
(b'meat', b'offering'): (122, 210.66724956379437),
(b'mercy', b'endureth'): (41, 269.07674062361025),
(b'miss', b'bates'): (113, 276.39588291692513),
(b'miss', b'dashwood'): (70, 76.66830732292917),
(b'miss', b'fairfax'): (125, 196.19987447261062),
(b'miss', b'smith'): (58, 73.03442128232508),
(b'miss', b'somers'): (49, 160.06190476190477),
(b'miss', b'taylor'): (48, 156.44059469941823),
(b'miss', b'woodhouse'): (173, 229.03802722366902),
(b'moby', b'dick'): (84, 4115.877551020409),
(b'mock', b'turtle'): (56, 2528.877742946708),
(b'mr', b'elliot'): (174, 126.18129960463163),
(b'mr', b'elton'): (214, 121.3990121932397),
(b'mr', b'gresham'): (49, 87.31390492359931),
(b'mr', b'knightley'): (277, 161.74131790625913),
(b'mr', b'weston'): (162, 75.87438262077481),
(b'mr', b'woodhouse'): (132, 82.04651843976633),
(b'mrs', b'clay'): (66, 93.08931456265867),
(b'mrs', b'dashwood'): (121, 115.06873605661974),
(b'mrs', b'elton'): (142, 93.08931456265867),
(b'mrs', b'ferrars'): (73, 102.75628184416554),
(b'mrs', b'goddard'): (58, 143.57843432545658),
(b'mrs', b'jennings'): (229, 279.0655756128398),
(b'mrs', b'musgrove'): (66, 85.21252640735679),
(b'mrs', b'smith'): (79, 84.60327207490248),
(b'mrs', b'theresa'): (67, 170.20061244665206),
(b'mrs', b'weston'): (249, 160.68485093258923),
(b'o', b'clock'): (67, 89.14789088153573),
(b'o', b'er'): (82, 108.14993564993564),
(b'peace', b'offerings'): (83, 176.2577199456205),
(b'sent', b'messengers'): (43, 79.21555418015616),
(b'sin', b'offering'): (118, 129.96079665512747),
(b'sir', b'arthur'): (71, 131.41924812030075),
(b'sir', b'john'): (113, 95.83994133176884),
(b'sir', b'walter'): (136, 399.5145142857143),
(b'six', b'hundred'): (66, 73.57615079223149),
(b'sperm', b'whale'): (183, 297.3672297627184),
(b'sweet', b'savour'): (43, 286.17879256965944),
(b'take', b'heed'): (58, 86.38454061712328),
(b'ten', b'thousand'): (82, 84.00099962514057),
(b'thou', b'shalt'): (1282, 66.88233182614454),
(b'thousand', b'pounds'): (48, 166.51834523092802),
(b'thus', b'saith'): (444, 144.0289127889979),
(b'unleavened', b'bread'): (43, 237.70041787206688),
(b'wasn', b't'): (58, 120.2225788701394),
(b'wee', b'l'): (35, 450.39751861042186),
(b'without', b'blemish'): (46, 83.71359108159393),
(b'won', b't'): (219, 189.96708776595744),
(b'wouldn', b't'): (58, 120.2225788701394),
(b'years', b'ago'): (56, 74.31594785893046)}
In [26]:
# as in Maas et al. (2001):
# - leave in stop words ("indicative of sentiment")
# - no stemming ("model learns similar representations of words of the same stem when data suggests it")
clean_sents = []
for s in lower_sents:
clean_sents.append(lower_bigram[s])
In [27]:
clean_sents[0:9]
Out[27]:
[['emma', 'by', 'jane', 'austen', '1816'],
['volume', 'i'],
['chapter', 'i'],
['emma',
'woodhouse',
'handsome',
'clever',
'and',
'rich',
'with',
'a',
'comfortable',
'home',
'and',
'happy',
'disposition',
'seemed',
'to',
'unite',
'some',
'of',
'the',
'best',
'blessings',
'of',
'existence',
'and',
'had',
'lived',
'nearly',
'twenty',
'one',
'years',
'in',
'the',
'world',
'with',
'very',
'little',
'to',
'distress',
'or',
'vex',
'her'],
['she',
'was',
'the',
'youngest',
'of',
'the',
'two',
'daughters',
'of',
'a',
'most',
'affectionate',
'indulgent',
'father',
'and',
'had',
'in',
'consequence',
'of',
'her',
'sister',
's',
'marriage',
'been',
'mistress',
'of',
'his',
'house',
'from',
'a',
'very',
'early',
'period'],
['her',
'mother',
'had',
'died',
'too',
'long',
'ago',
'for',
'her',
'to',
'have',
'more',
'than',
'an',
'indistinct',
'remembrance',
'of',
'her',
'caresses',
'and',
'her',
'place',
'had',
'been',
'supplied',
'by',
'an',
'excellent',
'woman',
'as',
'governess',
'who',
'had',
'fallen',
'little',
'short',
'of',
'a',
'mother',
'in',
'affection'],
['sixteen',
'years',
'had',
'miss_taylor',
'been',
'in',
'mr_woodhouse',
's',
'family',
'less',
'as',
'a',
'governess',
'than',
'a',
'friend',
'very',
'fond',
'of',
'both',
'daughters',
'but',
'particularly',
'of',
'emma'],
['between',
'_them_',
'it',
'was',
'more',
'the',
'intimacy',
'of',
'sisters'],
['even',
'before',
'miss_taylor',
'had',
'ceased',
'to',
'hold',
'the',
'nominal',
'office',
'of',
'governess',
'the',
'mildness',
'of',
'her',
'temper',
'had',
'hardly',
'allowed',
'her',
'to',
'impose',
'any',
'restraint',
'and',
'the',
'shadow',
'of',
'authority',
'being',
'now',
'long',
'passed',
'away',
'they',
'had',
'been',
'living',
'together',
'as',
'friend',
'and',
'friend',
'very',
'mutually',
'attached',
'and',
'emma',
'doing',
'just',
'what',
'she',
'liked',
'highly',
'esteeming',
'miss_taylor',
's',
'judgment',
'but',
'directed',
'chiefly',
'by',
'her',
'own']]
In [28]:
clean_sents[6] # could consider removing stop words or common words
Out[28]:
['sixteen',
'years',
'had',
'miss_taylor',
'been',
'in',
'mr_woodhouse',
's',
'family',
'less',
'as',
'a',
'governess',
'than',
'a',
'friend',
'very',
'fond',
'of',
'both',
'daughters',
'but',
'particularly',
'of',
'emma']
In [ ]:
# max_vocab_size can be used instead of min_count (which has increased here)
# model = Word2Vec(sentences=clean_sents, size=64, sg=1, window=10, min_count=10, seed=42, workers=8)
# model.save('../clean_gutenberg_model.w2v')
In [29]:
# skip re-training the model with the next line:
model = gensim.models.Word2Vec.load('./clean_gutenberg_model.w2v')
In [30]:
len(model.wv.vocab) # down from 17k in previous notebook
Out[30]:
10329
In [31]:
model['ma_am']
Out[31]:
array([-0.27275795, 0.22294798, -0.27785164, -0.21537074, -0.04375846,
0.35075492, -0.21310569, 0.28521448, 0.18605071, 0.22906332,
0.3929922 , 0.3332729 , -0.06804646, -0.36645287, -0.26969737,
-0.51488483, 0.02132919, 0.07797143, 0.12759572, 0.20545809,
-0.46009699, 0.24983054, 0.25449356, -0.64410228, -0.20215428,
0.1824095 , -0.23551014, 0.82762975, -0.2430227 , -0.02013004,
-0.88646883, -0.06863049, -0.63370681, 0.32553154, 0.02467243,
0.02537833, 0.13675088, -0.08868799, 0.31126702, 0.48400268,
-0.60338777, -0.20596088, -0.41460729, -0.47632793, -0.02355143,
0.56830853, 0.15640558, 0.12009691, 0.16354683, 0.66328555,
-0.24750067, -0.16379237, 0.00516235, 0.83655453, 0.10755659,
0.13685858, -0.00376199, -0.17476274, -0.35152084, -0.01025227,
-0.03357989, 0.33727771, 0.05614873, -0.19056796], dtype=float32)
In [32]:
model.most_similar('ma_am')
Out[32]:
[('madam', 0.8394622802734375),
('mamma', 0.8382871747016907),
('betty', 0.8321777582168579),
('nancy', 0.8240376710891724),
('shouldn', 0.8222858905792236),
('m_sure', 0.8149751424789429),
('madman', 0.8145883083343506),
('bunger', 0.8138953447341919),
('frederick', 0.8121578693389893),
('indignantly', 0.8070249557495117)]
In [33]:
model.most_similar(positive=['father', 'woman'], negative=['man'])
Out[33]:
[('mother', 0.7831815481185913),
('husband', 0.7511569857597351),
('daughter', 0.7453563213348389),
('wife', 0.7420299053192139),
('sister', 0.739268958568573),
('womb', 0.6849961280822754),
('loved', 0.6742969751358032),
('sarah', 0.6667004823684692),
('child', 0.6640843152999878),
('isaac', 0.6623573303222656)]
In [ ]:
# tsne = TSNE(n_components=2, n_iter=1000)
In [ ]:
# X_2d = tsne.fit_transform(model[model.wv.vocab])
In [ ]:
# coords_df = pd.DataFrame(X_2d, columns=['x','y'])
# coords_df['token'] = model.wv.vocab.keys()
In [ ]:
# coords_df.head()
In [ ]:
# coords_df.to_csv('../clean_gutenberg_tsne.csv', index=False)
In [34]:
coords_df = pd.read_csv('./clean_gutenberg_tsne.csv')
In [35]:
_ = coords_df.plot.scatter('x', 'y', figsize=(12,12), marker='.', s=10, alpha=0.2)
In [37]:
subset_df = coords_df.sample(n=5000)
In [38]:
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)
In [39]:
show(p)
In [ ]:
# output_file() here
Content source: the-deep-learners/nyc-ds-academy
Similar notebooks: