In [1]:
import nltk
import pandas as pd
In [14]:
df=pd.read_csv('data/pov_seedwords.txt',delimiter='\t')
In [17]:
df.head(10)
Out[17]:
id
corpus
intro
section
length
body
subject
country
0
1
news
1 of 1792 DOCUMENTS //p //p The New York Time...
Section A; Column 0; Business/Financial Desk; ...
1715 words
MIAMI -- For the Ingram clan, working for the...
PUBLIC TRANSPORTATION (90%); MIDDLE INCOME PER...
UNITED STATES (96%)
1
2
news
2 of 1792 DOCUMENTS //p //p The New York Time...
Section A; Column 0; Business/Financial Desk; ...
1832 words
WAXAHACHIE, Tex. -- Most Americans suffered s...
SENIOR CITIZENS (91%); MIDDLE INCOME PERSONS (...
UNITED STATES (94%)
2
3
news
3 of 1792 DOCUMENTS //p //p The New York Time...
Section A; Column 0; Business/Financial Desk; ...
1759 words
When the California Labor Commissioner's Offic...
LABOR FORCE (90%); FREELANCE EMPLOYMENT (90%);...
UNITED STATES (95%)
3
4
news
4 of 1792 DOCUMENTS //p //p The New York Time...
Section A; Column 0; National Desk; Pg. 1
1506 words
SAN BERNARDINO, Calif. -- A heavily armed man ...
SHOOTINGS (92%); GUNSHOT WOUNDS (89%); FIREARM...
UNITED STATES (94%)
4
5
news
5 of 1792 DOCUMENTS //p //p The New York Time...
Section A; Column 0; National Desk; Pg. 1
1075 words
More than one a day. //p That is how often, on...
SHOOTINGS (92%); WOUNDS & INJURIES (90%); GUNS...
UNITED STATES (94%)
5
6
news
6 of 1792 DOCUMENTS //p //p The New York Time...
Section A; Column 0; National Desk; Pg. 1
1424 words
SAN BERNARDINO, Calif. -- Syed Rizwan Farook a...
ONLINE DATING SERVICES (90%); COURTSHIP & DATI...
UNITED STATES (98%); PAKISTAN (79%); SAUDI ARA...
6
7
news
7 of 1792 DOCUMENTS //p //p The New York Time...
Section A; Column 0; National Desk; Pg. 1
1766 words
SAN BERNARDINO, Calif. -- The couple who the p...
SPECIAL INVESTIGATIVE FORCES (90%); TERRORISM ...
UNITED STATES (93%)
7
8
news
8 of 1792 DOCUMENTS //p //p The New York Time...
Section A; Column 0; National Desk; Pg. 1
1522 words
The killings are happening too often. Bunched ...
SHOOTINGS (90%); COOKING & ENTERTAINING (90%);...
UNITED STATES (97%)
8
9
news
9 of 1792 DOCUMENTS //p //p The New York Time...
Section A; Column 0; National Desk; THE CAMPAI...
1180 words
The Republican candidates for president angril...
MUSLIMS & ISLAM (90%); US REPUBLICAN PARTY (90...
UNITED STATES (94%)
9
10
news
10 of 1792 DOCUMENTS //p //p The New York Tim...
Section A; Column 0; National Desk; Pg. 1
1885 words
WASHINGTON -- On the day she and her husband ...
TERRORISM (92%); TERRORIST ATTACKS (91%); TERR...
UNITED STATES (97%); SYRIA (86%); PAKISTAN (79...
In [9]:
df.columns
Out[9]:
Index(['id', 'corpus', 'intro', 'section', 'length', 'body', 'subject',
'country'],
dtype='object')
In [18]:
df.corpus
Out[18]:
0 news
1 news
2 news
3 news
4 news
5 news
6 news
7 news
8 news
9 news
10 news
11 news
12 news
13 news
14 news
15 news
16 news
17 news
18 news
19 news
20 news
21 news
22 news
23 news
24 news
25 news
26 news
27 news
28 news
29 news
...
1470 poverty
1471 poverty
1472 poverty
1473 poverty
1474 poverty
1475 poverty
1476 poverty
1477 poverty
1478 poverty
1479 poverty
1480 poverty
1481 poverty
1482 poverty
1483 poverty
1484 poverty
1485 poverty
1486 poverty
1487 poverty
1488 poverty
1489 poverty
1490 poverty
1491 poverty
1492 poverty
1493 poverty
1494 poverty
1495 poverty
1496 poverty
1497 poverty
1498 poverty
1499 poverty
Name: corpus, dtype: object
In [13]:
df.corpus.value_counts()
Out[13]:
news 500
poverty 500
comment 500
Name: corpus, dtype: int64
In [21]:
news = df[df.corpus=='news']
poverty = df[df.corpus=='poverty']
comment = df[df.corpus=='comment']
In [24]:
example_text = 'This is an example. One more sentence! And, another one! Then the last one.'
In [23]:
word_tok = nltk.tokenize.WordPunctTokenizer()
In [25]:
word_tok.tokenize(example_text)
Out[25]:
['This',
'is',
'an',
'example',
'.',
'One',
'more',
'sentence',
'!',
'And',
',',
'another',
'one',
'!',
'Then',
'the',
'last',
'one',
'.']
In [26]:
word_tok2 = nltk.tokenize.WhitespaceTokenizer()
In [27]:
word_tok2.tokenize(example_text)
Out[27]:
['This',
'is',
'an',
'example.',
'One',
'more',
'sentence!',
'And,',
'another',
'one!',
'Then',
'the',
'last',
'one.']
In [50]:
word_tok3=nltk.tokenize.RegexpTokenizer(r'\b(\w+(?:[\'\-]\w+)?)\b')
In [32]:
word_tok.tokenize('@lori here #python with @matt')
Out[32]:
['@', 'lori', 'here', '#', 'python', 'with', '@', 'matt']
In [46]:
word_tok3.tokenize("it's this_as_one_word some-compound-word like i'm and they've dog's bowl")
Out[46]:
["it's",
'this_as_one_word',
'some-compound',
'word',
'like',
"i'm",
'and',
"they've",
"dog's",
'bowl']
In [55]:
news_toks=[]
for art in news.body:
news_toks.append(word_tok3.tokenize(art.lower()))
In [56]:
news_freq = nltk.FreqDist()
In [57]:
for art in news_toks:
news_freq.update(art)
In [59]:
news_freq.most_common(10)
Out[59]:
[('the', 43956),
('to', 20442),
('a', 20428),
('of', 19719),
('and', 17536),
('p', 16606),
('in', 16587),
('that', 10297),
('said', 7641),
('mr', 7366)]
In [60]:
news_freq['policy']
Out[60]:
398
In [72]:
%matplotlib inline
import matplotlib.pyplot as plt
In [63]:
news_freq.plot(100)
In [164]:
comments_freq = nltk.FreqDist()
for art in comment.body:
comments_freq.update(word_tok3.tokenize(art.lower()))
In [65]:
poverty_freq = nltk.FreqDist()
for art in poverty.body:
poverty_freq.update(word_tok3.tokenize(art.lower()))
In [66]:
poverty_freq.most_common(50)
Out[66]:
[('the', 27902),
('of', 13996),
('to', 13112),
('and', 12731),
('a', 11619),
('p', 10546),
('in', 10440),
('that', 6790),
('for', 5222),
('is', 4713),
('on', 3940),
('as', 3176),
('with', 2929),
('it', 2822),
('he', 2821),
('are', 2736),
('have', 2372),
('said', 2358),
('was', 2329),
('but', 2328),
('at', 2327),
('mr', 2313),
('by', 2309),
('has', 2185),
('not', 2143),
('from', 2049),
('his', 2038),
('who', 2013),
('be', 1916),
('more', 1910),
('this', 1875),
('an', 1863),
('they', 1778),
('we', 1620),
('their', 1575),
('i', 1517),
('about', 1515),
('people', 1505),
('or', 1419),
('new', 1395),
('than', 1301),
('she', 1151),
('would', 1144),
('her', 1121),
('one', 1109),
('will', 1098),
('had', 1075),
('which', 1021),
('what', 979),
('you', 965)]
In [68]:
poverty_freq['poverty']
Out[68]:
684
In [73]:
plt.figure(figsize=(12,4))
poverty_freq.plot(100)
In [75]:
news_tokens=sum(news_freq.values())
news_tokens
Out[75]:
757678
In [77]:
poverty_tokens=sum(poverty_freq.values())
poverty_tokens
Out[77]:
494377
In [174]:
comments_tokens=sum(comments_freq.values())
comments_tokens
Out[174]:
449848
In [85]:
news_freq['poverty']
Out[85]:
58
In [86]:
poverty_freq['poverty']
Out[86]:
684
In [83]:
1000000*(news_freq['poverty'] / news_tokens)
Out[83]:
76.5496688566911
In [84]:
1000000*(poverty_freq['poverty'] / poverty_tokens)
Out[84]:
1383.5595102522973
In [107]:
import math
ln = math.log
def calculate_keyness(a, c, b, d):
'''
calculate loglikehood following
http://ucrel.lancs.ac.uk/llwizard.html
a = freq of word in corpus A
c = size of corpus A
b = freq of word in corpus B
d = size of corpus B
'''
E1 = c*(a+b) / (c+d)
E2 = d*(a+b) / (c+d)
direction = 1 if a/c > b/d else -1
G2 = 2*((a*ln (a/E1)) + (b*ln (b/E2)))
return direction*G2
In [110]:
calculate_keyness(58, 757678, 684, 494377)
Out[110]:
-922.4529260281342
In [114]:
calculate_keyness(news_freq['obama'], news_tokens,
poverty_freq['obama'], poverty_tokens)
Out[114]:
85.50690363793245
In [143]:
news_vocab = set(news_freq.keys())
poverty_vocab = set(poverty_freq.keys())
total_vocab = list(news_vocab.union(poverty_vocab))
In [ ]:
In [144]:
len(total_vocab)
Out[144]:
41681
In [154]:
keyness_data = []
for item in total_vocab:
nf=news_freq[item]
pf=poverty_freq[item]
if nf<5 or pf<5:
continue
keyness=calculate_keyness(pf, poverty_tokens, nf, news_tokens)
keyness_data.append({'word':item, 'news': nf,
'poverty': pf, 'LL': keyness}
)
In [156]:
keyness_df=pd.DataFrame(keyness_data)
In [159]:
keyness_df.sort_values('LL', ascending=False)
Out[159]:
LL
news
poverty
word
2003
1534.664171
25
937
inequality
4717
922.452926
58
684
poverty
1609
836.115408
88
699
income
5907
647.186611
61
526
sanders
2303
600.979665
205
746
economic
1144
537.511004
116
560
poor
5899
523.328147
90
505
tax
5763
434.581098
22
309
welfare
5930
345.292406
498
893
our
543
344.930299
313
684
children
382
339.044525
215
558
social
1715
327.690960
45
295
francis
584
285.377190
26
230
wealth
5323
281.031488
102
358
families
2034
261.470472
79
309
wage
1346
249.328473
42
239
rich
1163
247.283029
72
288
housing
3198
244.572343
636
921
percent
2666
231.127074
5321
4713
is
1841
225.310541
18
176
homeless
174
218.886271
1454
1620
we
6024
218.249295
243
485
women
1872
218.195124
114
327
de
4030
216.110143
83
282
class
4809
202.873808
23
173
pope
4691
201.218120
39
202
wages
196
195.900886
2881
2736
are
550
189.165437
68
240
society
228
187.769215
66
236
blasio
933
187.105740
14
144
bernie
...
...
...
...
...
1650
-107.172285
709
217
house
2858
-107.570512
222
25
records
1070
-110.211403
369
73
military
5508
-114.336661
227
24
investigation
3907
-117.815276
455
101
bush
4222
-119.382362
335
56
official
1185
-119.984588
200
15
apple
4322
-123.103184
282
37
attack
2902
-126.774036
541
129
department
792
-128.553557
218
17
shooting
4241
-141.298741
20428
11619
a
3635
-142.016899
376
59
f
2556
-147.402155
248
19
intelligence
2196
-149.237408
242
17
israel
4177
-156.625600
2265
919
were
4379
-161.444039
4365
2038
his
1528
-175.185686
327
31
iran
5690
-177.223028
1175
360
him
896
-194.240363
494
74
company
2370
-207.544005
381
35
officer
1826
-234.421376
489
56
officers
4400
-264.446737
978
210
police
949
-266.630674
508
50
water
5541
-281.971342
469
35
islamic
1530
-340.382999
1025
184
officials
1030
-341.490604
6494
2821
he
4161
-404.028209
5762
2329
was
3534
-437.665580
3308
1075
had
2360
-1053.523904
7366
2313
mr
1762
-1134.934456
7641
2358
said
6140 rows × 4 columns
In [160]:
keyness_df.to_csv('data/poverty_keyness_list.csv', index=False)
In [169]:
poverty_vocab = set(poverty_freq.keys())
comments_vocab = set(poverty_freq.keys())
total_vocab = list(poverty_vocab.union(comments_vocab))
In [176]:
keyness_data = []
for item in total_vocab:
cf=comments_freq[item]
pf=poverty_freq[item]
if cf<5 or pf<5:
continue
keyness=calculate_keyness(pf, poverty_tokens, cf, comments_tokens)
keyness_data.append({'word':item, 'comments': cf,
'poverty': pf, 'LL': keyness}
)
In [172]:
keyness_data
Out[172]:
[]
In [177]:
keyness_df=pd.DataFrame(keyness_data)
keyness_df.sort_values('LL', ascending=False)
Out[177]:
LL
comments
poverty
word
1574
996.011084
578
2358
said
1787
792.925547
74
937
inequality
2113
698.351529
748
2313
mr
4258
553.474462
61
684
poverty
1435
475.119286
91
699
income
659
318.406244
71
495
ms
1030
316.188296
98
560
poor
3804
312.797925
58
455
mrs
2060
292.622485
196
746
economic
1671
258.230964
31
327
de
5509
214.627560
113
484
workers
215
214.051584
15
236
blasio
488
212.183191
217
684
children
1530
207.480200
36
295
francis
1809
199.622016
44
309
wage
5215
189.203363
48
309
welfare
2206
166.204829
147
493
city
1998
141.779716
41
244
mayor
1642
131.371767
19
176
homeless
1049
130.019548
66
288
housing
3021
128.317547
38
223
minimum
525
119.273335
45
230
wealth
350
116.544382
225
558
social
4236
110.280853
37
202
wages
3406
102.987777
17
144
gap
1983
98.729622
275
607
work
3725
93.057311
324
671
clinton
3201
92.730460
7
104
editor
3628
90.894083
87
282
class
4222
86.125705
51
206
development
...
...
...
...
...
3711
-63.440715
811
580
it's
1833
-63.553141
600
396
should
3512
-64.544994
230
101
bush
2064
-65.197908
93
17
patients
1951
-65.661438
75
9
solar
1970
-71.133539
146
42
jeb
1122
-71.633332
143
40
trump's
4621
-72.662587
2695
2328
but
2687
-74.802479
120
26
invite
1241
-76.246898
650
416
my
144
-77.592441
352
176
sign
674
-78.990674
80
7
football
5447
-80.645948
378
192
facebook
2187
-86.099225
579
342
republican
2828
-87.552576
2325
1916
be
1672
-92.343865
93
8
syrian
3554
-100.064650
537
290
me
3686
-105.061674
205
56
rubio
4494
-107.778457
431
202
twitter
4826
-116.443897
1990
1517
i
1731
-121.662207
172
31
refugees
1656
-131.717072
231
56
donald
3507
-132.451989
287
87
section
5097
-143.988049
438
174
follow
2159
-145.478224
290
81
he's
650
-145.758603
164
19
guns
3248
-201.684678
269
44
gun
3987
-261.387116
321
45
cruz
2164
-283.091128
586
170
opinion
4433
-671.729292
888
143
trump
5551 rows × 4 columns
In [199]:
pd.Series([i[0] for i in comment.section.str.split(';') if type(i) is list]).value_counts()
Out[199]:
Section A 439
Section SR 48
Section 13
dtype: int64
In [226]:
news_bigram = nltk.FreqDist()
news_trigram = nltk.FreqDist()
for art in news.body:
toks = word_tok3.tokenize(art.lower())
news_bigram.update(nltk.ngrams(toks,2))
news_trigram.update(nltk.ngrams(toks,3))
In [224]:
pov_bigram = nltk.FreqDist()
pov_trigram = nltk.FreqDist()
for art in poverty.body:
toks = word_tok3.tokenize(art.lower())
pov_bigram.update(nltk.ngrams(toks,2))
pov_trigram.update(nltk.ngrams(toks,3))
In [228]:
pov_trigram.most_common()
Out[228]:
[(('the', 'united', 'states'), 357),
(('the', 'new', 'york'), 186),
(('one', 'of', 'the'), 184),
(('mr', 'de', 'blasio'), 144),
(('new', 'york', 'times'), 141),
(('in', 'the', 'united'), 139),
(('sign', 'up', 'for'), 127),
(('on', 'facebook', 'and'), 121),
(('a', 'lot', 'of'), 116),
(('percent', 'of', 'the'), 111),
(('some', 'of', 'the'), 107),
(('facebook', 'and', 'twitter'), 107),
(('the', 'minimum', 'wage'), 104),
(('new', 'york', 'city'), 103),
(('according', 'to', 'the'), 100),
(('the', 'number', 'of'), 100),
(('in', 'new', 'york'), 100),
(('of', 'new', 'york'), 89),
(('up', 'for', 'the'), 88),
(('the', 'university', 'of'), 88),
(('as', 'well', 'as'), 87),
(('p', 'this', 'is'), 87),
(('p', 'follow', 'the'), 85),
(('follow', 'the', 'new'), 85),
(('p', 'in', 'the'), 83),
(('and', 'sign', 'up'), 82),
(('twitter', 'and', 'sign'), 82),
(('part', 'of', 'the'), 80),
(('he', 'said', 'p'), 79),
(('to', 'the', 'editor'), 78),
(('and', 'twitter', 'and'), 78),
(('today', 'newsletter', 'p'), 72),
(('more', 'likely', 'to'), 71),
(('the', 'middle', 'class'), 70),
(('york', 'times', 'opinion'), 67),
(('times', 'opinion', 'section'), 67),
(('section', 'on', 'facebook'), 67),
(('opinion', 'section', 'on'), 67),
(('for', 'the', 'opinion'), 66),
(('opinion', 'today', 'newsletter'), 66),
(('we', 'need', 'to'), 66),
(('at', 'the', 'university'), 66),
(('the', 'opinion', 'today'), 66),
(('the', 'end', 'of'), 64),
(('this', 'is', 'a'), 62),
(('p', 'to', 'the'), 62),
(('at', 'the', 'same'), 59),
(('mr', 'sanders', 'said'), 58),
(('senator', 'bernie', 'sanders'), 57),
(('the', 'editor', 'p'), 57),
(('p', 'but', 'the'), 57),
(('the', 'united', 'nations'), 56),
(('the', 'fact', 'that'), 56),
(('the', 'same', 'time'), 56),
(('there', 'is', 'a'), 56),
(('president', 'of', 'the'), 54),
(('in', 'recent', 'years'), 54),
(('many', 'of', 'the'), 52),
(('p', 'it', 'is'), 51),
(('out', 'of', 'the'), 51),
(('the', 'white', 'house'), 51),
(('end', 'of', 'the'), 50),
(('around', 'the', 'world'), 50),
(('the', 'rest', 'of'), 49),
(('p', 'mr', 'sanders'), 49),
(('according', 'to', 'a'), 49),
(('the', 'cost', 'of'), 48),
(('in', 'an', 'interview'), 48),
(('in', 'the', 'last'), 48),
(('she', 'said', 'p'), 47),
(('director', 'of', 'the'), 46),
(('more', 'than', 'a'), 46),
(('to', 'be', 'a'), 46),
(('of', 'the', 'united'), 46),
(('bill', 'de', 'blasio'), 46),
(('in', 'the', 'world'), 46),
(('the', 'first', 'draft'), 45),
(('it', 'is', 'a'), 45),
(('p', 'at', 'the'), 44),
(('in', 'this', 'country'), 44),
(('in', 'the', 'past'), 44),
(('hillary', 'rodham', 'clinton'), 43),
(('p', 'there', 'are'), 43),
(('of', 'the', 'most'), 42),
(('likely', 'to', 'be'), 42),
(('much', 'of', 'the'), 42),
(('in', 'the', 'country'), 42),
(('most', 'of', 'the'), 42),
(('p', 'mrs', 'clinton'), 42),
(('out', 'of', 'poverty'), 41),
(('p', 'in', 'a'), 41),
(('we', 'have', 'to'), 40),
(('of', 'the', 'new'), 40),
(('the', '2016', 'presidential'), 40),
(('it', 'is', 'not'), 40),
(('mayor', 'bill', 'de'), 40),
(('version', 'of', 'the'), 39),
(('the', 'federal', 'government'), 39),
(('at', 'the', 'top'), 39),
(('said', 'in', 'a'), 39),
(('of', 'the', 'poor'), 38),
(('you', 'need', 'to'), 38),
(('for', 'the', 'poor'), 38),
(('to', 'know', 'about'), 37),
(('need', 'to', 'know'), 37),
(('find', 'out', 'what'), 37),
(('of', 'the', 'world'), 37),
(('at', 'a', 'time'), 37),
(('to', 'the', 'united'), 37),
(('the', 'one', 'that'), 36),
(('there', 'is', 'no'), 36),
(('the', 'first', 'time'), 36),
(('2016', 'presidential', 'race'), 36),
(('half', 'of', 'the'), 36),
(('over', 'the', 'last'), 36),
(('and', 'the', 'first'), 36),
(('what', 'you', 'need'), 36),
(('is', 'not', 'a'), 36),
(('p', 'mr', 'de'), 35),
(('twitter', 'and', 'the'), 35),
(('as', 'much', 'as'), 35),
(('p', 'i', 'think'), 35),
(('draft', 'newsletter', 'p'), 35),
(('of', 'the', 'population'), 35),
(('presidential', 'race', 'today'), 35),
(('the', 'poor', 'p'), 35),
(('about', 'the', '2016'), 35),
(('p', 'find', 'out'), 35),
(('know', 'about', 'the'), 35),
(('out', 'what', 'you'), 35),
(('politics', 'news', 'updates'), 35),
(('race', 'today', 'and'), 35),
(('get', 'politics', 'news'), 35),
(('people', 'who', 'are'), 35),
(('and', 'get', 'politics'), 35),
(('first', 'draft', 'newsletter'), 35),
(('today', 'and', 'get'), 35),
(('in', 'the', 'city'), 35),
(('be', 'able', 'to'), 35),
(('new', 'york', 'p'), 35),
(('said', 'in', 'an'), 34),
(('mr', 'de', "blasio's"), 34),
(('over', 'the', 'past'), 34),
(('members', 'of', 'the'), 34),
(('new', 'york', 'state'), 34),
(('for', 'the', 'first'), 34),
(('news', 'updates', 'via'), 34),
(('mrs', 'clinton', 'said'), 34),
(('via', 'facebook', 'twitter'), 34),
(('updates', 'via', 'facebook'), 34),
(('are', 'going', 'to'), 34),
(('facebook', 'twitter', 'and'), 34),
(('in', 'the', 'state'), 33),
(('p', 'new', 'york'), 33),
(('of', 'income', 'inequality'), 33),
(('sanders', 'of', 'vermont'), 33),
(('as', 'part', 'of'), 33),
(('this', 'is', 'the'), 33),
(('the', 'top', '1'), 32),
(('is', 'one', 'of'), 32),
(('that', 'he', 'would'), 32),
(('in', 'the', 'first'), 32),
(('is', 'that', 'the'), 32),
(('of', 'the', 'american'), 32),
(('are', 'more', 'likely'), 32),
(('the', 'democratic', 'party'), 32),
(('up', 'for', 'our'), 32),
(('bernie', 'sanders', 'of'), 32),
(('the', 'affordable', 'care'), 32),
(('the', 'poor', 'and'), 31),
(('follow', 'us', 'on'), 31),
(('us', 'on', 'facebook'), 31),
(('of', 'the', 'story'), 31),
(('a', 'professor', 'of'), 31),
(('in', 'terms', 'of'), 31),
(('has', 'been', 'a'), 31),
(('a', 'group', 'of'), 31),
(('across', 'the', 'country'), 31),
(('top', '1', 'percent'), 31),
(('affordable', 'care', 'act'), 31),
(('a', 'number', 'of'), 31),
(('it', 'was', 'a'), 31),
(('in', 'other', 'words'), 31),
(('the', 'center', 'for'), 30),
(('than', 'the', 'one'), 30),
(('because', 'of', 'the'), 30),
(('the', 'kind', 'of'), 30),
(('is', 'a', 'more'), 30),
(('that', 'he', 'was'), 30),
(('it', 'comes', 'to'), 30),
(('is', 'expected', 'to'), 30),
(('rich', 'and', 'poor'), 30),
(('the', 'world', 'p'), 30),
(('men', 'and', 'women'), 30),
(('at', 'the', 'end'), 30),
(('all', 'of', 'the'), 30),
(('united', 'states', 'p'), 30),
(('of', 'the', 'country'), 30),
(('when', 'it', 'comes'), 30),
(('you', 'have', 'to'), 30),
(('as', 'a', 'result'), 30),
(('chairman', 'of', 'the'), 29),
(('more', 'than', 'half'), 29),
(('the', 'rich', 'and'), 29),
(('the', 'supreme', 'court'), 29),
(('of', 'the', "world's"), 29),
(('said', 'p', 'the'), 29),
(('000', 'a', 'year'), 29),
(('going', 'to', 'be'), 29),
(('have', 'to', 'be'), 29),
(('u', 'b', 'i'), 28),
(('increase', 'in', 'the'), 28),
(('he', 'did', 'not'), 28),
(('g', 'o', 'p'), 28),
(('there', 'was', 'a'), 28),
(('to', 'raise', 'the'), 28),
(('but', 'it', 'is'), 28),
(('need', 'to', 'be'), 28),
(('de', 'blasio', 'said'), 28),
(('this', 'is', 'not'), 28),
(('p', 'it', 'was'), 27),
(('the', 'age', 'of'), 27),
(('less', 'likely', 'to'), 27),
(('the', 'lives', 'of'), 27),
(('more', 'complete', 'version'), 27),
(('part', 'of', 'a'), 27),
(('in', 'the', 'middle'), 27),
(('appeared', 'in', 'print'), 27),
(('a', 'more', 'complete'), 27),
(('that', 'appeared', 'in'), 27),
(('p', 'applause', 'p'), 27),
(('over', 'the', 'next'), 27),
(('complete', 'version', 'of'), 27),
(('a', 'lack', 'of'), 27),
(('the', 'tax', 'code'), 27),
(('the', 'story', 'than'), 27),
(('a', 'time', 'when'), 27),
(('one', 'that', 'appeared'), 27),
(('the', 'gap', 'between'), 27),
(('in', 'print', 'p'), 27),
(('story', 'than', 'the'), 27),
(('in', 'which', 'the'), 27),
(('the', 'department', 'of'), 27),
(('at', 'the', 'time'), 27),
(('he', 'was', 'a'), 26),
(('said', 'he', 'was'), 26),
(('provides', 'news', 'analysis'), 26),
(('policy', 'and', 'everyday'), 26),
(('about', 'politics', 'policy'), 26),
(('a', 'series', 'of'), 26),
(('everyday', 'life', 'follow'), 26),
(('and', 'graphics', 'about'), 26),
(('and', 'twitter', 'sign'), 26),
(('united', 'states', 'and'), 26),
(('in', 'favor', 'of'), 26),
(('to', 'have', 'a'), 26),
(('p', 'the', 'upshot'), 26),
(('number', 'of', 'people'), 26),
(('the', 'center', 'of'), 26),
(('analysis', 'and', 'graphics'), 26),
(('p', 'there', 'is'), 26),
(('as', 'long', 'as'), 26),
(('and', 'everyday', 'life'), 26),
(('the', 'upshot', 'provides'), 26),
(('twitter', 'sign', 'up'), 26),
(('in', 'the', '1970s'), 26),
(('life', 'follow', 'us'), 26),
(('news', 'analysis', 'and'), 26),
(('upshot', 'provides', 'news'), 26),
(('graphics', 'about', 'politics'), 26),
(('inequality', 'in', 'the'), 26),
(('politics', 'policy', 'and'), 26),
(('the', 'state', 'of'), 26),
(('of', 'the', 'state'), 26),
(('that', 'he', 'had'), 26),
(('to', 'get', 'the'), 26),
(('at', 'the', 'center'), 25),
(('the', 'people', 'who'), 25),
(('in', 'the', 'same'), 25),
(('it', 'would', 'be'), 25),
(('raising', 'the', 'minimum'), 25),
(('those', 'who', 'are'), 25),
(('said', 'it', 'was'), 25),
(('a', 'couple', 'of'), 25),
(('the', 'idea', 'that'), 25),
(('said', 'that', 'the'), 25),
(('p', 'we', 'are'), 25),
(('than', 'half', 'of'), 25),
(('on', 'the', 'other'), 25),
(('in', 'a', 'statement'), 25),
(('people', 'in', 'the'), 25),
(('of', 'the', 'people'), 25),
(('he', 'said', 'he'), 25),
(('the', 'importance', 'of'), 24),
(('climate', 'change', 'and'), 24),
(('the', 'president', 'of'), 24),
(('of', 'poverty', 'and'), 24),
(('it', 'has', 'been'), 24),
(('15', 'an', 'hour'), 24),
(('the', 'problem', 'is'), 24),
(('income', 'inequality', 'and'), 24),
(('of', 'people', 'who'), 24),
(('p', 'but', 'mr'), 24),
(('a', 'result', 'of'), 24),
(('of', 'the', "nation's"), 24),
(('1', 'percent', 'of'), 24),
(('a', 'way', 'to'), 24),
(('social', 'and', 'economic'), 24),
(('to', 'ensure', 'that'), 24),
(('earned-income', 'tax', 'credit'), 24),
(('not', 'going', 'to'), 24),
(('p', 'the', 'writer'), 23),
(('the', 'rise', 'of'), 23),
(('attention', 'to', 'the'), 23),
(('said', 'p', 'mr'), 23),
(('p', 'g', 'g'), 23),
(('a', 'majority', 'of'), 23),
(('of', 'the', "country's"), 23),
(('an', 'increase', 'in'), 23),
(('criminal', 'justice', 'system'), 23),
(('to', 'the', 'left'), 23),
(('university', 'of', 'california'), 23),
(('it', 'is', 'the'), 23),
(('back', 'to', 'the'), 23),
(('in', 'addition', 'to'), 23),
(('rest', 'of', 'the'), 23),
(('is', 'going', 'to'), 23),
(('to', 'change', 'the'), 23),
(('the', 'poverty', 'line'), 23),
(('people', 'who', 'have'), 23),
(('an', 'effort', 'to'), 23),
(('for', 'our', 'newsletter'), 22),
(('de', 'blasio', 'and'), 22),
(('at', 'the', 'bottom'), 22),
(('the', 'u', 's'), 22),
(('that', 'it', 'would'), 22),
(('to', 'make', 'sure'), 22),
(('of', 'the', 'income'), 22),
(('would', 'be', 'a'), 22),
(('i', 'think', 'the'), 22),
(('the', 'republican', 'party'), 22),
(('those', 'at', 'the'), 22),
(('p', 'i', 'was'), 22),
(('for', 'more', 'than'), 22),
(('it', 'was', 'the'), 22),
(('p', "it's", 'not'), 21),
(('middle', 'class', 'and'), 21),
(('to', 'focus', 'on'), 21),
(('the', 'issue', 'of'), 21),
(('to', 'pay', 'for'), 21),
(('parts', 'of', 'the'), 21),
(('of', 'the', 'federal'), 21),
(('that', 'there', 'is'), 21),
(('that', 'mrs', 'clinton'), 21),
(('was', 'one', 'of'), 21),
(('member', 'of', 'the'), 21),
(('a', 'few', 'years'), 21),
(('focus', 'on', 'the'), 21),
(('p', 'but', 'in'), 21),
(('the', 'need', 'to'), 21),
(('the', 'american', 'dream'), 21),
(('the', 'g', 'o'), 21),
(('of', 'more', 'than'), 21),
(('to', 'believe', 'that'), 21),
(('it', 'was', 'not'), 21),
(('is', 'not', 'the'), 21),
(('p', 'the', 'study'), 21),
(('i', 'want', 'to'), 21),
(('that', 'it', 'was'), 21),
(('i', "don't", 'think'), 21),
(('the', 'islamic', 'state'), 21),
(('minimum', 'wage', 'to'), 21),
(('the', 'most', 'important'), 21),
(('the', 'need', 'for'), 21),
(('for', 'the', 'democratic'), 21),
(('to', 'do', 'with'), 21),
(('to', 'make', 'the'), 21),
(('said', 'he', 'had'), 20),
(('the', 'author', 'of'), 20),
(('as', 'well', 'p'), 20),
(('to', 'those', 'who'), 20),
(('p', 'the', 'report'), 20),
(('vast', 'majority', 'of'), 20),
(('the', 'american', 'economy'), 20),
(("we're", 'going', 'to'), 20),
(('to', 'new', 'york'), 20),
(('we', 'have', 'a'), 20),
(('for', 'those', 'who'), 20),
(('in', 'the', 'new'), 20),
(('chief', 'executive', 'of'), 20),
(('in', 'south', 'carolina'), 20),
(('and', 'the', 'poor'), 20),
(('many', 'of', 'them'), 20),
(('the', 'justice', 'department'), 20),
(('in', 'the', 'early'), 20),
(('the', 'earned-income', 'tax'), 20),
(('used', 'to', 'be'), 20),
(('executive', 'director', 'of'), 20),
(('mrs', 'clinton', 'has'), 20),
(('the', 'obama', 'administration'), 20),
(('to', 'make', 'a'), 20),
(('the', 'country', 'p'), 20),
(('the', 'lack', 'of'), 20),
(('new', 'york', 'and'), 20),
(('federal', 'minimum', 'wage'), 20),
(('of', 'the', 'city'), 20),
(('he', 'said', 'the'), 20),
(('the', 'future', 'of'), 19),
(('at', 'the', 'very'), 19),
(('in', 'response', 'to'), 19),
(('the', 'role', 'of'), 19),
(('is', 'likely', 'to'), 19),
(('the', 'effects', 'of'), 19),
(('to', 'live', 'in'), 19),
(('invite', 'you', 'to'), 19),
(('that', 'we', 'are'), 19),
(('on', 'the', 'streets'), 19),
(('united', 'states', 'is'), 19),
(('that', 'it', 'is'), 19),
(('p', 'in', 'his'), 19),
(('sign', 'up', 'here'), 19),
(('are', 'in', 'the'), 19),
(('newsletter', 'p', 'this'), 19),
(('who', 'want', 'to'), 19),
(('the', 'face', 'of'), 19),
(('has', 'not', 'been'), 19),
(('and', 'it', 'is'), 19),
(('the', 'country', 'and'), 19),
(('health', 'care', 'and'), 19),
(('gross', 'domestic', 'product'), 19),
(('de', 'blasio', 'has'), 19),
(('mr', 'obama', 'said'), 19),
(('there', 'will', 'be'), 19),
(('said', 'he', 'would'), 19),
(('the', 'federal', 'minimum'), 19),
(('of', 'the', 'national'), 19),
(('that', 'is', 'the'), 19),
(('of', 'the', 'middle'), 19),
(('in', 'the', 'bottom'), 19),
(('i', "don't", 'know'), 19),
(('and', 'that', 'the'), 19),
(('60', 'percent', 'of'), 18),
(('secretary', 'of', 'state'), 18),
(('the', 'criminal', 'justice'), 18),
(('by', 'the', 'end'), 18),
(('he', 'said', 'it'), 18),
(('as', 'a', 'whole'), 18),
(('for', 'the', 'new'), 18),
(('a', 'third', 'of'), 18),
(('to', 'your', 'inbox'), 18),
(('the', 'mayor', 'said'), 18),
(('taxes', 'on', 'the'), 18),
(('to', 'go', 'to'), 18),
(('mr', 'sanders', 'has'), 18),
(('at', 'least', 'one'), 18),
(('the', 'right', 'to'), 18),
(('of', 'millions', 'of'), 18),
(('our', 'newsletter', 'p'), 18),
(('a', 'chance', 'to'), 18),
(('the', 'distribution', 'of'), 18),
(('the', 'value', 'of'), 18),
(('e', 'c', 'd'), 18),
(('in', 'order', 'to'), 18),
(('george', 'w', 'bush'), 18),
(('you', 'want', 'to'), 18),
(('gap', 'between', 'the'), 18),
(('of', 'thousands', 'of'), 18),
(('the', 'european', 'union'), 18),
(('the', 'director', 'of'), 18),
(('the', 'work', 'force'), 18),
(('on', 'the', 'street'), 18),
(('andrew', 'm', 'cuomo'), 18),
(('when', 'he', 'was'), 18),
(('to', 'address', 'the'), 18),
(('of', 'income', 'and'), 18),
(('to', 'say', 'that'), 18),
(('of', 'the', 'house'), 18),
(('in', 'ways', 'that'), 18),
(('the', 'best', 'way'), 18),
(('are', 'likely', 'to'), 18),
(('o', 'e', 'c'), 18),
(('to', 'be', 'the'), 18),
(('a', 'sense', 'of'), 18),
(('the', 'children', 'of'), 17),
(('it', 'will', 'be'), 17),
(('the', 'mayor', 'of'), 17),
(('is', 'part', 'of'), 17),
(('p', 'we', 'have'), 17),
(('income', 'inequality', 'is'), 17),
(('up', 'in', 'the'), 17),
(('that', 'they', 'are'), 17),
(('30', 'percent', 'of'), 17),
(('people', 'out', 'of'), 17),
(('delivered', 'to', 'your'), 17),
(('take', 'advantage', 'of'), 17),
(('of', 'climate', 'change'), 17),
(('around', 'the', 'country'), 17),
(('she', 'said', 'she'), 17),
(('the', 'writer', 'is'), 17),
(('people', 'who', 'were'), 17),
(('the', 'first', 'place'), 17),
(('up', 'here', 'p'), 17),
(('in', 'recent', 'decades'), 17),
(('it', 'is', 'also'), 17),
(('he', 'said', 'that'), 17),
(('p', 'and', 'yet'), 17),
(('a', 'college', 'degree'), 17),
(('grew', 'up', 'in'), 17),
(('gov', 'andrew', 'm'), 17),
(('p', 'the', 'new'), 17),
(('the', 'top', '10'), 17),
(('p', 'one', 'of'), 17),
(('would', 'have', 'to'), 17),
(('mrs', 'clinton', 'and'), 17),
(('he', 'is', 'a'), 17),
(('percent', 'of', 'americans'), 17),
(('p', 'in', 'an'), 17),
(('mr', 'sanders', 'who'), 17),
(('of', 'the', 'center'), 16),
(('the', 'effect', 'of'), 16),
(('of', 'the', 'economic'), 16),
(('the', 'middle', 'east'), 16),
(('income', 'inequality', 'p'), 16),
(('to', 'help', 'the'), 16),
(('the', 'size', 'of'), 16),
(('the', 'top', 'of'), 16),
(('income', 'inequality', 'in'), 16),
(('over', 'the', 'years'), 16),
(('p', 'but', 'he'), 16),
(('p', 'in', 'addition'), 16),
(('to', 'be', 'in'), 16),
(('p', 'i', "don't"), 16),
(('he', 'said', 'in'), 16),
(('who', 'grew', 'up'), 16),
(('p', 'while', 'the'), 16),
(('look', 'at', 'the'), 16),
(('at', 'odds', 'with'), 16),
(('in', 'the', 'top'), 16),
(('side', 'of', 'the'), 16),
(('a', 'high', 'school'), 16),
(('they', 'do', 'not'), 16),
(('tend', 'to', 'be'), 16),
(('of', 'the', 'poorest'), 16),
(('he', 'has', 'been'), 16),
(('10', 'percent', 'of'), 16),
(('of', 'the', 'democratic'), 16),
(('a', 'variety', 'of'), 16),
(('to', 'more', 'than'), 16),
(('and', 'those', 'who'), 16),
(('in', 'a', 'speech'), 16),
(('all', 'of', 'us'), 16),
(('middle', 'class', 'p'), 16),
(('an', 'economist', 'at'), 16),
(('bottom', '90', 'percent'), 16),
(('to', 'reduce', 'the'), 16),
(('raise', 'the', 'minimum'), 16),
(('in', 'an', 'email'), 16),
(('on', 'the', 'issue'), 16),
(('may', 'not', 'be'), 16),
(('pew', 'research', 'center'), 16),
(('the', 'share', 'of'), 16),
(('of', 'poverty', 'p'), 16),
(('leader', 'of', 'the'), 16),
(('we', 'are', 'going'), 16),
(('on', 'climate', 'change'), 16),
(('the', 'great', 'recession'), 16),
(('that', 'we', 'have'), 16),
(('of', 'the', "city's"), 16),
(('the', 'impact', 'of'), 16),
(('the', 'bottom', '90'), 16),
(('for', 'the', 'homeless'), 16),
(('they', 'want', 'to'), 16),
(('of', 'all', 'the'), 16),
(('to', 'see', 'the'), 16),
(('in', 'the', 'face'), 16),
(('percent', 'of', 'all'), 16),
(('the', 'power', 'of'), 16),
(('to', 'the', 'poor'), 16),
(('in', 'recent', 'months'), 16),
(('to', 'talk', 'about'), 16),
(('a', 'member', 'of'), 16),
(('the', 'middle', 'of'), 16),
(('the', 'financial', 'crisis'), 16),
(('to', 'deal', 'with'), 15),
(('p', "it's", 'a'), 15),
(('professor', 'at', 'the'), 15),
(('those', 'in', 'the'), 15),
(('to', 'build', 'a'), 15),
(("don't", 'want', 'to'), 15),
(('this', 'kind', 'of'), 15),
(('that', 'has', 'been'), 15),
(('according', 'to', 'an'), 15),
(('the', 'los', 'angeles'), 15),
(('between', 'rich', 'and'), 15),
(('by', 'the', 'time'), 15),
(('even', 'if', 'they'), 15),
(('black', 'lives', 'matter'), 15),
(('income', 'and', 'wealth'), 15),
(('and', 'the', 'united'), 15),
(('at', 'the', 'new'), 15),
(('on', 'the', 'right'), 15),
(('p', 'according', 'to'), 15),
(('the', 'last', 'two'), 15),
(('on', 'behalf', 'of'), 15),
(('the', 'developing', 'world'), 15),
(('social', 'security', 'and'), 15),
(('mr', 'passos', 'coelho'), 15),
(('a', 'lot', 'more'), 15),
(('going', 'to', 'do'), 15),
(('and', 'we', 'have'), 15),
(('and', 'health', 'care'), 15),
(('to', 'try', 'to'), 15),
(('along', 'with', 'the'), 15),
(('on', 'the', 'left'), 15),
(('that', 'there', 'are'), 15),
(('they', 'are', 'not'), 15),
(('on', 'income', 'inequality'), 15),
(('for', 'all', 'the'), 15),
(('and', 'in', 'the'), 15),
(('that', 'mr', 'sanders'), 15),
(('mrs', 'clinton', 'is'), 15),
(('found', 'that', 'the'), 15),
(('a', 'professor', 'at'), 15),
(('president', 'bill', 'clinton'), 15),
(('p', 'among', 'the'), 15),
(('of', 'homeless', 'services'), 15),
(('p', 'as', 'a'), 15),
(('the', 'rate', 'of'), 15),
(('in', 'new', 'hampshire'), 15),
(('went', 'on', 'to'), 15),
(('top', 'of', 'the'), 15),
(('p', 'on', 'the'), 15),
(('united', 'states', 'has'), 15),
(('make', 'sure', 'that'), 15),
(('who', 'had', 'been'), 15),
(('mr', 'clinton', 'said'), 15),
(('in', 'the', 'house'), 15),
(('at', 'the', 'state'), 15),
(('the', 'american', 'people'), 15),
(('to', 'do', 'so'), 15),
(('seem', 'to', 'be'), 15),
(('by', 'the', 'united'), 15),
(('war', 'on', 'poverty'), 15),
(('your', 'inbox', 'every'), 15),
(('less', 'than', 'a'), 15),
(('poverty', 'and', 'inequality'), 15),
(('the', 'amount', 'of'), 15),
(('m', 'i', 't'), 15),
(('based', 'on', 'the'), 14),
(('united', 'states', 'in'), 14),
(('in', 'the', '1990s'), 14),
(('the', 'other', 'hand'), 14),
(('not', 'just', 'a'), 14),
(('best', 'way', 'to'), 14),
(('who', 'have', 'been'), 14),
(('in', 'the', 'next'), 14),
(('the', 'house', 'of'), 14),
(('and', 'income', 'inequality'), 14),
(('for', 'american', 'progress'), 14),
(('a', 'part', 'of'), 14),
(('the', '1', 'percent'), 14),
(('there', 'was', 'no'), 14),
(('share', 'of', 'the'), 14),
(('mr', 'obama', 'has'), 14),
(('p', 'i', 'invite'), 14),
(('department', 'of', 'homeless'), 14),
(('top', '10', 'percent'), 14),
(('a', 'range', 'of'), 14),
(('and', 'around', 'the'), 14),
(('as', 'a', 'way'), 14),
(('p', 'he', 'was'), 14),
(('do', 'more', 'to'), 14),
(('a', 'matter', 'of'), 14),
(('for', 'example', 'the'), 14),
(('for', 'them', 'to'), 14),
(('seems', 'to', 'be'), 14),
(('needs', 'to', 'be'), 14),
(('the', 'benefits', 'of'), 14),
(('the', 'work', 'of'), 14),
(('the', 'percentage', 'of'), 14),
(('the', 'price', 'of'), 14),
(('sustainable', 'development', 'goals'), 14),
(('40', 'percent', 'of'), 14),
(('the', 'world', 'and'), 14),
(('was', 'the', 'first'), 14),
(('live', 'in', 'poverty'), 14),
(('a', 'quarter', 'of'), 14),
(('to', 'create', 'a'), 14),
(('for', 'a', 'new'), 14),
(('can', 'be', 'done'), 14),
(('who', 'has', 'been'), 14),
(('this', 'is', 'an'), 14),
(('lot', 'of', 'people'), 14),
(('the', 'bottom', 'of'), 14),
(('are', 'less', 'likely'), 14),
(('donald', 'j', 'trump'), 14),
(('p', 'in', 'other'), 14),
(('the', 'vast', 'majority'), 14),
(('mrs', 'clinton', 'will'), 14),
(('there', 'has', 'been'), 14),
(('of', 'the', 'civil'), 14),
(('p', 'as', 'the'), 14),
(('than', 'any', 'other'), 14),
(('going', 'to', 'have'), 14),
(('been', 'able', 'to'), 14),
(('the', 'case', 'that'), 14),
(('to', 'help', 'them'), 14),
(('rich', 'and', 'the'), 14),
(('of', 'the', 'global'), 14),
(('in', 'the', 'american'), 14),
(('i', 'invite', 'you'), 14),
(('20', 'percent', 'of'), 14),
(("it's", 'not', 'just'), 14),
(('the', '20th', 'century'), 14),
(('will', 'be', 'the'), 14),
(('has', 'long', 'been'), 14),
(('senator', 'elizabeth', 'warren'), 14),
(('up', 'with', 'the'), 14),
(('there', 'are', 'many'), 14),
(('more', 'than', 'the'), 14),
(('to', 'do', 'something'), 14),
(('in', 'san', 'francisco'), 14),
(('poverty', 'in', 'the'), 14),
(('of', 'people', 'in'), 14),
(('center', 'for', 'american'), 14),
(('associated', 'with', 'the'), 14),
(('but', 'there', 'is'), 14),
(('is', 'not', 'just'), 14),
(('p', 'he', 'added'), 14),
(('for', 'mrs', 'clinton'), 14),
(('p', 'if', 'you'), 14),
(('she', 'said', 'i'), 14),
(('p', 'he', 'has'), 14),
(('and', 'for', 'the'), 13),
(('p', 'the', 'mayor'), 13),
(('is', 'the', 'same'), 13),
(('who', 'are', 'in'), 13),
(('to', 'do', 'more'), 13),
(('the', 'washington', 'post'), 13),
(('result', 'of', 'the'), 13),
(('a', 'kind', 'of'), 13),
(('the', 'economic', 'ladder'), 13),
(('as', 'it', 'is'), 13),
(('in', 'the', 'future'), 13),
(('head', 'of', 'the'), 13),
(('economic', 'and', 'political'), 13),
(('believe', 'that', 'the'), 13),
(('in', 'the', 'way'), 13),
(('and', 'his', 'wife'), 13),
(('in', 'some', 'cases'), 13),
(('center', 'of', 'the'), 13),
(('have', 'to', 'do'), 13),
(('he', 'said', 'i'), 13),
(('of', 'those', 'who'), 13),
(('the', 'civil', 'rights'), 13),
(('state', 'of', 'the'), 13),
(('the', 'story', 'of'), 13),
(('michael', 'r', 'bloomberg'), 13),
(('that', 'would', 'be'), 13),
(('that', 'we', 'can'), 13),
(('economist', 'at', 'the'), 13),
(('women', 'in', 'the'), 13),
(('to', 'work', 'with'), 13),
(('of', 'buenos', 'aires'), 13),
(('but', 'there', 'are'), 13),
(('minimum', 'wage', 'and'), 13),
(('the', 'wealthy', 'and'), 13),
(('to', 'have', 'the'), 13),
(('the', 'increase', 'in'), 13),
(('i', 'had', 'to'), 13),
(('in', 'which', 'he'), 13),
(('in', 'a', 'recent'), 13),
(('but', 'in', 'the'), 13),
(('the', 'democratic', 'presidential'), 13),
(('the', 'level', 'of'), 13),
(('just', 'a', 'few'), 13),
(('those', 'who', 'have'), 13),
(('p', 'of', 'course'), 13),
(('me', 'on', 'twitter'), 13),
(('in', 'the', 'democratic'), 13),
(('p', 'that', 'is'), 13),
(('response', 'to', 'the'), 13),
(('mr', 'sanders', 'was'), 13),
(('in', 'the', 'region'), 13),
(('more', 'and', 'more'), 13),
(('mr', 'sanders', 'is'), 13),
(('the', 'wall', 'street'), 13),
(('of', 'the', 'first'), 13),
(('p', 'after', 'the'), 13),
(('was', 'in', 'the'), 13),
(('of', 'the', 'same'), 13),
(('academy', 'of', 'sciences'), 13),
(('a', 'spokesman', 'for'), 13),
(('of', 'housing', 'and'), 13),
(('the', 'result', 'of'), 13),
(('thousands', 'of', 'people'), 13),
(('on', 'the', 'poor'), 13),
(('to', 'be', 'more'), 13),
(('mr', 'sanders', 'and'), 13),
(('in', 'their', 'own'), 13),
(('g', 'd', 'p'), 13),
(('the', 'most', 'recent'), 13),
(('even', 'if', 'the'), 13),
(('on', 'the', 'rich'), 13),
(('that', 'of', 'the'), 13),
(('say', 'they', 'are'), 13),
(('the', 'federal', 'reserve'), 13),
(('the', 'trans-pacific', 'partnership'), 13),
(('earlier', 'this', 'year'), 13),
(('to', 'find', 'a'), 13),
(('of', 'health', 'and'), 13),
(('follow', 'me', 'on'), 13),
(('the', 'working', 'poor'), 13),
(('about', 'how', 'to'), 13),
(('p', 'mr', 'obama'), 13),
(('said', 'she', 'was'), 13),
(('every', 'day', 'with'), 13),
(('said', 'p', 'in'), 13),
(('of', 'the', 'few'), 13),
(('when', 'she', 'was'), 13),
(('the', 'heart', 'of'), 13),
(('to', 'support', 'the'), 13),
(('on', 'how', 'to'), 13),
(('p', 'for', 'example'), 12),
(('that', 'she', 'was'), 12),
(('the', 'o', 'e'), 12),
(('to', 'a', 'new'), 12),
(('it', 'to', 'the'), 12),
(('and', 'of', 'course'), 12),
(('income', 'inequality', 'has'), 12),
(('they', 'have', 'to'), 12),
(('to', 'the', 'right'), 12),
(('want', 'to', 'do'), 12),
(('a', 'bit', 'of'), 12),
(('to', 'take', 'a'), 12),
(('for', 'the', 'next'), 12),
(('c', 'e', 'o'), 12),
(('p', 'many', 'of'), 12),
(('of', 'dollars', 'in'), 12),
(('fellow', 'at', 'the'), 12),
(('of', 'a', 'new'), 12),
(('of', 'people', 'living'), 12),
(('the', 'head', 'of'), 12),
(('in', 'the', 'nation'), 12),
(('p', 'e', 'a'), 12),
(('would', 'not', 'be'), 12),
(('of', 'the', 'nation'), 12),
(('people', 'at', 'the'), 12),
(('the', 'history', 'of'), 12),
(('the', 'consequences', 'of'), 12),
(('the', 'catholic', 'church'), 12),
(('that', 'have', 'been'), 12),
(('on', 'the', 'ground'), 12),
(('the', 'same', 'as'), 12),
(('the', 'death', 'of'), 12),
(('a', 'year', 'in'), 12),
(('is', 'trying', 'to'), 12),
(('p', 'i', 'am'), 12),
(('are', 'on', 'the'), 12),
(('said', 'she', 'had'), 12),
(('gap', 'between', 'rich'), 12),
(('that', 'this', 'is'), 12),
(('occupy', 'wall', 'street'), 12),
(('no', 'matter', 'how'), 12),
(('has', 'become', 'a'), 12),
(('minimum', 'wage', 'p'), 12),
(('of', 'our', 'time'), 12),
(('inequality', 'p', 'the'), 12),
(('in', 'the', 'senate'), 12),
(('is', 'more', 'than'), 12),
(('below', 'the', 'poverty'), 12),
(('out', 'to', 'be'), 12),
(('social', 'safety', 'net'), 12),
(('i', 'believe', 'that'), 12),
(('because', 'of', 'a'), 12),
(('i', "don't", 'want'), 12),
(('by', 'the', 'new'), 12),
(('and', 'they', 'are'), 12),
(('in', 'the', 'south'), 12),
(('p', 'during', 'the'), 12),
(('more', 'likely', 'than'), 12),
(('p', 'on', 'monday'), 12),
(('climate', 'change', 'p'), 12),
(('the', 'institute', 'for'), 12),
(('one', 'of', 'his'), 12),
(('we', 'have', 'been'), 12),
(('new', 'york', 'today'), 12),
(('of', 'the', 'work'), 12),
(('be', 'in', 'the'), 12),
(('more', 'than', '100'), 12),
(('the', 'general', 'election'), 12),
(('to', 'be', 'done'), 12),
(('do', 'not', 'have'), 12),
(('of', 'homeless', 'people'), 12),
(('they', 'have', 'been'), 12),
(('american', 'enterprise', 'institute'), 12),
(('changes', 'in', 'the'), 12),
(('in', 'latin', 'america'), 12),
(('if', 'you', 'want'), 12),
(('day', 'with', 'the'), 12),
(('the', 'war', 'on'), 12),
(('not', 'just', 'the'), 12),
(('he', 'has', 'a'), 12),
(('millions', 'of', 'dollars'), 12),
(('new', 'york', 'university'), 12),
(('but', 'they', 'are'), 12),
(('he', 'would', 'be'), 12),
(('between', 'the', 'rich'), 12),
(('the', 'republican', 'presidential'), 12),
(('to', 'reduce', 'inequality'), 12),
(('in', 'the', 'right'), 12),
(('the', '21st', 'century'), 12),
(('senator', 'ted', 'cruz'), 12),
(('p', 'mr', 'clinton'), 12),
(('to', 'the', 'new'), 12),
(('the', 'subject', 'of'), 12),
(('and', 'economic', 'inequality'), 12),
(('in', 'one', 'of'), 12),
(('the', 'era', 'of'), 12),
(('p', 'some', 'of'), 12),
(('early', 'childhood', 'education'), 12),
(('of', 'the', 'law'), 12),
(('that', 'he', 'has'), 12),
(('to', 'help', 'people'), 12),
(('few', 'years', 'ago'), 12),
(('the', 'working', 'class'), 12),
(('p', 'for', 'the'), 12),
(('such', 'as', 'the'), 12),
(('to', 'respond', 'to'), 12),
(('the', 'ability', 'to'), 12),
(('de', 'blasio', 'is'), 12),
(('was', 'born', 'in'), 12),
(('in', 'the', 'bronx'), 12),
(('p', 'if', 'we'), 12),
(('have', 'been', 'a'), 12),
(('mrs', 'clinton', 'was'), 12),
(('of', 'economic', 'growth'), 12),
(('the', 'fair', 'housing'), 12),
(('the', 'quality', 'of'), 12),
(('n', 'y', 'p'), 12),
(('wall', 'street', 'journal'), 12),
(('that', 'they', 'have'), 12),
(('at', 'one', 'point'), 12),
(('at', 'the', 'federal'), 11),
(('to', 'mrs', 'clinton'), 11),
(('workers', 'in', 'the'), 11),
(('role', 'in', 'the'), 11),
(('grow', 'up', 'in'), 11),
(('about', 'half', 'of'), 11),
(('the', "today's", 'headlines'), 11),
(('his', 'or', 'her'), 11),
(('the', 'federal', 'poverty'), 11),
(('to', 'do', 'p'), 11),
(('the', 'chance', 'to'), 11),
(('the', 'poverty', 'rate'), 11),
(('years', 'p', 'the'), 11),
(('the', 'chief', 'executive'), 11),
(('p', 'n', 'f'), 11),
(('the', 'world', 'has'), 11),
(('in', 'front', 'of'), 11),
(('p', 'but', 'as'), 11),
(('p', 'the', 'most'), 11),
(('mr', 'sanders', 'also'), 11),
(('the', 'idea', 'of'), 11),
(('p', 'over', 'the'), 11),
(('in', 'a', 'way'), 11),
(('they', 'have', 'a'), 11),
(('in', 'exchange', 'for'), 11),
(('p', 'and', 'the'), 11),
(('with', 'the', "today's"), 11),
(('the', 'wages', 'of'), 11),
(('years', 'ago', 'the'), 11),
(('the', 'government', 'should'), 11),
(('0', '1', 'percent'), 11),
(('get', 'news', 'and'), 11),
(('housing', 'and', 'urban'), 11),
(('there', 'have', 'been'), 11),
(('women', 'who', 'are'), 11),
(('of', 'it', 'p'), 11),
(('hundreds', 'of', 'millions'), 11),
(('the', 'executive', 'director'), 11),
(('a', 'right', 'to'), 11),
(('more', 'than', 'three'), 11),
(('the', 'health', 'care'), 11),
(('p', 'get', 'news'), 11),
(('has', 'been', 'the'), 11),
(('the', 'bloomberg', 'administration'), 11),
(('is', 'not', 'enough'), 11),
(('not', 'the', 'only'), 11),
(('the', 'decline', 'of'), 11),
(('on', 'twitter', 'p'), 11),
(('an', 'average', 'of'), 11),
(('organization', 'for', 'economic'), 11),
(('capital', 'in', 'the'), 11),
(('of', 'the', 'church'), 11),
(('fight', 'for', '15'), 11),
(('we', 'can', 'do'), 11),
(('a', 'general', 'election'), 11),
(('go', 'back', 'to'), 11),
(('say', 'that', 'the'), 11),
...]
In [237]:
poverty_vocab = set(pov_trigram.keys())
news_vocab = set(news_trigram.keys())
total_vocab = list(poverty_vocab.union(news_vocab))
In [246]:
keyness_data = []
for item in total_vocab:
nf=news_trigram[item]
pf=pov_trigram[item]
if nf<5 or pf<5:
continue
keyness=calculate_keyness(pf, poverty_tokens, nf, news_tokens)
keyness_data.append({'word':'_'.join(item), 'news': nf,
'poverty': pf, 'LL': keyness}
)
In [242]:
len(total_vocab)
Out[242]:
951606
In [247]:
keyness_df=pd.DataFrame(keyness_data, columns = ('word','poverty','news','LL'))
keyness_df.sort_values('LL', ascending=False)
Out[247]:
word
poverty
news
LL
1093
mr_de_blasio
144
42
111.106498
2178
the_minimum_wage
104
19
106.493954
1011
p_to_the
62
9
70.281401
411
senator_bernie_sanders
57
7
68.779227
1257
more_likely_to
71
17
62.648658
2509
sign_up_for
127
63
57.904751
1423
the_middle_class
70
19
56.881528
817
mr_sanders_said
58
12
55.706806
2163
on_facebook_and
121
61
54.004843
1449
p_mr_sanders
49
10
47.412156
671
new_york_city
103
54
43.573906
1051
facebook_and_twitter
107
59
42.082258
187
of_the_poor
38
6
41.598845
856
for_the_poor
38
6
41.598845
2329
up_for_our
32
5
35.187939
1999
need_to_know
37
8
34.679883
1089
today_and_get
35
7
34.231837
1697
presidential_race_today
35
7
34.231837
402
of_the_population
35
7
34.231837
1763
draft_newsletter_p
35
7
34.231837
1064
and_get_politics
35
7
34.231837
2042
get_politics_news
35
7
34.231837
1896
p_find_out
35
7
34.231837
651
out_what_you
35
7
34.231837
1398
twitter_and_the
35
7
34.231837
661
politics_news_updates
35
7
34.231837
1954
race_today_and
35
7
34.231837
2307
first_draft_newsletter
35
7
34.231837
1526
in_this_country
44
13
33.622277
1734
the_cost_of
48
16
33.301494
...
...
...
...
...
394
politics_newsletter_p
10
57
-19.375858
438
for_the_first
34
117
-19.645455
990
that_it_was
21
87
-20.023060
2094
said_they_were
7
49
-20.034878
606
p_he_said
5
43
-20.411060
2454
that_they_were
8
53
-20.705282
39
said_in_an
34
121
-21.653055
455
of_the_islamic
6
48
-21.696218
1234
had_not_been
5
45
-21.989602
1898
the_united_states
357
739
-22.453452
1122
said_in_a
39
135
-22.928122
64
the_middle_east
16
79
-22.955481
2023
a_news_conference
8
57
-23.636714
2518
at_the_time
27
110
-24.687482
967
he_said_he
25
106
-25.237329
711
he_had_been
7
56
-25.312254
1672
said_he_was
26
110
-26.107470
630
in_an_interview
48
162
-26.178372
2302
the_death_penalty
5
52
-27.645743
300
said_that_the
25
112
-28.786309
779
in_a_statement
25
112
-28.786309
2470
that_he_had
26
118
-30.855787
1012
the_obama_administration
20
106
-33.389055
2475
he_said_p
79
246
-33.453147
1238
the_condition_of
5
61
-35.157520
1326
said_he_had
20
121
-43.582521
1984
the_justice_department
20
128
-48.528374
2397
the_white_house
51
240
-65.757386
2097
f_b_i
5
208
-170.841911
2280
the_islamic_state
21
296
-181.798753
2521 rows × 4 columns
In [ ]:
Content source: mbod/intro_python_for_comm
Similar notebooks: