In [1]:
import nltk
import pandas as pd

In [14]:
df=pd.read_csv('data/pov_seedwords.txt',delimiter='\t')

In [17]:
df.head(10)


Out[17]:
id corpus intro section length body subject country
0 1 news 1 of 1792 DOCUMENTS //p //p The New York Time... Section A; Column 0; Business/Financial Desk; ... 1715 words MIAMI -- For the Ingram clan, working for the... PUBLIC TRANSPORTATION (90%); MIDDLE INCOME PER... UNITED STATES (96%)
1 2 news 2 of 1792 DOCUMENTS //p //p The New York Time... Section A; Column 0; Business/Financial Desk; ... 1832 words WAXAHACHIE, Tex. -- Most Americans suffered s... SENIOR CITIZENS (91%); MIDDLE INCOME PERSONS (... UNITED STATES (94%)
2 3 news 3 of 1792 DOCUMENTS //p //p The New York Time... Section A; Column 0; Business/Financial Desk; ... 1759 words When the California Labor Commissioner's Offic... LABOR FORCE (90%); FREELANCE EMPLOYMENT (90%);... UNITED STATES (95%)
3 4 news 4 of 1792 DOCUMENTS //p //p The New York Time... Section A; Column 0; National Desk; Pg. 1 1506 words SAN BERNARDINO, Calif. -- A heavily armed man ... SHOOTINGS (92%); GUNSHOT WOUNDS (89%); FIREARM... UNITED STATES (94%)
4 5 news 5 of 1792 DOCUMENTS //p //p The New York Time... Section A; Column 0; National Desk; Pg. 1 1075 words More than one a day. //p That is how often, on... SHOOTINGS (92%); WOUNDS & INJURIES (90%); GUNS... UNITED STATES (94%)
5 6 news 6 of 1792 DOCUMENTS //p //p The New York Time... Section A; Column 0; National Desk; Pg. 1 1424 words SAN BERNARDINO, Calif. -- Syed Rizwan Farook a... ONLINE DATING SERVICES (90%); COURTSHIP & DATI... UNITED STATES (98%); PAKISTAN (79%); SAUDI ARA...
6 7 news 7 of 1792 DOCUMENTS //p //p The New York Time... Section A; Column 0; National Desk; Pg. 1 1766 words SAN BERNARDINO, Calif. -- The couple who the p... SPECIAL INVESTIGATIVE FORCES (90%); TERRORISM ... UNITED STATES (93%)
7 8 news 8 of 1792 DOCUMENTS //p //p The New York Time... Section A; Column 0; National Desk; Pg. 1 1522 words The killings are happening too often. Bunched ... SHOOTINGS (90%); COOKING & ENTERTAINING (90%);... UNITED STATES (97%)
8 9 news 9 of 1792 DOCUMENTS //p //p The New York Time... Section A; Column 0; National Desk; THE CAMPAI... 1180 words The Republican candidates for president angril... MUSLIMS & ISLAM (90%); US REPUBLICAN PARTY (90... UNITED STATES (94%)
9 10 news 10 of 1792 DOCUMENTS //p //p The New York Tim... Section A; Column 0; National Desk; Pg. 1 1885 words WASHINGTON -- On the day she and her husband ... TERRORISM (92%); TERRORIST ATTACKS (91%); TERR... UNITED STATES (97%); SYRIA (86%); PAKISTAN (79...

In [9]:
df.columns


Out[9]:
Index(['id', 'corpus', 'intro', 'section', 'length', 'body', 'subject',
       'country'],
      dtype='object')

In [18]:
df.corpus


Out[18]:
0          news
1          news
2          news
3          news
4          news
5          news
6          news
7          news
8          news
9          news
10         news
11         news
12         news
13         news
14         news
15         news
16         news
17         news
18         news
19         news
20         news
21         news
22         news
23         news
24         news
25         news
26         news
27         news
28         news
29         news
         ...   
1470    poverty
1471    poverty
1472    poverty
1473    poverty
1474    poverty
1475    poverty
1476    poverty
1477    poverty
1478    poverty
1479    poverty
1480    poverty
1481    poverty
1482    poverty
1483    poverty
1484    poverty
1485    poverty
1486    poverty
1487    poverty
1488    poverty
1489    poverty
1490    poverty
1491    poverty
1492    poverty
1493    poverty
1494    poverty
1495    poverty
1496    poverty
1497    poverty
1498    poverty
1499    poverty
Name: corpus, dtype: object

In [13]:
df.corpus.value_counts()


Out[13]:
news       500
poverty    500
comment    500
Name: corpus, dtype: int64

In [21]:
news = df[df.corpus=='news']
poverty = df[df.corpus=='poverty']
comment = df[df.corpus=='comment']

In [24]:
example_text = 'This is an example. One more sentence! And, another one! Then the last one.'

In [23]:
word_tok = nltk.tokenize.WordPunctTokenizer()

In [25]:
word_tok.tokenize(example_text)


Out[25]:
['This',
 'is',
 'an',
 'example',
 '.',
 'One',
 'more',
 'sentence',
 '!',
 'And',
 ',',
 'another',
 'one',
 '!',
 'Then',
 'the',
 'last',
 'one',
 '.']

In [26]:
word_tok2 = nltk.tokenize.WhitespaceTokenizer()

In [27]:
word_tok2.tokenize(example_text)


Out[27]:
['This',
 'is',
 'an',
 'example.',
 'One',
 'more',
 'sentence!',
 'And,',
 'another',
 'one!',
 'Then',
 'the',
 'last',
 'one.']

In [50]:
word_tok3=nltk.tokenize.RegexpTokenizer(r'\b(\w+(?:[\'\-]\w+)?)\b')

In [32]:
word_tok.tokenize('@lori here #python with @matt')


Out[32]:
['@', 'lori', 'here', '#', 'python', 'with', '@', 'matt']

In [46]:
word_tok3.tokenize("it's this_as_one_word some-compound-word like i'm and they've dog's bowl")


Out[46]:
["it's",
 'this_as_one_word',
 'some-compound',
 'word',
 'like',
 "i'm",
 'and',
 "they've",
 "dog's",
 'bowl']

In [55]:
news_toks=[]
for art in news.body:
    news_toks.append(word_tok3.tokenize(art.lower()))

In [56]:
news_freq = nltk.FreqDist()

In [57]:
for art in news_toks:
    news_freq.update(art)

In [59]:
news_freq.most_common(10)


Out[59]:
[('the', 43956),
 ('to', 20442),
 ('a', 20428),
 ('of', 19719),
 ('and', 17536),
 ('p', 16606),
 ('in', 16587),
 ('that', 10297),
 ('said', 7641),
 ('mr', 7366)]

In [60]:
news_freq['policy']


Out[60]:
398

In [72]:
%matplotlib inline
import matplotlib.pyplot as plt

In [63]:
news_freq.plot(100)



In [164]:
comments_freq = nltk.FreqDist()
for art in comment.body:
    comments_freq.update(word_tok3.tokenize(art.lower()))

In [65]:
poverty_freq = nltk.FreqDist()
for art in poverty.body:
    poverty_freq.update(word_tok3.tokenize(art.lower()))

In [66]:
poverty_freq.most_common(50)


Out[66]:
[('the', 27902),
 ('of', 13996),
 ('to', 13112),
 ('and', 12731),
 ('a', 11619),
 ('p', 10546),
 ('in', 10440),
 ('that', 6790),
 ('for', 5222),
 ('is', 4713),
 ('on', 3940),
 ('as', 3176),
 ('with', 2929),
 ('it', 2822),
 ('he', 2821),
 ('are', 2736),
 ('have', 2372),
 ('said', 2358),
 ('was', 2329),
 ('but', 2328),
 ('at', 2327),
 ('mr', 2313),
 ('by', 2309),
 ('has', 2185),
 ('not', 2143),
 ('from', 2049),
 ('his', 2038),
 ('who', 2013),
 ('be', 1916),
 ('more', 1910),
 ('this', 1875),
 ('an', 1863),
 ('they', 1778),
 ('we', 1620),
 ('their', 1575),
 ('i', 1517),
 ('about', 1515),
 ('people', 1505),
 ('or', 1419),
 ('new', 1395),
 ('than', 1301),
 ('she', 1151),
 ('would', 1144),
 ('her', 1121),
 ('one', 1109),
 ('will', 1098),
 ('had', 1075),
 ('which', 1021),
 ('what', 979),
 ('you', 965)]

In [68]:
poverty_freq['poverty']


Out[68]:
684

In [73]:
plt.figure(figsize=(12,4))
poverty_freq.plot(100)



In [75]:
news_tokens=sum(news_freq.values())
news_tokens


Out[75]:
757678

In [77]:
poverty_tokens=sum(poverty_freq.values())
poverty_tokens


Out[77]:
494377

In [174]:
comments_tokens=sum(comments_freq.values())
comments_tokens


Out[174]:
449848

In [85]:
news_freq['poverty']


Out[85]:
58

In [86]:
poverty_freq['poverty']


Out[86]:
684

In [83]:
1000000*(news_freq['poverty'] / news_tokens)


Out[83]:
76.5496688566911

In [84]:
1000000*(poverty_freq['poverty'] / poverty_tokens)


Out[84]:
1383.5595102522973

In [107]:
import math
ln = math.log

def calculate_keyness(a, c, b, d):
    '''
    calculate loglikehood following
    http://ucrel.lancs.ac.uk/llwizard.html
    a = freq of word in corpus A
    c = size of corpus A
    b = freq of word in corpus B
    d = size of corpus B
    '''
    
    E1 = c*(a+b) / (c+d) 
    
    E2 = d*(a+b) / (c+d)
    
    direction = 1 if a/c > b/d else -1
    
    G2 = 2*((a*ln (a/E1)) + (b*ln (b/E2)))
    
    return direction*G2

In [110]:
calculate_keyness(58, 757678, 684, 494377)


Out[110]:
-922.4529260281342

In [114]:
calculate_keyness(news_freq['obama'], news_tokens,
                  poverty_freq['obama'], poverty_tokens)


Out[114]:
85.50690363793245

In [143]:
news_vocab = set(news_freq.keys())
poverty_vocab = set(poverty_freq.keys())
total_vocab = list(news_vocab.union(poverty_vocab))

In [ ]:


In [144]:
len(total_vocab)


Out[144]:
41681

In [154]:
keyness_data = []
for item in total_vocab:
    nf=news_freq[item]
    pf=poverty_freq[item]
    
    if nf<5 or pf<5:
        continue
        
    keyness=calculate_keyness(pf, poverty_tokens, nf, news_tokens)
    keyness_data.append({'word':item, 'news': nf, 
                         'poverty': pf, 'LL': keyness}
                       )

In [156]:
keyness_df=pd.DataFrame(keyness_data)

In [159]:
keyness_df.sort_values('LL', ascending=False)


Out[159]:
LL news poverty word
2003 1534.664171 25 937 inequality
4717 922.452926 58 684 poverty
1609 836.115408 88 699 income
5907 647.186611 61 526 sanders
2303 600.979665 205 746 economic
1144 537.511004 116 560 poor
5899 523.328147 90 505 tax
5763 434.581098 22 309 welfare
5930 345.292406 498 893 our
543 344.930299 313 684 children
382 339.044525 215 558 social
1715 327.690960 45 295 francis
584 285.377190 26 230 wealth
5323 281.031488 102 358 families
2034 261.470472 79 309 wage
1346 249.328473 42 239 rich
1163 247.283029 72 288 housing
3198 244.572343 636 921 percent
2666 231.127074 5321 4713 is
1841 225.310541 18 176 homeless
174 218.886271 1454 1620 we
6024 218.249295 243 485 women
1872 218.195124 114 327 de
4030 216.110143 83 282 class
4809 202.873808 23 173 pope
4691 201.218120 39 202 wages
196 195.900886 2881 2736 are
550 189.165437 68 240 society
228 187.769215 66 236 blasio
933 187.105740 14 144 bernie
... ... ... ... ...
1650 -107.172285 709 217 house
2858 -107.570512 222 25 records
1070 -110.211403 369 73 military
5508 -114.336661 227 24 investigation
3907 -117.815276 455 101 bush
4222 -119.382362 335 56 official
1185 -119.984588 200 15 apple
4322 -123.103184 282 37 attack
2902 -126.774036 541 129 department
792 -128.553557 218 17 shooting
4241 -141.298741 20428 11619 a
3635 -142.016899 376 59 f
2556 -147.402155 248 19 intelligence
2196 -149.237408 242 17 israel
4177 -156.625600 2265 919 were
4379 -161.444039 4365 2038 his
1528 -175.185686 327 31 iran
5690 -177.223028 1175 360 him
896 -194.240363 494 74 company
2370 -207.544005 381 35 officer
1826 -234.421376 489 56 officers
4400 -264.446737 978 210 police
949 -266.630674 508 50 water
5541 -281.971342 469 35 islamic
1530 -340.382999 1025 184 officials
1030 -341.490604 6494 2821 he
4161 -404.028209 5762 2329 was
3534 -437.665580 3308 1075 had
2360 -1053.523904 7366 2313 mr
1762 -1134.934456 7641 2358 said

6140 rows × 4 columns


In [160]:
keyness_df.to_csv('data/poverty_keyness_list.csv', index=False)

In [169]:
poverty_vocab = set(poverty_freq.keys())
comments_vocab = set(poverty_freq.keys())
total_vocab = list(poverty_vocab.union(comments_vocab))

In [176]:
keyness_data = []
for item in total_vocab:
    cf=comments_freq[item]
    pf=poverty_freq[item]
    
    if cf<5 or pf<5:
        continue
        
    keyness=calculate_keyness(pf, poverty_tokens, cf, comments_tokens)
    keyness_data.append({'word':item, 'comments': cf, 
                         'poverty': pf, 'LL': keyness}
                       )

In [172]:
keyness_data


Out[172]:
[]

In [177]:
keyness_df=pd.DataFrame(keyness_data)
keyness_df.sort_values('LL', ascending=False)


Out[177]:
LL comments poverty word
1574 996.011084 578 2358 said
1787 792.925547 74 937 inequality
2113 698.351529 748 2313 mr
4258 553.474462 61 684 poverty
1435 475.119286 91 699 income
659 318.406244 71 495 ms
1030 316.188296 98 560 poor
3804 312.797925 58 455 mrs
2060 292.622485 196 746 economic
1671 258.230964 31 327 de
5509 214.627560 113 484 workers
215 214.051584 15 236 blasio
488 212.183191 217 684 children
1530 207.480200 36 295 francis
1809 199.622016 44 309 wage
5215 189.203363 48 309 welfare
2206 166.204829 147 493 city
1998 141.779716 41 244 mayor
1642 131.371767 19 176 homeless
1049 130.019548 66 288 housing
3021 128.317547 38 223 minimum
525 119.273335 45 230 wealth
350 116.544382 225 558 social
4236 110.280853 37 202 wages
3406 102.987777 17 144 gap
1983 98.729622 275 607 work
3725 93.057311 324 671 clinton
3201 92.730460 7 104 editor
3628 90.894083 87 282 class
4222 86.125705 51 206 development
... ... ... ... ...
3711 -63.440715 811 580 it's
1833 -63.553141 600 396 should
3512 -64.544994 230 101 bush
2064 -65.197908 93 17 patients
1951 -65.661438 75 9 solar
1970 -71.133539 146 42 jeb
1122 -71.633332 143 40 trump's
4621 -72.662587 2695 2328 but
2687 -74.802479 120 26 invite
1241 -76.246898 650 416 my
144 -77.592441 352 176 sign
674 -78.990674 80 7 football
5447 -80.645948 378 192 facebook
2187 -86.099225 579 342 republican
2828 -87.552576 2325 1916 be
1672 -92.343865 93 8 syrian
3554 -100.064650 537 290 me
3686 -105.061674 205 56 rubio
4494 -107.778457 431 202 twitter
4826 -116.443897 1990 1517 i
1731 -121.662207 172 31 refugees
1656 -131.717072 231 56 donald
3507 -132.451989 287 87 section
5097 -143.988049 438 174 follow
2159 -145.478224 290 81 he's
650 -145.758603 164 19 guns
3248 -201.684678 269 44 gun
3987 -261.387116 321 45 cruz
2164 -283.091128 586 170 opinion
4433 -671.729292 888 143 trump

5551 rows × 4 columns


In [199]:
pd.Series([i[0] for i in comment.section.str.split(';') if type(i) is list]).value_counts()


Out[199]:
Section A     439
Section SR     48
Section        13
dtype: int64

In [226]:
news_bigram = nltk.FreqDist()
news_trigram = nltk.FreqDist()
for art in news.body:
    toks = word_tok3.tokenize(art.lower())
    news_bigram.update(nltk.ngrams(toks,2))
    news_trigram.update(nltk.ngrams(toks,3))

In [224]:
pov_bigram = nltk.FreqDist()
pov_trigram = nltk.FreqDist()

for art in poverty.body:
    toks = word_tok3.tokenize(art.lower())
    pov_bigram.update(nltk.ngrams(toks,2))
    pov_trigram.update(nltk.ngrams(toks,3))

In [228]:
pov_trigram.most_common()


Out[228]:
[(('the', 'united', 'states'), 357),
 (('the', 'new', 'york'), 186),
 (('one', 'of', 'the'), 184),
 (('mr', 'de', 'blasio'), 144),
 (('new', 'york', 'times'), 141),
 (('in', 'the', 'united'), 139),
 (('sign', 'up', 'for'), 127),
 (('on', 'facebook', 'and'), 121),
 (('a', 'lot', 'of'), 116),
 (('percent', 'of', 'the'), 111),
 (('some', 'of', 'the'), 107),
 (('facebook', 'and', 'twitter'), 107),
 (('the', 'minimum', 'wage'), 104),
 (('new', 'york', 'city'), 103),
 (('according', 'to', 'the'), 100),
 (('the', 'number', 'of'), 100),
 (('in', 'new', 'york'), 100),
 (('of', 'new', 'york'), 89),
 (('up', 'for', 'the'), 88),
 (('the', 'university', 'of'), 88),
 (('as', 'well', 'as'), 87),
 (('p', 'this', 'is'), 87),
 (('p', 'follow', 'the'), 85),
 (('follow', 'the', 'new'), 85),
 (('p', 'in', 'the'), 83),
 (('and', 'sign', 'up'), 82),
 (('twitter', 'and', 'sign'), 82),
 (('part', 'of', 'the'), 80),
 (('he', 'said', 'p'), 79),
 (('to', 'the', 'editor'), 78),
 (('and', 'twitter', 'and'), 78),
 (('today', 'newsletter', 'p'), 72),
 (('more', 'likely', 'to'), 71),
 (('the', 'middle', 'class'), 70),
 (('york', 'times', 'opinion'), 67),
 (('times', 'opinion', 'section'), 67),
 (('section', 'on', 'facebook'), 67),
 (('opinion', 'section', 'on'), 67),
 (('for', 'the', 'opinion'), 66),
 (('opinion', 'today', 'newsletter'), 66),
 (('we', 'need', 'to'), 66),
 (('at', 'the', 'university'), 66),
 (('the', 'opinion', 'today'), 66),
 (('the', 'end', 'of'), 64),
 (('this', 'is', 'a'), 62),
 (('p', 'to', 'the'), 62),
 (('at', 'the', 'same'), 59),
 (('mr', 'sanders', 'said'), 58),
 (('senator', 'bernie', 'sanders'), 57),
 (('the', 'editor', 'p'), 57),
 (('p', 'but', 'the'), 57),
 (('the', 'united', 'nations'), 56),
 (('the', 'fact', 'that'), 56),
 (('the', 'same', 'time'), 56),
 (('there', 'is', 'a'), 56),
 (('president', 'of', 'the'), 54),
 (('in', 'recent', 'years'), 54),
 (('many', 'of', 'the'), 52),
 (('p', 'it', 'is'), 51),
 (('out', 'of', 'the'), 51),
 (('the', 'white', 'house'), 51),
 (('end', 'of', 'the'), 50),
 (('around', 'the', 'world'), 50),
 (('the', 'rest', 'of'), 49),
 (('p', 'mr', 'sanders'), 49),
 (('according', 'to', 'a'), 49),
 (('the', 'cost', 'of'), 48),
 (('in', 'an', 'interview'), 48),
 (('in', 'the', 'last'), 48),
 (('she', 'said', 'p'), 47),
 (('director', 'of', 'the'), 46),
 (('more', 'than', 'a'), 46),
 (('to', 'be', 'a'), 46),
 (('of', 'the', 'united'), 46),
 (('bill', 'de', 'blasio'), 46),
 (('in', 'the', 'world'), 46),
 (('the', 'first', 'draft'), 45),
 (('it', 'is', 'a'), 45),
 (('p', 'at', 'the'), 44),
 (('in', 'this', 'country'), 44),
 (('in', 'the', 'past'), 44),
 (('hillary', 'rodham', 'clinton'), 43),
 (('p', 'there', 'are'), 43),
 (('of', 'the', 'most'), 42),
 (('likely', 'to', 'be'), 42),
 (('much', 'of', 'the'), 42),
 (('in', 'the', 'country'), 42),
 (('most', 'of', 'the'), 42),
 (('p', 'mrs', 'clinton'), 42),
 (('out', 'of', 'poverty'), 41),
 (('p', 'in', 'a'), 41),
 (('we', 'have', 'to'), 40),
 (('of', 'the', 'new'), 40),
 (('the', '2016', 'presidential'), 40),
 (('it', 'is', 'not'), 40),
 (('mayor', 'bill', 'de'), 40),
 (('version', 'of', 'the'), 39),
 (('the', 'federal', 'government'), 39),
 (('at', 'the', 'top'), 39),
 (('said', 'in', 'a'), 39),
 (('of', 'the', 'poor'), 38),
 (('you', 'need', 'to'), 38),
 (('for', 'the', 'poor'), 38),
 (('to', 'know', 'about'), 37),
 (('need', 'to', 'know'), 37),
 (('find', 'out', 'what'), 37),
 (('of', 'the', 'world'), 37),
 (('at', 'a', 'time'), 37),
 (('to', 'the', 'united'), 37),
 (('the', 'one', 'that'), 36),
 (('there', 'is', 'no'), 36),
 (('the', 'first', 'time'), 36),
 (('2016', 'presidential', 'race'), 36),
 (('half', 'of', 'the'), 36),
 (('over', 'the', 'last'), 36),
 (('and', 'the', 'first'), 36),
 (('what', 'you', 'need'), 36),
 (('is', 'not', 'a'), 36),
 (('p', 'mr', 'de'), 35),
 (('twitter', 'and', 'the'), 35),
 (('as', 'much', 'as'), 35),
 (('p', 'i', 'think'), 35),
 (('draft', 'newsletter', 'p'), 35),
 (('of', 'the', 'population'), 35),
 (('presidential', 'race', 'today'), 35),
 (('the', 'poor', 'p'), 35),
 (('about', 'the', '2016'), 35),
 (('p', 'find', 'out'), 35),
 (('know', 'about', 'the'), 35),
 (('out', 'what', 'you'), 35),
 (('politics', 'news', 'updates'), 35),
 (('race', 'today', 'and'), 35),
 (('get', 'politics', 'news'), 35),
 (('people', 'who', 'are'), 35),
 (('and', 'get', 'politics'), 35),
 (('first', 'draft', 'newsletter'), 35),
 (('today', 'and', 'get'), 35),
 (('in', 'the', 'city'), 35),
 (('be', 'able', 'to'), 35),
 (('new', 'york', 'p'), 35),
 (('said', 'in', 'an'), 34),
 (('mr', 'de', "blasio's"), 34),
 (('over', 'the', 'past'), 34),
 (('members', 'of', 'the'), 34),
 (('new', 'york', 'state'), 34),
 (('for', 'the', 'first'), 34),
 (('news', 'updates', 'via'), 34),
 (('mrs', 'clinton', 'said'), 34),
 (('via', 'facebook', 'twitter'), 34),
 (('updates', 'via', 'facebook'), 34),
 (('are', 'going', 'to'), 34),
 (('facebook', 'twitter', 'and'), 34),
 (('in', 'the', 'state'), 33),
 (('p', 'new', 'york'), 33),
 (('of', 'income', 'inequality'), 33),
 (('sanders', 'of', 'vermont'), 33),
 (('as', 'part', 'of'), 33),
 (('this', 'is', 'the'), 33),
 (('the', 'top', '1'), 32),
 (('is', 'one', 'of'), 32),
 (('that', 'he', 'would'), 32),
 (('in', 'the', 'first'), 32),
 (('is', 'that', 'the'), 32),
 (('of', 'the', 'american'), 32),
 (('are', 'more', 'likely'), 32),
 (('the', 'democratic', 'party'), 32),
 (('up', 'for', 'our'), 32),
 (('bernie', 'sanders', 'of'), 32),
 (('the', 'affordable', 'care'), 32),
 (('the', 'poor', 'and'), 31),
 (('follow', 'us', 'on'), 31),
 (('us', 'on', 'facebook'), 31),
 (('of', 'the', 'story'), 31),
 (('a', 'professor', 'of'), 31),
 (('in', 'terms', 'of'), 31),
 (('has', 'been', 'a'), 31),
 (('a', 'group', 'of'), 31),
 (('across', 'the', 'country'), 31),
 (('top', '1', 'percent'), 31),
 (('affordable', 'care', 'act'), 31),
 (('a', 'number', 'of'), 31),
 (('it', 'was', 'a'), 31),
 (('in', 'other', 'words'), 31),
 (('the', 'center', 'for'), 30),
 (('than', 'the', 'one'), 30),
 (('because', 'of', 'the'), 30),
 (('the', 'kind', 'of'), 30),
 (('is', 'a', 'more'), 30),
 (('that', 'he', 'was'), 30),
 (('it', 'comes', 'to'), 30),
 (('is', 'expected', 'to'), 30),
 (('rich', 'and', 'poor'), 30),
 (('the', 'world', 'p'), 30),
 (('men', 'and', 'women'), 30),
 (('at', 'the', 'end'), 30),
 (('all', 'of', 'the'), 30),
 (('united', 'states', 'p'), 30),
 (('of', 'the', 'country'), 30),
 (('when', 'it', 'comes'), 30),
 (('you', 'have', 'to'), 30),
 (('as', 'a', 'result'), 30),
 (('chairman', 'of', 'the'), 29),
 (('more', 'than', 'half'), 29),
 (('the', 'rich', 'and'), 29),
 (('the', 'supreme', 'court'), 29),
 (('of', 'the', "world's"), 29),
 (('said', 'p', 'the'), 29),
 (('000', 'a', 'year'), 29),
 (('going', 'to', 'be'), 29),
 (('have', 'to', 'be'), 29),
 (('u', 'b', 'i'), 28),
 (('increase', 'in', 'the'), 28),
 (('he', 'did', 'not'), 28),
 (('g', 'o', 'p'), 28),
 (('there', 'was', 'a'), 28),
 (('to', 'raise', 'the'), 28),
 (('but', 'it', 'is'), 28),
 (('need', 'to', 'be'), 28),
 (('de', 'blasio', 'said'), 28),
 (('this', 'is', 'not'), 28),
 (('p', 'it', 'was'), 27),
 (('the', 'age', 'of'), 27),
 (('less', 'likely', 'to'), 27),
 (('the', 'lives', 'of'), 27),
 (('more', 'complete', 'version'), 27),
 (('part', 'of', 'a'), 27),
 (('in', 'the', 'middle'), 27),
 (('appeared', 'in', 'print'), 27),
 (('a', 'more', 'complete'), 27),
 (('that', 'appeared', 'in'), 27),
 (('p', 'applause', 'p'), 27),
 (('over', 'the', 'next'), 27),
 (('complete', 'version', 'of'), 27),
 (('a', 'lack', 'of'), 27),
 (('the', 'tax', 'code'), 27),
 (('the', 'story', 'than'), 27),
 (('a', 'time', 'when'), 27),
 (('one', 'that', 'appeared'), 27),
 (('the', 'gap', 'between'), 27),
 (('in', 'print', 'p'), 27),
 (('story', 'than', 'the'), 27),
 (('in', 'which', 'the'), 27),
 (('the', 'department', 'of'), 27),
 (('at', 'the', 'time'), 27),
 (('he', 'was', 'a'), 26),
 (('said', 'he', 'was'), 26),
 (('provides', 'news', 'analysis'), 26),
 (('policy', 'and', 'everyday'), 26),
 (('about', 'politics', 'policy'), 26),
 (('a', 'series', 'of'), 26),
 (('everyday', 'life', 'follow'), 26),
 (('and', 'graphics', 'about'), 26),
 (('and', 'twitter', 'sign'), 26),
 (('united', 'states', 'and'), 26),
 (('in', 'favor', 'of'), 26),
 (('to', 'have', 'a'), 26),
 (('p', 'the', 'upshot'), 26),
 (('number', 'of', 'people'), 26),
 (('the', 'center', 'of'), 26),
 (('analysis', 'and', 'graphics'), 26),
 (('p', 'there', 'is'), 26),
 (('as', 'long', 'as'), 26),
 (('and', 'everyday', 'life'), 26),
 (('the', 'upshot', 'provides'), 26),
 (('twitter', 'sign', 'up'), 26),
 (('in', 'the', '1970s'), 26),
 (('life', 'follow', 'us'), 26),
 (('news', 'analysis', 'and'), 26),
 (('upshot', 'provides', 'news'), 26),
 (('graphics', 'about', 'politics'), 26),
 (('inequality', 'in', 'the'), 26),
 (('politics', 'policy', 'and'), 26),
 (('the', 'state', 'of'), 26),
 (('of', 'the', 'state'), 26),
 (('that', 'he', 'had'), 26),
 (('to', 'get', 'the'), 26),
 (('at', 'the', 'center'), 25),
 (('the', 'people', 'who'), 25),
 (('in', 'the', 'same'), 25),
 (('it', 'would', 'be'), 25),
 (('raising', 'the', 'minimum'), 25),
 (('those', 'who', 'are'), 25),
 (('said', 'it', 'was'), 25),
 (('a', 'couple', 'of'), 25),
 (('the', 'idea', 'that'), 25),
 (('said', 'that', 'the'), 25),
 (('p', 'we', 'are'), 25),
 (('than', 'half', 'of'), 25),
 (('on', 'the', 'other'), 25),
 (('in', 'a', 'statement'), 25),
 (('people', 'in', 'the'), 25),
 (('of', 'the', 'people'), 25),
 (('he', 'said', 'he'), 25),
 (('the', 'importance', 'of'), 24),
 (('climate', 'change', 'and'), 24),
 (('the', 'president', 'of'), 24),
 (('of', 'poverty', 'and'), 24),
 (('it', 'has', 'been'), 24),
 (('15', 'an', 'hour'), 24),
 (('the', 'problem', 'is'), 24),
 (('income', 'inequality', 'and'), 24),
 (('of', 'people', 'who'), 24),
 (('p', 'but', 'mr'), 24),
 (('a', 'result', 'of'), 24),
 (('of', 'the', "nation's"), 24),
 (('1', 'percent', 'of'), 24),
 (('a', 'way', 'to'), 24),
 (('social', 'and', 'economic'), 24),
 (('to', 'ensure', 'that'), 24),
 (('earned-income', 'tax', 'credit'), 24),
 (('not', 'going', 'to'), 24),
 (('p', 'the', 'writer'), 23),
 (('the', 'rise', 'of'), 23),
 (('attention', 'to', 'the'), 23),
 (('said', 'p', 'mr'), 23),
 (('p', 'g', 'g'), 23),
 (('a', 'majority', 'of'), 23),
 (('of', 'the', "country's"), 23),
 (('an', 'increase', 'in'), 23),
 (('criminal', 'justice', 'system'), 23),
 (('to', 'the', 'left'), 23),
 (('university', 'of', 'california'), 23),
 (('it', 'is', 'the'), 23),
 (('back', 'to', 'the'), 23),
 (('in', 'addition', 'to'), 23),
 (('rest', 'of', 'the'), 23),
 (('is', 'going', 'to'), 23),
 (('to', 'change', 'the'), 23),
 (('the', 'poverty', 'line'), 23),
 (('people', 'who', 'have'), 23),
 (('an', 'effort', 'to'), 23),
 (('for', 'our', 'newsletter'), 22),
 (('de', 'blasio', 'and'), 22),
 (('at', 'the', 'bottom'), 22),
 (('the', 'u', 's'), 22),
 (('that', 'it', 'would'), 22),
 (('to', 'make', 'sure'), 22),
 (('of', 'the', 'income'), 22),
 (('would', 'be', 'a'), 22),
 (('i', 'think', 'the'), 22),
 (('the', 'republican', 'party'), 22),
 (('those', 'at', 'the'), 22),
 (('p', 'i', 'was'), 22),
 (('for', 'more', 'than'), 22),
 (('it', 'was', 'the'), 22),
 (('p', "it's", 'not'), 21),
 (('middle', 'class', 'and'), 21),
 (('to', 'focus', 'on'), 21),
 (('the', 'issue', 'of'), 21),
 (('to', 'pay', 'for'), 21),
 (('parts', 'of', 'the'), 21),
 (('of', 'the', 'federal'), 21),
 (('that', 'there', 'is'), 21),
 (('that', 'mrs', 'clinton'), 21),
 (('was', 'one', 'of'), 21),
 (('member', 'of', 'the'), 21),
 (('a', 'few', 'years'), 21),
 (('focus', 'on', 'the'), 21),
 (('p', 'but', 'in'), 21),
 (('the', 'need', 'to'), 21),
 (('the', 'american', 'dream'), 21),
 (('the', 'g', 'o'), 21),
 (('of', 'more', 'than'), 21),
 (('to', 'believe', 'that'), 21),
 (('it', 'was', 'not'), 21),
 (('is', 'not', 'the'), 21),
 (('p', 'the', 'study'), 21),
 (('i', 'want', 'to'), 21),
 (('that', 'it', 'was'), 21),
 (('i', "don't", 'think'), 21),
 (('the', 'islamic', 'state'), 21),
 (('minimum', 'wage', 'to'), 21),
 (('the', 'most', 'important'), 21),
 (('the', 'need', 'for'), 21),
 (('for', 'the', 'democratic'), 21),
 (('to', 'do', 'with'), 21),
 (('to', 'make', 'the'), 21),
 (('said', 'he', 'had'), 20),
 (('the', 'author', 'of'), 20),
 (('as', 'well', 'p'), 20),
 (('to', 'those', 'who'), 20),
 (('p', 'the', 'report'), 20),
 (('vast', 'majority', 'of'), 20),
 (('the', 'american', 'economy'), 20),
 (("we're", 'going', 'to'), 20),
 (('to', 'new', 'york'), 20),
 (('we', 'have', 'a'), 20),
 (('for', 'those', 'who'), 20),
 (('in', 'the', 'new'), 20),
 (('chief', 'executive', 'of'), 20),
 (('in', 'south', 'carolina'), 20),
 (('and', 'the', 'poor'), 20),
 (('many', 'of', 'them'), 20),
 (('the', 'justice', 'department'), 20),
 (('in', 'the', 'early'), 20),
 (('the', 'earned-income', 'tax'), 20),
 (('used', 'to', 'be'), 20),
 (('executive', 'director', 'of'), 20),
 (('mrs', 'clinton', 'has'), 20),
 (('the', 'obama', 'administration'), 20),
 (('to', 'make', 'a'), 20),
 (('the', 'country', 'p'), 20),
 (('the', 'lack', 'of'), 20),
 (('new', 'york', 'and'), 20),
 (('federal', 'minimum', 'wage'), 20),
 (('of', 'the', 'city'), 20),
 (('he', 'said', 'the'), 20),
 (('the', 'future', 'of'), 19),
 (('at', 'the', 'very'), 19),
 (('in', 'response', 'to'), 19),
 (('the', 'role', 'of'), 19),
 (('is', 'likely', 'to'), 19),
 (('the', 'effects', 'of'), 19),
 (('to', 'live', 'in'), 19),
 (('invite', 'you', 'to'), 19),
 (('that', 'we', 'are'), 19),
 (('on', 'the', 'streets'), 19),
 (('united', 'states', 'is'), 19),
 (('that', 'it', 'is'), 19),
 (('p', 'in', 'his'), 19),
 (('sign', 'up', 'here'), 19),
 (('are', 'in', 'the'), 19),
 (('newsletter', 'p', 'this'), 19),
 (('who', 'want', 'to'), 19),
 (('the', 'face', 'of'), 19),
 (('has', 'not', 'been'), 19),
 (('and', 'it', 'is'), 19),
 (('the', 'country', 'and'), 19),
 (('health', 'care', 'and'), 19),
 (('gross', 'domestic', 'product'), 19),
 (('de', 'blasio', 'has'), 19),
 (('mr', 'obama', 'said'), 19),
 (('there', 'will', 'be'), 19),
 (('said', 'he', 'would'), 19),
 (('the', 'federal', 'minimum'), 19),
 (('of', 'the', 'national'), 19),
 (('that', 'is', 'the'), 19),
 (('of', 'the', 'middle'), 19),
 (('in', 'the', 'bottom'), 19),
 (('i', "don't", 'know'), 19),
 (('and', 'that', 'the'), 19),
 (('60', 'percent', 'of'), 18),
 (('secretary', 'of', 'state'), 18),
 (('the', 'criminal', 'justice'), 18),
 (('by', 'the', 'end'), 18),
 (('he', 'said', 'it'), 18),
 (('as', 'a', 'whole'), 18),
 (('for', 'the', 'new'), 18),
 (('a', 'third', 'of'), 18),
 (('to', 'your', 'inbox'), 18),
 (('the', 'mayor', 'said'), 18),
 (('taxes', 'on', 'the'), 18),
 (('to', 'go', 'to'), 18),
 (('mr', 'sanders', 'has'), 18),
 (('at', 'least', 'one'), 18),
 (('the', 'right', 'to'), 18),
 (('of', 'millions', 'of'), 18),
 (('our', 'newsletter', 'p'), 18),
 (('a', 'chance', 'to'), 18),
 (('the', 'distribution', 'of'), 18),
 (('the', 'value', 'of'), 18),
 (('e', 'c', 'd'), 18),
 (('in', 'order', 'to'), 18),
 (('george', 'w', 'bush'), 18),
 (('you', 'want', 'to'), 18),
 (('gap', 'between', 'the'), 18),
 (('of', 'thousands', 'of'), 18),
 (('the', 'european', 'union'), 18),
 (('the', 'director', 'of'), 18),
 (('the', 'work', 'force'), 18),
 (('on', 'the', 'street'), 18),
 (('andrew', 'm', 'cuomo'), 18),
 (('when', 'he', 'was'), 18),
 (('to', 'address', 'the'), 18),
 (('of', 'income', 'and'), 18),
 (('to', 'say', 'that'), 18),
 (('of', 'the', 'house'), 18),
 (('in', 'ways', 'that'), 18),
 (('the', 'best', 'way'), 18),
 (('are', 'likely', 'to'), 18),
 (('o', 'e', 'c'), 18),
 (('to', 'be', 'the'), 18),
 (('a', 'sense', 'of'), 18),
 (('the', 'children', 'of'), 17),
 (('it', 'will', 'be'), 17),
 (('the', 'mayor', 'of'), 17),
 (('is', 'part', 'of'), 17),
 (('p', 'we', 'have'), 17),
 (('income', 'inequality', 'is'), 17),
 (('up', 'in', 'the'), 17),
 (('that', 'they', 'are'), 17),
 (('30', 'percent', 'of'), 17),
 (('people', 'out', 'of'), 17),
 (('delivered', 'to', 'your'), 17),
 (('take', 'advantage', 'of'), 17),
 (('of', 'climate', 'change'), 17),
 (('around', 'the', 'country'), 17),
 (('she', 'said', 'she'), 17),
 (('the', 'writer', 'is'), 17),
 (('people', 'who', 'were'), 17),
 (('the', 'first', 'place'), 17),
 (('up', 'here', 'p'), 17),
 (('in', 'recent', 'decades'), 17),
 (('it', 'is', 'also'), 17),
 (('he', 'said', 'that'), 17),
 (('p', 'and', 'yet'), 17),
 (('a', 'college', 'degree'), 17),
 (('grew', 'up', 'in'), 17),
 (('gov', 'andrew', 'm'), 17),
 (('p', 'the', 'new'), 17),
 (('the', 'top', '10'), 17),
 (('p', 'one', 'of'), 17),
 (('would', 'have', 'to'), 17),
 (('mrs', 'clinton', 'and'), 17),
 (('he', 'is', 'a'), 17),
 (('percent', 'of', 'americans'), 17),
 (('p', 'in', 'an'), 17),
 (('mr', 'sanders', 'who'), 17),
 (('of', 'the', 'center'), 16),
 (('the', 'effect', 'of'), 16),
 (('of', 'the', 'economic'), 16),
 (('the', 'middle', 'east'), 16),
 (('income', 'inequality', 'p'), 16),
 (('to', 'help', 'the'), 16),
 (('the', 'size', 'of'), 16),
 (('the', 'top', 'of'), 16),
 (('income', 'inequality', 'in'), 16),
 (('over', 'the', 'years'), 16),
 (('p', 'but', 'he'), 16),
 (('p', 'in', 'addition'), 16),
 (('to', 'be', 'in'), 16),
 (('p', 'i', "don't"), 16),
 (('he', 'said', 'in'), 16),
 (('who', 'grew', 'up'), 16),
 (('p', 'while', 'the'), 16),
 (('look', 'at', 'the'), 16),
 (('at', 'odds', 'with'), 16),
 (('in', 'the', 'top'), 16),
 (('side', 'of', 'the'), 16),
 (('a', 'high', 'school'), 16),
 (('they', 'do', 'not'), 16),
 (('tend', 'to', 'be'), 16),
 (('of', 'the', 'poorest'), 16),
 (('he', 'has', 'been'), 16),
 (('10', 'percent', 'of'), 16),
 (('of', 'the', 'democratic'), 16),
 (('a', 'variety', 'of'), 16),
 (('to', 'more', 'than'), 16),
 (('and', 'those', 'who'), 16),
 (('in', 'a', 'speech'), 16),
 (('all', 'of', 'us'), 16),
 (('middle', 'class', 'p'), 16),
 (('an', 'economist', 'at'), 16),
 (('bottom', '90', 'percent'), 16),
 (('to', 'reduce', 'the'), 16),
 (('raise', 'the', 'minimum'), 16),
 (('in', 'an', 'email'), 16),
 (('on', 'the', 'issue'), 16),
 (('may', 'not', 'be'), 16),
 (('pew', 'research', 'center'), 16),
 (('the', 'share', 'of'), 16),
 (('of', 'poverty', 'p'), 16),
 (('leader', 'of', 'the'), 16),
 (('we', 'are', 'going'), 16),
 (('on', 'climate', 'change'), 16),
 (('the', 'great', 'recession'), 16),
 (('that', 'we', 'have'), 16),
 (('of', 'the', "city's"), 16),
 (('the', 'impact', 'of'), 16),
 (('the', 'bottom', '90'), 16),
 (('for', 'the', 'homeless'), 16),
 (('they', 'want', 'to'), 16),
 (('of', 'all', 'the'), 16),
 (('to', 'see', 'the'), 16),
 (('in', 'the', 'face'), 16),
 (('percent', 'of', 'all'), 16),
 (('the', 'power', 'of'), 16),
 (('to', 'the', 'poor'), 16),
 (('in', 'recent', 'months'), 16),
 (('to', 'talk', 'about'), 16),
 (('a', 'member', 'of'), 16),
 (('the', 'middle', 'of'), 16),
 (('the', 'financial', 'crisis'), 16),
 (('to', 'deal', 'with'), 15),
 (('p', "it's", 'a'), 15),
 (('professor', 'at', 'the'), 15),
 (('those', 'in', 'the'), 15),
 (('to', 'build', 'a'), 15),
 (("don't", 'want', 'to'), 15),
 (('this', 'kind', 'of'), 15),
 (('that', 'has', 'been'), 15),
 (('according', 'to', 'an'), 15),
 (('the', 'los', 'angeles'), 15),
 (('between', 'rich', 'and'), 15),
 (('by', 'the', 'time'), 15),
 (('even', 'if', 'they'), 15),
 (('black', 'lives', 'matter'), 15),
 (('income', 'and', 'wealth'), 15),
 (('and', 'the', 'united'), 15),
 (('at', 'the', 'new'), 15),
 (('on', 'the', 'right'), 15),
 (('p', 'according', 'to'), 15),
 (('the', 'last', 'two'), 15),
 (('on', 'behalf', 'of'), 15),
 (('the', 'developing', 'world'), 15),
 (('social', 'security', 'and'), 15),
 (('mr', 'passos', 'coelho'), 15),
 (('a', 'lot', 'more'), 15),
 (('going', 'to', 'do'), 15),
 (('and', 'we', 'have'), 15),
 (('and', 'health', 'care'), 15),
 (('to', 'try', 'to'), 15),
 (('along', 'with', 'the'), 15),
 (('on', 'the', 'left'), 15),
 (('that', 'there', 'are'), 15),
 (('they', 'are', 'not'), 15),
 (('on', 'income', 'inequality'), 15),
 (('for', 'all', 'the'), 15),
 (('and', 'in', 'the'), 15),
 (('that', 'mr', 'sanders'), 15),
 (('mrs', 'clinton', 'is'), 15),
 (('found', 'that', 'the'), 15),
 (('a', 'professor', 'at'), 15),
 (('president', 'bill', 'clinton'), 15),
 (('p', 'among', 'the'), 15),
 (('of', 'homeless', 'services'), 15),
 (('p', 'as', 'a'), 15),
 (('the', 'rate', 'of'), 15),
 (('in', 'new', 'hampshire'), 15),
 (('went', 'on', 'to'), 15),
 (('top', 'of', 'the'), 15),
 (('p', 'on', 'the'), 15),
 (('united', 'states', 'has'), 15),
 (('make', 'sure', 'that'), 15),
 (('who', 'had', 'been'), 15),
 (('mr', 'clinton', 'said'), 15),
 (('in', 'the', 'house'), 15),
 (('at', 'the', 'state'), 15),
 (('the', 'american', 'people'), 15),
 (('to', 'do', 'so'), 15),
 (('seem', 'to', 'be'), 15),
 (('by', 'the', 'united'), 15),
 (('war', 'on', 'poverty'), 15),
 (('your', 'inbox', 'every'), 15),
 (('less', 'than', 'a'), 15),
 (('poverty', 'and', 'inequality'), 15),
 (('the', 'amount', 'of'), 15),
 (('m', 'i', 't'), 15),
 (('based', 'on', 'the'), 14),
 (('united', 'states', 'in'), 14),
 (('in', 'the', '1990s'), 14),
 (('the', 'other', 'hand'), 14),
 (('not', 'just', 'a'), 14),
 (('best', 'way', 'to'), 14),
 (('who', 'have', 'been'), 14),
 (('in', 'the', 'next'), 14),
 (('the', 'house', 'of'), 14),
 (('and', 'income', 'inequality'), 14),
 (('for', 'american', 'progress'), 14),
 (('a', 'part', 'of'), 14),
 (('the', '1', 'percent'), 14),
 (('there', 'was', 'no'), 14),
 (('share', 'of', 'the'), 14),
 (('mr', 'obama', 'has'), 14),
 (('p', 'i', 'invite'), 14),
 (('department', 'of', 'homeless'), 14),
 (('top', '10', 'percent'), 14),
 (('a', 'range', 'of'), 14),
 (('and', 'around', 'the'), 14),
 (('as', 'a', 'way'), 14),
 (('p', 'he', 'was'), 14),
 (('do', 'more', 'to'), 14),
 (('a', 'matter', 'of'), 14),
 (('for', 'example', 'the'), 14),
 (('for', 'them', 'to'), 14),
 (('seems', 'to', 'be'), 14),
 (('needs', 'to', 'be'), 14),
 (('the', 'benefits', 'of'), 14),
 (('the', 'work', 'of'), 14),
 (('the', 'percentage', 'of'), 14),
 (('the', 'price', 'of'), 14),
 (('sustainable', 'development', 'goals'), 14),
 (('40', 'percent', 'of'), 14),
 (('the', 'world', 'and'), 14),
 (('was', 'the', 'first'), 14),
 (('live', 'in', 'poverty'), 14),
 (('a', 'quarter', 'of'), 14),
 (('to', 'create', 'a'), 14),
 (('for', 'a', 'new'), 14),
 (('can', 'be', 'done'), 14),
 (('who', 'has', 'been'), 14),
 (('this', 'is', 'an'), 14),
 (('lot', 'of', 'people'), 14),
 (('the', 'bottom', 'of'), 14),
 (('are', 'less', 'likely'), 14),
 (('donald', 'j', 'trump'), 14),
 (('p', 'in', 'other'), 14),
 (('the', 'vast', 'majority'), 14),
 (('mrs', 'clinton', 'will'), 14),
 (('there', 'has', 'been'), 14),
 (('of', 'the', 'civil'), 14),
 (('p', 'as', 'the'), 14),
 (('than', 'any', 'other'), 14),
 (('going', 'to', 'have'), 14),
 (('been', 'able', 'to'), 14),
 (('the', 'case', 'that'), 14),
 (('to', 'help', 'them'), 14),
 (('rich', 'and', 'the'), 14),
 (('of', 'the', 'global'), 14),
 (('in', 'the', 'american'), 14),
 (('i', 'invite', 'you'), 14),
 (('20', 'percent', 'of'), 14),
 (("it's", 'not', 'just'), 14),
 (('the', '20th', 'century'), 14),
 (('will', 'be', 'the'), 14),
 (('has', 'long', 'been'), 14),
 (('senator', 'elizabeth', 'warren'), 14),
 (('up', 'with', 'the'), 14),
 (('there', 'are', 'many'), 14),
 (('more', 'than', 'the'), 14),
 (('to', 'do', 'something'), 14),
 (('in', 'san', 'francisco'), 14),
 (('poverty', 'in', 'the'), 14),
 (('of', 'people', 'in'), 14),
 (('center', 'for', 'american'), 14),
 (('associated', 'with', 'the'), 14),
 (('but', 'there', 'is'), 14),
 (('is', 'not', 'just'), 14),
 (('p', 'he', 'added'), 14),
 (('for', 'mrs', 'clinton'), 14),
 (('p', 'if', 'you'), 14),
 (('she', 'said', 'i'), 14),
 (('p', 'he', 'has'), 14),
 (('and', 'for', 'the'), 13),
 (('p', 'the', 'mayor'), 13),
 (('is', 'the', 'same'), 13),
 (('who', 'are', 'in'), 13),
 (('to', 'do', 'more'), 13),
 (('the', 'washington', 'post'), 13),
 (('result', 'of', 'the'), 13),
 (('a', 'kind', 'of'), 13),
 (('the', 'economic', 'ladder'), 13),
 (('as', 'it', 'is'), 13),
 (('in', 'the', 'future'), 13),
 (('head', 'of', 'the'), 13),
 (('economic', 'and', 'political'), 13),
 (('believe', 'that', 'the'), 13),
 (('in', 'the', 'way'), 13),
 (('and', 'his', 'wife'), 13),
 (('in', 'some', 'cases'), 13),
 (('center', 'of', 'the'), 13),
 (('have', 'to', 'do'), 13),
 (('he', 'said', 'i'), 13),
 (('of', 'those', 'who'), 13),
 (('the', 'civil', 'rights'), 13),
 (('state', 'of', 'the'), 13),
 (('the', 'story', 'of'), 13),
 (('michael', 'r', 'bloomberg'), 13),
 (('that', 'would', 'be'), 13),
 (('that', 'we', 'can'), 13),
 (('economist', 'at', 'the'), 13),
 (('women', 'in', 'the'), 13),
 (('to', 'work', 'with'), 13),
 (('of', 'buenos', 'aires'), 13),
 (('but', 'there', 'are'), 13),
 (('minimum', 'wage', 'and'), 13),
 (('the', 'wealthy', 'and'), 13),
 (('to', 'have', 'the'), 13),
 (('the', 'increase', 'in'), 13),
 (('i', 'had', 'to'), 13),
 (('in', 'which', 'he'), 13),
 (('in', 'a', 'recent'), 13),
 (('but', 'in', 'the'), 13),
 (('the', 'democratic', 'presidential'), 13),
 (('the', 'level', 'of'), 13),
 (('just', 'a', 'few'), 13),
 (('those', 'who', 'have'), 13),
 (('p', 'of', 'course'), 13),
 (('me', 'on', 'twitter'), 13),
 (('in', 'the', 'democratic'), 13),
 (('p', 'that', 'is'), 13),
 (('response', 'to', 'the'), 13),
 (('mr', 'sanders', 'was'), 13),
 (('in', 'the', 'region'), 13),
 (('more', 'and', 'more'), 13),
 (('mr', 'sanders', 'is'), 13),
 (('the', 'wall', 'street'), 13),
 (('of', 'the', 'first'), 13),
 (('p', 'after', 'the'), 13),
 (('was', 'in', 'the'), 13),
 (('of', 'the', 'same'), 13),
 (('academy', 'of', 'sciences'), 13),
 (('a', 'spokesman', 'for'), 13),
 (('of', 'housing', 'and'), 13),
 (('the', 'result', 'of'), 13),
 (('thousands', 'of', 'people'), 13),
 (('on', 'the', 'poor'), 13),
 (('to', 'be', 'more'), 13),
 (('mr', 'sanders', 'and'), 13),
 (('in', 'their', 'own'), 13),
 (('g', 'd', 'p'), 13),
 (('the', 'most', 'recent'), 13),
 (('even', 'if', 'the'), 13),
 (('on', 'the', 'rich'), 13),
 (('that', 'of', 'the'), 13),
 (('say', 'they', 'are'), 13),
 (('the', 'federal', 'reserve'), 13),
 (('the', 'trans-pacific', 'partnership'), 13),
 (('earlier', 'this', 'year'), 13),
 (('to', 'find', 'a'), 13),
 (('of', 'health', 'and'), 13),
 (('follow', 'me', 'on'), 13),
 (('the', 'working', 'poor'), 13),
 (('about', 'how', 'to'), 13),
 (('p', 'mr', 'obama'), 13),
 (('said', 'she', 'was'), 13),
 (('every', 'day', 'with'), 13),
 (('said', 'p', 'in'), 13),
 (('of', 'the', 'few'), 13),
 (('when', 'she', 'was'), 13),
 (('the', 'heart', 'of'), 13),
 (('to', 'support', 'the'), 13),
 (('on', 'how', 'to'), 13),
 (('p', 'for', 'example'), 12),
 (('that', 'she', 'was'), 12),
 (('the', 'o', 'e'), 12),
 (('to', 'a', 'new'), 12),
 (('it', 'to', 'the'), 12),
 (('and', 'of', 'course'), 12),
 (('income', 'inequality', 'has'), 12),
 (('they', 'have', 'to'), 12),
 (('to', 'the', 'right'), 12),
 (('want', 'to', 'do'), 12),
 (('a', 'bit', 'of'), 12),
 (('to', 'take', 'a'), 12),
 (('for', 'the', 'next'), 12),
 (('c', 'e', 'o'), 12),
 (('p', 'many', 'of'), 12),
 (('of', 'dollars', 'in'), 12),
 (('fellow', 'at', 'the'), 12),
 (('of', 'a', 'new'), 12),
 (('of', 'people', 'living'), 12),
 (('the', 'head', 'of'), 12),
 (('in', 'the', 'nation'), 12),
 (('p', 'e', 'a'), 12),
 (('would', 'not', 'be'), 12),
 (('of', 'the', 'nation'), 12),
 (('people', 'at', 'the'), 12),
 (('the', 'history', 'of'), 12),
 (('the', 'consequences', 'of'), 12),
 (('the', 'catholic', 'church'), 12),
 (('that', 'have', 'been'), 12),
 (('on', 'the', 'ground'), 12),
 (('the', 'same', 'as'), 12),
 (('the', 'death', 'of'), 12),
 (('a', 'year', 'in'), 12),
 (('is', 'trying', 'to'), 12),
 (('p', 'i', 'am'), 12),
 (('are', 'on', 'the'), 12),
 (('said', 'she', 'had'), 12),
 (('gap', 'between', 'rich'), 12),
 (('that', 'this', 'is'), 12),
 (('occupy', 'wall', 'street'), 12),
 (('no', 'matter', 'how'), 12),
 (('has', 'become', 'a'), 12),
 (('minimum', 'wage', 'p'), 12),
 (('of', 'our', 'time'), 12),
 (('inequality', 'p', 'the'), 12),
 (('in', 'the', 'senate'), 12),
 (('is', 'more', 'than'), 12),
 (('below', 'the', 'poverty'), 12),
 (('out', 'to', 'be'), 12),
 (('social', 'safety', 'net'), 12),
 (('i', 'believe', 'that'), 12),
 (('because', 'of', 'a'), 12),
 (('i', "don't", 'want'), 12),
 (('by', 'the', 'new'), 12),
 (('and', 'they', 'are'), 12),
 (('in', 'the', 'south'), 12),
 (('p', 'during', 'the'), 12),
 (('more', 'likely', 'than'), 12),
 (('p', 'on', 'monday'), 12),
 (('climate', 'change', 'p'), 12),
 (('the', 'institute', 'for'), 12),
 (('one', 'of', 'his'), 12),
 (('we', 'have', 'been'), 12),
 (('new', 'york', 'today'), 12),
 (('of', 'the', 'work'), 12),
 (('be', 'in', 'the'), 12),
 (('more', 'than', '100'), 12),
 (('the', 'general', 'election'), 12),
 (('to', 'be', 'done'), 12),
 (('do', 'not', 'have'), 12),
 (('of', 'homeless', 'people'), 12),
 (('they', 'have', 'been'), 12),
 (('american', 'enterprise', 'institute'), 12),
 (('changes', 'in', 'the'), 12),
 (('in', 'latin', 'america'), 12),
 (('if', 'you', 'want'), 12),
 (('day', 'with', 'the'), 12),
 (('the', 'war', 'on'), 12),
 (('not', 'just', 'the'), 12),
 (('he', 'has', 'a'), 12),
 (('millions', 'of', 'dollars'), 12),
 (('new', 'york', 'university'), 12),
 (('but', 'they', 'are'), 12),
 (('he', 'would', 'be'), 12),
 (('between', 'the', 'rich'), 12),
 (('the', 'republican', 'presidential'), 12),
 (('to', 'reduce', 'inequality'), 12),
 (('in', 'the', 'right'), 12),
 (('the', '21st', 'century'), 12),
 (('senator', 'ted', 'cruz'), 12),
 (('p', 'mr', 'clinton'), 12),
 (('to', 'the', 'new'), 12),
 (('the', 'subject', 'of'), 12),
 (('and', 'economic', 'inequality'), 12),
 (('in', 'one', 'of'), 12),
 (('the', 'era', 'of'), 12),
 (('p', 'some', 'of'), 12),
 (('early', 'childhood', 'education'), 12),
 (('of', 'the', 'law'), 12),
 (('that', 'he', 'has'), 12),
 (('to', 'help', 'people'), 12),
 (('few', 'years', 'ago'), 12),
 (('the', 'working', 'class'), 12),
 (('p', 'for', 'the'), 12),
 (('such', 'as', 'the'), 12),
 (('to', 'respond', 'to'), 12),
 (('the', 'ability', 'to'), 12),
 (('de', 'blasio', 'is'), 12),
 (('was', 'born', 'in'), 12),
 (('in', 'the', 'bronx'), 12),
 (('p', 'if', 'we'), 12),
 (('have', 'been', 'a'), 12),
 (('mrs', 'clinton', 'was'), 12),
 (('of', 'economic', 'growth'), 12),
 (('the', 'fair', 'housing'), 12),
 (('the', 'quality', 'of'), 12),
 (('n', 'y', 'p'), 12),
 (('wall', 'street', 'journal'), 12),
 (('that', 'they', 'have'), 12),
 (('at', 'one', 'point'), 12),
 (('at', 'the', 'federal'), 11),
 (('to', 'mrs', 'clinton'), 11),
 (('workers', 'in', 'the'), 11),
 (('role', 'in', 'the'), 11),
 (('grow', 'up', 'in'), 11),
 (('about', 'half', 'of'), 11),
 (('the', "today's", 'headlines'), 11),
 (('his', 'or', 'her'), 11),
 (('the', 'federal', 'poverty'), 11),
 (('to', 'do', 'p'), 11),
 (('the', 'chance', 'to'), 11),
 (('the', 'poverty', 'rate'), 11),
 (('years', 'p', 'the'), 11),
 (('the', 'chief', 'executive'), 11),
 (('p', 'n', 'f'), 11),
 (('the', 'world', 'has'), 11),
 (('in', 'front', 'of'), 11),
 (('p', 'but', 'as'), 11),
 (('p', 'the', 'most'), 11),
 (('mr', 'sanders', 'also'), 11),
 (('the', 'idea', 'of'), 11),
 (('p', 'over', 'the'), 11),
 (('in', 'a', 'way'), 11),
 (('they', 'have', 'a'), 11),
 (('in', 'exchange', 'for'), 11),
 (('p', 'and', 'the'), 11),
 (('with', 'the', "today's"), 11),
 (('the', 'wages', 'of'), 11),
 (('years', 'ago', 'the'), 11),
 (('the', 'government', 'should'), 11),
 (('0', '1', 'percent'), 11),
 (('get', 'news', 'and'), 11),
 (('housing', 'and', 'urban'), 11),
 (('there', 'have', 'been'), 11),
 (('women', 'who', 'are'), 11),
 (('of', 'it', 'p'), 11),
 (('hundreds', 'of', 'millions'), 11),
 (('the', 'executive', 'director'), 11),
 (('a', 'right', 'to'), 11),
 (('more', 'than', 'three'), 11),
 (('the', 'health', 'care'), 11),
 (('p', 'get', 'news'), 11),
 (('has', 'been', 'the'), 11),
 (('the', 'bloomberg', 'administration'), 11),
 (('is', 'not', 'enough'), 11),
 (('not', 'the', 'only'), 11),
 (('the', 'decline', 'of'), 11),
 (('on', 'twitter', 'p'), 11),
 (('an', 'average', 'of'), 11),
 (('organization', 'for', 'economic'), 11),
 (('capital', 'in', 'the'), 11),
 (('of', 'the', 'church'), 11),
 (('fight', 'for', '15'), 11),
 (('we', 'can', 'do'), 11),
 (('a', 'general', 'election'), 11),
 (('go', 'back', 'to'), 11),
 (('say', 'that', 'the'), 11),
 ...]

In [237]:
poverty_vocab = set(pov_trigram.keys())
news_vocab = set(news_trigram.keys())
total_vocab = list(poverty_vocab.union(news_vocab))

In [246]:
keyness_data = []
for item in total_vocab:
    nf=news_trigram[item]
    pf=pov_trigram[item]
    
    if nf<5 or pf<5:
        continue
        
    keyness=calculate_keyness(pf, poverty_tokens, nf, news_tokens)
    keyness_data.append({'word':'_'.join(item), 'news': nf, 
                         'poverty': pf, 'LL': keyness}
                       )

In [242]:
len(total_vocab)


Out[242]:
951606

In [247]:
keyness_df=pd.DataFrame(keyness_data, columns = ('word','poverty','news','LL'))
keyness_df.sort_values('LL', ascending=False)


Out[247]:
word poverty news LL
1093 mr_de_blasio 144 42 111.106498
2178 the_minimum_wage 104 19 106.493954
1011 p_to_the 62 9 70.281401
411 senator_bernie_sanders 57 7 68.779227
1257 more_likely_to 71 17 62.648658
2509 sign_up_for 127 63 57.904751
1423 the_middle_class 70 19 56.881528
817 mr_sanders_said 58 12 55.706806
2163 on_facebook_and 121 61 54.004843
1449 p_mr_sanders 49 10 47.412156
671 new_york_city 103 54 43.573906
1051 facebook_and_twitter 107 59 42.082258
187 of_the_poor 38 6 41.598845
856 for_the_poor 38 6 41.598845
2329 up_for_our 32 5 35.187939
1999 need_to_know 37 8 34.679883
1089 today_and_get 35 7 34.231837
1697 presidential_race_today 35 7 34.231837
402 of_the_population 35 7 34.231837
1763 draft_newsletter_p 35 7 34.231837
1064 and_get_politics 35 7 34.231837
2042 get_politics_news 35 7 34.231837
1896 p_find_out 35 7 34.231837
651 out_what_you 35 7 34.231837
1398 twitter_and_the 35 7 34.231837
661 politics_news_updates 35 7 34.231837
1954 race_today_and 35 7 34.231837
2307 first_draft_newsletter 35 7 34.231837
1526 in_this_country 44 13 33.622277
1734 the_cost_of 48 16 33.301494
... ... ... ... ...
394 politics_newsletter_p 10 57 -19.375858
438 for_the_first 34 117 -19.645455
990 that_it_was 21 87 -20.023060
2094 said_they_were 7 49 -20.034878
606 p_he_said 5 43 -20.411060
2454 that_they_were 8 53 -20.705282
39 said_in_an 34 121 -21.653055
455 of_the_islamic 6 48 -21.696218
1234 had_not_been 5 45 -21.989602
1898 the_united_states 357 739 -22.453452
1122 said_in_a 39 135 -22.928122
64 the_middle_east 16 79 -22.955481
2023 a_news_conference 8 57 -23.636714
2518 at_the_time 27 110 -24.687482
967 he_said_he 25 106 -25.237329
711 he_had_been 7 56 -25.312254
1672 said_he_was 26 110 -26.107470
630 in_an_interview 48 162 -26.178372
2302 the_death_penalty 5 52 -27.645743
300 said_that_the 25 112 -28.786309
779 in_a_statement 25 112 -28.786309
2470 that_he_had 26 118 -30.855787
1012 the_obama_administration 20 106 -33.389055
2475 he_said_p 79 246 -33.453147
1238 the_condition_of 5 61 -35.157520
1326 said_he_had 20 121 -43.582521
1984 the_justice_department 20 128 -48.528374
2397 the_white_house 51 240 -65.757386
2097 f_b_i 5 208 -170.841911
2280 the_islamic_state 21 296 -181.798753

2521 rows × 4 columns


In [ ]: