In [2]:
import nltk
import pandas as pd
In [3]:
df=pd.read_csv('data/pov_seedwords.txt',delimiter='\t')
In [4]:
df.head()
Out[4]:
id
corpus
intro
section
length
body
subject
country
0
1
news
1 of 1792 DOCUMENTS //p //p The New York Time...
Section A; Column 0; Business/Financial Desk; ...
1715 words
MIAMI -- For the Ingram clan, working for the...
PUBLIC TRANSPORTATION (90%); MIDDLE INCOME PER...
UNITED STATES (96%)
1
2
news
2 of 1792 DOCUMENTS //p //p The New York Time...
Section A; Column 0; Business/Financial Desk; ...
1832 words
WAXAHACHIE, Tex. -- Most Americans suffered s...
SENIOR CITIZENS (91%); MIDDLE INCOME PERSONS (...
UNITED STATES (94%)
2
3
news
3 of 1792 DOCUMENTS //p //p The New York Time...
Section A; Column 0; Business/Financial Desk; ...
1759 words
When the California Labor Commissioner's Offic...
LABOR FORCE (90%); FREELANCE EMPLOYMENT (90%);...
UNITED STATES (95%)
3
4
news
4 of 1792 DOCUMENTS //p //p The New York Time...
Section A; Column 0; National Desk; Pg. 1
1506 words
SAN BERNARDINO, Calif. -- A heavily armed man ...
SHOOTINGS (92%); GUNSHOT WOUNDS (89%); FIREARM...
UNITED STATES (94%)
4
5
news
5 of 1792 DOCUMENTS //p //p The New York Time...
Section A; Column 0; National Desk; Pg. 1
1075 words
More than one a day. //p That is how often, on...
SHOOTINGS (92%); WOUNDS & INJURIES (90%); GUNS...
UNITED STATES (94%)
In [7]:
In [9]:
word_tok=nltk.tokenize.RegexpTokenizer(r'\b(\w+(?:[\'\-]\w+)?)\b')
In [10]:
words_in_texts=[]
for doc in df.body:
words_in_texts.extend(word_tok.tokenize(doc.lower()))
In [25]:
collocates = nltk.BigramCollocationFinder.from_words(words_in_texts, window_size=4)
In [36]:
import copy
poverty = copy.copy(collocates)
In [34]:
poverty.apply_freq_filter(50)
In [35]:
poverty.ngram_fd
Out[35]:
FreqDist({('to', 'foreign'): 52,
('born', 'in'): 96,
('of', 'obama'): 88,
('rise', 'in'): 75,
('sanders', 'vermont'): 50,
('who', 'a'): 508,
('in', 'washington'): 248,
('sanders', 'the'): 89,
('of', 'technology'): 58,
('to', 'act'): 63,
('them', 'the'): 398,
('course', 'the'): 76,
('said', 'no'): 92,
('5', 'in'): 50,
('of', 'islamic'): 102,
('less', 'and'): 51,
('might', 'be'): 192,
('against', 'p'): 121,
('the', 'over'): 252,
('on', 'media'): 60,
('to', 'people'): 380,
('p', 'editor'): 83,
('an', 'era'): 50,
('not', "it's"): 50,
('name', 'the'): 60,
('has', 'on'): 193,
('mrs', 'the'): 80,
('the', 'things'): 84,
('bush', 'to'): 68,
('then', 'in'): 71,
('use', 'to'): 104,
('used', 'by'): 66,
('federal', 'court'): 56,
('the', 'how'): 108,
('both', 'sides'): 71,
('for', 'at'): 192,
('that', 'him'): 80,
('in', 'this'): 636,
('the', 'iowa'): 60,
('workers', 'p'): 58,
('the', 'about'): 312,
('mr', 'cuomo'): 76,
('percent', 'for'): 75,
('and', 'each'): 50,
('would', 'like'): 78,
('we', 'could'): 96,
('will', 'take'): 69,
('last', 'it'): 50,
('to', 'around'): 73,
('office', 'to'): 50,
('moved', 'to'): 132,
('and', 'several'): 63,
('said', 'has'): 79,
('of', 'history'): 70,
('republican', 'presidential'): 130,
('law', 'p'): 104,
('pay', 'p'): 57,
('so', 'p'): 129,
('people', 'he'): 70,
('students', 'to'): 71,
('like', 'are'): 63,
('programs', 'the'): 64,
('p', 'once'): 57,
('those', 'not'): 53,
('not', 'even'): 72,
("doesn't", 'the'): 59,
('new', 'for'): 116,
('at', 'any'): 65,
('to', 'offer'): 87,
('issue', 'p'): 61,
('he', 'added'): 267,
('director', 'the'): 236,
('problem', 'that'): 65,
('president', 'and'): 151,
('poor', 'and'): 117,
('spent', 'of'): 50,
('with', 'many'): 71,
('version', 'of'): 131,
('all', 'and'): 160,
('see', 'it'): 54,
('out', 'you'): 61,
('the', 'set'): 64,
('center', 'for'): 143,
('people', 'have'): 227,
('to', 'create'): 133,
('changes', 'to'): 61,
('the', 'senate'): 380,
('that', 'america'): 60,
('by', 'united'): 69,
('inequality', 'the'): 182,
('attorney', 'general'): 79,
('as', 'has'): 134,
('left', 'the'): 182,
('today', 'is'): 57,
('is', 'best'): 53,
('spoke', 'of'): 53,
('a', 'while'): 114,
('years', 'that'): 88,
('at', 'university'): 419,
('the', 'end'): 358,
('feel', 'the'): 55,
('a', 'letter'): 74,
('spent', 'the'): 61,
('to', 'without'): 72,
('speech', 'the'): 76,
('is', 'hard'): 54,
('seems', 'to'): 199,
('the', 'ladder'): 57,
('to', 'economy'): 53,
('an', 'with'): 151,
('and', 'very'): 74,
('a', 'shot'): 52,
('000', 'and'): 88,
('that', 'day'): 53,
('pope', 'francis'): 85,
('the', 'could'): 277,
('about', 'policy'): 51,
('justice', 'system'): 56,
('on', 'as'): 171,
('to', 'water'): 57,
('of', 'other'): 215,
('to', 'end'): 128,
("it's", 'like'): 59,
('working', 'with'): 74,
('the', 'without'): 84,
('is', 'an'): 456,
('into', 'and'): 177,
('what', 'have'): 86,
('in', 'october'): 72,
('gun', 'the'): 71,
('by', 'this'): 58,
('her', 'p'): 330,
('a', 'long'): 221,
('the', 'months'): 138,
('as', 'other'): 54,
('does', 'the'): 108,
('and', 'cruz'): 59,
('but', 'those'): 63,
('before', 'to'): 70,
('before', 'in'): 52,
('legislation', 'the'): 66,
('said', 'interview'): 135,
('to', 'stay'): 93,
('of', 'making'): 75,
('san', 'bernardino'): 173,
('it', 'at'): 102,
('of', 'one'): 204,
('i', 'and'): 165,
('on', 'their'): 223,
('a', 'response'): 65,
('at', 'mr'): 102,
('our', 'in'): 104,
('vote', 'in'): 60,
('tend', 'to'): 109,
('law', 'school'): 50,
('chief', 'the'): 71,
('not', 'going'): 105,
('from', 'that'): 165,
('official', 'said'): 79,
('and', 'into'): 147,
('the', 'victims'): 98,
('issues', 'that'): 56,
('in', 'long'): 73,
('between', 'and'): 570,
('in', 'los'): 52,
('was', 'after'): 66,
('york', 'state'): 83,
('one', 'day'): 55,
('the', 'investigation'): 113,
('p', 'still'): 230,
('and', 'become'): 50,
('for', 'families'): 70,
('are', 'with'): 228,
('in', 'most'): 153,
('about', 'he'): 94,
('after', 'and'): 68,
('the', 'government'): 947,
('that', 'his'): 361,
('at', 'her'): 63,
('those', 'in'): 166,
('in', 'short'): 51,
('arguing', 'that'): 59,
('they', 'get'): 85,
('monday', 'the'): 56,
('parents', 'and'): 60,
('sent', 'to'): 103,
('a', 'when'): 238,
('the', 'do'): 150,
('the', 'had'): 770,
('and', 'others'): 199,
('the', 'health'): 216,
('where', 'she'): 64,
('election', 'the'): 75,
('americans', 'in'): 75,
('high', 'the'): 81,
('grew', 'up'): 119,
('and', 'down'): 62,
('the', 'speaker'): 51,
('they', 'about'): 75,
('could', 'be'): 459,
('politics', 'p'): 116,
('reduce', 'the'): 98,
('the', 'deal'): 233,
('of', 'over'): 137,
('to', '15'): 63,
('is', 'good'): 86,
('he', 'mr'): 196,
('view', 'of'): 80,
('no', 'a'): 77,
('israel', 'the'): 55,
('republican', 'and'): 151,
('that', 'make'): 152,
('to', 'middle'): 55,
('hillary', 'clinton'): 274,
('to', 'was'): 143,
('so', 'are'): 64,
('first', 'to'): 185,
('advantage', 'of'): 67,
('p', 'it'): 966,
('both', 'the'): 205,
('federal', 'the'): 89,
('civil', 'liberties'): 63,
('has', 'more'): 167,
('the', 'f'): 388,
('of', 'refugees'): 60,
('be', 'as'): 224,
('offered', 'a'): 58,
('has', 'mr'): 59,
('they', 'mr'): 63,
('not', 'at'): 93,
('perhaps', 'the'): 73,
('i', "don't"): 301,
('so', 'we'): 91,
('p', 'where'): 61,
('you', 'get'): 95,
('his', 'from'): 81,
('at', 'news'): 80,
('in', 'speech'): 106,
('a', 'history'): 85,
('obama', 'and'): 117,
('in', 'years'): 366,
('trump', 'is'): 120,
('of', 'income'): 222,
('was', 'too'): 56,
('what', 'are'): 123,
('to', 'control'): 76,
('members', 'to'): 55,
('democrats', 'and'): 57,
('from', 'with'): 61,
('and', 'may'): 101,
('to', 'whether'): 105,
('away', 'the'): 117,
('the', 'judge'): 91,
('for', 'one'): 106,
('a', 'senator'): 82,
('a', 'period'): 66,
('vote', 'for'): 78,
('all', 'it'): 53,
('free', 'to'): 52,
('of', 'policies'): 71,
('saying', 'that'): 139,
('head', 'the'): 80,
('in', 'way'): 238,
('had', 'his'): 120,
('5', 'million'): 84,
('on', 'we'): 83,
('he', 'of'): 217,
('news', 'and'): 95,
('many', 'have'): 153,
('central', 'the'): 63,
('a', 'speech'): 137,
('he', 'one'): 66,
('with', 'it'): 125,
('oil', 'and'): 74,
('as', 'more'): 86,
('was', 'what'): 63,
('the', 'prospect'): 57,
('and', 'only'): 121,
('time', 'as'): 58,
('of', 'center'): 59,
('are', 'be'): 96,
('police', 'department'): 77,
('spending', 'on'): 58,
('from', 'the'): 2109,
('the', 'local'): 95,
('public', 'p'): 140,
('with', 'them'): 77,
('among', 'them'): 52,
('and', 'new'): 238,
('black', 'lives'): 56,
('more', 'is'): 73,
('who', 'were'): 239,
('to', 'comment'): 89,
('the', 'called'): 92,
('p', 'e'): 51,
('different', 'the'): 78,
('while', 'was'): 67,
('came', 'from'): 79,
('are', 'they'): 103,
('an', 'economic'): 53,
('that', 'about'): 104,
('told', 'the'): 201,
('to', 'sign'): 60,
('one', 'most'): 141,
('that', 'can'): 378,
('consider', 'the'): 61,
('she', 'of'): 74,
('had', 'and'): 151,
('attention', 'the'): 66,
('him', 'as'): 124,
('p', 'has'): 695,
('the', 'author'): 64,
('are', 'on'): 236,
('i', 'just'): 69,
('party', 'to'): 87,
('the', 'all'): 179,
('an', 'increase'): 61,
('she', 'was'): 464,
('twitter', 'and'): 514,
('of', 'climate'): 69,
('and', 'when'): 218,
('during', 'a'): 131,
('p', 'he'): 999,
('and', 'killed'): 61,
('clinton', 'p'): 78,
('a', 'handful'): 56,
('to', 'away'): 75,
('the', 'west'): 163,
('me', 'twitter'): 83,
('you', 'he'): 55,
('about', 'that'): 161,
('income', 'inequality'): 273,
('the', 'officers'): 107,
('of', 'including'): 156,
('officials', 'p'): 111,
('whether', 'a'): 57,
('and', 'first'): 100,
('one', 'and'): 143,
('the', 'team'): 95,
('the', 'years'): 528,
('this', 'i'): 80,
('united', 'has'): 81,
('work', 'force'): 56,
('we', 'will'): 169,
('it', 'unclear'): 51,
('data', 'the'): 118,
('washington', 'a'): 60,
('the', 'being'): 141,
('is', 'and'): 586,
('the', 'goal'): 76,
('obama', 'p'): 65,
('a', 'just'): 72,
('law', 'a'): 54,
('time', 'is'): 73,
('at', 'which'): 64,
('enforcement', 'the'): 58,
('the', 'biggest'): 160,
('p', 'since'): 134,
('but', 'in'): 432,
('to', 'time'): 64,
('a', 'moment'): 92,
('asked', 'if'): 55,
('it', 'be'): 520,
('his', 'wife'): 207,
('such', 'p'): 63,
('a', 'church'): 50,
('the', 'greatest'): 67,
('would', 'more'): 82,
('to', 'education'): 60,
('she', 'she'): 145,
('supporters', 'of'): 70,
('keep', 'the'): 118,
('p', "that's"): 143,
('conservative', 'and'): 53,
('done', 'the'): 70,
('programs', 'and'): 58,
('day', 'p'): 97,
('without', 'a'): 109,
('like', 'the'): 473,
('security', 'and'): 154,
('for', 'months'): 76,
('that', 'get'): 53,
('problem', 'p'): 68,
('are', 'often'): 114,
('large', 'of'): 119,
('p', 'to'): 1123,
('be', 'with'): 172,
('family', 'p'): 94,
('by', 'her'): 58,
('among', 'p'): 59,
('e', 'c'): 53,
('even', 'more'): 135,
('in', 'country'): 308,
('that', 'means'): 53,
('have', 'p'): 234,
('to', 'set'): 57,
('fight', 'the'): 97,
('me', 'to'): 95,
('americans', 'are'): 83,
('right', 'in'): 55,
('p', 'case'): 84,
('so', 'could'): 56,
('much', 'p'): 71,
('the', 'number'): 315,
('said', 'that'): 1187,
('than', 'an'): 51,
('forces', 'the'): 76,
('order', 'the'): 51,
('source', 'of'): 92,
('care', 'and'): 135,
('after', 'to'): 61,
('their', 'with'): 82,
('iran', 'and'): 52,
('a', 'now'): 73,
('was', 'or'): 64,
('can', 'get'): 57,
('not', 'in'): 404,
('a', 'they'): 188,
('to', 'engage'): 50,
('000', 'of'): 64,
('at', 'of'): 880,
('me', 'and'): 117,
('social', 'security'): 110,
('a', 'bad'): 77,
('time', 'on'): 58,
('other', 'that'): 130,
('the', 'saying'): 73,
('a', 'former'): 408,
('job', 'p'): 61,
('the', 'proposed'): 55,
('new', 'of'): 199,
('of', 'states'): 210,
('mr', 'had'): 299,
('child', 'and'): 54,
('he', 'left'): 53,
('having', 'a'): 87,
('of', 'them'): 438,
('a', 'where'): 189,
('so', 'the'): 348,
('were', 'p'): 205,
('in', 'were'): 103,
('and', 'sanders'): 57,
('more', 'they'): 62,
('my', 'and'): 140,
('the', 'comes'): 53,
('for', 'as'): 176,
('the', 'senator'): 140,
('go', 'p'): 70,
('program', 'p'): 51,
('his', 'the'): 677,
('ago', 'p'): 60,
('democratic', 'the'): 63,
('came', 'in'): 67,
('a', 'less'): 56,
('two', 'and'): 116,
('be', 'mr'): 97,
('is', 'how'): 105,
('the', 'itself'): 59,
('that', 'could'): 477,
('other', 'for'): 51,
('the', 'effects'): 92,
('of', 'some'): 167,
('a', 'from'): 514,
('debate', 'the'): 114,
('of', 'she'): 189,
('the', 'report'): 260,
('to', 'look'): 92,
('around', 'p'): 110,
('have', 'been'): 1204,
('in', 'chicago'): 50,
('the', 'chance'): 55,
('data', 'that'): 51,
('governor', 'of'): 74,
('commitment', 'to'): 66,
('we', 'all'): 111,
('how', 'much'): 157,
('to', 'better'): 106,
('the', 'video'): 104,
('the', 'bronx'): 63,
('case', 'for'): 56,
('well', 'as'): 301,
('for', 'his'): 387,
('the', 'interest'): 55,
('with', 'or'): 112,
('north', 'charleston'): 59,
('doing', 'the'): 73,
('one', 'point'): 57,
('clinton', 'in'): 91,
('that', 'say'): 56,
('country', 'to'): 95,
('and', 'was'): 585,
('was', 'on'): 327,
('the', 'banks'): 52,
('as', 'the'): 1896,
('e', 'a'): 125,
('people', 'they'): 84,
('question', 'whether'): 61,
('ways', 'that'): 59,
('to', 'work'): 397,
('likely', 'be'): 113,
('laws', 'and'): 51,
('of', 'change'): 92,
('since', 'and'): 50,
('p', 'every'): 64,
('the', 'federal'): 523,
('need', 'the'): 103,
('more', 'years'): 90,
('behind', 'p'): 52,
('two', 'in'): 107,
('several', 'the'): 82,
('in', 'that'): 1226,
('held', 'a'): 56,
('an', 'independent'): 55,
('and', 'get'): 147,
('have', 'become'): 105,
('higher', 'than'): 69,
('give', 'to'): 70,
('involved', 'the'): 95,
('70', 'percent'): 53,
('is', 'a'): 2450,
('who', 'made'): 50,
('women', 'who'): 119,
('the', 'schools'): 67,
('to', 'an'): 700,
('the', 'foundation'): 180,
('free', 'and'): 54,
('the', 'former'): 316,
('get', 'out'): 78,
('politics', 'newsletter'): 67,
('in', 'high'): 60,
('in', '2016'): 101,
('addition', 'to'): 81,
('wrote', 'a'): 80,
('campaign', 'that'): 82,
('years', 'to'): 156,
('and', 'said'): 462,
('that', 'after'): 58,
('to', 'develop'): 52,
('mr', "obama's"): 188,
('so', 'you'): 59,
('just', 'for'): 58,
('had', 'no'): 130,
('time', 'said'): 55,
('not', 'clear'): 60,
('is', 'he'): 180,
('american', 'in'): 152,
('and', 'would'): 289,
('from', 'said'): 53,
('be', 'and'): 340,
('and', 'among'): 94,
('funding', 'for'): 60,
('in', 'san'): 154,
('to', 'attention'): 59,
('believe', 'in'): 53,
('to', 'home'): 94,
('of', 'poor'): 110,
('issue', 'is'): 55,
('think', "it's"): 50,
('marriage', 'in'): 52,
('a', 'high'): 139,
('to', 'things'): 75,
('that', 'some'): 160,
('were', 'by'): 200,
('president', 'said'): 62,
('well', 'the'): 190,
('violence', 'and'): 51,
('on', 'island'): 54,
('win', 'the'): 83,
('making', 'the'): 114,
('in', 'school'): 107,
('had', 'in'): 304,
('public', 'in'): 113,
('used', 'be'): 50,
('spokesman', 'for'): 90,
('in', 'industry'): 62,
('a', 'small'): 216,
('should', 'to'): 175,
('because', 'not'): 79,
('that', 'time'): 108,
('p', 'mr'): 2122,
('city', 'and'): 95,
('been', 'by'): 335,
('the', 'right'): 447,
('given', 'the'): 158,
('at', 'is'): 68,
('what', 'about'): 72,
('university', 'of'): 366,
('a', 'broader'): 68,
('we', 'now'): 52,
('leaders', 'to'): 77,
('right', 'and'): 91,
('to', 'any'): 192,
('all', 'in'): 150,
('future', 'p'): 64,
('chairman', 'of'): 140,
('under', 'p'): 61,
('every', 'p'): 61,
('evidence', 'the'): 66,
('of', 'do'): 58,
('of', 'weapons'): 53,
('left', 'in'): 66,
('to', 'republican'): 87,
('who', 'do'): 66,
('was', 'to'): 1302,
('to', 'stop'): 149,
('the', 'environmental'): 65,
('around', 'a'): 56,
('get', 'a'): 172,
('system', 'that'): 98,
('asked', 'a'): 55,
('has', 'also'): 145,
('a', 'culture'): 64,
("it's", 'going'): 50,
('to', 'part'): 59,
('comes', 'to'): 138,
('of', 'where'): 119,
('p', 'states'): 94,
('department', 'a'): 61,
('over', 'past'): 114,
('government', 'is'): 76,
('campaign', 'and'): 84,
('scott', 'walker'): 52,
('the', 'police'): 667,
('p', 'senator'): 104,
('which', 'have'): 95,
('a', 'called'): 103,
('poverty', 'the'): 130,
('in', 'i'): 238,
('got', 'to'): 82,
('of', 'party'): 118,
('for', 'so'): 60,
('on', 'are'): 74,
('concerned', 'about'): 63,
('recent', 'years'): 167,
('a', 'local'): 81,
('that', 'she'): 352,
('the', 'houthis'): 54,
('p', 'other'): 201,
('the', 'institute'): 156,
('that', 'same'): 61,
('that', 'much'): 70,
('the', 'homeless'): 83,
('worked', 'with'): 59,
('at', 'end'): 101,
('national', 'the'): 94,
('of', "that's"): 50,
('part', 'of'): 722,
('and', "they're"): 54,
('the', 'benefits'): 99,
('on', 'these'): 51,
('i', 'my'): 111,
('there', 'a'): 490,
('be', 'for'): 306,
('man', 'the'): 71,
('to', 'obama'): 96,
('has', 'p'): 127,
('a', 'presidential'): 103,
('p', 'and'): 1173,
('p', 'well'): 50,
('follow', 'me'): 84,
('which', 'had'): 82,
('engaged', 'in'): 60,
('against', 'a'): 98,
('other', 'as'): 51,
('years', 'later'): 74,
('percent', 'said'): 56,
('would', 'to'): 621,
('of', 'security'): 108,
('gay', 'and'): 54,
('of', 'groups'): 81,
('n', 'y'): 60,
('this', 'of'): 340,
('national', 'in'): 68,
('said', 'if'): 98,
('says', 'the'): 86,
('with', 'in'): 420,
('era', 'of'): 64,
('concluded', 'that'): 72,
('which', 'is'): 395,
('do', 'i'): 58,
('our', 'the'): 152,
('on', 'whether'): 52,
('obama', 'administration'): 163,
('put', 'it'): 98,
('with', 'about'): 71,
('or', 'from'): 50,
('the', 'matter'): 78,
('and', 'up'): 598,
('the', 'my'): 91,
('and', 'white'): 62,
('in', 'right'): 56,
('even', 'of'): 105,
('d', 'a'): 95,
('to', 'sell'): 70,
('to', 'congress'): 66,
('concerns', 'about'): 61,
('for', 'state'): 83,
('the', 'have'): 784,
('in', 'of'): 2271,
('mr', 'and'): 612,
('to', 'every'): 83,
('is', 'the'): 2475,
('and', 'be'): 248,
('path', 'to'): 55,
('candidates', 'to'): 70,
('for', 'these'): 57,
('rather', 'a'): 63,
('even', 'with'): 50,
('to', 'one'): 289,
('one', 'the'): 1032,
('they', 'not'): 305,
('in', 'debate'): 68,
('the', 'trump'): 99,
('on', 'campaign'): 58,
('10', 'of'): 67,
('into', 'an'): 51,
('he', 'asked'): 53,
('cut', 'the'): 51,
('in', 'system'): 51,
('the', 'fight'): 134,
('officials', 'said'): 262,
('even', 'a'): 178,
('program', 'to'): 80,
('000', 'year'): 69,
('the', 'ones'): 108,
('and', 'over'): 149,
('that', 'when'): 139,
('only', 'in'): 137,
('all', 'those'): 55,
('to', 'nuclear'): 51,
('in', 'front'): 105,
('of', 'court'): 71,
('has', 'never'): 75,
('mr', "bush's"): 98,
('to', 'win'): 144,
('have', 'up'): 67,
('there', 'were'): 225,
('p', 'republican'): 90,
('spoke', 'the'): 91,
('to', 'themselves'): 99,
('next', 'the'): 90,
('in', 'all'): 160,
('of', 'florida'): 89,
('government', 'of'): 77,
('the', 'so-called'): 76,
('most', 'recent'): 63,
('the', 'very'): 185,
('and', 'about'): 290,
('of', '10'): 54,
('and', 'financial'): 56,
('is', 'all'): 104,
('so', 'and'): 95,
('and', 'according'): 71,
('on', 'they'): 120,
('happened', 'the'): 50,
('a', 'little'): 166,
('the', 'issue'): 329,
('at', 'they'): 58,
('tied', 'to'): 55,
('of', 'young'): 100,
('or', 'and'): 143,
('000', 'to'): 110,
('do', 'to'): 233,
('this', 'was'): 203,
('you', 'be'): 73,
('p', 'among'): 74,
('the', 'but'): 1003,
('that', 'any'): 79,
('that', 'might'): 161,
('because', 'he'): 146,
('all', 'is'): 68,
('part', 'a'): 163,
('they', 'say'): 95,
('with', 'from'): 104,
('government', 'and'): 181,
('in', 'has'): 252,
('had', 'not'): 207,
('only', 'percent'): 113,
('food', 'stamps'): 74,
('have', 'so'): 55,
('to', 'campaign'): 74,
('but', 'if'): 129,
('an', 'but'): 85,
('on', 'an'): 177,
('you', "can't"): 94,
('those', 'and'): 80,
('he', 'for'): 195,
('the', 'opinion'): 364,
('decision', 'the'): 89,
('hold', 'the'): 67,
('been', 'with'): 111,
('things', 'that'): 67,
('months', 'the'): 113,
('the', 'impact'): 108,
('that', 'or'): 80,
('long', 'of'): 78,
('government', 'p'): 150,
('showed', 'that'): 71,
('fact', 'the'): 93,
('states', 'and'): 273,
('than', 'as'): 53,
('here', 'that'): 50,
('conference', 'in'): 51,
('them', 'with'): 82,
('the', 'we'): 506,
('a', 'couple'): 100,
('in', 'what'): 130,
('past', 'years'): 100,
('himself', 'as'): 69,
('to', 'support'): 228,
('you', 'up'): 51,
('known', 'for'): 65,
('of', 'with'): 431,
('senator', 'sanders'): 89,
('immigration', 'and'): 61,
('great', 'of'): 60,
('them', 'of'): 66,
('in', '2011'): 191,
('today', 'newsletter'): 350,
('this', 'been'): 54,
('of', 'both'): 85,
('media', 'and'): 66,
('were', 'with'): 124,
('support', 'the'): 224,
('real', 'estate'): 123,
('the', 'early'): 142,
('to', 'them'): 437,
('i', 'want'): 154,
('these', 'p'): 65,
('at', 'top'): 85,
('more', 'a'): 335,
('some', 'said'): 54,
('of', 'inequality'): 207,
('one', 'that'): 259,
('officials', 'and'): 95,
('how', 'are'): 81,
('was', 'no'): 146,
('the', 'after'): 247,
('like', 'a'): 332,
('but', 'is'): 458,
('to', 'research'): 53,
('recent', 'the'): 122,
('the', 'here'): 132,
('time', 'in'): 190,
('and', 'bush'): 51,
('2008', 'the'): 54,
('president', 'for'): 55,
('to', 'hold'): 98,
('appears', 'to'): 93,
('executive', 'director'): 104,
('white', 'the'): 99,
('issue', 'of'): 88,
('hours', 'the'): 56,
('the', 'still'): 146,
('less', 'than'): 323,
('pay', 'for'): 160,
('economic', 'that'): 76,
('for', 'years'): 371,
('young', 'and'): 76,
('world', 'is'): 83,
('in', 'interview'): 325,
('parents', 'the'): 58,
('a', 'bill'): 119,
('the', 'instead'): 58,
('making', 'it'): 63,
('the', 'under'): 75,
('three', 'p'): 51,
('to', 'think'): 117,
('if', 'in'): 73,
('new', 'hampshire'): 147,
('national', 'and'): 103,
('of', 'country'): 143,
('ended', 'up'): 57,
('to', 'maintain'): 50,
('p', 'have'): 534,
('i', 'did'): 58,
('added', 'that'): 116,
('to', 'join'): 125,
('americans', 'have'): 69,
('the', 'outcome'): 54,
('we', 'a'): 375,
('be', 'or'): 83,
('are', 'out'): 71,
('w', 'bush'): 108,
('who', 'the'): 1029,
('that', 'such'): 56,
('said', 'will'): 67,
('companies', 'are'): 53,
('and', 'inequality'): 109,
('who', 'want'): 70,
('case', 'of'): 89,
('record', 'of'): 72,
('our', 'p'): 237,
('the', 'use'): 156,
('and', 'policy'): 101,
('the', 'massacre'): 50,
('the', 'i'): 823,
('also', 'been'): 51,
('like', 'this'): 103,
('in', 'november'): 99,
('say', 'they'): 190,
('way', 'of'): 151,
('just', 'of'): 159,
('also', 'that'): 222,
('up', 'with'): 205,
('out', 'he'): 52,
('her', 'was'): 128,
('the', 'situation'): 82,
('work', 'p'): 157,
('chief', 'of'): 182,
('that', "he's"): 58,
('to', 'violence'): 56,
('for', 'ms'): 55,
('is', 'not'): 980,
('here', 'to'): 61,
('make', 'to'): 85,
('2014', 'the'): 92,
('so', 'would'): 50,
('at', 'age'): 66,
('and', 'money'): 60,
('own', 'p'): 150,
('under', 'of'): 79,
('a', 'against'): 126,
('of', 'like'): 160,
('program', 'in'): 59,
('law', 'that'): 89,
('of', 'will'): 183,
('long', 'the'): 157,
('and', 'yet'): 115,
('most', 'is'): 56,
('cruz', 'and'): 56,
('not', 'the'): 1284,
('of', 'civil'): 68,
('ways', 'to'): 104,
('that', 'up'): 86,
('growth', 'the'): 74,
('you', 'do'): 112,
('than', 'they'): 100,
('the', 'line'): 167,
('general', 'the'): 74,
('the', 'vatican'): 75,
('and', 'students'): 55,
('even', 'for'): 75,
('lives', 'in'): 69,
('but', 'would'): 88,
('review', 'the'): 62,
('in', 'name'): 52,
('the', 'result'): 103,
('then', 'he'): 55,
('p', 'but'): 1648,
('which', 'been'): 67,
('and', 'p'): 2105,
('the', 'paris'): 53,
('of', 'say'): 67,
('by', 'to'): 276,
('p', 'course'): 68,
('about', 'said'): 59,
("that's", 'to'): 50,
('a', 'clinton'): 50,
('the', 'death'): 182,
('washington', 'on'): 86,
('they', 'can'): 172,
('had', 'taken'): 66,
('also', 'to'): 318,
('p', "here's"): 50,
('they', 'also'): 118,
('administration', 'the'): 109,
('head', 'of'): 105,
('americans', 'p'): 67,
('the', 'safety'): 113,
('now', 'he'): 54,
('his', 'had'): 116,
('the', 'he'): 1221,
('wrong', 'the'): 50,
('or', 'or'): 115,
('lives', 'the'): 74,
('s', 'to'): 50,
('in', 'north'): 92,
('and', 'found'): 88,
('c', 'c'): 61,
('said', 'when'): 90,
('and', 'president'): 122,
('tuesday', 'the'): 68,
('mr', 'also'): 124,
('need', 'to'): 600,
('he', 'can'): 95,
('i', 'can'): 102,
('said', 'were'): 177,
('p', 'washington'): 220,
('the', 'truth'): 113,
('very', 'different'): 56,
('referred', 'to'): 54,
('p', 'do'): 137,
('should', 'in'): 56,
('big', 'in'): 58,
('and', 'years'): 61,
('had', 'but'): 56,
('to', 'safety'): 69,
('far', 'than'): 74,
('can', 'make'): 51,
...})
In [37]:
poverty.apply_ngram_filter(lambda x,y: 'poverty' not in x+y)
In [40]:
poverty.apply_freq_filter(5)
In [41]:
poverty.ngram_fd
Out[41]:
FreqDist({('a', 'poverty'): 26,
('abject', 'poverty'): 5,
('about', 'poverty'): 22,
('above', 'poverty'): 5,
('address', 'poverty'): 8,
('against', 'poverty'): 7,
('americans', 'poverty'): 6,
('an', 'anti-poverty'): 7,
('an', 'poverty'): 5,
('and', 'poverty'): 65,
('anti-poverty', 'programs'): 8,
('antipoverty', 'programs'): 5,
('are', 'poverty'): 5,
('as', 'poverty'): 8,
('at', 'poverty'): 11,
('be', 'poverty'): 7,
('below', 'poverty'): 25,
('between', 'poverty'): 5,
('by', 'poverty'): 9,
('child', 'poverty'): 7,
('childhood', 'poverty'): 11,
('children', 'poverty'): 7,
('combating', 'poverty'): 6,
('concentrated', 'poverty'): 5,
('cycle', 'poverty'): 6,
('deep', 'poverty'): 5,
('dire', 'poverty'): 5,
('economic', 'poverty'): 5,
('end', 'poverty'): 9,
('escaping', 'poverty'): 8,
('evicted', 'poverty'): 7,
('extreme', 'poverty'): 46,
('federal', 'poverty'): 13,
('fight', 'poverty'): 7,
('fighting', 'poverty'): 10,
('for', 'poverty'): 12,
('from', 'poverty'): 21,
('global', 'poverty'): 14,
('has', 'poverty'): 6,
('have', 'poverty'): 7,
('high', 'poverty'): 5,
('in', 'poverty'): 100,
('income', 'poverty'): 6,
('inequality', 'poverty'): 16,
('into', 'poverty'): 19,
('is', 'poverty'): 8,
('issues', 'poverty'): 6,
('its', 'poverty'): 8,
('level', 'poverty'): 6,
('like', 'poverty'): 7,
('live', 'poverty'): 23,
('lives', 'poverty'): 8,
('living', 'poverty'): 24,
('more', 'poverty'): 10,
('not', 'poverty'): 6,
('odds', 'poverty'): 8,
('of', 'antipoverty'): 5,
('of', 'poverty'): 212,
('official', 'poverty'): 5,
('on', 'poverty'): 45,
('or', 'poverty'): 8,
('out', 'poverty'): 53,
('p', 'poverty'): 21,
('people', 'poverty'): 30,
('poverty', '2030'): 8,
('poverty', 'a'): 43,
('poverty', 'about'): 5,
('poverty', 'according'): 5,
('poverty', 'also'): 6,
('poverty', 'america'): 7,
('poverty', 'among'): 9,
('poverty', 'an'): 6,
('poverty', 'and'): 198,
('poverty', 'are'): 14,
('poverty', 'around'): 5,
('poverty', 'as'): 17,
('poverty', 'at'): 14,
('poverty', 'be'): 11,
('poverty', 'because'): 5,
('poverty', 'but'): 23,
('poverty', 'by'): 22,
('poverty', 'center'): 12,
('poverty', 'climate'): 5,
('poverty', 'could'): 5,
('poverty', 'crime'): 7,
('poverty', 'desmond'): 5,
('poverty', 'disease'): 5,
('poverty', 'for'): 17,
('poverty', 'francis'): 5,
('poverty', 'from'): 11,
('poverty', 'has'): 20,
('poverty', 'have'): 9,
('poverty', 'he'): 20,
('poverty', 'health'): 7,
('poverty', 'hunger'): 6,
('poverty', 'i'): 8,
('poverty', 'in'): 72,
('poverty', 'income'): 9,
('poverty', 'inequality'): 32,
('poverty', 'into'): 5,
('poverty', 'is'): 49,
('poverty', 'issues'): 6,
('poverty', 'it'): 17,
('poverty', "it's"): 6,
('poverty', 'just'): 7,
('poverty', 'law'): 12,
('poverty', 'level'): 8,
('poverty', 'levels'): 5,
('poverty', 'line'): 42,
('poverty', 'more'): 13,
('poverty', 'mr'): 17,
('poverty', 'not'): 15,
('poverty', 'now'): 7,
('poverty', 'of'): 23,
('poverty', 'on'): 10,
('poverty', 'one'): 8,
('poverty', 'or'): 27,
('poverty', 'other'): 5,
('poverty', 'over'): 5,
('poverty', 'p'): 81,
('poverty', 'percent'): 6,
('poverty', 'poverty'): 6,
('poverty', 'profit'): 7,
('poverty', 'project'): 8,
('poverty', 'rate'): 25,
('poverty', 'rates'): 10,
('poverty', 'reduction'): 9,
('poverty', 'said'): 27,
('poverty', 'seem'): 5,
('poverty', 'so'): 7,
('poverty', 'still'): 5,
('poverty', 'than'): 15,
('poverty', 'that'): 27,
('poverty', 'the'): 130,
('poverty', 'their'): 5,
('poverty', 'they'): 10,
('poverty', 'this'): 20,
('poverty', 'through'): 5,
('poverty', 'to'): 38,
('poverty', 'united'): 5,
('poverty', 'violence'): 5,
('poverty', 'was'): 13,
('poverty', 'we'): 10,
('poverty', 'welfare'): 5,
('poverty', 'who'): 10,
('poverty', 'will'): 6,
('poverty', 'with'): 8,
('poverty', 'worldwide'): 6,
('poverty', 'would'): 10,
('progress', 'poverty'): 6,
('reduce', 'poverty'): 10,
('reducing', 'poverty'): 7,
('rise', 'poverty'): 5,
('solution', 'poverty'): 5,
('southern', 'poverty'): 12,
('that', 'poverty'): 15,
('the', 'antipoverty'): 8,
('the', 'poverty'): 184,
('their', 'poverty'): 11,
('to', 'poverty'): 85,
('unemployment', 'poverty'): 7,
('united', 'poverty'): 5,
('urban', 'poverty'): 5,
('war', 'poverty'): 16,
('was', 'poverty'): 7,
('way', 'poverty'): 6,
('will', 'poverty'): 5,
('with', 'poverty'): 16})
In [49]:
class_colls = copy.copy(collocates)
class_colls.apply_ngram_filter(lambda x,y: x != 'class' and y!='class')
In [51]:
class_colls.apply_freq_filter(5)
In [54]:
class_colls.ngram_fd)
Out[54]:
FreqDist({('a', 'class'): 40,
('about', 'class'): 10,
('american', 'class'): 8,
('and', 'class'): 39,
('as', 'class'): 14,
('billionaire', 'class'): 7,
('by', 'class'): 8,
('class', 'a'): 29,
('class', 'about'): 6,
('class', 'and'): 71,
('class', 'are'): 12,
('class', 'as'): 12,
('class', 'at'): 7,
('class', 'be'): 7,
('class', 'but'): 21,
('class', 'by'): 6,
('class', 'even'): 6,
('class', 'from'): 7,
('class', 'had'): 5,
('class', 'has'): 9,
('class', 'have'): 5,
('class', 'he'): 11,
('class', 'i'): 10,
('class', 'if'): 5,
('class', 'in'): 35,
('class', 'inequality'): 6,
('class', 'is'): 24,
('class', 'it'): 10,
('class', "it's"): 9,
('class', 'like'): 8,
('class', 'more'): 9,
('class', 'mr'): 12,
('class', 'not'): 7,
('class', 'of'): 25,
('class', 'on'): 9,
('class', 'or'): 7,
('class', 'p'): 67,
('class', 'poor'): 6,
('class', 'said'): 11,
('class', 'than'): 8,
('class', 'that'): 26,
('class', 'the'): 86,
('class', 'their'): 5,
('class', 'they'): 5,
('class', 'this'): 7,
('class', 'to'): 22,
('class', 'war'): 5,
('class', 'warfare'): 6,
('class', 'was'): 8,
('class', 'well'): 5,
('class', 'when'): 7,
('class', 'who'): 9,
('class', 'with'): 10,
('first', 'class'): 6,
('for', 'class'): 19,
('from', 'class'): 5,
('his', 'class'): 5,
('in', 'class'): 23,
('into', 'class'): 11,
('is', 'class'): 5,
('lower', 'class'): 5,
('middle', 'class'): 203,
('new', 'class'): 7,
('of', 'class'): 60,
('on', 'class'): 15,
('or', 'class'): 13,
('p', 'class'): 9,
('political', 'class'): 9,
('poor', 'class'): 8,
('race', 'class'): 8,
('social', 'class'): 8,
('that', 'class'): 9,
('the', 'class'): 215,
('to', 'class'): 18,
('upper', 'class'): 6,
('white', 'class'): 6,
('with', 'class'): 6,
('working', 'class'): 41})
In [59]:
class_colls_df = pd.DataFrame([(' - '.join(k), v) for k,v in class_colls.ngram_fd.items()], columns=('collocate pair','freq'))
In [64]:
class_colls_df.sort('freq', ascending=False)
/Users/Matt/anaconda3/lib/python3.4/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
if __name__ == '__main__':
Out[64]:
collocate pair
freq
4
the - class
215
51
middle - class
203
11
class - the
86
45
class - and
71
72
class - p
67
59
of - class
60
47
working - class
41
75
a - class
40
26
and - class
39
57
class - in
35
6
class - a
29
3
class - that
26
77
class - of
25
28
class - is
24
9
in - class
23
13
class - to
22
49
class - but
21
66
for - class
19
41
to - class
18
31
on - class
15
54
as - class
14
1
or - class
13
32
class - are
12
30
class - as
12
58
class - mr
12
61
class - he
11
15
class - said
11
27
into - class
11
63
class - with
10
64
class - it
10
...
...
...
60
class - at
7
16
class - from
7
38
class - not
7
56
new - class
7
37
class - when
7
25
class - be
7
29
class - this
7
2
billionaire - class
7
76
class - or
7
69
class - inequality
6
71
white - class
6
73
class - even
6
18
class - about
6
33
class - warfare
6
42
first - class
6
10
with - class
6
70
upper - class
6
40
class - by
6
68
class - poor
6
5
class - if
5
74
from - class
5
19
class - had
5
65
is - class
5
20
lower - class
5
24
class - their
5
48
class - they
5
34
class - have
5
35
class - well
5
36
class - war
5
0
his - class
5
78 rows × 2 columns
In [67]:
def get_collocates(word, collocates, min_freq=5):
wcolls = copy.copy(collocates)
wcolls.apply_ngram_filter(lambda x,y: word!=x and word!=y)
wcolls.apply_freq_filter(min_freq)
wcolls_df = pd.DataFrame([(' - '.join(k), v) for k,v in wcolls.ngram_fd.items()], columns=('collocate pair','freq'))
return wcolls_df.sort('freq',ascending=False)
In [71]:
get_collocates('inequality',collocates)
/Users/Matt/anaconda3/lib/python3.4/site-packages/ipykernel/__main__.py:6: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
Out[71]:
collocate pair
freq
97
income - inequality
273
46
of - inequality
207
132
inequality - the
182
152
inequality - and
159
85
inequality - p
137
144
and - inequality
109
89
the - inequality
107
161
inequality - in
104
48
to - inequality
99
20
inequality - is
84
113
inequality - a
73
45
on - inequality
72
86
economic - inequality
68
146
about - inequality
67
1
that - inequality
65
51
inequality - to
64
23
in - inequality
47
81
inequality - has
45
69
inequality - of
43
96
inequality - it
43
56
inequality - that
42
84
p - inequality
41
155
rising - inequality
37
31
poverty - inequality
32
135
inequality - but
30
118
inequality - as
30
65
reduce - inequality
27
170
growing - inequality
26
74
inequality - for
21
60
inequality - not
21
...
...
...
3
inequality - had
5
172
who - inequality
5
115
level - inequality
5
9
inequality - recent
5
112
inequality - there
5
38
inequality - even
5
61
inequality - other
5
63
combat - inequality
5
37
inequality - its
5
67
problem - inequality
5
70
was - inequality
5
83
crisis - inequality
5
30
inequality - individuals
5
87
inequality - public
5
22
inequality - largely
5
111
roots - inequality
5
18
inequality - poor
5
143
inequality - up
5
120
inequality - inevitable
5
123
inequality - most
5
124
increased - inequality
5
125
inequality - years
5
128
inequality - income
5
130
inequality - some
5
133
inequality - those
5
134
inequality - among
5
14
inequality - about
5
136
inequality - you
5
141
inevitable - inequality
5
90
talk - inequality
5
181 rows × 2 columns
In [ ]:
Content source: mbod/intro_python_for_comm
Similar notebooks: