In [1]:
#第二章 获取文本语料和词汇资源
'''
在自然语言处理的实际项目中,通常要使用大量的语言数据或者语料库。 章的目的是 要回答下列问题:
1. 什么是有用的文 语料和词汇资源,我们如何使用 Python 获取它们?
2. 哪些 Python 结构最适合这项工作?
3. 编写 Python 代码时我们如何避免重复的工作?
'''
%matplotlib inline
import nltk
In [2]:
#### 2.1
'''
古腾堡语料库
NLTK 包 古腾堡项目(Project Gutenberg)电子文 档案的经过挑选的一小部分文。该项目大约有 25,000(现在是 36,000 了)
免费电子图书,放在 http://www.gutenberg. org/上。
'''
#列出文本
nltk.corpus.gutenberg.fileids()
Out[2]:
[u'austen-emma.txt',
u'austen-persuasion.txt',
u'austen-sense.txt',
u'bible-kjv.txt',
u'blake-poems.txt',
u'bryant-stories.txt',
u'burgess-busterbrown.txt',
u'carroll-alice.txt',
u'chesterton-ball.txt',
u'chesterton-brown.txt',
u'chesterton-thursday.txt',
u'edgeworth-parents.txt',
u'melville-moby_dick.txt',
u'milton-paradise.txt',
u'shakespeare-caesar.txt',
u'shakespeare-hamlet.txt',
u'shakespeare-macbeth.txt',
u'whitman-leaves.txt']
In [3]:
ws = nltk.corpus.gutenberg.words('austen-emma.txt')
len(ws)
Out[3]:
192427
In [ ]:
# 另一种一种方式
from nltk.corpus import gutenberg
gutenberg.fileids()
Out[ ]:
[u'austen-emma.txt',
u'austen-persuasion.txt',
u'austen-sense.txt',
u'bible-kjv.txt',
u'blake-poems.txt',
u'bryant-stories.txt',
u'burgess-busterbrown.txt',
u'carroll-alice.txt',
u'chesterton-ball.txt',
u'chesterton-brown.txt',
u'chesterton-thursday.txt',
u'edgeworth-parents.txt',
u'melville-moby_dick.txt',
u'milton-paradise.txt',
u'shakespeare-caesar.txt',
u'shakespeare-hamlet.txt',
u'shakespeare-macbeth.txt',
u'whitman-leaves.txt']
In [ ]:
#通过循环遍历前面列出的 gutenberg 文件标识符链表相应的 fileid,然后计算统计每个文
#平均词长、平均句子长度和 文中每个词出现的 平均次数
for fileid in gutenberg.fileids():
num_chars=len(gutenberg.raw(fileid))
num_words=len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower for w in gutenberg.words(fileid)]))
print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
4 24 1 austen-emma.txt
4 26 1 austen-persuasion.txt
4 28 1 austen-sense.txt
In [ ]:
#网络聊天文本
from nltk.corpus import webtext
for fileid in webtext.fileids():
print fileid,webtext.raw(fileid)[:65],'...'
In [1]:
#即时聊天会话语料库
from nltk.corpus import nps_chat
In [5]:
nps_chat.readme
Out[5]:
<bound method NPSChatCorpusReader.readme of <NPSChatCorpusReader in u'/Users/wizardholy/nltk_data/corpora/nps_chat'>>
In [6]:
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
In [10]:
chatroom[123]
Out[10]:
[u'i',
u'do',
u"n't",
u'want',
u'hot',
u'pics',
u'of',
u'a',
u'female',
u',',
u'I',
u'can',
u'look',
u'in',
u'a',
u'mirror',
u'.']
In [11]:
nps_chat.readme
Out[11]:
<bound method NPSChatCorpusReader.readme of <NPSChatCorpusReader in u'/Users/wizardholy/nltk_data/corpora/nps_chat'>>
In [12]:
#布朗语料库,首个百万级别的英语电子资料库
'''
ID 文件 文体 描述
A16 ca16 新闻 news Chicago Tribune: Society Reportage
B02 cb02 社论 editorial Christian Science Monitor: Editorials
C17 cc17 评论 reviews Time Magazine: Reviews
D12 cd12 宗教 religion Underwood: Probing the Ethics of Realtors
E36 ce36 爱好 hobbies Norling: Renting a Car in Europe
F25 cf25 传说 lore Boroff: Jewish Teenage Culture
G22 cg22 纯文学 belles_lettres Reiner: Coping with Runaway Technology
H15 ch15 政府 government US Office of Civil and Defence Mobilization: The Fam ily Fallout Shelter
J17 cj19 博览 learned Mosteller: Probability with Statistical Applications
K04 ck04 小说 fiction W.E.B. Du Bois: Worlds of Color
L13 cl13 推理小说 mystery Hitchens: Footsteps in the Night
M01 cm01 科幻 science_fiction Heinlein: Stranger in a Strange Land
N14 cn15 探险 adventure Field: Rattlesnake Ridge
P12 cp12 言情 romance Callaghan: A Passion in Rome
R06 cr06 幽默 humor Thurber: The Future, If Any, of Comedy
'''
from nltk.corpus import brown
In [15]:
brown.categories()
Out[15]:
[u'adventure',
u'belles_lettres',
u'editorial',
u'fiction',
u'government',
u'hobbies',
u'humor',
u'learned',
u'lore',
u'mystery',
u'news',
u'religion',
u'reviews',
u'romance',
u'science_fiction']
In [17]:
brown.words(categories='adventure')
Out[17]:
[u'Dan', u'Morgan', u'told', u'himself', u'he', ...]
In [19]:
brown.words(fileids=['ca04'])
Out[19]:
[u'Oslo', u'The', u'most', u'positive', u'element', ...]
In [20]:
brown.sents(fileids=['ca04'])
Out[20]:
[[u'Oslo'], [u'The', u'most', u'positive', u'element', u'to', u'emerge', u'from', u'the', u'Oslo', u'meeting', u'of', u'North', u'Atlantic', u'Treaty', u'Organization', u'Foreign', u'Ministers', u'has', u'been', u'the', u'freer', u',', u'franker', u',', u'and', u'wider', u'discussions', u',', u'animated', u'by', u'much', u'better', u'mutual', u'understanding', u'than', u'in', u'past', u'meetings', u'.'], ...]
In [21]:
#布朗语料库是一个研究文体指尖的系统性差异的很方便的资源
#可以先产生特定问题的计数
news_text = brown.words(categories='news')
In [23]:
fdist = nltk.FreqDist([w.lower() for w in news_text])
In [24]:
modals = ['can','could','may','might','must','will']
In [26]:
for m in modals:
print m+":",fdist[m],
can: 94 could: 87 may: 93 might: 38 must: 53 will: 389
In [27]:
#下面对每一个感兴趣的问题进行统计,使用nltk的带条件的频率分布函数进行处理
cfd = nltk.ConditionalFreqDist(
(genre, word) for genre in brown.categories() for word in brown.words(categories=genre)
)
In [28]:
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
In [29]:
modals = ['can','could','may','might','must','will']
In [30]:
cfd.tabulate(conditions=genres, samples=modals)
can could may might must will
news 93 86 66 38 50 389
religion 82 59 78 12 54 71
hobbies 268 58 131 22 83 264
science_fiction 16 49 4 12 8 16
romance 74 193 11 51 45 43
humor 16 30 8 8 9 13
In [32]:
print brown.categories()
[u'adventure', u'belles_lettres', u'editorial', u'fiction', u'government', u'hobbies', u'humor', u'learned', u'lore', u'mystery', u'news', u'religion', u'reviews', u'romance', u'science_fiction']
In [33]:
#路透社语料库路
#透社语料库包 10,788 个新闻文档,共计 130 万字。这些文档分成 90 个主题,按照 “训练”和“测试”分为两组。
from nltk.corpus import reuters
In [34]:
reuters.fileids()
Out[34]:
['test/14826',
'test/14828',
'test/14829',
'test/14832',
'test/14833',
'test/14839',
'test/14840',
'test/14841',
'test/14842',
'test/14843',
'test/14844',
'test/14849',
'test/14852',
'test/14854',
'test/14858',
'test/14859',
'test/14860',
'test/14861',
'test/14862',
'test/14863',
'test/14865',
'test/14867',
'test/14872',
'test/14873',
'test/14875',
'test/14876',
'test/14877',
'test/14881',
'test/14882',
'test/14885',
'test/14886',
'test/14888',
'test/14890',
'test/14891',
'test/14892',
'test/14899',
'test/14900',
'test/14903',
'test/14904',
'test/14907',
'test/14909',
'test/14911',
'test/14912',
'test/14913',
'test/14918',
'test/14919',
'test/14921',
'test/14922',
'test/14923',
'test/14926',
'test/14928',
'test/14930',
'test/14931',
'test/14932',
'test/14933',
'test/14934',
'test/14941',
'test/14943',
'test/14949',
'test/14951',
'test/14954',
'test/14957',
'test/14958',
'test/14959',
'test/14960',
'test/14962',
'test/14963',
'test/14964',
'test/14965',
'test/14967',
'test/14968',
'test/14969',
'test/14970',
'test/14971',
'test/14974',
'test/14975',
'test/14978',
'test/14981',
'test/14982',
'test/14983',
'test/14984',
'test/14985',
'test/14986',
'test/14987',
'test/14988',
'test/14993',
'test/14995',
'test/14998',
'test/15000',
'test/15001',
'test/15002',
'test/15004',
'test/15005',
'test/15006',
'test/15011',
'test/15012',
'test/15013',
'test/15016',
'test/15017',
'test/15020',
'test/15023',
'test/15024',
'test/15026',
'test/15027',
'test/15028',
'test/15029',
'test/15031',
'test/15032',
'test/15033',
'test/15037',
'test/15038',
'test/15043',
'test/15045',
'test/15046',
'test/15048',
'test/15049',
'test/15052',
'test/15053',
'test/15055',
'test/15056',
'test/15060',
'test/15061',
'test/15062',
'test/15063',
'test/15065',
'test/15067',
'test/15069',
'test/15070',
'test/15074',
'test/15077',
'test/15078',
'test/15079',
'test/15082',
'test/15090',
'test/15091',
'test/15092',
'test/15093',
'test/15094',
'test/15095',
'test/15096',
'test/15097',
'test/15103',
'test/15104',
'test/15106',
'test/15107',
'test/15109',
'test/15110',
'test/15111',
'test/15112',
'test/15118',
'test/15119',
'test/15120',
'test/15121',
'test/15122',
'test/15124',
'test/15126',
'test/15128',
'test/15129',
'test/15130',
'test/15132',
'test/15136',
'test/15138',
'test/15141',
'test/15144',
'test/15145',
'test/15146',
'test/15149',
'test/15152',
'test/15153',
'test/15154',
'test/15156',
'test/15157',
'test/15161',
'test/15162',
'test/15171',
'test/15172',
'test/15175',
'test/15179',
'test/15180',
'test/15185',
'test/15188',
'test/15189',
'test/15190',
'test/15193',
'test/15194',
'test/15197',
'test/15198',
'test/15200',
'test/15204',
'test/15205',
'test/15206',
'test/15207',
'test/15208',
'test/15210',
'test/15211',
'test/15212',
'test/15213',
'test/15217',
'test/15219',
'test/15220',
'test/15221',
'test/15222',
'test/15223',
'test/15226',
'test/15227',
'test/15230',
'test/15233',
'test/15234',
'test/15237',
'test/15238',
'test/15239',
'test/15240',
'test/15242',
'test/15243',
'test/15244',
'test/15246',
'test/15247',
'test/15250',
'test/15253',
'test/15254',
'test/15255',
'test/15258',
'test/15259',
'test/15262',
'test/15263',
'test/15264',
'test/15265',
'test/15270',
'test/15271',
'test/15273',
'test/15274',
'test/15276',
'test/15278',
'test/15280',
'test/15281',
'test/15283',
'test/15287',
'test/15290',
'test/15292',
'test/15294',
'test/15295',
'test/15296',
'test/15299',
'test/15300',
'test/15302',
'test/15303',
'test/15306',
'test/15307',
'test/15308',
'test/15309',
'test/15310',
'test/15311',
'test/15312',
'test/15313',
'test/15314',
'test/15315',
'test/15321',
'test/15322',
'test/15324',
'test/15325',
'test/15326',
'test/15327',
'test/15329',
'test/15335',
'test/15336',
'test/15337',
'test/15339',
'test/15341',
'test/15344',
'test/15345',
'test/15348',
'test/15349',
'test/15351',
'test/15352',
'test/15354',
'test/15356',
'test/15357',
'test/15359',
'test/15363',
'test/15364',
'test/15365',
'test/15366',
'test/15367',
'test/15368',
'test/15372',
'test/15375',
'test/15378',
'test/15379',
'test/15380',
'test/15383',
'test/15384',
'test/15386',
'test/15387',
'test/15388',
'test/15389',
'test/15391',
'test/15394',
'test/15396',
'test/15397',
'test/15400',
'test/15404',
'test/15406',
'test/15409',
'test/15410',
'test/15411',
'test/15413',
'test/15415',
'test/15416',
'test/15417',
'test/15420',
'test/15421',
'test/15424',
'test/15425',
'test/15427',
'test/15428',
'test/15429',
'test/15430',
'test/15431',
'test/15432',
'test/15436',
'test/15438',
'test/15441',
'test/15442',
'test/15444',
'test/15446',
'test/15447',
'test/15448',
'test/15449',
'test/15450',
'test/15451',
'test/15452',
'test/15453',
'test/15454',
'test/15455',
'test/15457',
'test/15459',
'test/15460',
'test/15462',
'test/15464',
'test/15467',
'test/15468',
'test/15471',
'test/15472',
'test/15476',
'test/15477',
'test/15478',
'test/15479',
'test/15481',
'test/15482',
'test/15483',
'test/15484',
'test/15485',
'test/15487',
'test/15489',
'test/15494',
'test/15495',
'test/15496',
'test/15500',
'test/15501',
'test/15503',
'test/15504',
'test/15510',
'test/15511',
'test/15515',
'test/15520',
'test/15521',
'test/15522',
'test/15523',
'test/15527',
'test/15528',
'test/15531',
'test/15532',
'test/15535',
'test/15536',
'test/15539',
'test/15540',
'test/15542',
'test/15543',
'test/15544',
'test/15545',
'test/15547',
'test/15548',
'test/15549',
'test/15550',
'test/15551',
'test/15552',
'test/15553',
'test/15556',
'test/15558',
'test/15559',
'test/15560',
'test/15561',
'test/15562',
'test/15563',
'test/15565',
'test/15566',
'test/15567',
'test/15568',
'test/15569',
'test/15570',
'test/15571',
'test/15572',
'test/15573',
'test/15574',
'test/15575',
'test/15578',
'test/15579',
'test/15580',
'test/15581',
'test/15582',
'test/15583',
'test/15584',
'test/15585',
'test/15590',
'test/15591',
'test/15593',
'test/15594',
'test/15595',
'test/15596',
'test/15597',
'test/15598',
'test/15600',
'test/15601',
'test/15602',
'test/15603',
'test/15605',
'test/15607',
'test/15610',
'test/15613',
'test/15615',
'test/15616',
'test/15617',
'test/15618',
'test/15620',
'test/15621',
'test/15623',
'test/15624',
'test/15625',
'test/15626',
'test/15629',
'test/15632',
'test/15634',
'test/15636',
'test/15637',
'test/15639',
'test/15640',
'test/15641',
'test/15642',
'test/15643',
'test/15646',
'test/15648',
'test/15649',
'test/15651',
'test/15653',
'test/15655',
'test/15656',
'test/15664',
'test/15666',
'test/15667',
'test/15668',
'test/15669',
'test/15672',
'test/15674',
'test/15675',
'test/15676',
'test/15677',
'test/15679',
'test/15680',
'test/15682',
'test/15686',
'test/15688',
'test/15689',
'test/15691',
'test/15692',
'test/15694',
'test/15695',
'test/15696',
'test/15698',
'test/15702',
'test/15703',
'test/15704',
'test/15707',
'test/15708',
'test/15709',
'test/15710',
'test/15713',
'test/15715',
'test/15717',
'test/15719',
'test/15720',
'test/15721',
'test/15723',
'test/15725',
'test/15726',
'test/15727',
'test/15728',
'test/15729',
'test/15732',
'test/15733',
'test/15736',
'test/15737',
'test/15739',
'test/15742',
'test/15749',
'test/15751',
'test/15753',
'test/15757',
'test/15759',
'test/15762',
'test/15767',
'test/15768',
'test/15769',
'test/15772',
'test/15777',
'test/15778',
'test/15780',
'test/15782',
'test/15785',
'test/15790',
'test/15793',
'test/15797',
'test/15798',
'test/15800',
'test/15801',
'test/15803',
'test/15804',
'test/15805',
'test/15807',
'test/15808',
'test/15810',
'test/15811',
'test/15816',
'test/15817',
'test/15819',
'test/15821',
'test/15822',
'test/15823',
'test/15829',
'test/15831',
'test/15832',
'test/15833',
'test/15834',
'test/15836',
'test/15838',
'test/15840',
'test/15841',
'test/15842',
'test/15844',
'test/15845',
'test/15846',
'test/15847',
'test/15851',
'test/15852',
'test/15853',
'test/15854',
'test/15855',
'test/15856',
'test/15858',
'test/15859',
'test/15860',
'test/15861',
'test/15863',
'test/15864',
'test/15865',
'test/15866',
'test/15867',
'test/15868',
'test/15869',
'test/15870',
'test/15871',
'test/15872',
'test/15874',
'test/15875',
'test/15876',
'test/15877',
'test/15878',
'test/15879',
'test/15881',
'test/15885',
'test/15886',
'test/15888',
'test/15889',
'test/15890',
'test/15892',
'test/15893',
'test/15894',
'test/15895',
'test/15896',
'test/15897',
'test/15898',
'test/15899',
'test/15900',
'test/15901',
'test/15902',
'test/15903',
'test/15904',
'test/15906',
'test/15908',
'test/15909',
'test/15910',
'test/15911',
'test/15912',
'test/15913',
'test/15914',
'test/15916',
'test/15917',
'test/15918',
'test/15920',
'test/15921',
'test/15922',
'test/15923',
'test/15924',
'test/15925',
'test/15927',
'test/15928',
'test/15929',
'test/15930',
'test/15932',
'test/15933',
'test/15934',
'test/15937',
'test/15939',
'test/15942',
'test/15944',
'test/15949',
'test/15950',
'test/15951',
'test/15952',
'test/15953',
'test/15956',
'test/15959',
'test/15960',
'test/15961',
'test/15963',
'test/15964',
'test/15967',
'test/15968',
'test/15969',
'test/15970',
'test/15973',
'test/15975',
'test/15976',
'test/15977',
'test/15978',
'test/15979',
'test/15980',
'test/15981',
'test/15984',
'test/15985',
'test/15987',
'test/15988',
'test/15989',
'test/15993',
'test/15995',
'test/15996',
'test/15997',
'test/15999',
'test/16002',
'test/16003',
'test/16004',
'test/16005',
'test/16006',
'test/16007',
'test/16009',
'test/16012',
'test/16013',
'test/16014',
'test/16015',
'test/16016',
'test/16021',
'test/16022',
'test/16023',
'test/16026',
'test/16029',
'test/16030',
'test/16033',
'test/16037',
'test/16040',
'test/16041',
'test/16045',
'test/16052',
'test/16053',
'test/16055',
'test/16063',
'test/16066',
'test/16067',
'test/16068',
'test/16069',
'test/16071',
'test/16072',
'test/16074',
'test/16075',
'test/16076',
'test/16077',
'test/16079',
'test/16080',
'test/16083',
'test/16086',
'test/16088',
'test/16091',
'test/16093',
'test/16094',
'test/16095',
'test/16096',
'test/16097',
'test/16098',
'test/16099',
'test/16100',
'test/16103',
'test/16106',
'test/16107',
'test/16108',
'test/16110',
'test/16111',
'test/16112',
'test/16115',
'test/16117',
'test/16118',
'test/16119',
'test/16120',
'test/16122',
'test/16123',
'test/16125',
'test/16126',
'test/16130',
'test/16133',
'test/16134',
'test/16136',
'test/16139',
'test/16140',
'test/16141',
'test/16142',
'test/16143',
'test/16144',
'test/16145',
'test/16146',
'test/16147',
'test/16148',
'test/16149',
'test/16150',
'test/16152',
'test/16155',
'test/16158',
'test/16159',
'test/16161',
'test/16162',
'test/16163',
'test/16164',
'test/16166',
'test/16170',
'test/16171',
'test/16172',
'test/16173',
'test/16175',
'test/16176',
'test/16177',
'test/16179',
'test/16180',
'test/16185',
'test/16188',
'test/16189',
'test/16190',
'test/16193',
'test/16194',
'test/16195',
'test/16196',
'test/16197',
'test/16200',
'test/16201',
'test/16202',
'test/16203',
'test/16206',
'test/16207',
'test/16210',
'test/16211',
'test/16212',
'test/16213',
'test/16214',
'test/16215',
'test/16216',
'test/16219',
'test/16221',
'test/16223',
'test/16225',
'test/16226',
'test/16228',
'test/16230',
'test/16232',
'test/16233',
'test/16234',
'test/16236',
'test/16238',
'test/16241',
'test/16243',
'test/16244',
'test/16246',
'test/16247',
'test/16248',
'test/16250',
'test/16251',
'test/16252',
'test/16255',
'test/16256',
'test/16257',
'test/16258',
'test/16260',
'test/16262',
'test/16263',
'test/16264',
'test/16265',
'test/16266',
'test/16268',
'test/16269',
'test/16270',
'test/16271',
'test/16274',
'test/16275',
'test/16277',
'test/16278',
'test/16279',
'test/16281',
'test/16282',
'test/16283',
'test/16284',
'test/16285',
'test/16286',
'test/16287',
'test/16288',
'test/16289',
'test/16291',
'test/16294',
'test/16297',
'test/16298',
'test/16299',
'test/16300',
'test/16301',
'test/16302',
'test/16303',
'test/16304',
'test/16307',
'test/16310',
'test/16311',
'test/16312',
'test/16314',
'test/16315',
'test/16316',
'test/16317',
'test/16318',
'test/16319',
'test/16320',
'test/16324',
'test/16327',
'test/16331',
'test/16332',
'test/16336',
'test/16337',
'test/16339',
'test/16342',
'test/16343',
'test/16346',
'test/16347',
'test/16348',
'test/16350',
'test/16354',
'test/16357',
'test/16359',
'test/16360',
'test/16362',
'test/16363',
'test/16365',
'test/16366',
'test/16367',
'test/16369',
'test/16370',
'test/16371',
'test/16372',
'test/16374',
'test/16376',
'test/16377',
'test/16379',
'test/16380',
'test/16383',
'test/16385',
'test/16386',
'test/16388',
'test/16390',
'test/16392',
'test/16393',
'test/16394',
'test/16395',
'test/16396',
'test/16398',
'test/16399',
'test/16400',
'test/16401',
'test/16402',
'test/16403',
'test/16404',
'test/16405',
'test/16406',
'test/16407',
'test/16409',
'test/16410',
'test/16415',
'test/16417',
'test/16418',
'test/16419',
'test/16420',
'test/16421',
'test/16422',
'test/16424',
'test/16426',
'test/16427',
'test/16428',
'test/16429',
'test/16430',
'test/16432',
'test/16433',
'test/16434',
'test/16437',
'test/16438',
'test/16440',
'test/16441',
'test/16442',
'test/16443',
'test/16444',
'test/16448',
'test/16449',
'test/16450',
'test/16454',
'test/16457',
'test/16458',
'test/16459',
'test/16460',
'test/16461',
'test/16463',
'test/16465',
'test/16468',
'test/16469',
'test/16470',
'test/16471',
'test/16472',
'test/16473',
'test/16475',
'test/16476',
'test/16478',
'test/16479',
'test/16480',
'test/16481',
'test/16483',
'test/16486',
'test/16487',
'test/16488',
'test/16490',
'test/16492',
'test/16493',
'test/16495',
'test/16496',
'test/16499',
'test/16502',
'test/16505',
'test/16510',
'test/16512',
'test/16513',
'test/16518',
'test/16519',
'test/16521',
'test/16522',
'test/16523',
'test/16525',
'test/16527',
'test/16530',
'test/16531',
'test/16533',
'test/16538',
'test/16539',
'test/16545',
'test/16546',
'test/16549',
'test/16551',
'test/16554',
'test/16555',
'test/16561',
'test/16563',
'test/16564',
'test/16565',
'test/16568',
'test/16569',
'test/16570',
'test/16574',
'test/16577',
'test/16581',
'test/16583',
'test/16584',
'test/16585',
'test/16587',
'test/16588',
'test/16589',
'test/16590',
'test/16591',
...]
In [35]:
reuters.categories()
Out[35]:
[u'acq',
u'alum',
u'barley',
u'bop',
u'carcass',
u'castor-oil',
u'cocoa',
u'coconut',
u'coconut-oil',
u'coffee',
u'copper',
u'copra-cake',
u'corn',
u'cotton',
u'cotton-oil',
u'cpi',
u'cpu',
u'crude',
u'dfl',
u'dlr',
u'dmk',
u'earn',
u'fuel',
u'gas',
u'gnp',
u'gold',
u'grain',
u'groundnut',
u'groundnut-oil',
u'heat',
u'hog',
u'housing',
u'income',
u'instal-debt',
u'interest',
u'ipi',
u'iron-steel',
u'jet',
u'jobs',
u'l-cattle',
u'lead',
u'lei',
u'lin-oil',
u'livestock',
u'lumber',
u'meal-feed',
u'money-fx',
u'money-supply',
u'naphtha',
u'nat-gas',
u'nickel',
u'nkr',
u'nzdlr',
u'oat',
u'oilseed',
u'orange',
u'palladium',
u'palm-oil',
u'palmkernel',
u'pet-chem',
u'platinum',
u'potato',
u'propane',
u'rand',
u'rape-oil',
u'rapeseed',
u'reserves',
u'retail',
u'rice',
u'rubber',
u'rye',
u'ship',
u'silver',
u'sorghum',
u'soy-meal',
u'soy-oil',
u'soybean',
u'strategic-metal',
u'sugar',
u'sun-meal',
u'sun-oil',
u'sunseed',
u'tea',
u'tin',
u'trade',
u'veg-oil',
u'wheat',
u'wpi',
u'yen',
u'zinc']
In [42]:
#与布朗语料库不同,路透社语料库的类别是有互相重叠的,只是因为新闻报道往往涉及
#多个主题。我们可以查找由一个或多个文档涵盖的主题,也可以查找包 在一个或多个类别中的文档。
reuters.categories('test/14833')
Out[42]:
[u'palm-oil', u'veg-oil']
In [43]:
reuters.categories(['test/14833','test/15259'])
Out[43]:
[u'earn', u'palm-oil', u'veg-oil']
In [44]:
reuters.fileids('tea')
Out[44]:
[u'test/16225',
u'test/17494',
u'test/19672',
u'test/19982',
u'training/10268',
u'training/10406',
u'training/12754',
u'training/12907',
u'training/235',
u'training/275',
u'training/7545',
u'training/9153',
u'training/9327']
In [45]:
reuters.fileids(['tea','tin'])
Out[45]:
[u'test/14832',
u'test/14844',
u'test/14877',
u'test/15112',
u'test/15219',
u'test/15624',
u'test/15817',
u'test/16225',
u'test/17494',
u'test/17731',
u'test/18924',
u'test/19065',
u'test/19367',
u'test/19672',
u'test/19982',
u'test/20458',
u'training/10268',
u'training/10332',
u'training/10406',
u'training/11224',
u'training/11801',
u'training/12754',
u'training/12907',
u'training/13185',
u'training/1929',
u'training/235',
u'training/275',
u'training/311',
u'training/4122',
u'training/688',
u'training/6934',
u'training/7533',
u'training/7545',
u'training/7592',
u'training/7877',
u'training/8055',
u'training/8415',
u'training/8416',
u'training/8427',
u'training/8933',
u'training/908',
u'training/9153',
u'training/9327']
In [46]:
reuters.words('training/9865')[:14]
Out[46]:
[u'FRENCH',
u'FREE',
u'MARKET',
u'CEREAL',
u'EXPORT',
u'BIDS',
u'DETAILED',
u'French',
u'operators',
u'have',
u'requested',
u'licences',
u'to',
u'export']
In [47]:
reuters.words(['training/9865', 'training/9880'])
Out[47]:
[u'FRENCH', u'FREE', u'MARKET', u'CEREAL', u'EXPORT', ...]
In [48]:
reuters.words(categories='barley')
Out[48]:
[u'FRENCH', u'FREE', u'MARKET', u'CEREAL', u'EXPORT', ...]
In [49]:
reuters.words(categories=['barley', 'corn'])
Out[49]:
[u'THAI', u'TRADE', u'DEFICIT', u'WIDENS', u'IN', ...]
In [51]:
#就职演说语料库
from nltk.corpus import inaugural
In [52]:
inaugural.fileids()
Out[52]:
[u'1789-Washington.txt',
u'1793-Washington.txt',
u'1797-Adams.txt',
u'1801-Jefferson.txt',
u'1805-Jefferson.txt',
u'1809-Madison.txt',
u'1813-Madison.txt',
u'1817-Monroe.txt',
u'1821-Monroe.txt',
u'1825-Adams.txt',
u'1829-Jackson.txt',
u'1833-Jackson.txt',
u'1837-VanBuren.txt',
u'1841-Harrison.txt',
u'1845-Polk.txt',
u'1849-Taylor.txt',
u'1853-Pierce.txt',
u'1857-Buchanan.txt',
u'1861-Lincoln.txt',
u'1865-Lincoln.txt',
u'1869-Grant.txt',
u'1873-Grant.txt',
u'1877-Hayes.txt',
u'1881-Garfield.txt',
u'1885-Cleveland.txt',
u'1889-Harrison.txt',
u'1893-Cleveland.txt',
u'1897-McKinley.txt',
u'1901-McKinley.txt',
u'1905-Roosevelt.txt',
u'1909-Taft.txt',
u'1913-Wilson.txt',
u'1917-Wilson.txt',
u'1921-Harding.txt',
u'1925-Coolidge.txt',
u'1929-Hoover.txt',
u'1933-Roosevelt.txt',
u'1937-Roosevelt.txt',
u'1941-Roosevelt.txt',
u'1945-Roosevelt.txt',
u'1949-Truman.txt',
u'1953-Eisenhower.txt',
u'1957-Eisenhower.txt',
u'1961-Kennedy.txt',
u'1965-Johnson.txt',
u'1969-Nixon.txt',
u'1973-Nixon.txt',
u'1977-Carter.txt',
u'1981-Reagan.txt',
u'1985-Reagan.txt',
u'1989-Bush.txt',
u'1993-Clinton.txt',
u'1997-Clinton.txt',
u'2001-Bush.txt',
u'2005-Bush.txt',
u'2009-Obama.txt']
In [53]:
[fileid[:4] for fileid in inaugural.fileids()]
Out[53]:
[u'1789',
u'1793',
u'1797',
u'1801',
u'1805',
u'1809',
u'1813',
u'1817',
u'1821',
u'1825',
u'1829',
u'1833',
u'1837',
u'1841',
u'1845',
u'1849',
u'1853',
u'1857',
u'1861',
u'1865',
u'1869',
u'1873',
u'1877',
u'1881',
u'1885',
u'1889',
u'1893',
u'1897',
u'1901',
u'1905',
u'1909',
u'1913',
u'1917',
u'1921',
u'1925',
u'1929',
u'1933',
u'1937',
u'1941',
u'1945',
u'1949',
u'1953',
u'1957',
u'1961',
u'1965',
u'1969',
u'1973',
u'1977',
u'1981',
u'1985',
u'1989',
u'1993',
u'1997',
u'2001',
u'2005',
u'2009']
In [55]:
#让我们来看看词汇 america 和 citizen 随时间推移的使用情况。下面的代码使用 w.lowe
#r()将就职演说语料库中的词汇转换成小写。
#然后用 startswith()检查它们是否以“目 标”词汇 america 或 citizen 开始
cfd = nltk.ConditionalFreqDist(
(target, fileid[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america', 'citizen']
if w.lower().startswith(target)
)
In [58]:
cfd.plot()
In [2]:
#其他语言的语料库
nltk.corpus.cess_esp.words()
Out[2]:
[u'El', u'grupo', u'estatal', ...]
In [3]:
nltk.corpus.floresta.words()
Out[3]:
[u'Um', u'revivalismo', u'refrescante', u'O', ...]
In [4]:
nltk.corpus.indian.words('hindi.pos')
Out[4]:
[u'\u092a\u0942\u0930\u094d\u0923', u'\u092a\u094d\u0930\u0924\u093f\u092c\u0902\u0927', ...]
In [5]:
nltk.corpus.udhr.fileids()
Out[5]:
[u'Abkhaz-Cyrillic+Abkh',
u'Abkhaz-UTF8',
u'Achehnese-Latin1',
u'Achuar-Shiwiar-Latin1',
u'Adja-UTF8',
u'Afaan_Oromo_Oromiffa-Latin1',
u'Afrikaans-Latin1',
u'Aguaruna-Latin1',
u'Akuapem_Twi-UTF8',
u'Albanian_Shqip-Latin1',
u'Amahuaca',
u'Amahuaca-Latin1',
u'Amarakaeri-Latin1',
u'Amuesha-Yanesha-UTF8',
u'Arabela-Latin1',
u'Arabic_Alarabia-Arabic',
u'Asante-UTF8',
u'Ashaninca-Latin1',
u'Asheninca-Latin1',
u'Asturian_Bable-Latin1',
u'Aymara-Latin1',
u'Balinese-Latin1',
u'Bambara-UTF8',
u'Baoule-UTF8',
u'Basque_Euskara-Latin1',
u'Batonu_Bariba-UTF8',
u'Belorus_Belaruski-Cyrillic',
u'Belorus_Belaruski-UTF8',
u'Bemba-Latin1',
u'Bengali-UTF8',
u'Beti-UTF8',
u'Bichelamar-Latin1',
u'Bikol_Bicolano-Latin1',
u'Bora-Latin1',
u'Bosnian_Bosanski-Cyrillic',
u'Bosnian_Bosanski-Latin2',
u'Bosnian_Bosanski-UTF8',
u'Breton-Latin1',
u'Bugisnese-Latin1',
u'Bulgarian_Balgarski-Cyrillic',
u'Bulgarian_Balgarski-UTF8',
u'Cakchiquel-Latin1',
u'Campa_Pajonalino-Latin1',
u'Candoshi-Shapra-Latin1',
u'Caquinte-Latin1',
u'Cashibo-Cacataibo-Latin1',
u'Cashinahua-Latin1',
u'Catalan-Latin1',
u'Catalan_Catala-Latin1',
u'Cebuano-Latin1',
u'Chamorro-Latin1',
u'Chayahuita-Latin1',
u'Chechewa_Nyanja-Latin1',
u'Chickasaw-Latin1',
u'Chinanteco-Ajitlan-Latin1',
u'Chinanteco-UTF8',
u'Chinese_Mandarin-GB2312',
u'Chuuk_Trukese-Latin1',
u'Cokwe-Latin1',
u'Corsican-Latin1',
u'Croatian_Hrvatski-Latin2',
u'Czech-Latin2',
u'Czech-UTF8',
u'Czech_Cesky-Latin2',
u'Czech_Cesky-UTF8',
u'Dagaare-UTF8',
u'Dagbani-UTF8',
u'Dangme-UTF8',
u'Danish_Dansk-Latin1',
u'Dendi-UTF8',
u'Ditammari-UTF8',
u'Dutch_Nederlands-Latin1',
u'Edo-Latin1',
u'English-Latin1',
u'Esperanto-UTF8',
u'Estonian_Eesti-Latin1',
u'Ewe_Eve-UTF8',
u'Fante-UTF8',
u'Faroese-Latin1',
u'Farsi_Persian-UTF8',
u'Farsi_Persian-v2-UTF8',
u'Fijian-Latin1',
u'Filipino_Tagalog-Latin1',
u'Finnish_Suomi-Latin1',
u'Fon-UTF8',
u'French_Francais-Latin1',
u'Frisian-Latin1',
u'Friulian_Friulano-Latin1',
u'Ga-UTF8',
u'Gagauz_Gagauzi-UTF8',
u'Galician_Galego-Latin1',
u'Garifuna_Garifuna-Latin1',
u'German_Deutsch-Latin1',
u'Gonja-UTF8',
u'Greek_Ellinika-Greek',
u'Greek_Ellinika-UTF8',
u'Greenlandic_Inuktikut-Latin1',
u'Guarani-Latin1',
u'Guen_Mina-UTF8',
u'HaitianCreole_Kreyol-Latin1',
u'HaitianCreole_Popular-Latin1',
u'Hani-Latin1',
u'Hausa_Haoussa-Latin1',
u'Hawaiian-UTF8',
u'Hebrew_Ivrit-Hebrew',
u'Hebrew_Ivrit-UTF8',
u'Hiligaynon-Latin1',
u'Hindi-UTF8',
u'Hindi_web-UTF8',
u'Hmong_Miao-Sichuan-Guizhou-Yunnan-Latin1',
u'Hmong_Miao-SouthernEast-Guizhou-Latin1',
u'Hmong_Miao_Northern-East-Guizhou-Latin1',
u'Hrvatski_Croatian-Latin2',
u'Huasteco-Latin1',
u'Huitoto_Murui-Latin1',
u'Hungarian_Magyar-Latin1',
u'Hungarian_Magyar-Latin2',
u'Hungarian_Magyar-UTF8',
u'Ibibio_Efik-Latin1',
u'Icelandic_Yslenska-Latin1',
u'Ido-Latin1',
u'Igbo-UTF8',
u'Iloko_Ilocano-Latin1',
u'Indonesian-Latin1',
u'Interlingua-Latin1',
u'Inuktikut_Greenlandic-Latin1',
u'IrishGaelic_Gaeilge-Latin1',
u'Italian-Latin1',
u'Italian_Italiano-Latin1',
u'Japanese_Nihongo-EUC',
u'Japanese_Nihongo-SJIS',
u'Japanese_Nihongo-UTF8',
u'Javanese-Latin1',
u'Jola-Fogny_Diola-UTF8',
u'Kabye-UTF8',
u'Kannada-UTF8',
u'Kaonde-Latin1',
u'Kapampangan-Latin1',
u'Kasem-UTF8',
u'Kazakh-Cyrillic',
u'Kazakh-UTF8',
u'Kiche_Quiche-Latin1',
u'Kicongo-Latin1',
u'Kimbundu_Mbundu-Latin1',
u'Kinyamwezi_Nyamwezi-Latin1',
u'Kinyarwanda-Latin1',
u'Kituba-Latin1',
u'Korean_Hankuko-UTF8',
u'Kpelewo-UTF8',
u'Krio-UTF8',
u'Kurdish-UTF8',
u'Lamnso_Lam-nso-UTF8',
u'Latin_Latina-Latin1',
u'Latin_Latina-v2-Latin1',
u'Latvian-Latin1',
u'Limba-UTF8',
u'Lingala-Latin1',
u'Lithuanian_Lietuviskai-Baltic',
u'Lozi-Latin1',
u'Luba-Kasai_Tshiluba-Latin1',
u'Luganda_Ganda-Latin1',
u'Lunda_Chokwe-lunda-Latin1',
u'Luvale-Latin1',
u'Luxembourgish_Letzebuergeusch-Latin1',
u'Macedonian-UTF8',
u'Madurese-Latin1',
u'Makonde-Latin1',
u'Malagasy-Latin1',
u'Malay_BahasaMelayu-Latin1',
u'Maltese-UTF8',
u'Mam-Latin1',
u'Maninka-UTF8',
u'Maori-Latin1',
u'Mapudungun_Mapuzgun-Latin1',
u'Mapudungun_Mapuzgun-UTF8',
u'Marshallese-Latin1',
u'Matses-Latin1',
u'Mayan_Yucateco-Latin1',
u'Mazahua_Jnatrjo-UTF8',
u'Mazateco-Latin1',
u'Mende-UTF8',
u'Mikmaq_Micmac-Mikmaq-Latin1',
u'Minangkabau-Latin1',
u'Miskito_Miskito-Latin1',
u'Mixteco-Latin1',
u'Mongolian_Khalkha-Cyrillic',
u'Mongolian_Khalkha-UTF8',
u'Moore_More-UTF8',
u'Nahuatl-Latin1',
u'Ndebele-Latin1',
u'Nepali-UTF8',
u'Ngangela_Nyemba-Latin1',
u'NigerianPidginEnglish-Latin1',
u'Nomatsiguenga-Latin1',
u'NorthernSotho_Pedi-Sepedi-Latin1',
u'Norwegian-Latin1',
u'Norwegian_Norsk-Bokmal-Latin1',
u'Norwegian_Norsk-Nynorsk-Latin1',
u'Nyanja_Chechewa-Latin1',
u'Nyanja_Chinyanja-Latin1',
u'Nzema-UTF8',
u'OccitanAuvergnat-Latin1',
u'OccitanLanguedocien-Latin1',
u'Oromiffa_AfaanOromo-Latin1',
u'Osetin_Ossetian-UTF8',
u'Oshiwambo_Ndonga-Latin1',
u'Otomi_Nahnu-Latin1',
u'Paez-Latin1',
u'Palauan-Latin1',
u'Peuhl-UTF8',
u'Picard-Latin1',
u'Pipil-Latin1',
u'Polish-Latin2',
u'Polish_Polski-Latin2',
u'Ponapean-Latin1',
u'Portuguese_Portugues-Latin1',
u'Pulaar-UTF8',
u'Punjabi_Panjabi-UTF8',
u'Purhepecha-UTF8',
u'Qechi_Kekchi-Latin1',
u'Quechua-Latin1',
u'Quichua-Latin1',
u'Rarotongan_MaoriCookIslands-Latin1',
u'Rhaeto-Romance_Rumantsch-Latin1',
u'Romani-Latin1',
u'Romani-UTF8',
u'Romanian-Latin2',
u'Romanian_Romana-Latin2',
u'Rukonzo_Konjo-Latin1',
u'Rundi_Kirundi-Latin1',
u'Runyankore-rukiga_Nkore-kiga-Latin1',
u'Russian-Cyrillic',
u'Russian-UTF8',
u'Russian_Russky-Cyrillic',
u'Russian_Russky-UTF8',
u'Sami_Lappish-UTF8',
u'Sammarinese-Latin1',
u'Samoan-Latin1',
u'Sango_Sangho-Latin1',
u'Sanskrit-UTF8',
u'Saraiki-UTF8',
u'Sardinian-Latin1',
u'ScottishGaelic_GaidhligAlbanach-Latin1',
u'Seereer-UTF8',
u'Serbian_Srpski-Cyrillic',
u'Serbian_Srpski-Latin2',
u'Serbian_Srpski-UTF8',
u'Sharanahua-Latin1',
u'Shipibo-Conibo-Latin1',
u'Shona-Latin1',
u'Sinhala-UTF8',
u'Siswati-Latin1',
u'Slovak-Latin2',
u'Slovak_Slovencina-Latin2',
u'Slovenian_Slovenscina-Latin2',
u'SolomonsPidgin_Pijin-Latin1',
u'Somali-Latin1',
u'Soninke_Soninkanxaane-UTF8',
u'Sorbian-Latin2',
u'SouthernSotho_Sotho-Sesotho-Sutu-Sesutu-Latin1',
u'Spanish-Latin1',
u'Spanish_Espanol-Latin1',
u'Sukuma-Latin1',
u'Sundanese-Latin1',
u'Sussu_Soussou-Sosso-Soso-Susu-UTF8',
u'Swaheli-Latin1',
u'Swahili_Kiswahili-Latin1',
u'Swedish_Svenska-Latin1',
u'Tahitian-UTF8',
u'Tenek_Huasteco-Latin1',
u'Tetum-Latin1',
u'Themne_Temne-UTF8',
u'Tiv-Latin1',
u'Toba-UTF8',
u'Tojol-abal-Latin1',
u'TokPisin-Latin1',
u'Tonga-Latin1',
u'Tongan_Tonga-Latin1',
u'Totonaco-Latin1',
u'Trukese_Chuuk-Latin1',
u'Turkish_Turkce-Turkish',
u'Turkish_Turkce-UTF8',
u'Tzeltal-Latin1',
u'Tzotzil-Latin1',
u'Uighur_Uyghur-Latin1',
u'Uighur_Uyghur-UTF8',
u'Ukrainian-Cyrillic',
u'Ukrainian-UTF8',
u'Umbundu-Latin1',
u'Urarina-Latin1',
u'Uzbek-Latin1',
u'Vietnamese-ALRN-UTF8',
u'Vietnamese-UTF8',
u'Vlach-Latin1',
u'Walloon_Wallon-Latin1',
u'Wama-UTF8',
u'Waray-Latin1',
u'Wayuu-Latin1',
u'Welsh_Cymraeg-Latin1',
u'WesternSotho_Tswana-Setswana-Latin1',
u'Wolof-Latin1',
u'Xhosa-Latin1',
u'Yagua-Latin1',
u'Yao-Latin1',
u'Yapese-Latin1',
u'Yoruba-UTF8',
u'Zapoteco-Latin1',
u'Zapoteco-SanLucasQuiavini-Latin1',
u'Zhuang-Latin1',
u'Zulu-Latin1']
In [6]:
#udhr,是超过 300 种语言的世界人权宣言。语料库的 fileids 包括有关文件所使用的字符编码,如:UTF8 或者 Latin1。
nltk.corpus.udhr.words('Javanese-Latin1')[11:]
Out[6]:
[u'Saben', u'umat', u'manungsa', u'lair', u'kanthi', ...]
In [7]:
from nltk.corpus import udhr
In [8]:
languages = ['Chickasaw', 'English', 'German_Deutsch','Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
In [9]:
cfd = nltk.ConditionalFreqDist(
(lang, len(word))
for lang in languages
for word in udhr.words(lang + '-Latin1'))
In [10]:
cfd.plot()
In [11]:
cfd.plot(cumulative = True)
In [14]:
'''
NLTK中定义的基 语料库函数:使用help(nltk.corpus.reader)可以找到更多的文档, 也可以阅读 http://www.nltk.org/howto 上的在线语料库的 HOWTO。
示例 描述
fileids() 语料库中的文件
fileids([categories]) 这些分类对应的语料库中的文件
categories() 语料库中的分类
categories([fileids]) 这些文件对应的语料库中的分类
raw() 语料库的原始内容
raw(fileids=[f1,f2,f3]) 指定文件的原始内容
raw(categories=[c1,c2]) 指定分类的原始内容
words() 整个语料库中的词汇
words(fileids=[f1,f2,f3]) 指定文件中的词汇
words(categories=[c1,c2]) 指定分类中的词汇
sents() 指定分类中的句子
sents(fileids=[f1,f2,f3]) 指定文件中的句子
sents(categories=[c1,c2]) 指定分类中的句子
abspath(file id) 指定文件在磁盘上的位置
enc oding(fileid) 文件的编码(如果知道的话)
open(fileid) 打开指定语料库文件的文件流
root() 到 地安装的语料库根目录的路径
'''
#使用PlaintextCorpusReader加载自己的语料库
#公开发行的语料库的重要来源是语言数据联盟((LDC)和欧洲语言资源局(EL RA)。提供几十种语言的数以百计的已标注文 和语音语料库。
print ''
In [43]:
#2.2 条件频率分布
#条件和事件 频率分布计算观察到的事件,如文 中出现的词汇。条件频率分布需要给每个时间关联 一个条件,
#所以不是处理一个词序列,我们必须处理的是一个配对序列。
#按文体计数词汇
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre))
genre_word = [(genre, word)
for genre in ['news', 'romance']
for word in brown.words(categories=genre)]
print len(genre_word)
170576
In [44]:
genre_word[:4]
Out[44]:
[('news', u'The'),
('news', u'Fulton'),
('news', u'County'),
('news', u'Grand')]
In [45]:
genre_word[-4:]
Out[45]:
[('romance', u'afraid'),
('romance', u'not'),
('romance', u"''"),
('romance', u'.')]
In [46]:
cfd = nltk.ConditionalFreqDist(genre_word)
In [47]:
cfd.conditions()
Out[47]:
['romance', 'news']
In [48]:
cfd['news']
Out[48]:
FreqDist({u'sunbonnet': 1,
u'Elevated': 1,
u'narcotic': 2,
u'four': 73,
u'woods': 4,
u'railing': 1,
u'Until': 5,
u'aggression': 1,
u'marching': 2,
u'increase': 24,
u'eligible': 4,
u'electricity': 1,
u'$25-a-plate': 1,
u'wheeled': 2,
u'Casey': 6,
u'all-county': 1,
u'Belgians': 20,
u'Western': 7,
u'dependency': 2,
u'1959-60': 1,
u'Duhagon': 1,
u'sinking': 1,
u'1,119': 1,
u'co-operation': 1,
u'Famed': 1,
u'regional': 2,
u'Charitable': 1,
u'appropriation': 2,
u'yellow': 3,
u'Old': 11,
u'Heights': 1,
u'bringing': 2,
u'Policies': 2,
u'prize': 5,
u'Loen': 1,
u'Publique': 2,
u'wooden': 1,
u'Loeb': 1,
u'specialties': 1,
u'Sands': 1,
u'succession': 1,
u'Paul': 6,
u'shows': 4,
u'commented': 7,
u'Screw': 1,
u'charter': 15,
u'Oslo': 5,
u'tired': 3,
u'pulse': 1,
u'tires': 3,
u'271': 1,
u'second': 35,
u'273': 1,
u'Pampa': 2,
u'DiVarco': 1,
u'errors': 8,
u'Initially': 1,
u'Lucille': 2,
u'boogie': 1,
u'contributed': 5,
u'Seekonk': 2,
u'Hamilton': 2,
u'designing': 2,
u'College': 20,
u'increasing': 2,
u'Presidential': 2,
u'dispatched': 3,
u'hero': 7,
u'Sioux': 1,
u'Foundation': 3,
u'Munoz': 1,
u'error': 2,
u'here': 67,
u'reported': 28,
u'affiliated': 1,
u'Footnotes': 1,
u'Stephanie': 2,
u'doldrums': 1,
u'cyclical': 1,
u'kids': 6,
u'Fernberger': 1,
u'elaborate': 3,
u'climbed': 2,
u'reports': 12,
u'controversy': 5,
u'Boxwood': 1,
u'military': 30,
u'Walters': 1,
u'Isles': 1,
u'rebel': 3,
u'golden': 5,
u'Quincy': 1,
u'ground': 10,
u'Harvey': 7,
u'explained': 13,
u'precincts': 4,
u'Three': 4,
u'replace': 4,
u'brought': 21,
u'beneficiaries': 1,
u'Basic': 1,
u'Wales': 2,
u'Basin': 2,
u'unit': 10,
u'opponents': 1,
u'Ronald': 2,
u'Callan': 1,
u'spoke': 4,
u'tardiness': 1,
u'Slate': 5,
u'century': 2,
u'Admitting': 1,
u'Anticipated': 1,
u'occupying': 1,
u'Vernon': 4,
u'Tex.': 5,
u'music': 12,
u'therefore': 7,
u'passport': 1,
u'unfortunately': 1,
u'strike': 12,
u'heralded': 1,
u'until': 28,
u'Tudor': 1,
u'Stepanovich': 2,
u'females': 1,
u'Christine': 2,
u'successful': 10,
u'brings': 1,
u'whirling': 1,
u'Rule': 2,
u'99': 1,
u'Person': 2,
u'menaced': 1,
u'tying': 1,
u'90': 2,
u'hole': 17,
u'hold': 10,
u'95': 3,
u'circumstances': 5,
u'locked': 1,
u'brutality': 2,
u'Wilderness': 1,
u'homemakers': 1,
u'famed': 1,
u'accomplishment': 1,
u'Professors': 2,
u'Westphalia': 2,
u'temperatures': 1,
u'Travelers': 1,
u'centralization': 2,
u'example': 15,
u'Le': 1,
u'La': 7,
u'household': 3,
u'artillery': 2,
u'organized': 6,
u'Briar': 1,
u'Smith-Colmer': 1,
u'currency': 1,
u'caution': 1,
u'reviewing': 2,
u'want': 16,
u'counseling': 3,
u'Easier': 1,
u'absolute': 1,
u'preferably': 1,
u'hog': 2,
u'hoc': 1,
u'knows': 6,
u'complaining': 1,
u'travel': 2,
u'drying': 2,
u'feature': 4,
u'Gardner': 1,
u'machine': 5,
u'how': 37,
u'hot': 9,
u'significance': 4,
u'Comedian': 1,
u'Gerosa': 2,
u'preferable': 1,
u"He'll": 2,
u'A': 137,
u'beauty': 4,
u'inherent': 2,
u'L.': 25,
u'swing': 1,
u'outlawed': 1,
u'Players': 1,
u'modest': 7,
u'Reese': 3,
u'destined': 1,
u'fourteen-team': 1,
u'sentencing': 1,
u'types': 4,
u'ballroom': 4,
u'effective': 15,
u'down-payments': 1,
u'youths': 4,
u'romped': 1,
u'revolt': 1,
u'headquarters': 18,
u'Walkers': 1,
u'baggage': 2,
u'18th': 4,
u'4-7/8': 1,
u'Another': 7,
u'keeps': 3,
u'democratic': 1,
u'wing': 4,
u'wind': 4,
u'leisurely': 1,
u'Willy': 1,
u'senators': 4,
u'$840,000': 1,
u'welcomed': 2,
u'Edith': 1,
u'Housing': 3,
u'reforms': 4,
u'vary': 2,
u'kickoff': 2,
u'thousands': 7,
u'Dussa': 1,
u'Toll': 1,
u'Ludwig': 1,
u'Commies': 1,
u'His': 29,
u'Hit': 1,
u'surviving': 1,
u'fit': 3,
u'striking': 4,
u"Gardner's": 4,
u'secede': 1,
u'survivors': 1,
u'Harris': 8,
u'Barber': 5,
u'Palsy': 1,
u'hidden': 1,
u'county-wide': 2,
u'Sinfonica': 1,
u'slate': 1,
u'vouchers': 1,
u'detachment': 1,
u'effects': 2,
u'schools': 37,
u'P.m.': 1,
u'undeveloped': 1,
u'silver': 3,
u'glutted': 1,
u'headboard': 1,
u'represents': 2,
u'debut': 4,
u"road's": 3,
u'skills': 3,
u'McCluskey': 1,
u'4-year-old': 1,
u'clientele': 1,
u'Seidel': 2,
u"employers'": 1,
u'Superior': 5,
u'preceded': 3,
u'financial': 11,
u'reputedly': 1,
u'series': 25,
u'finger-paint': 1,
u'Mongolia': 1,
u"NATO's": 1,
u'3-to-o': 1,
u'whiz': 2,
u"we'd": 1,
u'substantially': 1,
u'laboratory': 2,
u'tricked': 1,
u"House's": 2,
u'whip': 2,
u'borne': 1,
u'misfortune': 2,
u'two-and-a-half-mile': 1,
u'ten-concert': 1,
u'5847': 1,
u'flooded': 1,
u'encourage': 3,
u'millions': 9,
u'Super': 1,
u"Simpson's": 1,
u'sexton': 1,
u'foundation': 1,
u'inhabitants': 2,
u'Word': 1,
u"Nugent's": 1,
u'Extension': 1,
u'sellout': 1,
u'University': 42,
u'Work': 1,
u'threatened': 4,
u'3-to-3': 1,
u'Marcus': 1,
u'3-run': 1,
u'Jacques': 1,
u'sheet': 1,
u'estimate': 3,
u'alert': 1,
u'substantiation': 1,
u'cornerstone': 1,
u'enormous': 1,
u'Hord': 1,
u'shelves': 1,
u'24-inch': 1,
u'shipped': 1,
u'musicians': 3,
u'speedy': 1,
u'coeds': 1,
u'Human': 2,
u'1970s': 1,
u'reserving': 1,
u'repealed': 1,
u'Palmer': 42,
u'hearsay': 1,
u'Convair': 1,
u"Al's": 1,
u'Due': 1,
u'channels': 1,
u'wash': 2,
u'$18.9': 1,
u'175': 1,
u'174': 1,
u'173': 1,
u'$18.2': 1,
u'Steelers': 2,
u'basketball': 5,
u'service': 37,
u'engagement': 4,
u'returns': 2,
u'needed': 26,
u'Simmons': 2,
u'master': 5,
u'listed': 10,
u'Dumont': 2,
u'legs': 3,
u'bitter': 5,
u'ranging': 3,
u'listen': 2,
u'collapse': 1,
u'predictably': 1,
u'convention': 2,
u'wisdom': 5,
u'advisement': 2,
u'task': 5,
u'ASDIC': 1,
u'defaulted': 1,
u'Bertoia': 1,
u'peril': 1,
u'outlay': 2,
u'showed': 5,
u'elegant': 2,
u'Carroll': 2,
u'walloped': 1,
u'nations': 12,
u'project': 26,
u'percentages': 1,
u'idle': 2,
u'Ilona': 1,
u'skimmed': 1,
u'feeling': 10,
u'acquisition': 1,
u'Cody': 1,
u'Angelo': 3,
u'acclaim': 1,
u'entail': 1,
u'willingness': 2,
u'Chicago': 22,
u'Woodyard': 1,
u'Mullenax': 2,
u'spent': 12,
u'Mont.': 1,
u"Lanin's": 1,
u'Gursel': 3,
u'picks': 1,
u'Angels': 1,
u'Williams': 14,
u'dozen': 5,
u'Then': 17,
u'person': 9,
u'bleachers': 2,
u'responsible': 5,
u'Myron': 1,
u'Band': 1,
u'recommended': 9,
u'absorbed': 4,
u'Minister': 5,
u'They': 62,
u'season': 43,
u'Ask': 1,
u'grips': 2,
u'Missionary': 1,
u'Jones': 22,
u'Lynn': 4,
u'Wise': 1,
u'shall': 5,
u'Wish': 1,
u'object': 2,
u'vexing': 1,
u'debonair': 1,
u'affirmation': 1,
u'mouth': 2,
u'letter': 7,
u'conceded': 1,
u'putout': 1,
u'Galveston': 2,
u'episode': 2,
u'Texans': 7,
u'professor': 1,
u'camp': 4,
u'Bulloch': 2,
u'Journal-Bulletin': 1,
u'ruthless': 1,
u'independents': 1,
u'prevention': 2,
u'Mansion': 1,
u'detriment': 1,
u'nineteenth': 1,
u'mating': 1,
u'purged': 1,
u'incomplete': 1,
u'marvel': 3,
u'saying': 8,
u'signatures': 5,
u'bomb': 10,
u'reactor': 3,
u'Symonds': 1,
u'U-2': 1,
u'Union': 14,
u'orchestra': 9,
u'meetings': 5,
u'Agency': 1,
u'parolees': 2,
u'nominated': 2,
u'undue': 2,
u"Communism's": 1,
u'cooking': 1,
u'judgeship': 1,
u'Paradise': 2,
u'culminates': 1,
u'driving': 12,
u'Congressional': 3,
u'Meyner': 3,
u'touches': 2,
u'busy': 3,
u'clicked': 1,
u'Extend': 1,
u'695': 1,
u'headline': 2,
u'menu': 1,
u'Moller': 1,
u'than': 138,
u'Tiao': 1,
u'theme': 5,
u'touched': 2,
u'rich': 5,
u'Coliseum': 1,
u'submarine-ball': 1,
u"Berlin's": 1,
u'plate': 5,
u'D.C.': 3,
u'$15': 1,
u'Nevertheless': 1,
u'television': 13,
u"AID's": 1,
u'pocket': 1,
u'Mears': 1,
u"Ruth's": 7,
u'Sports': 4,
u'societies': 2,
u'Senators': 4,
u'greens': 2,
u'bloodstream': 1,
u'ever': 32,
u'Rip': 1,
u'Rio': 2,
u'three-year': 2,
u'flanked': 1,
u'release': 3,
u"leader's": 4,
u'U-I': 1,
u'respond': 1,
u'mandatory': 1,
u'disaster': 1,
u'fair': 10,
u'Bennington': 1,
u'transferred': 1,
u'pads': 1,
u'Brevard': 5,
u'glad': 1,
u'result': 30,
u'fail': 2,
u'Ave.': 10,
u'resigned': 5,
u'best': 29,
u'pricking': 1,
u"Braves'": 1,
u'lots': 3,
u'Heinkel': 2,
u'rings': 2,
u"'20's": 1,
u'injuries': 3,
u'224-170': 1,
u'pressures': 1,
u'score': 11,
u'Lockies': 2,
u'toolmaker': 1,
u'preserve': 4,
u'indecisive': 1,
u'redistricting': 1,
u'never': 38,
u"Meyner's": 1,
u'nationwide': 2,
u'nature': 7,
u'rolled': 2,
u'punted': 1,
u'authorizing': 2,
u'lefthanders': 1,
u'drew': 6,
u'extent': 1,
u'Bronx': 5,
u'Peterson': 4,
u'roller': 1,
u'Capello': 1,
u"war's": 1,
u'accident': 7,
u'met': 8,
u'country': 24,
u'conclusions': 1,
u'demanded': 3,
u'Vacancy': 1,
u'planned': 9,
u'logic': 1,
u'federalism': 1,
u'argue': 2,
u'asked': 34,
u'30th': 1,
u'Apartment': 1,
u'liberal-conservative': 1,
u'troublesome': 1,
u'25%': 1,
u'102': 1,
u'month-long': 1,
u'250': 2,
u'255': 1,
u'relearns': 1,
u'Clarence': 3,
u'reconsideration': 2,
u'Sitting': 1,
u'union': 21,
u'Rizzuto': 1,
u'breakoff': 1,
u'.': 4030,
u'Nischwitz': 3,
u'extraction': 1,
u'startled': 1,
u'stadium': 3,
u'Cherry': 3,
u'privilege': 3,
u'one-week-old': 1,
u'Flowers': 2,
u'dots': 1,
u'Precise': 1,
u'life': 17,
u'retrospect': 1,
u'Tokyo': 1,
u'worker': 2,
u'allotting': 1,
u'Stella': 1,
u'1,212,000': 1,
u'child': 9,
u'worked': 12,
u'Gloriana': 3,
u'Holmes': 9,
u'commerce': 3,
u'presidency': 2,
u'administrative': 3,
u'employ': 2,
u'misconstrued': 1,
u'1213-15': 1,
u'Campbell': 1,
u"Gannon's": 1,
u'Brandt': 5,
u'Zurcher': 2,
u'played': 19,
u'Innumerable': 1,
u'conditioned': 2,
u'player': 6,
u'eighteen': 3,
u'London-based': 1,
u'Courtney': 1,
u'Puerto': 2,
u'churchmen': 2,
u'doorman': 1,
u'specter': 1,
u'trusted': 2,
u'Phouma': 3,
u'damaged': 2,
u'Valley': 1,
u"Dresbachs'": 1,
u'things': 10,
u'cumulative': 1,
u'rebellion': 1,
u'Newman': 1,
u'socialized': 1,
u'300': 6,
u'harmony': 1,
u'babies': 2,
u'pre-school': 1,
u'fairly': 3,
u'Budapest': 2,
u'saluted': 1,
u'Maybe': 2,
u'torpedoes': 1,
u'Angeles': 12,
u'photographers': 1,
u'Peking': 1,
u'Living': 3,
u'5-to-2': 1,
u'5-to-3': 1,
u"Stevenses'": 1,
u'protected': 1,
u'furlough': 1,
u'matters': 3,
u'vice-president': 2,
u'academic': 10,
u"d'etat": 1,
u'telephone': 8,
u'echoes': 1,
u'corporate': 3,
u'Funeral': 5,
u'fittest': 1,
u'opinions': 3,
u'spurred': 1,
u'provocation': 2,
u'capitol': 1,
u'sleeps': 1,
u'Subsequent': 1,
u'distribute': 2,
u'fantastic': 1,
u'plight': 1,
u'rushing': 5,
u'succeeding': 1,
u'previous': 11,
u'ham': 2,
u'duffer': 1,
u'Oscar': 2,
u'ease': 2,
u'Odell': 1,
u'had': 279,
u'emphasis': 5,
u'Leonard': 6,
u'Mohammedanism': 1,
u'Connecticut': 2,
u'collections': 4,
u'easy': 4,
u'prison': 7,
u'has': 300,
u'hat': 1,
u'Apart': 3,
u'municipal': 4,
u'mediocre': 1,
u'Dawson': 1,
u'survival': 2,
u'disagreement': 3,
u'possible': 28,
u"rocket's": 2,
u'firmer': 3,
u'possibly': 3,
u'opener': 3,
u'birth': 5,
u'Missouri': 3,
u'clustered': 1,
u'pertinent': 1,
u'unique': 1,
u'$2,170': 1,
u'desire': 4,
u'county': 26,
u"bridegroom's": 2,
u'seaside': 1,
u'misled': 1,
u'steps': 8,
u'Shrove': 1,
u"court's": 1,
u'Further': 2,
u'Pentagon': 2,
u"Louis's": 1,
u'Warren': 13,
u'attorney': 17,
u'right': 33,
u'old': 23,
u'crowd': 8,
u'$1,000,000,000': 1,
u'creed': 3,
u'Expressways': 1,
u'crown': 2,
u'System': 2,
u'culpas': 1,
u'3,325': 1,
u'Conservation': 4,
u'glove': 4,
u'Noel': 1,
u'Between': 1,
u'enemies': 1,
u'MacDonald': 2,
u'for': 943,
u'bottom': 1,
u'p.m.': 38,
u'contributing': 1,
u'individuals': 5,
u'summoned': 4,
u'pondered': 1,
u'Celebration': 1,
u'Donnelly': 1,
u'Instant': 1,
u'Calls': 2,
u"ol'": 1,
u'dental': 6,
u'6,000': 1,
u'shifting': 2,
u'defensive': 7,
u'losing': 5,
u'brokerage': 1,
u'manufacturing': 5,
u'shaken': 2,
u'Macon': 2,
u'benches': 1,
u'boiling': 1,
u'dollars': 15,
u'citizens': 6,
u'globetrotter': 1,
u'despair': 1,
u'stoked': 1,
u'lacked': 3,
u'slightly': 4,
u'meddle': 1,
u'consulting': 3,
u'statements': 9,
u'Cal.': 1,
u'Blacks': 1,
u'honeymoon': 3,
u'Scotland': 2,
u'son': 22,
u'undermining': 1,
u'Misses': 1,
u'one-fourth': 1,
u'raiser': 1,
u'raises': 3,
u'sow': 1,
u'stockholder': 1,
u'reducing': 2,
u'defendants': 9,
u'Hank': 4,
u'collectors': 1,
u'162': 1,
u'support': 24,
u'constantly': 3,
u'busy-work': 1,
u'Hand': 1,
u'symphony': 1,
u"boy's": 2,
u'10,000,000': 1,
u'resulted': 6,
u'call': 14,
u'happy': 12,
u'offer': 9,
u'understandably': 1,
u'forming': 2,
u'Completing': 1,
u'Acres': 2,
u'talents': 2,
u'understandable': 2,
u'incinerator': 1,
u'underdeveloped': 1,
u'duel': 3,
u"else's": 1,
u'Toni': 1,
u'inside': 4,
u'goutte': 1,
u'Waldorf-Astoria': 2,
u'County': 35,
u'unanimous': 2,
u'Guests': 4,
u'Tony': 3,
u'Hawksley': 10,
u'Enrique': 1,
u'panels': 5,
u'Weatherford': 2,
u'Stallard': 1,
u'8,293': 1,
u'150': 3,
u'juvenile': 5,
u'later': 34,
u'liberal': 4,
u'154': 3,
u'Trooper': 1,
u'Six': 5,
u'proven': 1,
u"''": 702,
u'Virgin': 2,
u'Squad': 1,
u'exist': 1,
u'Pittsboro': 1,
u'Sid': 1,
u'segregationist': 1,
u'acacia': 3,
u'dealer': 4,
u'negotiations': 11,
u'Knoll': 1,
u'McDaniel': 2,
u'college': 18,
u'protested': 1,
u'Noting': 2,
u'Practice': 1,
u'eventual': 3,
u'floor': 14,
u'Track': 1,
u'Possible': 1,
u'crowns': 1,
u'flood': 1,
u'Nolan': 1,
u'republic': 1,
u'amicable': 1,
u'ambitious': 1,
u'entomologist': 1,
u'Norristown': 1,
u'smell': 2,
u'roll': 4,
u'steamship': 1,
u'intend': 3,
u'Lenny': 1,
u'models': 2,
u'high-wage': 1,
u'Western-style': 1,
u'Luthuli': 1,
u'eminent': 1,
u'scale': 1,
u'smelling': 1,
u'persecution': 1,
u'source': 7,
u'Charley': 4,
u'fastened': 1,
u'Mostly': 1,
u'Debutante': 2,
u"workers'": 1,
u'Charles': 22,
u'Quaker': 1,
u"O'Hare": 1,
u'Fifth': 1,
u'time': 97,
u'push': 2,
u'conferred': 1,
u'Empire': 2,
u'Principal': 2,
u'gown': 6,
u'smelts': 1,
u'chain': 2,
u'criteria': 2,
u'Indians': 4,
u'Nicklaus': 1,
u'integration': 6,
u'tee': 4,
u'theaters': 6,
u'645-acre': 1,
u'Wabash': 1,
u'Indiana': 2,
u'chair': 1,
u'$278,877,000': 1,
u'Beyeler': 2,
u'ballet': 7,
u'92': 1,
u'8861': 1,
u'900-student': 1,
u'sweat-suits': 1,
u'shouldda': 1,
u'carpenters': 1,
u'Bahi': 1,
u'96': 1,
u'verbally': 1,
u'recipient': 3,
u'Prize': 1,
u'Political': 1,
u"Howsam's": 1,
u'choice': 6,
u'Lyle': 1,
u'alcoholics': 2,
u'mourn': 1,
u'stays': 1,
u'southpaw': 5,
u'right-handed': 1,
u'exact': 1,
u'minute': 1,
u'Tau': 1,
u'1.10.8': 1,
u'3-month': 1,
u'Fifteen': 1,
u'1.10.4': 1,
u'rights': 3,
u'Tax': 5,
u'make': 43,
u'1.10.1': 1,
u'leave': 5,
u'solved': 2,
u'depositors': 1,
u'settle': 1,
u'team': 33,
u'Patience': 1,
u'prevent': 12,
u'spiritual': 1,
u'$80,738': 2,
u'M.': 22,
u'prediction': 1,
u'sign': 9,
u'Bldg.': 1,
u'3505o': 1,
u'ogled': 1,
u'Lt.': 1,
u'Look': 3,
u'Associations': 1,
u'Adamson': 1,
u'jeopardy': 1,
u'celebrated': 2,
u'locker': 3,
u'melt': 1,
u'current': 13,
u'wayward': 1,
u"Tuttle's": 1,
u'Southwest': 2,
u'boost': 6,
u'Lopez': 1,
u'Me': 2,
u'drafted': 1,
u'jury': 44,
u'funeral': 2,
u'understanding': 5,
u"Leopold's": 1,
u'yards': 22,
u'address': 12,
u'alone': 8,
u'along': 34,
u'$80': 1,
u'My': 9,
u'Godwin': 1,
u'nitroglycerine': 1,
u'passengers': 3,
u'revenues': 13,
u'Associated': 2,
u'Cornell': 1,
u'transition': 3,
u'brilliant': 3,
u'saws': 2,
u'studied': 7,
u'wherever': 1,
u'Casals': 1,
u'accomplished': 4,
u'studies': 2,
u'influx': 1,
u'tasks': 2,
u'love': 3,
u'Hagner': 1,
u'Thornton': 1,
u'prefer': 3,
u"Leavitt's": 1,
u'jolt': 1,
u'Lisle': 1,
u'redevelopers': 1,
u'Davidson': 1,
u'opposes': 2,
u'cocktail': 6,
u'August': 12,
u'working': 16,
u'Sarasota': 1,
u'positive': 2,
u'angry': 3,
u'tightly': 1,
u'Ghormley': 1,
u'cherished': 1,
u'wood': 1,
u'opposed': 9,
u'films': 3,
u'scope': 1,
u'Pinsk': 1,
u'Those': 8,
u'loving': 1,
u'``': 732,
u'Klaus': 1,
u'afford': 4,
u'subsistence': 1,
u'apparent': 7,
u'validity': 1,
u'Jimmy': 4,
u'virtue': 2,
u'Achaeans': 1,
u'scratches': 3,
u'Retail': 1,
u'Opelika': 1,
...})
In [49]:
cfd['romance']
Out[49]:
FreqDist({u'raining': 2,
u'sitters': 1,
u'yellow': 13,
u'keno': 1,
u'four': 8,
u'Does': 2,
u'railing': 1,
u'ringlets': 1,
u'self-pity': 2,
u'snowing': 1,
u'Myra': 24,
u'Ronald': 3,
u'Western': 2,
u'lore': 1,
u'portentous': 1,
u'immature': 1,
u'shaving': 1,
u'Elec': 9,
u'foul': 1,
u'experimentally': 1,
u'bringing': 3,
u'prize': 1,
u'wooden': 3,
u'piling': 1,
u'freckles': 2,
u'persisted': 1,
u'woods': 1,
u'succession': 1,
u'Paul': 1,
u'Jerez': 1,
u'straight': 13,
u'Rachel': 16,
u'tired': 11,
u'hanging': 3,
u'pulse': 2,
u'elegant': 2,
u'second': 15,
u'valiant': 1,
u'sailed': 1,
u'scraped': 1,
u'loathing': 1,
u'nigs': 1,
u"gran'dad": 1,
u"this'll": 1,
u'Initially': 1,
u'thunder': 2,
u'contributed': 1,
u'fingers': 13,
u'Wrong': 1,
u'Hamilton': 1,
u'outfielders': 1,
u'replaced': 2,
u'hero': 1,
u'chins': 1,
u'jubilantly': 1,
u'interdependent': 1,
u'here': 65,
u'reported': 3,
u'chassis': 1,
u'china': 1,
u'hers': 3,
u'shriek': 1,
u'Yuki': 1,
u'kids': 6,
u'unwillingness': 1,
u'elaborate': 2,
u'climbed': 7,
u'cheerfully': 1,
u'golden': 3,
u'explained': 7,
u'Three': 5,
u'brought': 19,
u'remnant': 1,
u'stern': 1,
u'Wales': 3,
u'Pompeii': 5,
u'spoke': 12,
u'moth': 1,
u'symphony': 2,
u'music': 6,
u'telegraph': 1,
u'strike': 1,
u'Brainards': 1,
u'until': 34,
u'holy': 2,
u'populations': 1,
u'successful': 3,
u'brings': 2,
u'whirling': 2,
u'hurt': 7,
u'glass': 7,
u"ever'body": 1,
u'hole': 2,
u'hold': 12,
u'circumstances': 4,
u"captain's": 2,
u'locked': 1,
u'Wilderness': 1,
u'plunged': 2,
u'locker': 2,
u'sweeter': 2,
u'leaped': 1,
u'centralization': 1,
u'example': 2,
u'Le': 1,
u'wand': 1,
u'household': 2,
u'organized': 1,
u'caution': 1,
u'want': 37,
u'pinto': 1,
u'absolute': 1,
u'preferably': 1,
u"cane's": 1,
u'groaned': 1,
u'hon': 2,
u'travel': 2,
u'drying': 2,
u'feature': 1,
u'machine': 3,
u'how': 60,
u'hot': 15,
u'hop': 2,
u'significance': 1,
u"He'll": 1,
u'dignified': 1,
u'fanaticism': 1,
u'A': 48,
u'uselessness': 1,
u'beauty': 6,
u'assimilated': 1,
u'swing': 3,
u'despondent': 1,
u'wrong': 23,
u'chump': 1,
u'outcry': 3,
u"dryin'": 2,
u'Quint': 11,
u'presiding': 1,
u'tulip': 1,
u'Another': 3,
u'keeps': 3,
u'wind': 6,
u'wine': 5,
u'restriction': 1,
u"Daddy's": 1,
u'snugly': 1,
u'dreamed': 4,
u'ice-feeling': 1,
u'wrought': 1,
u'His': 44,
u'Hit': 1,
u'fit': 4,
u'screaming': 3,
u'fig': 2,
u'Him': 1,
u'Implements': 1,
u'put-upon': 1,
u'detachment': 1,
u'schools': 2,
u'sixteen': 1,
u'silver': 3,
u'blazer': 1,
u'arrow': 1,
u'blushed': 1,
u'expectancy': 1,
u'burial': 2,
u'preceded': 2,
u'snakes': 5,
u'series': 2,
u"we'd": 4,
u'mutineer': 1,
u'message': 1,
u'whip': 6,
u'borne': 1,
u'misfortune': 1,
u'drove': 10,
u'encourage': 3,
u'hangouts': 1,
u'engineer': 4,
u'foundation': 1,
u'stamping': 1,
u'assured': 2,
u'Work': 1,
u'assures': 1,
u'Osric': 1,
u'estimate': 3,
u'enormous': 2,
u'ate': 4,
u'moment': 27,
u'disturbed': 3,
u'Human': 1,
u'necessity': 2,
u'disfigured': 2,
u'Please': 5,
u'spinning': 2,
u'Nerves': 1,
u'hear': 16,
u'clarity': 1,
u'fur-piece': 1,
u'bitten': 1,
u'basketball': 1,
u'renovated': 1,
u'service': 6,
u'similarly': 2,
u'Fearless': 2,
u'engagement': 2,
u"Fudo's": 2,
u'tango': 1,
u'needed': 10,
u'blossoms': 1,
u'Straightened': 1,
u'legs': 4,
u'bitter': 6,
u'Alexander': 20,
u'ramming': 1,
u'frowned': 4,
u'wisdom': 3,
u'Shocked': 1,
u'heart-stopping': 1,
u'crawl': 1,
u'showed': 7,
u'handcuffs': 1,
u'tree': 1,
u'idly': 1,
u'shower': 2,
u'pneumonia': 2,
u'idle': 1,
u'exclaimed': 2,
u'feeling': 21,
u'groaning': 1,
u'dozed': 3,
u'Son': 1,
u'Williams': 2,
u'dozen': 2,
u'Then': 42,
u'person': 9,
u'responsible': 1,
u'eagerly': 2,
u'snuggled': 3,
u'Washoe': 1,
u'absorbed': 3,
u'amusing': 2,
u'doors': 3,
u'Ask': 1,
u'Ash': 1,
u'floorshow': 1,
u'shall': 3,
u'tinkers': 1,
u'wells': 1,
u"aunt's": 1,
u'simplify': 1,
u'mouth': 11,
u'letter': 19,
u'entry': 1,
u'drought': 2,
u'morality': 1,
u'episode': 1,
u'cops': 2,
u'camp': 5,
u'Lucille': 11,
u'nineteenth': 1,
u'scream': 2,
u'came': 75,
u'saying': 18,
u'jocular': 1,
u'padded': 1,
u'participate': 1,
u'conclusion': 1,
u'tempted': 2,
u'cheaply': 1,
u'abreast': 1,
u'lessons': 1,
u'busy': 7,
u'clicked': 2,
u'Reaching': 1,
u'quaint': 2,
u"baby's": 4,
u'than': 65,
u'bush': 2,
u'bliss': 1,
u'touched': 3,
u'rich': 6,
u'foolishly': 1,
u'plate': 5,
u'stammered': 1,
u'pocket': 6,
u'altogether': 2,
u'relish': 2,
u"Officers'": 2,
u'shape': 3,
u'patch': 1,
u'eyelids': 1,
u'lurched': 1,
u'release': 4,
u'prayerful': 1,
u'boarded': 1,
u'Clearly': 2,
u'blew': 1,
u'disaster': 1,
u'fair': 5,
u'flaxen': 1,
u'fail': 2,
u'faim': 1,
u'resigned': 1,
u'Dogs': 1,
u'best': 12,
u'Craddock': 1,
u'lots': 5,
u'rings': 2,
u'kind': 34,
u'pressures': 1,
u'scorn': 1,
u'preserve': 2,
u'claws': 1,
u'never': 84,
u'nationwide': 1,
u'nature': 5,
u'rolled': 6,
u'smelled': 8,
u'lapping': 2,
u'twinkling': 1,
u'defiance': 1,
u'debt': 2,
u'debs': 1,
u'pity': 2,
u'accident': 1,
u'disdain': 1,
u'country': 10,
u'pits': 1,
u'readers': 1,
u'adventures': 1,
u'Laura': 5,
u'planned': 3,
u'marrying': 2,
u'argue': 1,
u'asked': 45,
u'twenty-three': 1,
u'irresponsible': 1,
u'wearying': 1,
u'gypsies': 1,
u'gleaming': 1,
u'Sorry': 1,
u'Sitting': 1,
u'union': 1,
u'subside': 1,
u'.': 3736,
u'much': 69,
u'sommelier': 1,
u'superhuman': 1,
u'dollies': 1,
u'life': 51,
u'spit': 1,
u'eastern': 1,
u'lift': 3,
u'child': 17,
u'worked': 8,
u'chill': 4,
u'contemplated': 1,
u'ferreted': 1,
u'Kezziah': 1,
u'miniature': 3,
u'skirts': 1,
u'remembering': 1,
u'played': 6,
u'player': 3,
u'eighteen': 4,
u'aqueducts': 1,
u'specter': 1,
u'trusted': 2,
u'things': 30,
u'shipwrecked': 1,
u'Did': 12,
u'Fairview': 3,
u'babies': 3,
u'Appleby': 1,
u'boiled': 1,
u'Maybe': 12,
u'middle-aged': 1,
u'supper': 2,
u'tune': 2,
u'holystones': 1,
u'echoed': 1,
u'stillness': 1,
u'raindrops': 1,
u'corporate': 1,
u'spurred': 1,
u'Thank': 3,
u'rotated': 1,
u'beset': 1,
u'exclaiming': 1,
u'ham': 6,
u'Oscar': 1,
u'ease': 1,
u'had': 692,
u'advancement': 1,
u'Surely': 4,
u'innocent': 10,
u'prison': 1,
u'has': 26,
u'hat': 5,
u"t's": 1,
u'casually': 1,
u'elders': 1,
u'possible': 9,
u'Broiled': 1,
u'possibly': 4,
u'birth': 1,
u'shadow': 3,
u'unique': 1,
u'occurring': 1,
u'desire': 2,
u'Midshipman': 2,
u'eight-by-ten': 1,
u'remind': 1,
u'steps': 11,
u'finely-spun': 1,
u'Warren': 9,
u"Emma's": 2,
u'right': 55,
u'old': 73,
u'creek': 1,
u'crowd': 4,
u'people': 48,
u'easy': 6,
u'crown': 1,
u"an'": 2,
u'glove': 3,
u'creep': 2,
u'enemies': 1,
u'gasps': 1,
u'ruffled': 1,
u'for': 410,
u'bottom': 6,
u'hulks': 1,
u'forbore': 1,
u'plucked': 1,
u'contributing': 2,
u'fog': 7,
u'summoned': 1,
u'Remy': 1,
u'post': 1,
u'substituted': 1,
u'shifting': 1,
u'starring': 1,
u'bowing': 3,
u'manufacturing': 1,
u'Think': 1,
u'shaken': 2,
u'First': 1,
u'foolish': 3,
u'Caneli': 1,
u'benches': 2,
u'boiling': 3,
u'dollars': 7,
u'rebuffed': 1,
u'Valentine': 1,
u'despair': 2,
u'slightly': 7,
u'expertly': 1,
u'raised': 6,
u'gauze': 1,
u'statements': 1,
u'son': 33,
u'thankful': 3,
u'magazines': 1,
u'Korean': 1,
u'fabric': 1,
u"Thoreau's": 1,
u'support': 2,
u'tame': 2,
u'absolutely': 2,
u"boy's": 4,
u'greatness': 2,
u'call': 24,
u'overhand': 1,
u'happy': 8,
u'offer': 3,
u'fascination': 1,
u'forming': 1,
u'conclusively': 1,
u'shrilly': 1,
u'peppermints': 2,
u'Seeing': 2,
u'inside': 11,
u'devices': 1,
u'County': 1,
u'Tony': 1,
u'Damn': 1,
u'Sis': 2,
u'Sir': 1,
u'later': 19,
u'proved': 5,
u'Sit': 1,
u'steady': 3,
u'wetness': 1,
u'Six': 1,
u'bathrobe': 1,
u'crumble': 1,
u"''": 1044,
u'proves': 1,
u'exist': 2,
u'Francisco': 1,
u'relay': 1,
u"Bartoli's": 1,
u'floor': 13,
u'Weakness': 1,
u'relax': 1,
u'ourselves': 1,
u'overturning': 1,
u"Allstates'": 1,
u'smell': 3,
u'roll': 2,
u'intend': 1,
u'teats': 2,
u'semi-professionally': 1,
u'invested': 1,
u'smelling': 2,
u'rolling': 2,
u'Gardens': 1,
u'congested': 1,
u'Charles': 4,
u'unquenched': 1,
u'time': 93,
u'push': 3,
u'banners': 1,
u'gown': 1,
u'Blackwells': 1,
u'chain': 2,
u'whoever': 1,
u'Indians': 1,
u'bandits': 1,
u'skiing': 1,
u'chair': 5,
u'ballet': 1,
u'religion': 9,
u'rousing': 1,
u'methodically': 1,
u'crates': 1,
u'jerk': 1,
u'choice': 4,
u'alcoholics': 1,
u'stays': 1,
u'spats': 1,
u'fullest': 1,
u'minute': 7,
u'tear': 2,
u'teas': 1,
u'Supply': 1,
u'Tax': 1,
u'make': 49,
u'leave': 24,
u'illustrators': 1,
u'settle': 2,
u'team': 6,
u'Suzanne': 1,
u'unaware': 1,
u'prevent': 1,
u'spiritual': 2,
u'thinkers': 1,
u'meadow': 1,
u'attic': 1,
u'sigh': 1,
u'M.': 1,
u'sign': 6,
u'ogled': 1,
u'depressions': 1,
u'Sewickley': 1,
u'Francie': 4,
u'falling': 2,
u'crackling': 1,
u'Me': 1,
u'banister': 1,
u'banged': 2,
u'assertive': 1,
u'funeral': 6,
u'Mi': 1,
u'understanding': 3,
u'address': 3,
u'alone': 24,
u'along': 33,
u'My': 21,
u'enroll': 1,
u'hurtling': 2,
u'brilliant': 5,
u'studied': 7,
u'wherever': 2,
u'accomplished': 1,
u'sprouted': 1,
u'Doaty': 8,
u'studies': 2,
u'nowhere': 2,
u'love': 32,
u'cacophony': 1,
u'prefer': 1,
u'logical': 1,
u'Alberto': 2,
u'crammed': 1,
u'August': 2,
u'working': 8,
u'wicker': 1,
u'angry': 5,
u'predictions': 1,
u'tightly': 4,
u'Hello': 2,
u'papal': 1,
u'wondering': 6,
u'Those': 5,
u'loving': 2,
u'high-speed': 1,
u'``': 1045,
u'afford': 2,
u'apparent': 1,
u'Telling': 1,
u"She'll": 4,
u'everywhere': 2,
u'virtue': 1,
u'Blackwell': 3,
u'preponderance': 1,
u'anything': 42,
u'Pope': 6,
u'Relatives': 1,
u'values': 1,
u'Showers': 1,
u'believed': 2,
u'Our': 3,
u'detached': 1,
u'Out': 6,
u'Gertrude': 4,
u'admired': 4,
u'frogs': 1,
u'awesome': 1,
u'parachute': 1,
u'hides': 1,
u'admirer': 1,
u'230': 1,
u'Happened': 1,
u'winter': 5,
u'divided': 3,
u'Who': 6,
u'elephant': 1,
u'Why': 28,
u'moon-washed': 1,
u'gumming': 1,
u'spot': 2,
u'Bari': 1,
u'date': 3,
u'such': 21,
u'suck': 1,
u'spouted': 1,
u'revealed': 2,
u'nineties': 1,
u'stress': 1,
u'Captain': 9,
u'natural': 4,
u'conscious': 1,
u'consequently': 1,
u'ordinarily': 1,
u'darkened': 2,
u'so': 174,
u'forebears': 1,
u'swollen': 2,
u'wolves': 1,
u'pulled': 8,
u'Encouraged': 1,
u'years': 34,
u'course': 36,
u'maneuvered': 1,
u'unfavorable': 1,
u'Cromwell': 13,
u'drunker': 1,
u'tore': 3,
u'solitary': 3,
u'thumb': 3,
u'nearsighted': 1,
u'paraded': 1,
u'torn': 3,
u'attraction': 1,
u'thump': 1,
u'Cousin': 22,
u'troubled': 3,
u'parades': 1,
u'mused': 2,
u'apron': 1,
u'civilian': 1,
u'Folly': 3,
u'nation': 2,
u'quok': 1,
u'Gordon': 2,
u'sorted': 1,
u'in-laws': 1,
u'matched': 1,
u'shouted': 11,
u'Yellow': 1,
u'fisherman': 1,
u'veins': 1,
u'quarter': 1,
u'repaired': 1,
u'square': 4,
u'retrieve': 2,
u'bursting': 1,
u'owing': 1,
u'entering': 1,
u'Kong': 3,
u'salads': 1,
u'disasters': 1,
u'rounding': 1,
u'post-operative': 1,
u'Furnaces': 1,
u'seriously': 1,
u'investigation': 2,
u'Joe': 5,
u'bordering': 1,
u'million': 1,
u'possibility': 2,
u'quite': 22,
u'bumps': 1,
u'complicated': 1,
u'Either': 1,
u'intensely': 2,
u'Westfield': 2,
u'training': 4,
u'disguised': 1,
u'modest': 1,
u'aboard': 4,
u'bothersome': 1,
u'puny': 3,
u'emotion': 1,
u'intuition': 1,
u'poling': 2,
u'spoken': 3,
u'Royal': 1,
u'one': 166,
u'chide': 1,
u'potted': 1,
u'open': 18,
u'ripping': 1,
u'city': 10,
u'Miyagi': 2,
u'Monday': 2,
u'bite': 2,
u'shiver': 2,
u'draft': 1,
u'typing': 2,
u'begotten': 1,
u'two-colored': 1,
u'sentinels': 1,
u'padding': 1,
u'ridiculous': 3,
u'slashed': 1,
u'Seven': 4,
u'translate': 1,
u'Ciao': 1,
u'scrumptious': 1,
u'folly': 1,
u'crossroads': 1,
u'future': 6,
u'counselor': 1,
u'janitor': 1,
u'damned': 4,
u'prospect': 1,
u'mountain': 1,
u'illness': 1,
u'flatly': 1,
u'turned': 48,
u'alley': 4,
u'sad': 6,
u'say': 60,
u'rained': 1,
u'buried': 2,
u'dragooned': 1,
u'sap': 1,
u'saw': 47,
u'sat': 32,
u'Esperanza': 1,
u'fashionable': 1,
u'Jewish': 1,
u'aside': 5,
u'Kleenex': 1,
u'note': 13,
u'take': 62,
u'Half': 2,
u'wanting': 3,
u'Hall': 2,
u'to-do': 1,
u'altered': 2,
u'opposite': 4,
u'backyards': 1,
u'knew': 69,
u'remarks': 2,
u'knowingly': 1,
u'inserted': 1,
u'pages': 1,
u'lawn': 2,
u'weather-royal': 1,
u'average': 5,
u'drive': 6,
u'federal': 1,
u'heavily-upholstered': 1,
u'salt': 1,
u'trembled': 1,
u'laws': 2,
u'walking': 12,
u'merit': 1,
u'too-expensive': 1,
u'peaches': 1,
u'propagandist': 1,
u'commissary': 1,
u'aggressive': 1,
u'imagined': 4,
u'Wants': 1,
u'slow': 2,
u'transact': 1,
u'Krishnaists': 1,
u"Myra's": 4,
u'tears': 6,
u'going': 60,
u'robe': 2,
u'clawing': 1,
u'revolutionized': 1,
u'freezing': 3,
u'flowerpot': 2,
u'Speedy': 1,
u'Conneaut': 1,
u'artist': 3,
u'hinges': 1,
u'absurdly': 1,
u'worried': 7,
u'priest': 1,
u"could've": 1,
u'worries': 5,
u'marred': 1,
u'where': 54,
u'vision': 2,
u'orchids': 1,
u'morose': 1,
u'raged': 1,
u'dived': 1,
u'cheesecloth': 1,
u'hastened': 1,
u'aroused': 1,
u'rages': 1,
u'That': 29,
u'jumped': 3,
u'mops': 1,
u'fierceness': 1,
u'bureau': 4,
u'moons': 1,
u'Bible': 2,
u'jobs': 6,
u'screen': 4,
u'aversion': 2,
u'spare': 3,
u'Constance': 1,
u'concentrated': 1,
u'many': 27,
u'thickly': 1,
u'loudly': 1,
u'bare-armed': 1,
u'expression': 1,
u"can't": 24,
u'girl-san': 1,
u'gaiety': 1,
u'twin': 1,
u'sentinel': 1,
u'Riverside': 7,
u'thick-skulled': 1,
u'boat': 2,
u'caring': 2,
u'companionship': 1,
u'teddy': 2,
u'stretch': 1,
u'west': 2,
u'vacation': 2,
u'braving': 1,
u'motives': 1,
u'reflective': 1,
u'Signor': 2,
u'wants': 7,
u'thousand': 3,
u'tightened': 1,
u'Dazed': 1,
u'former': 1,
u'Honshu': 1,
u'pettiness': 1,
u'pretence': 1,
u'straighten': 1,
u'easier': 3,
u'defeatism': 1,
u'newspaper': 2,
u'situation': 5,
u'parboiled': 1,
u'canoe': 1,
u'brow': 2,
u'purse': 2,
u'dubious': 1,
u'quiet': 8,
u'limping': 1,
u'fame': 2,
u'missiles': 1,
u"roulette's": 1,
u'Lovejoy': 3,
u'underestimate': 1,
u'Abernathy': 2,
u'edged': 1,
u'I': 951,
u'edges': 2,
u'barber': 1,
u'tracking': 1,
u'vacant': 1,
u'trains': 1,
u'carpentry': 1,
u'scholarship': 2,
u'summer': 8,
u'sprayed': 1,
u'steamed': 1,
u'being': 49,
u'rest': 17,
u'diapiace': 1,
u'Hurrays': 3,
u'snarled': 1,
u'blondes': 1,
u'grounded': 1,
u'straight-A': 1,
u'instrument': 1,
u'nymphomaniac': 1,
u'dryly': 1,
u"Pietro's": 1,
u'bedstraw': 1,
u"Captain's": 1,
u'aspects': 2,
u'around': 68,
u'gestures': 1,
u'Mont': 1,
u'Aunt': 2,
u'darn': 1,
u'vacuum': 2,
u'world': 26,
u'vague': 1,
u'dare': 2,
u'boast': 2,
u'clam': 1,
u'stranger': 2,
u'Share': 1,
u'souvenirs': 1,
u'discarded': 1,
u'clay': 2,
u'auditorium': 1,
u'seating': 1,
u'Perry': 3,
u'learning': 2,
u'thinks': 4,
u'scholarships': 1,
u'dimensions': 1,
u'strewn': 1,
u'noon': 4,
u'exit': 1,
u'refer': 1,
u'zest': 1,
u'intimate': 3,
u'sprung': 1,
u"You'll": 5,
u'leadership': 7,
u'stone': 6,
u"Roy's": 1,
u'package': 1,
u'industry': 1,
u'Puzzled': 1,
u'favorite': 4,
u'slender': 2,
u'side': 17,
u'Regretfully': 1,
u'neighbor': 1,
u'act': 3,
u'mean': 10,
u'stony': 1,
u'burning': 3,
u'No': 40,
u'image': 8,
u'Acting': 1,
u'lively': 3,
u'parties': 5,
u'bubbly': 1,
u'her': 651,
u'lounging': 1,
u'mindless': 1,
u'sealed': 1,
u'bubble': 2,
u'tireless': 2,
u'Harro': 1,
u'yearned': 1,
u'complete': 3,
u'black-and-yellow': 1,
u"child's": 3,
u'Alternately': 1,
u'foreheads': 1,
u'Small': 3,
u'unreliable': 1,
u'aide': 1,
u'with': 460,
...})
In [50]:
list(cfd['news'])
Out[50]:
[u'stock',
u'sunbonnet',
u'Elevated',
u'narcotic',
u'four',
u'woods',
u'railing',
u'Until',
u'aggression',
u'marching',
u'looking',
u'eligible',
u'electricity',
u'$25-a-plate',
u'consulate',
u'Casey',
u'all-county',
u'Belgians',
u'Western',
u'1959-60',
u'Duhagon',
u'sinking',
u'1,119',
u'co-operation',
u'Famed',
u'regional',
u'Charitable',
u'appropriation',
u'yellow',
u'uncertain',
u'Heights',
u'bringing',
u'prize',
u'Loen',
u'Publique',
u'wooden',
u'Loeb',
u'963',
u'specialties',
u'Sands',
u'succession',
u'Paul',
u'Phyfe',
u'commented',
u'Screw',
u'charter',
u'tired',
u'pulse',
u'tires',
u'271',
u'second',
u'273',
u'Pampa',
u'DiVarco',
u'Electra',
u'errors',
u'fall-off',
u'forgetting',
u'Initially',
u'Lucille',
u'boogie',
u'contributed',
u'Seekonk',
u'Hamilton',
u'designing',
u'replaced',
u'increasing',
u'Presidential',
u'hero',
u'Sioux',
u'whose',
u'Munoz',
u'Church',
u'here',
u'reported',
u'affiliated',
u'Footnotes',
u'Stephanie',
u'doldrums',
u'cyclical',
u'kids',
u'Fernberger',
u'elaborate',
u'climbed',
u'reports',
u'controversy',
u'Pierson',
u'menu',
u'military',
u'Isles',
u'Ervin',
u'golden',
u'Quincy',
u'owed',
u'geography',
u'Harvey',
u'explained',
u'precincts',
u'Three',
u'replace',
u'brought',
u'beneficiaries',
u'Basic',
u'Wales',
u'Basin',
u'unit',
u'opponents',
u'Ronald',
u'ominous',
u'spoke',
u'tardiness',
u'Slate',
u'hungry',
u'Admitting',
u'Anticipated',
u'occupying',
u'Vernon',
u'Tex.',
u'music',
u'therefore',
u'passport',
u'staged',
u'strike',
u'heralded',
u'until',
u'Tudor',
u'Stepanovich',
u'females',
u'Christine',
u'successful',
u'brings',
u'whirling',
u'Rule',
u'99',
u'Person',
u'remembered',
u'menaced',
u'tying',
u'90',
u'hole',
u'hold',
u'95',
u'circumstances',
u'AID',
u'locked',
u'dreadful',
u'Wilderness',
u'Armond',
u'homemakers',
u'famed',
u'accomplishment',
u'Professors',
u'Westphalia',
u'temperatures',
u'132,000',
u'centralization',
u'example',
u'fumes',
u'august',
u'Tournament',
u'La',
u'household',
u'artillery',
u'organized',
u'Briar',
u'Smith-Colmer',
u'currency',
u'caution',
u'reviewing',
u'want',
u'counseling',
u'Whelan',
u'arenas',
u'absolute',
u'preferably',
u'hog',
u'hoc',
u'complaining',
u'travel',
u'drying',
u'stuff',
u'feature',
u'Gardner',
u'machine',
u'how',
u'hot',
u'Delray',
u'significance',
u'Jussel',
u'Stock',
u'preferable',
u"He'll",
u'blue-uniformed',
u'A',
u'beauty',
u'L.',
u'Welfare',
u'206',
u'outlawed',
u'Players',
u'modest',
u'Reese',
u'destined',
u'fourteen-team',
u'sentencing',
u'types',
u'compartment',
u'effective',
u'occasions',
u'down-payments',
u'youths',
u'romped',
u'revolt',
u'headquarters',
u'Walkers',
u'cabled',
u'baggage',
u'18th',
u'4-7/8',
u'Another',
u'keeps',
u'democratic',
u'wing',
u'wind',
u'leisurely',
u'Willy',
u'senators',
u'$840,000',
u'welcomed',
u'Edith',
u'Housing',
u'govern',
u'vary',
u'kickoff',
u'intangible',
u'Dussa',
u'Toll',
u'Ludwig',
u'Commies',
u'some',
u'His',
u'Hit',
u'fit',
u'revenue',
u"Gardner's",
u'secede',
u'survivors',
u'rescued',
u'Harris',
u'Barber',
u'Palsy',
u'hidden',
u'county-wide',
u'virtually',
u'slate',
u'vouchers',
u'detachment',
u'oks',
u'effects',
u'schools',
u'shadows',
u'yourself',
u'undeveloped',
u'silver',
u'Macon',
u'headboard',
u'represents',
u'debut',
u"road's",
u'crops',
u'McCluskey',
u'4-year-old',
u'clientele',
u'Seidel',
u"employers'",
u'Superior',
u'preceded',
u'financial',
u'reputedly',
u'series',
u'finger-paint',
u'Mongolia',
u"NATO's",
u'3-to-o',
u'whiz',
u"we'd",
u'substantially',
u'laboratory',
u"House's",
u'whip',
u'borne',
u'misfortune',
u'drove',
u'ten-concert',
u'ha',
u'Leatherman',
u'freeze',
u'5847',
u'Charges',
u'encourage',
u'engineer',
u'Super',
u"Simpson's",
u'Matisses',
u'foundation',
u'Word',
u'Extension',
u'sellout',
u'looting',
u'University',
u'Work',
u'threatened',
u'3-to-3',
u'3-run',
u'element',
u'checked',
u'estimate',
u'Pakistanis',
u'substantiation',
u'cornerstone',
u'enormous',
u'Hord',
u'shelves',
u'3:57',
u'24-inch',
u'shipped',
u'musicians',
u'speedy',
u'coeds',
u'Human',
u'1970s',
u'reserving',
u'repealed',
u'Espagnol',
u'pastel-like',
u'hearsay',
u'Convair',
u"Al's",
u'Due',
u'channels',
u'wash',
u"Santa's",
u'$18.9',
u'175',
u'174',
u'173',
u'$18.2',
u'Steelers',
u'basketball',
u'service',
u'Lucy',
u'engagement',
u'Skyway',
u'needed',
u'Simmons',
u'master',
u'listed',
u'Dumont',
u'legs',
u'bitter',
u'ranging',
u'listen',
u'collapse',
u'predictably',
u'bounty',
u'nolo',
u'wisdom',
u'advisement',
u'Serving',
u'defaulted',
u'Richmond-Petersburg',
u'Bertoia',
u'peril',
u'outlay',
u'showed',
u'elegant',
u'Inna',
u'Carroll',
u'walloped',
u'nations',
u'project',
u'idle',
u'Ilona',
u'skimmed',
u'feeling',
u'acquisition',
u'Cody',
u'Angelo',
u'Vice',
u'acclaim',
u'entail',
u'willingness',
u'Chicago',
u'Woodyard',
u'Mullenax',
u'craven',
u"Lanin's",
u'Gursel',
u'Son',
u'Angels',
u'Williams',
u'dozen',
u'Then',
u'concrete',
u'bleachers',
u'responsible',
u'Myron',
u'Band',
u'recommended',
u'absorbed',
u'Minister',
u'They',
u'$100,000',
u'Ask',
u'grips',
u'Missionary',
u'Jones',
u'Lynn',
u'Wise',
u'shall',
u'Wish',
u'object',
u'vexing',
u'debonair',
u'affirmation',
u'mouth',
u'letter',
u'conceded',
u'delegate',
u'putout',
u'Galveston',
u'episode',
u'Texans',
u'professor',
u'camp',
u'Bulloch',
u'Journal-Bulletin',
u'ruthless',
u'Democrat',
u'China',
u'paneling',
u'Mansion',
u'detriment',
u'nineteenth',
u'mating',
u'purged',
u'incomplete',
u'marvel',
u'saying',
u'signatures',
u'bomb',
u'reactor',
u'Symonds',
u'U-2',
u'Union',
u'Schenk',
u'meetings',
u'Agency',
u'parolees',
u'nominated',
u'undue',
u"Communism's",
u'cooking',
u'judgeship',
u'Paradise',
u'Congressional',
u'Meyner',
u'touches',
u'busy',
u'clicked',
u'Extend',
u'695',
u'headline',
u'buss',
u'Moller',
u'haze',
u'appreciated',
u'Tiao',
u'theme',
u'touched',
u'rich',
u'Darrow',
u'submarine-ball',
u'Klux',
u'lady',
u'plate',
u'D.C.',
u'cubic',
u'professionals',
u'Nevertheless',
u'$16',
u'untrammeled',
u'pocket',
u'$17',
u"Ruth's",
u'societies',
u'Senators',
u'greens',
u'maverick',
u'Rip',
u'Rio',
u'three-year',
u'adjourned',
u'release',
u"leader's",
u'U-I',
u'generosity',
u'respond',
u'mandatory',
u'disaster',
u'fair',
u'irritable',
u'Bennington',
u'reconvened',
u'pads',
u'Brevard',
u'result',
u'fail',
u'mea',
u'resigned',
u'fastened',
u'best',
u'pricking',
u'Emory',
u"Braves'",
u'lots',
u'Heinkel',
u'rings',
u"'20's",
u'224-170',
u'pressures',
u'score',
u'Lockies',
u'toolmaker',
u'preserve',
u'wage',
u'redistricting',
u'men',
u"Meyner's",
u'nationwide',
u'nature',
u'rolled',
u'impetus',
u'authorizing',
u'lefthanders',
u'Burbank',
u'extent',
u'Bronx',
u'marines',
u'roller',
u'Capello',
u"war's",
u'accident',
u'refinement',
u'country',
u'readers',
u'demanded',
u'Vacancy',
u"today's",
u'erupted',
u'planned',
u'logic',
u'federalism',
u'argue',
u"High's",
u'asked',
u'30th',
u'Apartment',
u'liberal-conservative',
u'Korman',
u'25%',
u'active',
u'rapport',
u'month-long',
u'250',
u'exports',
u'255',
u'relearns',
u'Clarence',
u'reconsideration',
u'shouting',
u'union',
u'Curry',
u'feathers',
u'breakoff',
u'.',
u'Nischwitz',
u'extraction',
u'startled',
u'stadium',
u'Insofar',
u'privilege',
u'one-week-old',
u'Flowers',
u'dots',
u'Precise',
u'life',
u'retrospect',
u'Tokyo',
u'worker',
u'allotting',
u'1,212,000',
u'child',
u'worked',
u'Gloriana',
u'Elected',
u'Holmes',
u'commerce',
u'presidency',
u'Chips',
u'1671',
u'employ',
u'misconstrued',
u'1213-15',
u'Campbell',
u"Gannon's",
u'harvesting',
u'Zurcher',
u'played',
u'Innumerable',
u'conditioned',
u'player',
u'$1,800',
u'eighteen',
u'London-based',
u'Courtney',
u'Puerto',
u'churchmen',
u'doorman',
u'specter',
u'trusted',
u'Phouma',
u'damaged',
u'recover',
u"Dresbachs'",
u'things',
u'cumulative',
u'rebellion',
u'Newman',
u'Cocktails',
u'harmony',
u'babies',
u'bid',
u'fairly',
u'Budapest',
u'3,399',
u'updated',
u'$9',
u'Maybe',
u'torpedoes',
u'Angeles',
u'photographers',
u'Peking',
u'5-to-2',
u'5-to-3',
u"Stevenses'",
u'furlough',
u'Loewe',
u'vice-president',
u'academic',
u'skidding',
u'echoes',
u'corporate',
u'Fazio',
u'fittest',
u'opinions',
u'spurred',
u'capitol',
u'sleeps',
u'Subsequent',
u'distribute',
u'1981',
u'plight',
u'rushing',
u'succeeding',
u'previous',
u'Colonial',
u'ham',
u'duffer',
u'Oscar',
u'ease',
u'Odell',
u'had',
u'ideal',
u'Leonard',
u'Mohammedanism',
u'Connecticut',
u'collections',
u'easy',
u'prison',
u'has',
u'hat',
u'Channel',
u'Apart',
u'municipal',
u'survival',
u'disagreement',
u'possible',
u"rocket's",
u'firmer',
u'possibly',
u'birth',
u'Missouri',
u'clustered',
u'imposed',
u'unique',
u'$2,170',
u'desire',
u'sliced',
u"bridegroom's",
u'seaside',
u'misled',
u'steps',
u'Shrove',
u"court's",
u'Further',
u'continuation',
u"Louis's",
u'Warren',
u'attorney',
u'right',
u'old',
u'crowd',
u'$1,000,000,000',
u'creed',
u'Expressways',
u'crown',
u'System',
u'culpas',
u'3,325',
u'Conservation',
u'Whatever',
u'Noel',
u'Between',
u"symphony's",
u'enemies',
u'chorus',
u'for',
u'bottom',
u'p.m.',
u'contributing',
u'continue',
u'motorist',
u'summoned',
u'pondered',
u'Heideman',
u'Instant',
u'Calls',
u"ol'",
u'dental',
u'6,000',
u'shifting',
u'defensive',
u'losing',
u'brokerage',
u'manufacturing',
u'shaken',
u'balking',
u'benches',
u'visitors',
u'dollars',
u'citizens',
u'globetrotter',
u'despair',
u'stoked',
u'lacked',
u'slightly',
u'meddle',
u'match',
u'consulting',
u'statements',
u'rationale',
u'Blacks',
u'yen',
u'honeymoon',
u'Scotland',
u'son',
u'freshman',
u'Misses',
u'one-fourth',
u'raiser',
u'raises',
u'sow',
u'stockholder',
u'reducing',
u'defendants',
u'Hank',
u'east',
u'Gulf',
u'lining',
u'support',
u'constantly',
u'busy-work',
u'Hand',
u'symphony',
u"boy's",
u'peddlers',
u'resulted',
u'overhead',
u'happy',
u'Vernor',
u'offer',
u'understandably',
u'forming',
u'Completing',
u'oil',
u'talents',
u'understandable',
u"Field's",
u'Couturier',
u'delegation',
u'duel',
u'121',
u"else's",
u'Toni',
u'inside',
u'officiated',
u'Waldorf-Astoria',
u'County',
u'engulfed',
u'unanimous',
u'Guests',
u'Tony',
u'Andy',
u'Enrique',
u'panels',
u'Weatherford',
u'Stallard',
u'8,293',
u'150',
u'juvenile',
u'adopt',
u'liberal',
u'154',
u'Trooper',
u'Six',
u'proven',
u'Letitia',
u"''",
u'Virgin',
u'Squad',
u'exist',
u'Pittsboro',
u'bats',
u'Sid',
u'segregationist',
u'acacia',
u'dealer',
u'negotiations',
u'McDaniel',
u'protested',
u'eventual',
u'floor',
u'Track',
u'Possible',
u'crowns',
u'flood',
u'Nolan',
u'republic',
u'Friend',
u'ambitious',
u'entomologist',
u'Dame',
u'smell',
u'roll',
u'steamship',
u'124',
u'intend',
u'Merrill',
u'Lenny',
u'models',
u'high-wage',
u'Western-style',
u'Luthuli',
u'acquaintance',
u'Arkansas',
u'undersea',
u'persecution',
u'Fuhrmann',
u'unmatched',
u'Charley',
u'reorganization',
u"Mongolia's",
u'Debutante',
u"workers'",
u'godliness',
u'toll-road',
u'Charles',
u'Quaker',
u"Leavitt's",
u'7:30',
u'time',
u'push',
u"Stephen's",
u'Empire',
u'gown',
u'smelts',
u'chain',
u'Indians',
u'Nicklaus',
u'Nehf',
u'theaters',
u'645-acre',
u'Wabash',
u'Indiana',
u'chair',
u'$278,877,000',
u'competition',
u'Beyeler',
u'ballet',
u'92',
u'8861',
u'900-student',
u'sweat-suits',
u'shouldda',
u'carpenters',
u'Bahi',
u'96',
u'eliminate',
u'Prize',
u'recipe',
u"Howsam's",
u'choice',
u'Lyle',
u'alcoholics',
u'mourn',
u'Reedville',
u'stays',
u'southpaw',
u'fifty',
u'exact',
u'minute',
u'Tau',
u'staffs',
u'1.10.8',
u'commentator',
u'1.10.4',
u'Supply',
u'Tax',
u'1.10.1',
u'leave',
u'solved',
u'depositors',
u'settle',
u'team',
u'prevent',
u'spiritual',
u'$80,738',
u'M.',
u'prediction',
u'sign',
u'Bldg.',
u'3505o',
u'soloists',
u'ogled',
u'Augusta',
u'Lt.',
u'Brady',
u'Associations',
u'Adamson',
u'jeopardy',
u'celebrated',
u'realizing',
u'melt',
u'current',
u'wayward',
u'fifth',
u'Southwest',
u'boost',
u'Lopez',
u'Me',
u'drafted',
u'jury',
u'funeral',
u'understanding',
u'guise',
u'competed',
u'well-springs',
u'yards',
u'address',
u'alone',
u'along',
u'Bow',
u'My',
u'Godwin',
u'Tyson',
u'nitroglycerine',
u'passengers',
u'revenues',
u'Associated',
u'brilliant',
u'saws',
u'studied',
u'wherever',
u'Haddix',
u'Casals',
u'accomplished',
u'studies',
u'influx',
u'tasks',
...]
In [52]:
cfd['romance']['could']
Out[52]:
193
In [ ]:
'''
条件频率分布是一个对许多 NLP 任务都有用的数据结构。表 2-4 总结了它们常用的方法。
表 2-4. NLTK 中的条件频率分布:定义、访问和可视化一个计数的条件频率分布的常用方法和习 惯用法
示例 描述
cfdist= ConditionalFreqDist(pairs) 从配对链表中创建条件频率分布
cfdist.conditions() 将条件按字母排序
c fdist[condition] 此条件下的频率分布
c fdist[condition][sample] 此条件下给定样 的频率
c fdist.tabulate() 为条件频率分布制表
cfdist.tabulate(samples, conditions) 指定样 和条件限制下制表
c fdist.plot() 为条件频率分布绘图
cfdist.plot(samples, conditions) 指定样 和条件限制下绘图
cfdist1 < cfdist2 测试样 在cfdist1中出现次数是否小于在cfdist2中出现次 数
'''
In [ ]:
In [ ]:
In [16]:
#2.5 WordNet 是面向语义的英语词典,类似与传统辞典,但具有更丰富的结构。
#NLTK 中包 括英语 WordNet,共有 155,287 个词和 117,659 个同义词集合。
from nltk.corpus import wordnet as wn
In [17]:
#因此,motorcar 只有一个可能的 义,它被定义为 car.n.01,car 的第一个名词意义。
#car.n.01 被称为 synset 或“同义词集”,意义相同的词(或“词条”)的集合
wn.synsets('motorcar')
Out[17]:
[Synset('car.n.01')]
In [20]:
#查看同义词集合
wn.synset('car.n.01').lemma_names()
Out[20]:
[u'car', u'auto', u'automobile', u'machine', u'motorcar']
In [22]:
#同义词集也有一些一 般的定义
wn.synset('car.n.01').definition()
Out[22]:
u'a motor vehicle with four wheels; usually propelled by an internal combustion engine'
In [23]:
#同义词集也有一些一般例句:
wn.synset('car.n.01').examples()
Out[23]:
[u'he needs a car to get to work']
In [24]:
csys = wn.synset('car.n.01')
In [25]:
csys.lemmas()
Out[25]:
[Lemma('car.n.01.car'),
Lemma('car.n.01.auto'),
Lemma('car.n.01.automobile'),
Lemma('car.n.01.machine'),
Lemma('car.n.01.motorcar')]
In [26]:
#car de 同义词集
asynsets = wn.synsets('car')
In [27]:
asynsets
Out[27]:
[Synset('car.n.01'),
Synset('car.n.02'),
Synset('car.n.03'),
Synset('car.n.04'),
Synset('cable_car.n.01')]
In [28]:
for synset in asynsets:
print synset.lemma_names()
[u'car', u'auto', u'automobile', u'machine', u'motorcar']
[u'car', u'railcar', u'railway_car', u'railroad_car']
[u'car', u'gondola']
[u'car', u'elevator_car']
[u'cable_car', u'car']
In [29]:
#访问所有包 词 car 的词条
wn.lemmas('car')
Out[29]:
[Lemma('car.n.01.car'),
Lemma('car.n.02.car'),
Lemma('car.n.03.car'),
Lemma('car.n.04.car'),
Lemma('cable_car.n.01.car')]
In [30]:
#WordNet WordNet 的同义词集对应于抽象的概念,它们并不总是有对应的英语词汇。
#这些概念在 层次结构中相互联系在一起。一些概念也很一般,如实体、状态、事件;这些被称为独一无 二的根同义词集。
motorcar = wn.synset('car.n.01')
In [31]:
types_of_motorcar = motorcar.hyponyms()
In [32]:
types_of_motorcar[26]
Out[32]:
Synset('stanley_steamer.n.01')
In [34]:
sorted([lemma.name for synset in types_of_motorcar for lemma in synset.lemmas])
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-34-1790d4a6c389> in <module>()
----> 1 sorted([lemma.name for synset in types_of_motorcar for lemma in synset.lemmas])
TypeError: 'instancemethod' object is not iterable
In [36]:
#得到一个最一般的上位(或根上位)同义词集
motorcar.root_hypernyms()
Out[36]:
[Synset('entity.n.01')]
In [38]:
#NLTK 中便捷的图形化 WordNet浏览器:nltk.app.wordnet()。 沿着上位词与下位词之间的链接,探索 WordNet 的层次结构
#nltk.app.wordnet()
Content source: klisly/machinelearning
Similar notebooks: