In [1]:
#第二章 获取文本语料和词汇资源
'''
在自然语言处理的实际项目中,通常要使用大量的语言数据或者语料库。 章的目的是 要回答下列问题:
1. 什么是有用的文 语料和词汇资源,我们如何使用 Python 获取它们?
2. 哪些 Python 结构最适合这项工作?
3. 编写 Python 代码时我们如何避免重复的工作?
'''
%matplotlib inline
import nltk

In [2]:
#### 2.1 
'''
古腾堡语料库
NLTK 包 古腾堡项目(Project Gutenberg)电子文 档案的经过挑选的一小部分文。该项目大约有 25,000(现在是 36,000 了) 
免费电子图书,放在 http://www.gutenberg. org/上。
'''
#列出文本
nltk.corpus.gutenberg.fileids()


Out[2]:
[u'austen-emma.txt',
 u'austen-persuasion.txt',
 u'austen-sense.txt',
 u'bible-kjv.txt',
 u'blake-poems.txt',
 u'bryant-stories.txt',
 u'burgess-busterbrown.txt',
 u'carroll-alice.txt',
 u'chesterton-ball.txt',
 u'chesterton-brown.txt',
 u'chesterton-thursday.txt',
 u'edgeworth-parents.txt',
 u'melville-moby_dick.txt',
 u'milton-paradise.txt',
 u'shakespeare-caesar.txt',
 u'shakespeare-hamlet.txt',
 u'shakespeare-macbeth.txt',
 u'whitman-leaves.txt']

In [3]:
ws = nltk.corpus.gutenberg.words('austen-emma.txt')
len(ws)


Out[3]:
192427

In [ ]:
# 另一种一种方式
from nltk.corpus import gutenberg
gutenberg.fileids()


Out[ ]:
[u'austen-emma.txt',
 u'austen-persuasion.txt',
 u'austen-sense.txt',
 u'bible-kjv.txt',
 u'blake-poems.txt',
 u'bryant-stories.txt',
 u'burgess-busterbrown.txt',
 u'carroll-alice.txt',
 u'chesterton-ball.txt',
 u'chesterton-brown.txt',
 u'chesterton-thursday.txt',
 u'edgeworth-parents.txt',
 u'melville-moby_dick.txt',
 u'milton-paradise.txt',
 u'shakespeare-caesar.txt',
 u'shakespeare-hamlet.txt',
 u'shakespeare-macbeth.txt',
 u'whitman-leaves.txt']

In [ ]:
#通过循环遍历前面列出的 gutenberg 文件标识符链表相应的 fileid,然后计算统计每个文  
#平均词长、平均句子长度和 文中每个词出现的 平均次数
for fileid in gutenberg.fileids():
    num_chars=len(gutenberg.raw(fileid))
    num_words=len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower for w in gutenberg.words(fileid)]))
    print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid


4 24 1 austen-emma.txt
4 26 1 austen-persuasion.txt
4 28 1 austen-sense.txt

In [ ]:
#网络聊天文本
from nltk.corpus import webtext
for fileid in webtext.fileids():
    print fileid,webtext.raw(fileid)[:65],'...'

In [1]:
#即时聊天会话语料库
from nltk.corpus import nps_chat

In [5]:
nps_chat.readme


Out[5]:
<bound method NPSChatCorpusReader.readme of <NPSChatCorpusReader in u'/Users/wizardholy/nltk_data/corpora/nps_chat'>>

In [6]:
chatroom = nps_chat.posts('10-19-20s_706posts.xml')

In [10]:
chatroom[123]


Out[10]:
[u'i',
 u'do',
 u"n't",
 u'want',
 u'hot',
 u'pics',
 u'of',
 u'a',
 u'female',
 u',',
 u'I',
 u'can',
 u'look',
 u'in',
 u'a',
 u'mirror',
 u'.']

In [11]:
nps_chat.readme


Out[11]:
<bound method NPSChatCorpusReader.readme of <NPSChatCorpusReader in u'/Users/wizardholy/nltk_data/corpora/nps_chat'>>

In [12]:
#布朗语料库,首个百万级别的英语电子资料库
'''
ID 文件 文体 描述
A16 ca16 新闻 news Chicago Tribune: Society Reportage
B02 cb02 社论 editorial Christian Science Monitor: Editorials
C17 cc17 评论 reviews Time Magazine: Reviews
D12 cd12 宗教 religion Underwood: Probing the Ethics of Realtors
E36 ce36 爱好 hobbies Norling: Renting a Car in Europe
F25 cf25 传说 lore Boroff: Jewish Teenage Culture
G22 cg22 纯文学 belles_lettres Reiner: Coping with Runaway Technology
H15 ch15 政府 government US Office of Civil and Defence Mobilization: The Fam ily Fallout Shelter
J17 cj19 博览 learned Mosteller: Probability with Statistical Applications
K04 ck04 小说 fiction W.E.B. Du Bois: Worlds of Color
L13 cl13 推理小说 mystery Hitchens: Footsteps in the Night
M01 cm01 科幻 science_fiction Heinlein: Stranger in a Strange Land
N14 cn15 探险 adventure Field: Rattlesnake Ridge
P12 cp12 言情 romance Callaghan: A Passion in Rome
R06 cr06 幽默 humor Thurber: The Future, If Any, of Comedy
'''
from nltk.corpus import brown

In [15]:
brown.categories()


Out[15]:
[u'adventure',
 u'belles_lettres',
 u'editorial',
 u'fiction',
 u'government',
 u'hobbies',
 u'humor',
 u'learned',
 u'lore',
 u'mystery',
 u'news',
 u'religion',
 u'reviews',
 u'romance',
 u'science_fiction']

In [17]:
brown.words(categories='adventure')


Out[17]:
[u'Dan', u'Morgan', u'told', u'himself', u'he', ...]

In [19]:
brown.words(fileids=['ca04'])


Out[19]:
[u'Oslo', u'The', u'most', u'positive', u'element', ...]

In [20]:
brown.sents(fileids=['ca04'])


Out[20]:
[[u'Oslo'], [u'The', u'most', u'positive', u'element', u'to', u'emerge', u'from', u'the', u'Oslo', u'meeting', u'of', u'North', u'Atlantic', u'Treaty', u'Organization', u'Foreign', u'Ministers', u'has', u'been', u'the', u'freer', u',', u'franker', u',', u'and', u'wider', u'discussions', u',', u'animated', u'by', u'much', u'better', u'mutual', u'understanding', u'than', u'in', u'past', u'meetings', u'.'], ...]

In [21]:
#布朗语料库是一个研究文体指尖的系统性差异的很方便的资源
#可以先产生特定问题的计数
news_text = brown.words(categories='news')

In [23]:
fdist = nltk.FreqDist([w.lower() for w in news_text])

In [24]:
modals = ['can','could','may','might','must','will']

In [26]:
for m in modals:
    print m+":",fdist[m],


can: 94 could: 87 may: 93 might: 38 must: 53 will: 389

In [27]:
#下面对每一个感兴趣的问题进行统计,使用nltk的带条件的频率分布函数进行处理
cfd = nltk.ConditionalFreqDist(
    (genre, word) for genre in brown.categories() for word in brown.words(categories=genre)
)

In [28]:
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']

In [29]:
modals = ['can','could','may','might','must','will']

In [30]:
cfd.tabulate(conditions=genres, samples=modals)


                  can could   may might  must  will 
           news    93    86    66    38    50   389 
       religion    82    59    78    12    54    71 
        hobbies   268    58   131    22    83   264 
science_fiction    16    49     4    12     8    16 
        romance    74   193    11    51    45    43 
          humor    16    30     8     8     9    13 

In [32]:
print brown.categories()


[u'adventure', u'belles_lettres', u'editorial', u'fiction', u'government', u'hobbies', u'humor', u'learned', u'lore', u'mystery', u'news', u'religion', u'reviews', u'romance', u'science_fiction']

In [33]:
#路透社语料库路
#透社语料库包  10,788 个新闻文档,共计 130 万字。这些文档分成 90 个主题,按照 “训练”和“测试”分为两组。
from nltk.corpus import reuters

In [34]:
reuters.fileids()


Out[34]:
['test/14826',
 'test/14828',
 'test/14829',
 'test/14832',
 'test/14833',
 'test/14839',
 'test/14840',
 'test/14841',
 'test/14842',
 'test/14843',
 'test/14844',
 'test/14849',
 'test/14852',
 'test/14854',
 'test/14858',
 'test/14859',
 'test/14860',
 'test/14861',
 'test/14862',
 'test/14863',
 'test/14865',
 'test/14867',
 'test/14872',
 'test/14873',
 'test/14875',
 'test/14876',
 'test/14877',
 'test/14881',
 'test/14882',
 'test/14885',
 'test/14886',
 'test/14888',
 'test/14890',
 'test/14891',
 'test/14892',
 'test/14899',
 'test/14900',
 'test/14903',
 'test/14904',
 'test/14907',
 'test/14909',
 'test/14911',
 'test/14912',
 'test/14913',
 'test/14918',
 'test/14919',
 'test/14921',
 'test/14922',
 'test/14923',
 'test/14926',
 'test/14928',
 'test/14930',
 'test/14931',
 'test/14932',
 'test/14933',
 'test/14934',
 'test/14941',
 'test/14943',
 'test/14949',
 'test/14951',
 'test/14954',
 'test/14957',
 'test/14958',
 'test/14959',
 'test/14960',
 'test/14962',
 'test/14963',
 'test/14964',
 'test/14965',
 'test/14967',
 'test/14968',
 'test/14969',
 'test/14970',
 'test/14971',
 'test/14974',
 'test/14975',
 'test/14978',
 'test/14981',
 'test/14982',
 'test/14983',
 'test/14984',
 'test/14985',
 'test/14986',
 'test/14987',
 'test/14988',
 'test/14993',
 'test/14995',
 'test/14998',
 'test/15000',
 'test/15001',
 'test/15002',
 'test/15004',
 'test/15005',
 'test/15006',
 'test/15011',
 'test/15012',
 'test/15013',
 'test/15016',
 'test/15017',
 'test/15020',
 'test/15023',
 'test/15024',
 'test/15026',
 'test/15027',
 'test/15028',
 'test/15029',
 'test/15031',
 'test/15032',
 'test/15033',
 'test/15037',
 'test/15038',
 'test/15043',
 'test/15045',
 'test/15046',
 'test/15048',
 'test/15049',
 'test/15052',
 'test/15053',
 'test/15055',
 'test/15056',
 'test/15060',
 'test/15061',
 'test/15062',
 'test/15063',
 'test/15065',
 'test/15067',
 'test/15069',
 'test/15070',
 'test/15074',
 'test/15077',
 'test/15078',
 'test/15079',
 'test/15082',
 'test/15090',
 'test/15091',
 'test/15092',
 'test/15093',
 'test/15094',
 'test/15095',
 'test/15096',
 'test/15097',
 'test/15103',
 'test/15104',
 'test/15106',
 'test/15107',
 'test/15109',
 'test/15110',
 'test/15111',
 'test/15112',
 'test/15118',
 'test/15119',
 'test/15120',
 'test/15121',
 'test/15122',
 'test/15124',
 'test/15126',
 'test/15128',
 'test/15129',
 'test/15130',
 'test/15132',
 'test/15136',
 'test/15138',
 'test/15141',
 'test/15144',
 'test/15145',
 'test/15146',
 'test/15149',
 'test/15152',
 'test/15153',
 'test/15154',
 'test/15156',
 'test/15157',
 'test/15161',
 'test/15162',
 'test/15171',
 'test/15172',
 'test/15175',
 'test/15179',
 'test/15180',
 'test/15185',
 'test/15188',
 'test/15189',
 'test/15190',
 'test/15193',
 'test/15194',
 'test/15197',
 'test/15198',
 'test/15200',
 'test/15204',
 'test/15205',
 'test/15206',
 'test/15207',
 'test/15208',
 'test/15210',
 'test/15211',
 'test/15212',
 'test/15213',
 'test/15217',
 'test/15219',
 'test/15220',
 'test/15221',
 'test/15222',
 'test/15223',
 'test/15226',
 'test/15227',
 'test/15230',
 'test/15233',
 'test/15234',
 'test/15237',
 'test/15238',
 'test/15239',
 'test/15240',
 'test/15242',
 'test/15243',
 'test/15244',
 'test/15246',
 'test/15247',
 'test/15250',
 'test/15253',
 'test/15254',
 'test/15255',
 'test/15258',
 'test/15259',
 'test/15262',
 'test/15263',
 'test/15264',
 'test/15265',
 'test/15270',
 'test/15271',
 'test/15273',
 'test/15274',
 'test/15276',
 'test/15278',
 'test/15280',
 'test/15281',
 'test/15283',
 'test/15287',
 'test/15290',
 'test/15292',
 'test/15294',
 'test/15295',
 'test/15296',
 'test/15299',
 'test/15300',
 'test/15302',
 'test/15303',
 'test/15306',
 'test/15307',
 'test/15308',
 'test/15309',
 'test/15310',
 'test/15311',
 'test/15312',
 'test/15313',
 'test/15314',
 'test/15315',
 'test/15321',
 'test/15322',
 'test/15324',
 'test/15325',
 'test/15326',
 'test/15327',
 'test/15329',
 'test/15335',
 'test/15336',
 'test/15337',
 'test/15339',
 'test/15341',
 'test/15344',
 'test/15345',
 'test/15348',
 'test/15349',
 'test/15351',
 'test/15352',
 'test/15354',
 'test/15356',
 'test/15357',
 'test/15359',
 'test/15363',
 'test/15364',
 'test/15365',
 'test/15366',
 'test/15367',
 'test/15368',
 'test/15372',
 'test/15375',
 'test/15378',
 'test/15379',
 'test/15380',
 'test/15383',
 'test/15384',
 'test/15386',
 'test/15387',
 'test/15388',
 'test/15389',
 'test/15391',
 'test/15394',
 'test/15396',
 'test/15397',
 'test/15400',
 'test/15404',
 'test/15406',
 'test/15409',
 'test/15410',
 'test/15411',
 'test/15413',
 'test/15415',
 'test/15416',
 'test/15417',
 'test/15420',
 'test/15421',
 'test/15424',
 'test/15425',
 'test/15427',
 'test/15428',
 'test/15429',
 'test/15430',
 'test/15431',
 'test/15432',
 'test/15436',
 'test/15438',
 'test/15441',
 'test/15442',
 'test/15444',
 'test/15446',
 'test/15447',
 'test/15448',
 'test/15449',
 'test/15450',
 'test/15451',
 'test/15452',
 'test/15453',
 'test/15454',
 'test/15455',
 'test/15457',
 'test/15459',
 'test/15460',
 'test/15462',
 'test/15464',
 'test/15467',
 'test/15468',
 'test/15471',
 'test/15472',
 'test/15476',
 'test/15477',
 'test/15478',
 'test/15479',
 'test/15481',
 'test/15482',
 'test/15483',
 'test/15484',
 'test/15485',
 'test/15487',
 'test/15489',
 'test/15494',
 'test/15495',
 'test/15496',
 'test/15500',
 'test/15501',
 'test/15503',
 'test/15504',
 'test/15510',
 'test/15511',
 'test/15515',
 'test/15520',
 'test/15521',
 'test/15522',
 'test/15523',
 'test/15527',
 'test/15528',
 'test/15531',
 'test/15532',
 'test/15535',
 'test/15536',
 'test/15539',
 'test/15540',
 'test/15542',
 'test/15543',
 'test/15544',
 'test/15545',
 'test/15547',
 'test/15548',
 'test/15549',
 'test/15550',
 'test/15551',
 'test/15552',
 'test/15553',
 'test/15556',
 'test/15558',
 'test/15559',
 'test/15560',
 'test/15561',
 'test/15562',
 'test/15563',
 'test/15565',
 'test/15566',
 'test/15567',
 'test/15568',
 'test/15569',
 'test/15570',
 'test/15571',
 'test/15572',
 'test/15573',
 'test/15574',
 'test/15575',
 'test/15578',
 'test/15579',
 'test/15580',
 'test/15581',
 'test/15582',
 'test/15583',
 'test/15584',
 'test/15585',
 'test/15590',
 'test/15591',
 'test/15593',
 'test/15594',
 'test/15595',
 'test/15596',
 'test/15597',
 'test/15598',
 'test/15600',
 'test/15601',
 'test/15602',
 'test/15603',
 'test/15605',
 'test/15607',
 'test/15610',
 'test/15613',
 'test/15615',
 'test/15616',
 'test/15617',
 'test/15618',
 'test/15620',
 'test/15621',
 'test/15623',
 'test/15624',
 'test/15625',
 'test/15626',
 'test/15629',
 'test/15632',
 'test/15634',
 'test/15636',
 'test/15637',
 'test/15639',
 'test/15640',
 'test/15641',
 'test/15642',
 'test/15643',
 'test/15646',
 'test/15648',
 'test/15649',
 'test/15651',
 'test/15653',
 'test/15655',
 'test/15656',
 'test/15664',
 'test/15666',
 'test/15667',
 'test/15668',
 'test/15669',
 'test/15672',
 'test/15674',
 'test/15675',
 'test/15676',
 'test/15677',
 'test/15679',
 'test/15680',
 'test/15682',
 'test/15686',
 'test/15688',
 'test/15689',
 'test/15691',
 'test/15692',
 'test/15694',
 'test/15695',
 'test/15696',
 'test/15698',
 'test/15702',
 'test/15703',
 'test/15704',
 'test/15707',
 'test/15708',
 'test/15709',
 'test/15710',
 'test/15713',
 'test/15715',
 'test/15717',
 'test/15719',
 'test/15720',
 'test/15721',
 'test/15723',
 'test/15725',
 'test/15726',
 'test/15727',
 'test/15728',
 'test/15729',
 'test/15732',
 'test/15733',
 'test/15736',
 'test/15737',
 'test/15739',
 'test/15742',
 'test/15749',
 'test/15751',
 'test/15753',
 'test/15757',
 'test/15759',
 'test/15762',
 'test/15767',
 'test/15768',
 'test/15769',
 'test/15772',
 'test/15777',
 'test/15778',
 'test/15780',
 'test/15782',
 'test/15785',
 'test/15790',
 'test/15793',
 'test/15797',
 'test/15798',
 'test/15800',
 'test/15801',
 'test/15803',
 'test/15804',
 'test/15805',
 'test/15807',
 'test/15808',
 'test/15810',
 'test/15811',
 'test/15816',
 'test/15817',
 'test/15819',
 'test/15821',
 'test/15822',
 'test/15823',
 'test/15829',
 'test/15831',
 'test/15832',
 'test/15833',
 'test/15834',
 'test/15836',
 'test/15838',
 'test/15840',
 'test/15841',
 'test/15842',
 'test/15844',
 'test/15845',
 'test/15846',
 'test/15847',
 'test/15851',
 'test/15852',
 'test/15853',
 'test/15854',
 'test/15855',
 'test/15856',
 'test/15858',
 'test/15859',
 'test/15860',
 'test/15861',
 'test/15863',
 'test/15864',
 'test/15865',
 'test/15866',
 'test/15867',
 'test/15868',
 'test/15869',
 'test/15870',
 'test/15871',
 'test/15872',
 'test/15874',
 'test/15875',
 'test/15876',
 'test/15877',
 'test/15878',
 'test/15879',
 'test/15881',
 'test/15885',
 'test/15886',
 'test/15888',
 'test/15889',
 'test/15890',
 'test/15892',
 'test/15893',
 'test/15894',
 'test/15895',
 'test/15896',
 'test/15897',
 'test/15898',
 'test/15899',
 'test/15900',
 'test/15901',
 'test/15902',
 'test/15903',
 'test/15904',
 'test/15906',
 'test/15908',
 'test/15909',
 'test/15910',
 'test/15911',
 'test/15912',
 'test/15913',
 'test/15914',
 'test/15916',
 'test/15917',
 'test/15918',
 'test/15920',
 'test/15921',
 'test/15922',
 'test/15923',
 'test/15924',
 'test/15925',
 'test/15927',
 'test/15928',
 'test/15929',
 'test/15930',
 'test/15932',
 'test/15933',
 'test/15934',
 'test/15937',
 'test/15939',
 'test/15942',
 'test/15944',
 'test/15949',
 'test/15950',
 'test/15951',
 'test/15952',
 'test/15953',
 'test/15956',
 'test/15959',
 'test/15960',
 'test/15961',
 'test/15963',
 'test/15964',
 'test/15967',
 'test/15968',
 'test/15969',
 'test/15970',
 'test/15973',
 'test/15975',
 'test/15976',
 'test/15977',
 'test/15978',
 'test/15979',
 'test/15980',
 'test/15981',
 'test/15984',
 'test/15985',
 'test/15987',
 'test/15988',
 'test/15989',
 'test/15993',
 'test/15995',
 'test/15996',
 'test/15997',
 'test/15999',
 'test/16002',
 'test/16003',
 'test/16004',
 'test/16005',
 'test/16006',
 'test/16007',
 'test/16009',
 'test/16012',
 'test/16013',
 'test/16014',
 'test/16015',
 'test/16016',
 'test/16021',
 'test/16022',
 'test/16023',
 'test/16026',
 'test/16029',
 'test/16030',
 'test/16033',
 'test/16037',
 'test/16040',
 'test/16041',
 'test/16045',
 'test/16052',
 'test/16053',
 'test/16055',
 'test/16063',
 'test/16066',
 'test/16067',
 'test/16068',
 'test/16069',
 'test/16071',
 'test/16072',
 'test/16074',
 'test/16075',
 'test/16076',
 'test/16077',
 'test/16079',
 'test/16080',
 'test/16083',
 'test/16086',
 'test/16088',
 'test/16091',
 'test/16093',
 'test/16094',
 'test/16095',
 'test/16096',
 'test/16097',
 'test/16098',
 'test/16099',
 'test/16100',
 'test/16103',
 'test/16106',
 'test/16107',
 'test/16108',
 'test/16110',
 'test/16111',
 'test/16112',
 'test/16115',
 'test/16117',
 'test/16118',
 'test/16119',
 'test/16120',
 'test/16122',
 'test/16123',
 'test/16125',
 'test/16126',
 'test/16130',
 'test/16133',
 'test/16134',
 'test/16136',
 'test/16139',
 'test/16140',
 'test/16141',
 'test/16142',
 'test/16143',
 'test/16144',
 'test/16145',
 'test/16146',
 'test/16147',
 'test/16148',
 'test/16149',
 'test/16150',
 'test/16152',
 'test/16155',
 'test/16158',
 'test/16159',
 'test/16161',
 'test/16162',
 'test/16163',
 'test/16164',
 'test/16166',
 'test/16170',
 'test/16171',
 'test/16172',
 'test/16173',
 'test/16175',
 'test/16176',
 'test/16177',
 'test/16179',
 'test/16180',
 'test/16185',
 'test/16188',
 'test/16189',
 'test/16190',
 'test/16193',
 'test/16194',
 'test/16195',
 'test/16196',
 'test/16197',
 'test/16200',
 'test/16201',
 'test/16202',
 'test/16203',
 'test/16206',
 'test/16207',
 'test/16210',
 'test/16211',
 'test/16212',
 'test/16213',
 'test/16214',
 'test/16215',
 'test/16216',
 'test/16219',
 'test/16221',
 'test/16223',
 'test/16225',
 'test/16226',
 'test/16228',
 'test/16230',
 'test/16232',
 'test/16233',
 'test/16234',
 'test/16236',
 'test/16238',
 'test/16241',
 'test/16243',
 'test/16244',
 'test/16246',
 'test/16247',
 'test/16248',
 'test/16250',
 'test/16251',
 'test/16252',
 'test/16255',
 'test/16256',
 'test/16257',
 'test/16258',
 'test/16260',
 'test/16262',
 'test/16263',
 'test/16264',
 'test/16265',
 'test/16266',
 'test/16268',
 'test/16269',
 'test/16270',
 'test/16271',
 'test/16274',
 'test/16275',
 'test/16277',
 'test/16278',
 'test/16279',
 'test/16281',
 'test/16282',
 'test/16283',
 'test/16284',
 'test/16285',
 'test/16286',
 'test/16287',
 'test/16288',
 'test/16289',
 'test/16291',
 'test/16294',
 'test/16297',
 'test/16298',
 'test/16299',
 'test/16300',
 'test/16301',
 'test/16302',
 'test/16303',
 'test/16304',
 'test/16307',
 'test/16310',
 'test/16311',
 'test/16312',
 'test/16314',
 'test/16315',
 'test/16316',
 'test/16317',
 'test/16318',
 'test/16319',
 'test/16320',
 'test/16324',
 'test/16327',
 'test/16331',
 'test/16332',
 'test/16336',
 'test/16337',
 'test/16339',
 'test/16342',
 'test/16343',
 'test/16346',
 'test/16347',
 'test/16348',
 'test/16350',
 'test/16354',
 'test/16357',
 'test/16359',
 'test/16360',
 'test/16362',
 'test/16363',
 'test/16365',
 'test/16366',
 'test/16367',
 'test/16369',
 'test/16370',
 'test/16371',
 'test/16372',
 'test/16374',
 'test/16376',
 'test/16377',
 'test/16379',
 'test/16380',
 'test/16383',
 'test/16385',
 'test/16386',
 'test/16388',
 'test/16390',
 'test/16392',
 'test/16393',
 'test/16394',
 'test/16395',
 'test/16396',
 'test/16398',
 'test/16399',
 'test/16400',
 'test/16401',
 'test/16402',
 'test/16403',
 'test/16404',
 'test/16405',
 'test/16406',
 'test/16407',
 'test/16409',
 'test/16410',
 'test/16415',
 'test/16417',
 'test/16418',
 'test/16419',
 'test/16420',
 'test/16421',
 'test/16422',
 'test/16424',
 'test/16426',
 'test/16427',
 'test/16428',
 'test/16429',
 'test/16430',
 'test/16432',
 'test/16433',
 'test/16434',
 'test/16437',
 'test/16438',
 'test/16440',
 'test/16441',
 'test/16442',
 'test/16443',
 'test/16444',
 'test/16448',
 'test/16449',
 'test/16450',
 'test/16454',
 'test/16457',
 'test/16458',
 'test/16459',
 'test/16460',
 'test/16461',
 'test/16463',
 'test/16465',
 'test/16468',
 'test/16469',
 'test/16470',
 'test/16471',
 'test/16472',
 'test/16473',
 'test/16475',
 'test/16476',
 'test/16478',
 'test/16479',
 'test/16480',
 'test/16481',
 'test/16483',
 'test/16486',
 'test/16487',
 'test/16488',
 'test/16490',
 'test/16492',
 'test/16493',
 'test/16495',
 'test/16496',
 'test/16499',
 'test/16502',
 'test/16505',
 'test/16510',
 'test/16512',
 'test/16513',
 'test/16518',
 'test/16519',
 'test/16521',
 'test/16522',
 'test/16523',
 'test/16525',
 'test/16527',
 'test/16530',
 'test/16531',
 'test/16533',
 'test/16538',
 'test/16539',
 'test/16545',
 'test/16546',
 'test/16549',
 'test/16551',
 'test/16554',
 'test/16555',
 'test/16561',
 'test/16563',
 'test/16564',
 'test/16565',
 'test/16568',
 'test/16569',
 'test/16570',
 'test/16574',
 'test/16577',
 'test/16581',
 'test/16583',
 'test/16584',
 'test/16585',
 'test/16587',
 'test/16588',
 'test/16589',
 'test/16590',
 'test/16591',
 ...]

In [35]:
reuters.categories()


Out[35]:
[u'acq',
 u'alum',
 u'barley',
 u'bop',
 u'carcass',
 u'castor-oil',
 u'cocoa',
 u'coconut',
 u'coconut-oil',
 u'coffee',
 u'copper',
 u'copra-cake',
 u'corn',
 u'cotton',
 u'cotton-oil',
 u'cpi',
 u'cpu',
 u'crude',
 u'dfl',
 u'dlr',
 u'dmk',
 u'earn',
 u'fuel',
 u'gas',
 u'gnp',
 u'gold',
 u'grain',
 u'groundnut',
 u'groundnut-oil',
 u'heat',
 u'hog',
 u'housing',
 u'income',
 u'instal-debt',
 u'interest',
 u'ipi',
 u'iron-steel',
 u'jet',
 u'jobs',
 u'l-cattle',
 u'lead',
 u'lei',
 u'lin-oil',
 u'livestock',
 u'lumber',
 u'meal-feed',
 u'money-fx',
 u'money-supply',
 u'naphtha',
 u'nat-gas',
 u'nickel',
 u'nkr',
 u'nzdlr',
 u'oat',
 u'oilseed',
 u'orange',
 u'palladium',
 u'palm-oil',
 u'palmkernel',
 u'pet-chem',
 u'platinum',
 u'potato',
 u'propane',
 u'rand',
 u'rape-oil',
 u'rapeseed',
 u'reserves',
 u'retail',
 u'rice',
 u'rubber',
 u'rye',
 u'ship',
 u'silver',
 u'sorghum',
 u'soy-meal',
 u'soy-oil',
 u'soybean',
 u'strategic-metal',
 u'sugar',
 u'sun-meal',
 u'sun-oil',
 u'sunseed',
 u'tea',
 u'tin',
 u'trade',
 u'veg-oil',
 u'wheat',
 u'wpi',
 u'yen',
 u'zinc']

In [42]:
#与布朗语料库不同,路透社语料库的类别是有互相重叠的,只是因为新闻报道往往涉及
#多个主题。我们可以查找由一个或多个文档涵盖的主题,也可以查找包 在一个或多个类别中的文档。
reuters.categories('test/14833')


Out[42]:
[u'palm-oil', u'veg-oil']

In [43]:
reuters.categories(['test/14833','test/15259'])


Out[43]:
[u'earn', u'palm-oil', u'veg-oil']

In [44]:
reuters.fileids('tea')


Out[44]:
[u'test/16225',
 u'test/17494',
 u'test/19672',
 u'test/19982',
 u'training/10268',
 u'training/10406',
 u'training/12754',
 u'training/12907',
 u'training/235',
 u'training/275',
 u'training/7545',
 u'training/9153',
 u'training/9327']

In [45]:
reuters.fileids(['tea','tin'])


Out[45]:
[u'test/14832',
 u'test/14844',
 u'test/14877',
 u'test/15112',
 u'test/15219',
 u'test/15624',
 u'test/15817',
 u'test/16225',
 u'test/17494',
 u'test/17731',
 u'test/18924',
 u'test/19065',
 u'test/19367',
 u'test/19672',
 u'test/19982',
 u'test/20458',
 u'training/10268',
 u'training/10332',
 u'training/10406',
 u'training/11224',
 u'training/11801',
 u'training/12754',
 u'training/12907',
 u'training/13185',
 u'training/1929',
 u'training/235',
 u'training/275',
 u'training/311',
 u'training/4122',
 u'training/688',
 u'training/6934',
 u'training/7533',
 u'training/7545',
 u'training/7592',
 u'training/7877',
 u'training/8055',
 u'training/8415',
 u'training/8416',
 u'training/8427',
 u'training/8933',
 u'training/908',
 u'training/9153',
 u'training/9327']

In [46]:
reuters.words('training/9865')[:14]


Out[46]:
[u'FRENCH',
 u'FREE',
 u'MARKET',
 u'CEREAL',
 u'EXPORT',
 u'BIDS',
 u'DETAILED',
 u'French',
 u'operators',
 u'have',
 u'requested',
 u'licences',
 u'to',
 u'export']

In [47]:
reuters.words(['training/9865', 'training/9880'])


Out[47]:
[u'FRENCH', u'FREE', u'MARKET', u'CEREAL', u'EXPORT', ...]

In [48]:
reuters.words(categories='barley')


Out[48]:
[u'FRENCH', u'FREE', u'MARKET', u'CEREAL', u'EXPORT', ...]

In [49]:
reuters.words(categories=['barley', 'corn'])


Out[49]:
[u'THAI', u'TRADE', u'DEFICIT', u'WIDENS', u'IN', ...]

In [51]:
#就职演说语料库
from nltk.corpus import inaugural

In [52]:
inaugural.fileids()


Out[52]:
[u'1789-Washington.txt',
 u'1793-Washington.txt',
 u'1797-Adams.txt',
 u'1801-Jefferson.txt',
 u'1805-Jefferson.txt',
 u'1809-Madison.txt',
 u'1813-Madison.txt',
 u'1817-Monroe.txt',
 u'1821-Monroe.txt',
 u'1825-Adams.txt',
 u'1829-Jackson.txt',
 u'1833-Jackson.txt',
 u'1837-VanBuren.txt',
 u'1841-Harrison.txt',
 u'1845-Polk.txt',
 u'1849-Taylor.txt',
 u'1853-Pierce.txt',
 u'1857-Buchanan.txt',
 u'1861-Lincoln.txt',
 u'1865-Lincoln.txt',
 u'1869-Grant.txt',
 u'1873-Grant.txt',
 u'1877-Hayes.txt',
 u'1881-Garfield.txt',
 u'1885-Cleveland.txt',
 u'1889-Harrison.txt',
 u'1893-Cleveland.txt',
 u'1897-McKinley.txt',
 u'1901-McKinley.txt',
 u'1905-Roosevelt.txt',
 u'1909-Taft.txt',
 u'1913-Wilson.txt',
 u'1917-Wilson.txt',
 u'1921-Harding.txt',
 u'1925-Coolidge.txt',
 u'1929-Hoover.txt',
 u'1933-Roosevelt.txt',
 u'1937-Roosevelt.txt',
 u'1941-Roosevelt.txt',
 u'1945-Roosevelt.txt',
 u'1949-Truman.txt',
 u'1953-Eisenhower.txt',
 u'1957-Eisenhower.txt',
 u'1961-Kennedy.txt',
 u'1965-Johnson.txt',
 u'1969-Nixon.txt',
 u'1973-Nixon.txt',
 u'1977-Carter.txt',
 u'1981-Reagan.txt',
 u'1985-Reagan.txt',
 u'1989-Bush.txt',
 u'1993-Clinton.txt',
 u'1997-Clinton.txt',
 u'2001-Bush.txt',
 u'2005-Bush.txt',
 u'2009-Obama.txt']

In [53]:
[fileid[:4] for fileid in inaugural.fileids()]


Out[53]:
[u'1789',
 u'1793',
 u'1797',
 u'1801',
 u'1805',
 u'1809',
 u'1813',
 u'1817',
 u'1821',
 u'1825',
 u'1829',
 u'1833',
 u'1837',
 u'1841',
 u'1845',
 u'1849',
 u'1853',
 u'1857',
 u'1861',
 u'1865',
 u'1869',
 u'1873',
 u'1877',
 u'1881',
 u'1885',
 u'1889',
 u'1893',
 u'1897',
 u'1901',
 u'1905',
 u'1909',
 u'1913',
 u'1917',
 u'1921',
 u'1925',
 u'1929',
 u'1933',
 u'1937',
 u'1941',
 u'1945',
 u'1949',
 u'1953',
 u'1957',
 u'1961',
 u'1965',
 u'1969',
 u'1973',
 u'1977',
 u'1981',
 u'1985',
 u'1989',
 u'1993',
 u'1997',
 u'2001',
 u'2005',
 u'2009']

In [55]:
#让我们来看看词汇 america 和 citizen 随时间推移的使用情况。下面的代码使用 w.lowe
#r()将就职演说语料库中的词汇转换成小写。
#然后用 startswith()检查它们是否以“目 标”词汇 america 或 citizen 开始
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids() 
    for w in inaugural.words(fileid) 
    for target in ['america', 'citizen']
    if w.lower().startswith(target)
)

In [58]:
cfd.plot()



In [2]:
#其他语言的语料库
nltk.corpus.cess_esp.words()


Out[2]:
[u'El', u'grupo', u'estatal', ...]

In [3]:
nltk.corpus.floresta.words()


Out[3]:
[u'Um', u'revivalismo', u'refrescante', u'O', ...]

In [4]:
nltk.corpus.indian.words('hindi.pos')


Out[4]:
[u'\u092a\u0942\u0930\u094d\u0923', u'\u092a\u094d\u0930\u0924\u093f\u092c\u0902\u0927', ...]

In [5]:
nltk.corpus.udhr.fileids()


Out[5]:
[u'Abkhaz-Cyrillic+Abkh',
 u'Abkhaz-UTF8',
 u'Achehnese-Latin1',
 u'Achuar-Shiwiar-Latin1',
 u'Adja-UTF8',
 u'Afaan_Oromo_Oromiffa-Latin1',
 u'Afrikaans-Latin1',
 u'Aguaruna-Latin1',
 u'Akuapem_Twi-UTF8',
 u'Albanian_Shqip-Latin1',
 u'Amahuaca',
 u'Amahuaca-Latin1',
 u'Amarakaeri-Latin1',
 u'Amuesha-Yanesha-UTF8',
 u'Arabela-Latin1',
 u'Arabic_Alarabia-Arabic',
 u'Asante-UTF8',
 u'Ashaninca-Latin1',
 u'Asheninca-Latin1',
 u'Asturian_Bable-Latin1',
 u'Aymara-Latin1',
 u'Balinese-Latin1',
 u'Bambara-UTF8',
 u'Baoule-UTF8',
 u'Basque_Euskara-Latin1',
 u'Batonu_Bariba-UTF8',
 u'Belorus_Belaruski-Cyrillic',
 u'Belorus_Belaruski-UTF8',
 u'Bemba-Latin1',
 u'Bengali-UTF8',
 u'Beti-UTF8',
 u'Bichelamar-Latin1',
 u'Bikol_Bicolano-Latin1',
 u'Bora-Latin1',
 u'Bosnian_Bosanski-Cyrillic',
 u'Bosnian_Bosanski-Latin2',
 u'Bosnian_Bosanski-UTF8',
 u'Breton-Latin1',
 u'Bugisnese-Latin1',
 u'Bulgarian_Balgarski-Cyrillic',
 u'Bulgarian_Balgarski-UTF8',
 u'Cakchiquel-Latin1',
 u'Campa_Pajonalino-Latin1',
 u'Candoshi-Shapra-Latin1',
 u'Caquinte-Latin1',
 u'Cashibo-Cacataibo-Latin1',
 u'Cashinahua-Latin1',
 u'Catalan-Latin1',
 u'Catalan_Catala-Latin1',
 u'Cebuano-Latin1',
 u'Chamorro-Latin1',
 u'Chayahuita-Latin1',
 u'Chechewa_Nyanja-Latin1',
 u'Chickasaw-Latin1',
 u'Chinanteco-Ajitlan-Latin1',
 u'Chinanteco-UTF8',
 u'Chinese_Mandarin-GB2312',
 u'Chuuk_Trukese-Latin1',
 u'Cokwe-Latin1',
 u'Corsican-Latin1',
 u'Croatian_Hrvatski-Latin2',
 u'Czech-Latin2',
 u'Czech-UTF8',
 u'Czech_Cesky-Latin2',
 u'Czech_Cesky-UTF8',
 u'Dagaare-UTF8',
 u'Dagbani-UTF8',
 u'Dangme-UTF8',
 u'Danish_Dansk-Latin1',
 u'Dendi-UTF8',
 u'Ditammari-UTF8',
 u'Dutch_Nederlands-Latin1',
 u'Edo-Latin1',
 u'English-Latin1',
 u'Esperanto-UTF8',
 u'Estonian_Eesti-Latin1',
 u'Ewe_Eve-UTF8',
 u'Fante-UTF8',
 u'Faroese-Latin1',
 u'Farsi_Persian-UTF8',
 u'Farsi_Persian-v2-UTF8',
 u'Fijian-Latin1',
 u'Filipino_Tagalog-Latin1',
 u'Finnish_Suomi-Latin1',
 u'Fon-UTF8',
 u'French_Francais-Latin1',
 u'Frisian-Latin1',
 u'Friulian_Friulano-Latin1',
 u'Ga-UTF8',
 u'Gagauz_Gagauzi-UTF8',
 u'Galician_Galego-Latin1',
 u'Garifuna_Garifuna-Latin1',
 u'German_Deutsch-Latin1',
 u'Gonja-UTF8',
 u'Greek_Ellinika-Greek',
 u'Greek_Ellinika-UTF8',
 u'Greenlandic_Inuktikut-Latin1',
 u'Guarani-Latin1',
 u'Guen_Mina-UTF8',
 u'HaitianCreole_Kreyol-Latin1',
 u'HaitianCreole_Popular-Latin1',
 u'Hani-Latin1',
 u'Hausa_Haoussa-Latin1',
 u'Hawaiian-UTF8',
 u'Hebrew_Ivrit-Hebrew',
 u'Hebrew_Ivrit-UTF8',
 u'Hiligaynon-Latin1',
 u'Hindi-UTF8',
 u'Hindi_web-UTF8',
 u'Hmong_Miao-Sichuan-Guizhou-Yunnan-Latin1',
 u'Hmong_Miao-SouthernEast-Guizhou-Latin1',
 u'Hmong_Miao_Northern-East-Guizhou-Latin1',
 u'Hrvatski_Croatian-Latin2',
 u'Huasteco-Latin1',
 u'Huitoto_Murui-Latin1',
 u'Hungarian_Magyar-Latin1',
 u'Hungarian_Magyar-Latin2',
 u'Hungarian_Magyar-UTF8',
 u'Ibibio_Efik-Latin1',
 u'Icelandic_Yslenska-Latin1',
 u'Ido-Latin1',
 u'Igbo-UTF8',
 u'Iloko_Ilocano-Latin1',
 u'Indonesian-Latin1',
 u'Interlingua-Latin1',
 u'Inuktikut_Greenlandic-Latin1',
 u'IrishGaelic_Gaeilge-Latin1',
 u'Italian-Latin1',
 u'Italian_Italiano-Latin1',
 u'Japanese_Nihongo-EUC',
 u'Japanese_Nihongo-SJIS',
 u'Japanese_Nihongo-UTF8',
 u'Javanese-Latin1',
 u'Jola-Fogny_Diola-UTF8',
 u'Kabye-UTF8',
 u'Kannada-UTF8',
 u'Kaonde-Latin1',
 u'Kapampangan-Latin1',
 u'Kasem-UTF8',
 u'Kazakh-Cyrillic',
 u'Kazakh-UTF8',
 u'Kiche_Quiche-Latin1',
 u'Kicongo-Latin1',
 u'Kimbundu_Mbundu-Latin1',
 u'Kinyamwezi_Nyamwezi-Latin1',
 u'Kinyarwanda-Latin1',
 u'Kituba-Latin1',
 u'Korean_Hankuko-UTF8',
 u'Kpelewo-UTF8',
 u'Krio-UTF8',
 u'Kurdish-UTF8',
 u'Lamnso_Lam-nso-UTF8',
 u'Latin_Latina-Latin1',
 u'Latin_Latina-v2-Latin1',
 u'Latvian-Latin1',
 u'Limba-UTF8',
 u'Lingala-Latin1',
 u'Lithuanian_Lietuviskai-Baltic',
 u'Lozi-Latin1',
 u'Luba-Kasai_Tshiluba-Latin1',
 u'Luganda_Ganda-Latin1',
 u'Lunda_Chokwe-lunda-Latin1',
 u'Luvale-Latin1',
 u'Luxembourgish_Letzebuergeusch-Latin1',
 u'Macedonian-UTF8',
 u'Madurese-Latin1',
 u'Makonde-Latin1',
 u'Malagasy-Latin1',
 u'Malay_BahasaMelayu-Latin1',
 u'Maltese-UTF8',
 u'Mam-Latin1',
 u'Maninka-UTF8',
 u'Maori-Latin1',
 u'Mapudungun_Mapuzgun-Latin1',
 u'Mapudungun_Mapuzgun-UTF8',
 u'Marshallese-Latin1',
 u'Matses-Latin1',
 u'Mayan_Yucateco-Latin1',
 u'Mazahua_Jnatrjo-UTF8',
 u'Mazateco-Latin1',
 u'Mende-UTF8',
 u'Mikmaq_Micmac-Mikmaq-Latin1',
 u'Minangkabau-Latin1',
 u'Miskito_Miskito-Latin1',
 u'Mixteco-Latin1',
 u'Mongolian_Khalkha-Cyrillic',
 u'Mongolian_Khalkha-UTF8',
 u'Moore_More-UTF8',
 u'Nahuatl-Latin1',
 u'Ndebele-Latin1',
 u'Nepali-UTF8',
 u'Ngangela_Nyemba-Latin1',
 u'NigerianPidginEnglish-Latin1',
 u'Nomatsiguenga-Latin1',
 u'NorthernSotho_Pedi-Sepedi-Latin1',
 u'Norwegian-Latin1',
 u'Norwegian_Norsk-Bokmal-Latin1',
 u'Norwegian_Norsk-Nynorsk-Latin1',
 u'Nyanja_Chechewa-Latin1',
 u'Nyanja_Chinyanja-Latin1',
 u'Nzema-UTF8',
 u'OccitanAuvergnat-Latin1',
 u'OccitanLanguedocien-Latin1',
 u'Oromiffa_AfaanOromo-Latin1',
 u'Osetin_Ossetian-UTF8',
 u'Oshiwambo_Ndonga-Latin1',
 u'Otomi_Nahnu-Latin1',
 u'Paez-Latin1',
 u'Palauan-Latin1',
 u'Peuhl-UTF8',
 u'Picard-Latin1',
 u'Pipil-Latin1',
 u'Polish-Latin2',
 u'Polish_Polski-Latin2',
 u'Ponapean-Latin1',
 u'Portuguese_Portugues-Latin1',
 u'Pulaar-UTF8',
 u'Punjabi_Panjabi-UTF8',
 u'Purhepecha-UTF8',
 u'Qechi_Kekchi-Latin1',
 u'Quechua-Latin1',
 u'Quichua-Latin1',
 u'Rarotongan_MaoriCookIslands-Latin1',
 u'Rhaeto-Romance_Rumantsch-Latin1',
 u'Romani-Latin1',
 u'Romani-UTF8',
 u'Romanian-Latin2',
 u'Romanian_Romana-Latin2',
 u'Rukonzo_Konjo-Latin1',
 u'Rundi_Kirundi-Latin1',
 u'Runyankore-rukiga_Nkore-kiga-Latin1',
 u'Russian-Cyrillic',
 u'Russian-UTF8',
 u'Russian_Russky-Cyrillic',
 u'Russian_Russky-UTF8',
 u'Sami_Lappish-UTF8',
 u'Sammarinese-Latin1',
 u'Samoan-Latin1',
 u'Sango_Sangho-Latin1',
 u'Sanskrit-UTF8',
 u'Saraiki-UTF8',
 u'Sardinian-Latin1',
 u'ScottishGaelic_GaidhligAlbanach-Latin1',
 u'Seereer-UTF8',
 u'Serbian_Srpski-Cyrillic',
 u'Serbian_Srpski-Latin2',
 u'Serbian_Srpski-UTF8',
 u'Sharanahua-Latin1',
 u'Shipibo-Conibo-Latin1',
 u'Shona-Latin1',
 u'Sinhala-UTF8',
 u'Siswati-Latin1',
 u'Slovak-Latin2',
 u'Slovak_Slovencina-Latin2',
 u'Slovenian_Slovenscina-Latin2',
 u'SolomonsPidgin_Pijin-Latin1',
 u'Somali-Latin1',
 u'Soninke_Soninkanxaane-UTF8',
 u'Sorbian-Latin2',
 u'SouthernSotho_Sotho-Sesotho-Sutu-Sesutu-Latin1',
 u'Spanish-Latin1',
 u'Spanish_Espanol-Latin1',
 u'Sukuma-Latin1',
 u'Sundanese-Latin1',
 u'Sussu_Soussou-Sosso-Soso-Susu-UTF8',
 u'Swaheli-Latin1',
 u'Swahili_Kiswahili-Latin1',
 u'Swedish_Svenska-Latin1',
 u'Tahitian-UTF8',
 u'Tenek_Huasteco-Latin1',
 u'Tetum-Latin1',
 u'Themne_Temne-UTF8',
 u'Tiv-Latin1',
 u'Toba-UTF8',
 u'Tojol-abal-Latin1',
 u'TokPisin-Latin1',
 u'Tonga-Latin1',
 u'Tongan_Tonga-Latin1',
 u'Totonaco-Latin1',
 u'Trukese_Chuuk-Latin1',
 u'Turkish_Turkce-Turkish',
 u'Turkish_Turkce-UTF8',
 u'Tzeltal-Latin1',
 u'Tzotzil-Latin1',
 u'Uighur_Uyghur-Latin1',
 u'Uighur_Uyghur-UTF8',
 u'Ukrainian-Cyrillic',
 u'Ukrainian-UTF8',
 u'Umbundu-Latin1',
 u'Urarina-Latin1',
 u'Uzbek-Latin1',
 u'Vietnamese-ALRN-UTF8',
 u'Vietnamese-UTF8',
 u'Vlach-Latin1',
 u'Walloon_Wallon-Latin1',
 u'Wama-UTF8',
 u'Waray-Latin1',
 u'Wayuu-Latin1',
 u'Welsh_Cymraeg-Latin1',
 u'WesternSotho_Tswana-Setswana-Latin1',
 u'Wolof-Latin1',
 u'Xhosa-Latin1',
 u'Yagua-Latin1',
 u'Yao-Latin1',
 u'Yapese-Latin1',
 u'Yoruba-UTF8',
 u'Zapoteco-Latin1',
 u'Zapoteco-SanLucasQuiavini-Latin1',
 u'Zhuang-Latin1',
 u'Zulu-Latin1']

In [6]:
#udhr,是超过 300 种语言的世界人权宣言。语料库的 fileids 包括有关文件所使用的字符编码,如:UTF8 或者 Latin1。
nltk.corpus.udhr.words('Javanese-Latin1')[11:]


Out[6]:
[u'Saben', u'umat', u'manungsa', u'lair', u'kanthi', ...]

In [7]:
from nltk.corpus import udhr

In [8]:
languages = ['Chickasaw', 'English', 'German_Deutsch','Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']

In [9]:
cfd = nltk.ConditionalFreqDist(
    (lang, len(word))
    for lang in languages
    for word in udhr.words(lang + '-Latin1'))

In [10]:
cfd.plot()



In [11]:
cfd.plot(cumulative = True)



In [14]:
'''
NLTK中定义的基 语料库函数:使用help(nltk.corpus.reader)可以找到更多的文档, 也可以阅读 http://www.nltk.org/howto 上的在线语料库的 HOWTO。
 
示例 描述
fileids() 语料库中的文件
fileids([categories]) 这些分类对应的语料库中的文件
categories() 语料库中的分类
categories([fileids]) 这些文件对应的语料库中的分类
raw() 语料库的原始内容
raw(fileids=[f1,f2,f3]) 指定文件的原始内容
raw(categories=[c1,c2]) 指定分类的原始内容
words() 整个语料库中的词汇
words(fileids=[f1,f2,f3]) 指定文件中的词汇
words(categories=[c1,c2]) 指定分类中的词汇
sents() 指定分类中的句子
sents(fileids=[f1,f2,f3]) 指定文件中的句子
sents(categories=[c1,c2]) 指定分类中的句子
abspath(file id) 指定文件在磁盘上的位置
enc oding(fileid) 文件的编码(如果知道的话)
open(fileid) 打开指定语料库文件的文件流
root() 到 地安装的语料库根目录的路径
'''
#使用PlaintextCorpusReader加载自己的语料库
#公开发行的语料库的重要来源是语言数据联盟((LDC)和欧洲语言资源局(EL RA)。提供几十种语言的数以百计的已标注文 和语音语料库。
print ''




In [43]:
#2.2 条件频率分布
#条件和事件 频率分布计算观察到的事件,如文 中出现的词汇。条件频率分布需要给每个时间关联 一个条件,
#所以不是处理一个词序列,我们必须处理的是一个配对序列。

#按文体计数词汇
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist( 
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))
genre_word = [(genre, word)
              for genre in ['news', 'romance']
              for word in brown.words(categories=genre)] 
print len(genre_word)


170576

In [44]:
genre_word[:4]


Out[44]:
[('news', u'The'),
 ('news', u'Fulton'),
 ('news', u'County'),
 ('news', u'Grand')]

In [45]:
genre_word[-4:]


Out[45]:
[('romance', u'afraid'),
 ('romance', u'not'),
 ('romance', u"''"),
 ('romance', u'.')]

In [46]:
cfd = nltk.ConditionalFreqDist(genre_word)

In [47]:
cfd.conditions()


Out[47]:
['romance', 'news']

In [48]:
cfd['news']


Out[48]:
FreqDist({u'sunbonnet': 1,
          u'Elevated': 1,
          u'narcotic': 2,
          u'four': 73,
          u'woods': 4,
          u'railing': 1,
          u'Until': 5,
          u'aggression': 1,
          u'marching': 2,
          u'increase': 24,
          u'eligible': 4,
          u'electricity': 1,
          u'$25-a-plate': 1,
          u'wheeled': 2,
          u'Casey': 6,
          u'all-county': 1,
          u'Belgians': 20,
          u'Western': 7,
          u'dependency': 2,
          u'1959-60': 1,
          u'Duhagon': 1,
          u'sinking': 1,
          u'1,119': 1,
          u'co-operation': 1,
          u'Famed': 1,
          u'regional': 2,
          u'Charitable': 1,
          u'appropriation': 2,
          u'yellow': 3,
          u'Old': 11,
          u'Heights': 1,
          u'bringing': 2,
          u'Policies': 2,
          u'prize': 5,
          u'Loen': 1,
          u'Publique': 2,
          u'wooden': 1,
          u'Loeb': 1,
          u'specialties': 1,
          u'Sands': 1,
          u'succession': 1,
          u'Paul': 6,
          u'shows': 4,
          u'commented': 7,
          u'Screw': 1,
          u'charter': 15,
          u'Oslo': 5,
          u'tired': 3,
          u'pulse': 1,
          u'tires': 3,
          u'271': 1,
          u'second': 35,
          u'273': 1,
          u'Pampa': 2,
          u'DiVarco': 1,
          u'errors': 8,
          u'Initially': 1,
          u'Lucille': 2,
          u'boogie': 1,
          u'contributed': 5,
          u'Seekonk': 2,
          u'Hamilton': 2,
          u'designing': 2,
          u'College': 20,
          u'increasing': 2,
          u'Presidential': 2,
          u'dispatched': 3,
          u'hero': 7,
          u'Sioux': 1,
          u'Foundation': 3,
          u'Munoz': 1,
          u'error': 2,
          u'here': 67,
          u'reported': 28,
          u'affiliated': 1,
          u'Footnotes': 1,
          u'Stephanie': 2,
          u'doldrums': 1,
          u'cyclical': 1,
          u'kids': 6,
          u'Fernberger': 1,
          u'elaborate': 3,
          u'climbed': 2,
          u'reports': 12,
          u'controversy': 5,
          u'Boxwood': 1,
          u'military': 30,
          u'Walters': 1,
          u'Isles': 1,
          u'rebel': 3,
          u'golden': 5,
          u'Quincy': 1,
          u'ground': 10,
          u'Harvey': 7,
          u'explained': 13,
          u'precincts': 4,
          u'Three': 4,
          u'replace': 4,
          u'brought': 21,
          u'beneficiaries': 1,
          u'Basic': 1,
          u'Wales': 2,
          u'Basin': 2,
          u'unit': 10,
          u'opponents': 1,
          u'Ronald': 2,
          u'Callan': 1,
          u'spoke': 4,
          u'tardiness': 1,
          u'Slate': 5,
          u'century': 2,
          u'Admitting': 1,
          u'Anticipated': 1,
          u'occupying': 1,
          u'Vernon': 4,
          u'Tex.': 5,
          u'music': 12,
          u'therefore': 7,
          u'passport': 1,
          u'unfortunately': 1,
          u'strike': 12,
          u'heralded': 1,
          u'until': 28,
          u'Tudor': 1,
          u'Stepanovich': 2,
          u'females': 1,
          u'Christine': 2,
          u'successful': 10,
          u'brings': 1,
          u'whirling': 1,
          u'Rule': 2,
          u'99': 1,
          u'Person': 2,
          u'menaced': 1,
          u'tying': 1,
          u'90': 2,
          u'hole': 17,
          u'hold': 10,
          u'95': 3,
          u'circumstances': 5,
          u'locked': 1,
          u'brutality': 2,
          u'Wilderness': 1,
          u'homemakers': 1,
          u'famed': 1,
          u'accomplishment': 1,
          u'Professors': 2,
          u'Westphalia': 2,
          u'temperatures': 1,
          u'Travelers': 1,
          u'centralization': 2,
          u'example': 15,
          u'Le': 1,
          u'La': 7,
          u'household': 3,
          u'artillery': 2,
          u'organized': 6,
          u'Briar': 1,
          u'Smith-Colmer': 1,
          u'currency': 1,
          u'caution': 1,
          u'reviewing': 2,
          u'want': 16,
          u'counseling': 3,
          u'Easier': 1,
          u'absolute': 1,
          u'preferably': 1,
          u'hog': 2,
          u'hoc': 1,
          u'knows': 6,
          u'complaining': 1,
          u'travel': 2,
          u'drying': 2,
          u'feature': 4,
          u'Gardner': 1,
          u'machine': 5,
          u'how': 37,
          u'hot': 9,
          u'significance': 4,
          u'Comedian': 1,
          u'Gerosa': 2,
          u'preferable': 1,
          u"He'll": 2,
          u'A': 137,
          u'beauty': 4,
          u'inherent': 2,
          u'L.': 25,
          u'swing': 1,
          u'outlawed': 1,
          u'Players': 1,
          u'modest': 7,
          u'Reese': 3,
          u'destined': 1,
          u'fourteen-team': 1,
          u'sentencing': 1,
          u'types': 4,
          u'ballroom': 4,
          u'effective': 15,
          u'down-payments': 1,
          u'youths': 4,
          u'romped': 1,
          u'revolt': 1,
          u'headquarters': 18,
          u'Walkers': 1,
          u'baggage': 2,
          u'18th': 4,
          u'4-7/8': 1,
          u'Another': 7,
          u'keeps': 3,
          u'democratic': 1,
          u'wing': 4,
          u'wind': 4,
          u'leisurely': 1,
          u'Willy': 1,
          u'senators': 4,
          u'$840,000': 1,
          u'welcomed': 2,
          u'Edith': 1,
          u'Housing': 3,
          u'reforms': 4,
          u'vary': 2,
          u'kickoff': 2,
          u'thousands': 7,
          u'Dussa': 1,
          u'Toll': 1,
          u'Ludwig': 1,
          u'Commies': 1,
          u'His': 29,
          u'Hit': 1,
          u'surviving': 1,
          u'fit': 3,
          u'striking': 4,
          u"Gardner's": 4,
          u'secede': 1,
          u'survivors': 1,
          u'Harris': 8,
          u'Barber': 5,
          u'Palsy': 1,
          u'hidden': 1,
          u'county-wide': 2,
          u'Sinfonica': 1,
          u'slate': 1,
          u'vouchers': 1,
          u'detachment': 1,
          u'effects': 2,
          u'schools': 37,
          u'P.m.': 1,
          u'undeveloped': 1,
          u'silver': 3,
          u'glutted': 1,
          u'headboard': 1,
          u'represents': 2,
          u'debut': 4,
          u"road's": 3,
          u'skills': 3,
          u'McCluskey': 1,
          u'4-year-old': 1,
          u'clientele': 1,
          u'Seidel': 2,
          u"employers'": 1,
          u'Superior': 5,
          u'preceded': 3,
          u'financial': 11,
          u'reputedly': 1,
          u'series': 25,
          u'finger-paint': 1,
          u'Mongolia': 1,
          u"NATO's": 1,
          u'3-to-o': 1,
          u'whiz': 2,
          u"we'd": 1,
          u'substantially': 1,
          u'laboratory': 2,
          u'tricked': 1,
          u"House's": 2,
          u'whip': 2,
          u'borne': 1,
          u'misfortune': 2,
          u'two-and-a-half-mile': 1,
          u'ten-concert': 1,
          u'5847': 1,
          u'flooded': 1,
          u'encourage': 3,
          u'millions': 9,
          u'Super': 1,
          u"Simpson's": 1,
          u'sexton': 1,
          u'foundation': 1,
          u'inhabitants': 2,
          u'Word': 1,
          u"Nugent's": 1,
          u'Extension': 1,
          u'sellout': 1,
          u'University': 42,
          u'Work': 1,
          u'threatened': 4,
          u'3-to-3': 1,
          u'Marcus': 1,
          u'3-run': 1,
          u'Jacques': 1,
          u'sheet': 1,
          u'estimate': 3,
          u'alert': 1,
          u'substantiation': 1,
          u'cornerstone': 1,
          u'enormous': 1,
          u'Hord': 1,
          u'shelves': 1,
          u'24-inch': 1,
          u'shipped': 1,
          u'musicians': 3,
          u'speedy': 1,
          u'coeds': 1,
          u'Human': 2,
          u'1970s': 1,
          u'reserving': 1,
          u'repealed': 1,
          u'Palmer': 42,
          u'hearsay': 1,
          u'Convair': 1,
          u"Al's": 1,
          u'Due': 1,
          u'channels': 1,
          u'wash': 2,
          u'$18.9': 1,
          u'175': 1,
          u'174': 1,
          u'173': 1,
          u'$18.2': 1,
          u'Steelers': 2,
          u'basketball': 5,
          u'service': 37,
          u'engagement': 4,
          u'returns': 2,
          u'needed': 26,
          u'Simmons': 2,
          u'master': 5,
          u'listed': 10,
          u'Dumont': 2,
          u'legs': 3,
          u'bitter': 5,
          u'ranging': 3,
          u'listen': 2,
          u'collapse': 1,
          u'predictably': 1,
          u'convention': 2,
          u'wisdom': 5,
          u'advisement': 2,
          u'task': 5,
          u'ASDIC': 1,
          u'defaulted': 1,
          u'Bertoia': 1,
          u'peril': 1,
          u'outlay': 2,
          u'showed': 5,
          u'elegant': 2,
          u'Carroll': 2,
          u'walloped': 1,
          u'nations': 12,
          u'project': 26,
          u'percentages': 1,
          u'idle': 2,
          u'Ilona': 1,
          u'skimmed': 1,
          u'feeling': 10,
          u'acquisition': 1,
          u'Cody': 1,
          u'Angelo': 3,
          u'acclaim': 1,
          u'entail': 1,
          u'willingness': 2,
          u'Chicago': 22,
          u'Woodyard': 1,
          u'Mullenax': 2,
          u'spent': 12,
          u'Mont.': 1,
          u"Lanin's": 1,
          u'Gursel': 3,
          u'picks': 1,
          u'Angels': 1,
          u'Williams': 14,
          u'dozen': 5,
          u'Then': 17,
          u'person': 9,
          u'bleachers': 2,
          u'responsible': 5,
          u'Myron': 1,
          u'Band': 1,
          u'recommended': 9,
          u'absorbed': 4,
          u'Minister': 5,
          u'They': 62,
          u'season': 43,
          u'Ask': 1,
          u'grips': 2,
          u'Missionary': 1,
          u'Jones': 22,
          u'Lynn': 4,
          u'Wise': 1,
          u'shall': 5,
          u'Wish': 1,
          u'object': 2,
          u'vexing': 1,
          u'debonair': 1,
          u'affirmation': 1,
          u'mouth': 2,
          u'letter': 7,
          u'conceded': 1,
          u'putout': 1,
          u'Galveston': 2,
          u'episode': 2,
          u'Texans': 7,
          u'professor': 1,
          u'camp': 4,
          u'Bulloch': 2,
          u'Journal-Bulletin': 1,
          u'ruthless': 1,
          u'independents': 1,
          u'prevention': 2,
          u'Mansion': 1,
          u'detriment': 1,
          u'nineteenth': 1,
          u'mating': 1,
          u'purged': 1,
          u'incomplete': 1,
          u'marvel': 3,
          u'saying': 8,
          u'signatures': 5,
          u'bomb': 10,
          u'reactor': 3,
          u'Symonds': 1,
          u'U-2': 1,
          u'Union': 14,
          u'orchestra': 9,
          u'meetings': 5,
          u'Agency': 1,
          u'parolees': 2,
          u'nominated': 2,
          u'undue': 2,
          u"Communism's": 1,
          u'cooking': 1,
          u'judgeship': 1,
          u'Paradise': 2,
          u'culminates': 1,
          u'driving': 12,
          u'Congressional': 3,
          u'Meyner': 3,
          u'touches': 2,
          u'busy': 3,
          u'clicked': 1,
          u'Extend': 1,
          u'695': 1,
          u'headline': 2,
          u'menu': 1,
          u'Moller': 1,
          u'than': 138,
          u'Tiao': 1,
          u'theme': 5,
          u'touched': 2,
          u'rich': 5,
          u'Coliseum': 1,
          u'submarine-ball': 1,
          u"Berlin's": 1,
          u'plate': 5,
          u'D.C.': 3,
          u'$15': 1,
          u'Nevertheless': 1,
          u'television': 13,
          u"AID's": 1,
          u'pocket': 1,
          u'Mears': 1,
          u"Ruth's": 7,
          u'Sports': 4,
          u'societies': 2,
          u'Senators': 4,
          u'greens': 2,
          u'bloodstream': 1,
          u'ever': 32,
          u'Rip': 1,
          u'Rio': 2,
          u'three-year': 2,
          u'flanked': 1,
          u'release': 3,
          u"leader's": 4,
          u'U-I': 1,
          u'respond': 1,
          u'mandatory': 1,
          u'disaster': 1,
          u'fair': 10,
          u'Bennington': 1,
          u'transferred': 1,
          u'pads': 1,
          u'Brevard': 5,
          u'glad': 1,
          u'result': 30,
          u'fail': 2,
          u'Ave.': 10,
          u'resigned': 5,
          u'best': 29,
          u'pricking': 1,
          u"Braves'": 1,
          u'lots': 3,
          u'Heinkel': 2,
          u'rings': 2,
          u"'20's": 1,
          u'injuries': 3,
          u'224-170': 1,
          u'pressures': 1,
          u'score': 11,
          u'Lockies': 2,
          u'toolmaker': 1,
          u'preserve': 4,
          u'indecisive': 1,
          u'redistricting': 1,
          u'never': 38,
          u"Meyner's": 1,
          u'nationwide': 2,
          u'nature': 7,
          u'rolled': 2,
          u'punted': 1,
          u'authorizing': 2,
          u'lefthanders': 1,
          u'drew': 6,
          u'extent': 1,
          u'Bronx': 5,
          u'Peterson': 4,
          u'roller': 1,
          u'Capello': 1,
          u"war's": 1,
          u'accident': 7,
          u'met': 8,
          u'country': 24,
          u'conclusions': 1,
          u'demanded': 3,
          u'Vacancy': 1,
          u'planned': 9,
          u'logic': 1,
          u'federalism': 1,
          u'argue': 2,
          u'asked': 34,
          u'30th': 1,
          u'Apartment': 1,
          u'liberal-conservative': 1,
          u'troublesome': 1,
          u'25%': 1,
          u'102': 1,
          u'month-long': 1,
          u'250': 2,
          u'255': 1,
          u'relearns': 1,
          u'Clarence': 3,
          u'reconsideration': 2,
          u'Sitting': 1,
          u'union': 21,
          u'Rizzuto': 1,
          u'breakoff': 1,
          u'.': 4030,
          u'Nischwitz': 3,
          u'extraction': 1,
          u'startled': 1,
          u'stadium': 3,
          u'Cherry': 3,
          u'privilege': 3,
          u'one-week-old': 1,
          u'Flowers': 2,
          u'dots': 1,
          u'Precise': 1,
          u'life': 17,
          u'retrospect': 1,
          u'Tokyo': 1,
          u'worker': 2,
          u'allotting': 1,
          u'Stella': 1,
          u'1,212,000': 1,
          u'child': 9,
          u'worked': 12,
          u'Gloriana': 3,
          u'Holmes': 9,
          u'commerce': 3,
          u'presidency': 2,
          u'administrative': 3,
          u'employ': 2,
          u'misconstrued': 1,
          u'1213-15': 1,
          u'Campbell': 1,
          u"Gannon's": 1,
          u'Brandt': 5,
          u'Zurcher': 2,
          u'played': 19,
          u'Innumerable': 1,
          u'conditioned': 2,
          u'player': 6,
          u'eighteen': 3,
          u'London-based': 1,
          u'Courtney': 1,
          u'Puerto': 2,
          u'churchmen': 2,
          u'doorman': 1,
          u'specter': 1,
          u'trusted': 2,
          u'Phouma': 3,
          u'damaged': 2,
          u'Valley': 1,
          u"Dresbachs'": 1,
          u'things': 10,
          u'cumulative': 1,
          u'rebellion': 1,
          u'Newman': 1,
          u'socialized': 1,
          u'300': 6,
          u'harmony': 1,
          u'babies': 2,
          u'pre-school': 1,
          u'fairly': 3,
          u'Budapest': 2,
          u'saluted': 1,
          u'Maybe': 2,
          u'torpedoes': 1,
          u'Angeles': 12,
          u'photographers': 1,
          u'Peking': 1,
          u'Living': 3,
          u'5-to-2': 1,
          u'5-to-3': 1,
          u"Stevenses'": 1,
          u'protected': 1,
          u'furlough': 1,
          u'matters': 3,
          u'vice-president': 2,
          u'academic': 10,
          u"d'etat": 1,
          u'telephone': 8,
          u'echoes': 1,
          u'corporate': 3,
          u'Funeral': 5,
          u'fittest': 1,
          u'opinions': 3,
          u'spurred': 1,
          u'provocation': 2,
          u'capitol': 1,
          u'sleeps': 1,
          u'Subsequent': 1,
          u'distribute': 2,
          u'fantastic': 1,
          u'plight': 1,
          u'rushing': 5,
          u'succeeding': 1,
          u'previous': 11,
          u'ham': 2,
          u'duffer': 1,
          u'Oscar': 2,
          u'ease': 2,
          u'Odell': 1,
          u'had': 279,
          u'emphasis': 5,
          u'Leonard': 6,
          u'Mohammedanism': 1,
          u'Connecticut': 2,
          u'collections': 4,
          u'easy': 4,
          u'prison': 7,
          u'has': 300,
          u'hat': 1,
          u'Apart': 3,
          u'municipal': 4,
          u'mediocre': 1,
          u'Dawson': 1,
          u'survival': 2,
          u'disagreement': 3,
          u'possible': 28,
          u"rocket's": 2,
          u'firmer': 3,
          u'possibly': 3,
          u'opener': 3,
          u'birth': 5,
          u'Missouri': 3,
          u'clustered': 1,
          u'pertinent': 1,
          u'unique': 1,
          u'$2,170': 1,
          u'desire': 4,
          u'county': 26,
          u"bridegroom's": 2,
          u'seaside': 1,
          u'misled': 1,
          u'steps': 8,
          u'Shrove': 1,
          u"court's": 1,
          u'Further': 2,
          u'Pentagon': 2,
          u"Louis's": 1,
          u'Warren': 13,
          u'attorney': 17,
          u'right': 33,
          u'old': 23,
          u'crowd': 8,
          u'$1,000,000,000': 1,
          u'creed': 3,
          u'Expressways': 1,
          u'crown': 2,
          u'System': 2,
          u'culpas': 1,
          u'3,325': 1,
          u'Conservation': 4,
          u'glove': 4,
          u'Noel': 1,
          u'Between': 1,
          u'enemies': 1,
          u'MacDonald': 2,
          u'for': 943,
          u'bottom': 1,
          u'p.m.': 38,
          u'contributing': 1,
          u'individuals': 5,
          u'summoned': 4,
          u'pondered': 1,
          u'Celebration': 1,
          u'Donnelly': 1,
          u'Instant': 1,
          u'Calls': 2,
          u"ol'": 1,
          u'dental': 6,
          u'6,000': 1,
          u'shifting': 2,
          u'defensive': 7,
          u'losing': 5,
          u'brokerage': 1,
          u'manufacturing': 5,
          u'shaken': 2,
          u'Macon': 2,
          u'benches': 1,
          u'boiling': 1,
          u'dollars': 15,
          u'citizens': 6,
          u'globetrotter': 1,
          u'despair': 1,
          u'stoked': 1,
          u'lacked': 3,
          u'slightly': 4,
          u'meddle': 1,
          u'consulting': 3,
          u'statements': 9,
          u'Cal.': 1,
          u'Blacks': 1,
          u'honeymoon': 3,
          u'Scotland': 2,
          u'son': 22,
          u'undermining': 1,
          u'Misses': 1,
          u'one-fourth': 1,
          u'raiser': 1,
          u'raises': 3,
          u'sow': 1,
          u'stockholder': 1,
          u'reducing': 2,
          u'defendants': 9,
          u'Hank': 4,
          u'collectors': 1,
          u'162': 1,
          u'support': 24,
          u'constantly': 3,
          u'busy-work': 1,
          u'Hand': 1,
          u'symphony': 1,
          u"boy's": 2,
          u'10,000,000': 1,
          u'resulted': 6,
          u'call': 14,
          u'happy': 12,
          u'offer': 9,
          u'understandably': 1,
          u'forming': 2,
          u'Completing': 1,
          u'Acres': 2,
          u'talents': 2,
          u'understandable': 2,
          u'incinerator': 1,
          u'underdeveloped': 1,
          u'duel': 3,
          u"else's": 1,
          u'Toni': 1,
          u'inside': 4,
          u'goutte': 1,
          u'Waldorf-Astoria': 2,
          u'County': 35,
          u'unanimous': 2,
          u'Guests': 4,
          u'Tony': 3,
          u'Hawksley': 10,
          u'Enrique': 1,
          u'panels': 5,
          u'Weatherford': 2,
          u'Stallard': 1,
          u'8,293': 1,
          u'150': 3,
          u'juvenile': 5,
          u'later': 34,
          u'liberal': 4,
          u'154': 3,
          u'Trooper': 1,
          u'Six': 5,
          u'proven': 1,
          u"''": 702,
          u'Virgin': 2,
          u'Squad': 1,
          u'exist': 1,
          u'Pittsboro': 1,
          u'Sid': 1,
          u'segregationist': 1,
          u'acacia': 3,
          u'dealer': 4,
          u'negotiations': 11,
          u'Knoll': 1,
          u'McDaniel': 2,
          u'college': 18,
          u'protested': 1,
          u'Noting': 2,
          u'Practice': 1,
          u'eventual': 3,
          u'floor': 14,
          u'Track': 1,
          u'Possible': 1,
          u'crowns': 1,
          u'flood': 1,
          u'Nolan': 1,
          u'republic': 1,
          u'amicable': 1,
          u'ambitious': 1,
          u'entomologist': 1,
          u'Norristown': 1,
          u'smell': 2,
          u'roll': 4,
          u'steamship': 1,
          u'intend': 3,
          u'Lenny': 1,
          u'models': 2,
          u'high-wage': 1,
          u'Western-style': 1,
          u'Luthuli': 1,
          u'eminent': 1,
          u'scale': 1,
          u'smelling': 1,
          u'persecution': 1,
          u'source': 7,
          u'Charley': 4,
          u'fastened': 1,
          u'Mostly': 1,
          u'Debutante': 2,
          u"workers'": 1,
          u'Charles': 22,
          u'Quaker': 1,
          u"O'Hare": 1,
          u'Fifth': 1,
          u'time': 97,
          u'push': 2,
          u'conferred': 1,
          u'Empire': 2,
          u'Principal': 2,
          u'gown': 6,
          u'smelts': 1,
          u'chain': 2,
          u'criteria': 2,
          u'Indians': 4,
          u'Nicklaus': 1,
          u'integration': 6,
          u'tee': 4,
          u'theaters': 6,
          u'645-acre': 1,
          u'Wabash': 1,
          u'Indiana': 2,
          u'chair': 1,
          u'$278,877,000': 1,
          u'Beyeler': 2,
          u'ballet': 7,
          u'92': 1,
          u'8861': 1,
          u'900-student': 1,
          u'sweat-suits': 1,
          u'shouldda': 1,
          u'carpenters': 1,
          u'Bahi': 1,
          u'96': 1,
          u'verbally': 1,
          u'recipient': 3,
          u'Prize': 1,
          u'Political': 1,
          u"Howsam's": 1,
          u'choice': 6,
          u'Lyle': 1,
          u'alcoholics': 2,
          u'mourn': 1,
          u'stays': 1,
          u'southpaw': 5,
          u'right-handed': 1,
          u'exact': 1,
          u'minute': 1,
          u'Tau': 1,
          u'1.10.8': 1,
          u'3-month': 1,
          u'Fifteen': 1,
          u'1.10.4': 1,
          u'rights': 3,
          u'Tax': 5,
          u'make': 43,
          u'1.10.1': 1,
          u'leave': 5,
          u'solved': 2,
          u'depositors': 1,
          u'settle': 1,
          u'team': 33,
          u'Patience': 1,
          u'prevent': 12,
          u'spiritual': 1,
          u'$80,738': 2,
          u'M.': 22,
          u'prediction': 1,
          u'sign': 9,
          u'Bldg.': 1,
          u'3505o': 1,
          u'ogled': 1,
          u'Lt.': 1,
          u'Look': 3,
          u'Associations': 1,
          u'Adamson': 1,
          u'jeopardy': 1,
          u'celebrated': 2,
          u'locker': 3,
          u'melt': 1,
          u'current': 13,
          u'wayward': 1,
          u"Tuttle's": 1,
          u'Southwest': 2,
          u'boost': 6,
          u'Lopez': 1,
          u'Me': 2,
          u'drafted': 1,
          u'jury': 44,
          u'funeral': 2,
          u'understanding': 5,
          u"Leopold's": 1,
          u'yards': 22,
          u'address': 12,
          u'alone': 8,
          u'along': 34,
          u'$80': 1,
          u'My': 9,
          u'Godwin': 1,
          u'nitroglycerine': 1,
          u'passengers': 3,
          u'revenues': 13,
          u'Associated': 2,
          u'Cornell': 1,
          u'transition': 3,
          u'brilliant': 3,
          u'saws': 2,
          u'studied': 7,
          u'wherever': 1,
          u'Casals': 1,
          u'accomplished': 4,
          u'studies': 2,
          u'influx': 1,
          u'tasks': 2,
          u'love': 3,
          u'Hagner': 1,
          u'Thornton': 1,
          u'prefer': 3,
          u"Leavitt's": 1,
          u'jolt': 1,
          u'Lisle': 1,
          u'redevelopers': 1,
          u'Davidson': 1,
          u'opposes': 2,
          u'cocktail': 6,
          u'August': 12,
          u'working': 16,
          u'Sarasota': 1,
          u'positive': 2,
          u'angry': 3,
          u'tightly': 1,
          u'Ghormley': 1,
          u'cherished': 1,
          u'wood': 1,
          u'opposed': 9,
          u'films': 3,
          u'scope': 1,
          u'Pinsk': 1,
          u'Those': 8,
          u'loving': 1,
          u'``': 732,
          u'Klaus': 1,
          u'afford': 4,
          u'subsistence': 1,
          u'apparent': 7,
          u'validity': 1,
          u'Jimmy': 4,
          u'virtue': 2,
          u'Achaeans': 1,
          u'scratches': 3,
          u'Retail': 1,
          u'Opelika': 1,
          ...})

In [49]:
cfd['romance']


Out[49]:
FreqDist({u'raining': 2,
          u'sitters': 1,
          u'yellow': 13,
          u'keno': 1,
          u'four': 8,
          u'Does': 2,
          u'railing': 1,
          u'ringlets': 1,
          u'self-pity': 2,
          u'snowing': 1,
          u'Myra': 24,
          u'Ronald': 3,
          u'Western': 2,
          u'lore': 1,
          u'portentous': 1,
          u'immature': 1,
          u'shaving': 1,
          u'Elec': 9,
          u'foul': 1,
          u'experimentally': 1,
          u'bringing': 3,
          u'prize': 1,
          u'wooden': 3,
          u'piling': 1,
          u'freckles': 2,
          u'persisted': 1,
          u'woods': 1,
          u'succession': 1,
          u'Paul': 1,
          u'Jerez': 1,
          u'straight': 13,
          u'Rachel': 16,
          u'tired': 11,
          u'hanging': 3,
          u'pulse': 2,
          u'elegant': 2,
          u'second': 15,
          u'valiant': 1,
          u'sailed': 1,
          u'scraped': 1,
          u'loathing': 1,
          u'nigs': 1,
          u"gran'dad": 1,
          u"this'll": 1,
          u'Initially': 1,
          u'thunder': 2,
          u'contributed': 1,
          u'fingers': 13,
          u'Wrong': 1,
          u'Hamilton': 1,
          u'outfielders': 1,
          u'replaced': 2,
          u'hero': 1,
          u'chins': 1,
          u'jubilantly': 1,
          u'interdependent': 1,
          u'here': 65,
          u'reported': 3,
          u'chassis': 1,
          u'china': 1,
          u'hers': 3,
          u'shriek': 1,
          u'Yuki': 1,
          u'kids': 6,
          u'unwillingness': 1,
          u'elaborate': 2,
          u'climbed': 7,
          u'cheerfully': 1,
          u'golden': 3,
          u'explained': 7,
          u'Three': 5,
          u'brought': 19,
          u'remnant': 1,
          u'stern': 1,
          u'Wales': 3,
          u'Pompeii': 5,
          u'spoke': 12,
          u'moth': 1,
          u'symphony': 2,
          u'music': 6,
          u'telegraph': 1,
          u'strike': 1,
          u'Brainards': 1,
          u'until': 34,
          u'holy': 2,
          u'populations': 1,
          u'successful': 3,
          u'brings': 2,
          u'whirling': 2,
          u'hurt': 7,
          u'glass': 7,
          u"ever'body": 1,
          u'hole': 2,
          u'hold': 12,
          u'circumstances': 4,
          u"captain's": 2,
          u'locked': 1,
          u'Wilderness': 1,
          u'plunged': 2,
          u'locker': 2,
          u'sweeter': 2,
          u'leaped': 1,
          u'centralization': 1,
          u'example': 2,
          u'Le': 1,
          u'wand': 1,
          u'household': 2,
          u'organized': 1,
          u'caution': 1,
          u'want': 37,
          u'pinto': 1,
          u'absolute': 1,
          u'preferably': 1,
          u"cane's": 1,
          u'groaned': 1,
          u'hon': 2,
          u'travel': 2,
          u'drying': 2,
          u'feature': 1,
          u'machine': 3,
          u'how': 60,
          u'hot': 15,
          u'hop': 2,
          u'significance': 1,
          u"He'll": 1,
          u'dignified': 1,
          u'fanaticism': 1,
          u'A': 48,
          u'uselessness': 1,
          u'beauty': 6,
          u'assimilated': 1,
          u'swing': 3,
          u'despondent': 1,
          u'wrong': 23,
          u'chump': 1,
          u'outcry': 3,
          u"dryin'": 2,
          u'Quint': 11,
          u'presiding': 1,
          u'tulip': 1,
          u'Another': 3,
          u'keeps': 3,
          u'wind': 6,
          u'wine': 5,
          u'restriction': 1,
          u"Daddy's": 1,
          u'snugly': 1,
          u'dreamed': 4,
          u'ice-feeling': 1,
          u'wrought': 1,
          u'His': 44,
          u'Hit': 1,
          u'fit': 4,
          u'screaming': 3,
          u'fig': 2,
          u'Him': 1,
          u'Implements': 1,
          u'put-upon': 1,
          u'detachment': 1,
          u'schools': 2,
          u'sixteen': 1,
          u'silver': 3,
          u'blazer': 1,
          u'arrow': 1,
          u'blushed': 1,
          u'expectancy': 1,
          u'burial': 2,
          u'preceded': 2,
          u'snakes': 5,
          u'series': 2,
          u"we'd": 4,
          u'mutineer': 1,
          u'message': 1,
          u'whip': 6,
          u'borne': 1,
          u'misfortune': 1,
          u'drove': 10,
          u'encourage': 3,
          u'hangouts': 1,
          u'engineer': 4,
          u'foundation': 1,
          u'stamping': 1,
          u'assured': 2,
          u'Work': 1,
          u'assures': 1,
          u'Osric': 1,
          u'estimate': 3,
          u'enormous': 2,
          u'ate': 4,
          u'moment': 27,
          u'disturbed': 3,
          u'Human': 1,
          u'necessity': 2,
          u'disfigured': 2,
          u'Please': 5,
          u'spinning': 2,
          u'Nerves': 1,
          u'hear': 16,
          u'clarity': 1,
          u'fur-piece': 1,
          u'bitten': 1,
          u'basketball': 1,
          u'renovated': 1,
          u'service': 6,
          u'similarly': 2,
          u'Fearless': 2,
          u'engagement': 2,
          u"Fudo's": 2,
          u'tango': 1,
          u'needed': 10,
          u'blossoms': 1,
          u'Straightened': 1,
          u'legs': 4,
          u'bitter': 6,
          u'Alexander': 20,
          u'ramming': 1,
          u'frowned': 4,
          u'wisdom': 3,
          u'Shocked': 1,
          u'heart-stopping': 1,
          u'crawl': 1,
          u'showed': 7,
          u'handcuffs': 1,
          u'tree': 1,
          u'idly': 1,
          u'shower': 2,
          u'pneumonia': 2,
          u'idle': 1,
          u'exclaimed': 2,
          u'feeling': 21,
          u'groaning': 1,
          u'dozed': 3,
          u'Son': 1,
          u'Williams': 2,
          u'dozen': 2,
          u'Then': 42,
          u'person': 9,
          u'responsible': 1,
          u'eagerly': 2,
          u'snuggled': 3,
          u'Washoe': 1,
          u'absorbed': 3,
          u'amusing': 2,
          u'doors': 3,
          u'Ask': 1,
          u'Ash': 1,
          u'floorshow': 1,
          u'shall': 3,
          u'tinkers': 1,
          u'wells': 1,
          u"aunt's": 1,
          u'simplify': 1,
          u'mouth': 11,
          u'letter': 19,
          u'entry': 1,
          u'drought': 2,
          u'morality': 1,
          u'episode': 1,
          u'cops': 2,
          u'camp': 5,
          u'Lucille': 11,
          u'nineteenth': 1,
          u'scream': 2,
          u'came': 75,
          u'saying': 18,
          u'jocular': 1,
          u'padded': 1,
          u'participate': 1,
          u'conclusion': 1,
          u'tempted': 2,
          u'cheaply': 1,
          u'abreast': 1,
          u'lessons': 1,
          u'busy': 7,
          u'clicked': 2,
          u'Reaching': 1,
          u'quaint': 2,
          u"baby's": 4,
          u'than': 65,
          u'bush': 2,
          u'bliss': 1,
          u'touched': 3,
          u'rich': 6,
          u'foolishly': 1,
          u'plate': 5,
          u'stammered': 1,
          u'pocket': 6,
          u'altogether': 2,
          u'relish': 2,
          u"Officers'": 2,
          u'shape': 3,
          u'patch': 1,
          u'eyelids': 1,
          u'lurched': 1,
          u'release': 4,
          u'prayerful': 1,
          u'boarded': 1,
          u'Clearly': 2,
          u'blew': 1,
          u'disaster': 1,
          u'fair': 5,
          u'flaxen': 1,
          u'fail': 2,
          u'faim': 1,
          u'resigned': 1,
          u'Dogs': 1,
          u'best': 12,
          u'Craddock': 1,
          u'lots': 5,
          u'rings': 2,
          u'kind': 34,
          u'pressures': 1,
          u'scorn': 1,
          u'preserve': 2,
          u'claws': 1,
          u'never': 84,
          u'nationwide': 1,
          u'nature': 5,
          u'rolled': 6,
          u'smelled': 8,
          u'lapping': 2,
          u'twinkling': 1,
          u'defiance': 1,
          u'debt': 2,
          u'debs': 1,
          u'pity': 2,
          u'accident': 1,
          u'disdain': 1,
          u'country': 10,
          u'pits': 1,
          u'readers': 1,
          u'adventures': 1,
          u'Laura': 5,
          u'planned': 3,
          u'marrying': 2,
          u'argue': 1,
          u'asked': 45,
          u'twenty-three': 1,
          u'irresponsible': 1,
          u'wearying': 1,
          u'gypsies': 1,
          u'gleaming': 1,
          u'Sorry': 1,
          u'Sitting': 1,
          u'union': 1,
          u'subside': 1,
          u'.': 3736,
          u'much': 69,
          u'sommelier': 1,
          u'superhuman': 1,
          u'dollies': 1,
          u'life': 51,
          u'spit': 1,
          u'eastern': 1,
          u'lift': 3,
          u'child': 17,
          u'worked': 8,
          u'chill': 4,
          u'contemplated': 1,
          u'ferreted': 1,
          u'Kezziah': 1,
          u'miniature': 3,
          u'skirts': 1,
          u'remembering': 1,
          u'played': 6,
          u'player': 3,
          u'eighteen': 4,
          u'aqueducts': 1,
          u'specter': 1,
          u'trusted': 2,
          u'things': 30,
          u'shipwrecked': 1,
          u'Did': 12,
          u'Fairview': 3,
          u'babies': 3,
          u'Appleby': 1,
          u'boiled': 1,
          u'Maybe': 12,
          u'middle-aged': 1,
          u'supper': 2,
          u'tune': 2,
          u'holystones': 1,
          u'echoed': 1,
          u'stillness': 1,
          u'raindrops': 1,
          u'corporate': 1,
          u'spurred': 1,
          u'Thank': 3,
          u'rotated': 1,
          u'beset': 1,
          u'exclaiming': 1,
          u'ham': 6,
          u'Oscar': 1,
          u'ease': 1,
          u'had': 692,
          u'advancement': 1,
          u'Surely': 4,
          u'innocent': 10,
          u'prison': 1,
          u'has': 26,
          u'hat': 5,
          u"t's": 1,
          u'casually': 1,
          u'elders': 1,
          u'possible': 9,
          u'Broiled': 1,
          u'possibly': 4,
          u'birth': 1,
          u'shadow': 3,
          u'unique': 1,
          u'occurring': 1,
          u'desire': 2,
          u'Midshipman': 2,
          u'eight-by-ten': 1,
          u'remind': 1,
          u'steps': 11,
          u'finely-spun': 1,
          u'Warren': 9,
          u"Emma's": 2,
          u'right': 55,
          u'old': 73,
          u'creek': 1,
          u'crowd': 4,
          u'people': 48,
          u'easy': 6,
          u'crown': 1,
          u"an'": 2,
          u'glove': 3,
          u'creep': 2,
          u'enemies': 1,
          u'gasps': 1,
          u'ruffled': 1,
          u'for': 410,
          u'bottom': 6,
          u'hulks': 1,
          u'forbore': 1,
          u'plucked': 1,
          u'contributing': 2,
          u'fog': 7,
          u'summoned': 1,
          u'Remy': 1,
          u'post': 1,
          u'substituted': 1,
          u'shifting': 1,
          u'starring': 1,
          u'bowing': 3,
          u'manufacturing': 1,
          u'Think': 1,
          u'shaken': 2,
          u'First': 1,
          u'foolish': 3,
          u'Caneli': 1,
          u'benches': 2,
          u'boiling': 3,
          u'dollars': 7,
          u'rebuffed': 1,
          u'Valentine': 1,
          u'despair': 2,
          u'slightly': 7,
          u'expertly': 1,
          u'raised': 6,
          u'gauze': 1,
          u'statements': 1,
          u'son': 33,
          u'thankful': 3,
          u'magazines': 1,
          u'Korean': 1,
          u'fabric': 1,
          u"Thoreau's": 1,
          u'support': 2,
          u'tame': 2,
          u'absolutely': 2,
          u"boy's": 4,
          u'greatness': 2,
          u'call': 24,
          u'overhand': 1,
          u'happy': 8,
          u'offer': 3,
          u'fascination': 1,
          u'forming': 1,
          u'conclusively': 1,
          u'shrilly': 1,
          u'peppermints': 2,
          u'Seeing': 2,
          u'inside': 11,
          u'devices': 1,
          u'County': 1,
          u'Tony': 1,
          u'Damn': 1,
          u'Sis': 2,
          u'Sir': 1,
          u'later': 19,
          u'proved': 5,
          u'Sit': 1,
          u'steady': 3,
          u'wetness': 1,
          u'Six': 1,
          u'bathrobe': 1,
          u'crumble': 1,
          u"''": 1044,
          u'proves': 1,
          u'exist': 2,
          u'Francisco': 1,
          u'relay': 1,
          u"Bartoli's": 1,
          u'floor': 13,
          u'Weakness': 1,
          u'relax': 1,
          u'ourselves': 1,
          u'overturning': 1,
          u"Allstates'": 1,
          u'smell': 3,
          u'roll': 2,
          u'intend': 1,
          u'teats': 2,
          u'semi-professionally': 1,
          u'invested': 1,
          u'smelling': 2,
          u'rolling': 2,
          u'Gardens': 1,
          u'congested': 1,
          u'Charles': 4,
          u'unquenched': 1,
          u'time': 93,
          u'push': 3,
          u'banners': 1,
          u'gown': 1,
          u'Blackwells': 1,
          u'chain': 2,
          u'whoever': 1,
          u'Indians': 1,
          u'bandits': 1,
          u'skiing': 1,
          u'chair': 5,
          u'ballet': 1,
          u'religion': 9,
          u'rousing': 1,
          u'methodically': 1,
          u'crates': 1,
          u'jerk': 1,
          u'choice': 4,
          u'alcoholics': 1,
          u'stays': 1,
          u'spats': 1,
          u'fullest': 1,
          u'minute': 7,
          u'tear': 2,
          u'teas': 1,
          u'Supply': 1,
          u'Tax': 1,
          u'make': 49,
          u'leave': 24,
          u'illustrators': 1,
          u'settle': 2,
          u'team': 6,
          u'Suzanne': 1,
          u'unaware': 1,
          u'prevent': 1,
          u'spiritual': 2,
          u'thinkers': 1,
          u'meadow': 1,
          u'attic': 1,
          u'sigh': 1,
          u'M.': 1,
          u'sign': 6,
          u'ogled': 1,
          u'depressions': 1,
          u'Sewickley': 1,
          u'Francie': 4,
          u'falling': 2,
          u'crackling': 1,
          u'Me': 1,
          u'banister': 1,
          u'banged': 2,
          u'assertive': 1,
          u'funeral': 6,
          u'Mi': 1,
          u'understanding': 3,
          u'address': 3,
          u'alone': 24,
          u'along': 33,
          u'My': 21,
          u'enroll': 1,
          u'hurtling': 2,
          u'brilliant': 5,
          u'studied': 7,
          u'wherever': 2,
          u'accomplished': 1,
          u'sprouted': 1,
          u'Doaty': 8,
          u'studies': 2,
          u'nowhere': 2,
          u'love': 32,
          u'cacophony': 1,
          u'prefer': 1,
          u'logical': 1,
          u'Alberto': 2,
          u'crammed': 1,
          u'August': 2,
          u'working': 8,
          u'wicker': 1,
          u'angry': 5,
          u'predictions': 1,
          u'tightly': 4,
          u'Hello': 2,
          u'papal': 1,
          u'wondering': 6,
          u'Those': 5,
          u'loving': 2,
          u'high-speed': 1,
          u'``': 1045,
          u'afford': 2,
          u'apparent': 1,
          u'Telling': 1,
          u"She'll": 4,
          u'everywhere': 2,
          u'virtue': 1,
          u'Blackwell': 3,
          u'preponderance': 1,
          u'anything': 42,
          u'Pope': 6,
          u'Relatives': 1,
          u'values': 1,
          u'Showers': 1,
          u'believed': 2,
          u'Our': 3,
          u'detached': 1,
          u'Out': 6,
          u'Gertrude': 4,
          u'admired': 4,
          u'frogs': 1,
          u'awesome': 1,
          u'parachute': 1,
          u'hides': 1,
          u'admirer': 1,
          u'230': 1,
          u'Happened': 1,
          u'winter': 5,
          u'divided': 3,
          u'Who': 6,
          u'elephant': 1,
          u'Why': 28,
          u'moon-washed': 1,
          u'gumming': 1,
          u'spot': 2,
          u'Bari': 1,
          u'date': 3,
          u'such': 21,
          u'suck': 1,
          u'spouted': 1,
          u'revealed': 2,
          u'nineties': 1,
          u'stress': 1,
          u'Captain': 9,
          u'natural': 4,
          u'conscious': 1,
          u'consequently': 1,
          u'ordinarily': 1,
          u'darkened': 2,
          u'so': 174,
          u'forebears': 1,
          u'swollen': 2,
          u'wolves': 1,
          u'pulled': 8,
          u'Encouraged': 1,
          u'years': 34,
          u'course': 36,
          u'maneuvered': 1,
          u'unfavorable': 1,
          u'Cromwell': 13,
          u'drunker': 1,
          u'tore': 3,
          u'solitary': 3,
          u'thumb': 3,
          u'nearsighted': 1,
          u'paraded': 1,
          u'torn': 3,
          u'attraction': 1,
          u'thump': 1,
          u'Cousin': 22,
          u'troubled': 3,
          u'parades': 1,
          u'mused': 2,
          u'apron': 1,
          u'civilian': 1,
          u'Folly': 3,
          u'nation': 2,
          u'quok': 1,
          u'Gordon': 2,
          u'sorted': 1,
          u'in-laws': 1,
          u'matched': 1,
          u'shouted': 11,
          u'Yellow': 1,
          u'fisherman': 1,
          u'veins': 1,
          u'quarter': 1,
          u'repaired': 1,
          u'square': 4,
          u'retrieve': 2,
          u'bursting': 1,
          u'owing': 1,
          u'entering': 1,
          u'Kong': 3,
          u'salads': 1,
          u'disasters': 1,
          u'rounding': 1,
          u'post-operative': 1,
          u'Furnaces': 1,
          u'seriously': 1,
          u'investigation': 2,
          u'Joe': 5,
          u'bordering': 1,
          u'million': 1,
          u'possibility': 2,
          u'quite': 22,
          u'bumps': 1,
          u'complicated': 1,
          u'Either': 1,
          u'intensely': 2,
          u'Westfield': 2,
          u'training': 4,
          u'disguised': 1,
          u'modest': 1,
          u'aboard': 4,
          u'bothersome': 1,
          u'puny': 3,
          u'emotion': 1,
          u'intuition': 1,
          u'poling': 2,
          u'spoken': 3,
          u'Royal': 1,
          u'one': 166,
          u'chide': 1,
          u'potted': 1,
          u'open': 18,
          u'ripping': 1,
          u'city': 10,
          u'Miyagi': 2,
          u'Monday': 2,
          u'bite': 2,
          u'shiver': 2,
          u'draft': 1,
          u'typing': 2,
          u'begotten': 1,
          u'two-colored': 1,
          u'sentinels': 1,
          u'padding': 1,
          u'ridiculous': 3,
          u'slashed': 1,
          u'Seven': 4,
          u'translate': 1,
          u'Ciao': 1,
          u'scrumptious': 1,
          u'folly': 1,
          u'crossroads': 1,
          u'future': 6,
          u'counselor': 1,
          u'janitor': 1,
          u'damned': 4,
          u'prospect': 1,
          u'mountain': 1,
          u'illness': 1,
          u'flatly': 1,
          u'turned': 48,
          u'alley': 4,
          u'sad': 6,
          u'say': 60,
          u'rained': 1,
          u'buried': 2,
          u'dragooned': 1,
          u'sap': 1,
          u'saw': 47,
          u'sat': 32,
          u'Esperanza': 1,
          u'fashionable': 1,
          u'Jewish': 1,
          u'aside': 5,
          u'Kleenex': 1,
          u'note': 13,
          u'take': 62,
          u'Half': 2,
          u'wanting': 3,
          u'Hall': 2,
          u'to-do': 1,
          u'altered': 2,
          u'opposite': 4,
          u'backyards': 1,
          u'knew': 69,
          u'remarks': 2,
          u'knowingly': 1,
          u'inserted': 1,
          u'pages': 1,
          u'lawn': 2,
          u'weather-royal': 1,
          u'average': 5,
          u'drive': 6,
          u'federal': 1,
          u'heavily-upholstered': 1,
          u'salt': 1,
          u'trembled': 1,
          u'laws': 2,
          u'walking': 12,
          u'merit': 1,
          u'too-expensive': 1,
          u'peaches': 1,
          u'propagandist': 1,
          u'commissary': 1,
          u'aggressive': 1,
          u'imagined': 4,
          u'Wants': 1,
          u'slow': 2,
          u'transact': 1,
          u'Krishnaists': 1,
          u"Myra's": 4,
          u'tears': 6,
          u'going': 60,
          u'robe': 2,
          u'clawing': 1,
          u'revolutionized': 1,
          u'freezing': 3,
          u'flowerpot': 2,
          u'Speedy': 1,
          u'Conneaut': 1,
          u'artist': 3,
          u'hinges': 1,
          u'absurdly': 1,
          u'worried': 7,
          u'priest': 1,
          u"could've": 1,
          u'worries': 5,
          u'marred': 1,
          u'where': 54,
          u'vision': 2,
          u'orchids': 1,
          u'morose': 1,
          u'raged': 1,
          u'dived': 1,
          u'cheesecloth': 1,
          u'hastened': 1,
          u'aroused': 1,
          u'rages': 1,
          u'That': 29,
          u'jumped': 3,
          u'mops': 1,
          u'fierceness': 1,
          u'bureau': 4,
          u'moons': 1,
          u'Bible': 2,
          u'jobs': 6,
          u'screen': 4,
          u'aversion': 2,
          u'spare': 3,
          u'Constance': 1,
          u'concentrated': 1,
          u'many': 27,
          u'thickly': 1,
          u'loudly': 1,
          u'bare-armed': 1,
          u'expression': 1,
          u"can't": 24,
          u'girl-san': 1,
          u'gaiety': 1,
          u'twin': 1,
          u'sentinel': 1,
          u'Riverside': 7,
          u'thick-skulled': 1,
          u'boat': 2,
          u'caring': 2,
          u'companionship': 1,
          u'teddy': 2,
          u'stretch': 1,
          u'west': 2,
          u'vacation': 2,
          u'braving': 1,
          u'motives': 1,
          u'reflective': 1,
          u'Signor': 2,
          u'wants': 7,
          u'thousand': 3,
          u'tightened': 1,
          u'Dazed': 1,
          u'former': 1,
          u'Honshu': 1,
          u'pettiness': 1,
          u'pretence': 1,
          u'straighten': 1,
          u'easier': 3,
          u'defeatism': 1,
          u'newspaper': 2,
          u'situation': 5,
          u'parboiled': 1,
          u'canoe': 1,
          u'brow': 2,
          u'purse': 2,
          u'dubious': 1,
          u'quiet': 8,
          u'limping': 1,
          u'fame': 2,
          u'missiles': 1,
          u"roulette's": 1,
          u'Lovejoy': 3,
          u'underestimate': 1,
          u'Abernathy': 2,
          u'edged': 1,
          u'I': 951,
          u'edges': 2,
          u'barber': 1,
          u'tracking': 1,
          u'vacant': 1,
          u'trains': 1,
          u'carpentry': 1,
          u'scholarship': 2,
          u'summer': 8,
          u'sprayed': 1,
          u'steamed': 1,
          u'being': 49,
          u'rest': 17,
          u'diapiace': 1,
          u'Hurrays': 3,
          u'snarled': 1,
          u'blondes': 1,
          u'grounded': 1,
          u'straight-A': 1,
          u'instrument': 1,
          u'nymphomaniac': 1,
          u'dryly': 1,
          u"Pietro's": 1,
          u'bedstraw': 1,
          u"Captain's": 1,
          u'aspects': 2,
          u'around': 68,
          u'gestures': 1,
          u'Mont': 1,
          u'Aunt': 2,
          u'darn': 1,
          u'vacuum': 2,
          u'world': 26,
          u'vague': 1,
          u'dare': 2,
          u'boast': 2,
          u'clam': 1,
          u'stranger': 2,
          u'Share': 1,
          u'souvenirs': 1,
          u'discarded': 1,
          u'clay': 2,
          u'auditorium': 1,
          u'seating': 1,
          u'Perry': 3,
          u'learning': 2,
          u'thinks': 4,
          u'scholarships': 1,
          u'dimensions': 1,
          u'strewn': 1,
          u'noon': 4,
          u'exit': 1,
          u'refer': 1,
          u'zest': 1,
          u'intimate': 3,
          u'sprung': 1,
          u"You'll": 5,
          u'leadership': 7,
          u'stone': 6,
          u"Roy's": 1,
          u'package': 1,
          u'industry': 1,
          u'Puzzled': 1,
          u'favorite': 4,
          u'slender': 2,
          u'side': 17,
          u'Regretfully': 1,
          u'neighbor': 1,
          u'act': 3,
          u'mean': 10,
          u'stony': 1,
          u'burning': 3,
          u'No': 40,
          u'image': 8,
          u'Acting': 1,
          u'lively': 3,
          u'parties': 5,
          u'bubbly': 1,
          u'her': 651,
          u'lounging': 1,
          u'mindless': 1,
          u'sealed': 1,
          u'bubble': 2,
          u'tireless': 2,
          u'Harro': 1,
          u'yearned': 1,
          u'complete': 3,
          u'black-and-yellow': 1,
          u"child's": 3,
          u'Alternately': 1,
          u'foreheads': 1,
          u'Small': 3,
          u'unreliable': 1,
          u'aide': 1,
          u'with': 460,
          ...})

In [50]:
list(cfd['news'])


Out[50]:
[u'stock',
 u'sunbonnet',
 u'Elevated',
 u'narcotic',
 u'four',
 u'woods',
 u'railing',
 u'Until',
 u'aggression',
 u'marching',
 u'looking',
 u'eligible',
 u'electricity',
 u'$25-a-plate',
 u'consulate',
 u'Casey',
 u'all-county',
 u'Belgians',
 u'Western',
 u'1959-60',
 u'Duhagon',
 u'sinking',
 u'1,119',
 u'co-operation',
 u'Famed',
 u'regional',
 u'Charitable',
 u'appropriation',
 u'yellow',
 u'uncertain',
 u'Heights',
 u'bringing',
 u'prize',
 u'Loen',
 u'Publique',
 u'wooden',
 u'Loeb',
 u'963',
 u'specialties',
 u'Sands',
 u'succession',
 u'Paul',
 u'Phyfe',
 u'commented',
 u'Screw',
 u'charter',
 u'tired',
 u'pulse',
 u'tires',
 u'271',
 u'second',
 u'273',
 u'Pampa',
 u'DiVarco',
 u'Electra',
 u'errors',
 u'fall-off',
 u'forgetting',
 u'Initially',
 u'Lucille',
 u'boogie',
 u'contributed',
 u'Seekonk',
 u'Hamilton',
 u'designing',
 u'replaced',
 u'increasing',
 u'Presidential',
 u'hero',
 u'Sioux',
 u'whose',
 u'Munoz',
 u'Church',
 u'here',
 u'reported',
 u'affiliated',
 u'Footnotes',
 u'Stephanie',
 u'doldrums',
 u'cyclical',
 u'kids',
 u'Fernberger',
 u'elaborate',
 u'climbed',
 u'reports',
 u'controversy',
 u'Pierson',
 u'menu',
 u'military',
 u'Isles',
 u'Ervin',
 u'golden',
 u'Quincy',
 u'owed',
 u'geography',
 u'Harvey',
 u'explained',
 u'precincts',
 u'Three',
 u'replace',
 u'brought',
 u'beneficiaries',
 u'Basic',
 u'Wales',
 u'Basin',
 u'unit',
 u'opponents',
 u'Ronald',
 u'ominous',
 u'spoke',
 u'tardiness',
 u'Slate',
 u'hungry',
 u'Admitting',
 u'Anticipated',
 u'occupying',
 u'Vernon',
 u'Tex.',
 u'music',
 u'therefore',
 u'passport',
 u'staged',
 u'strike',
 u'heralded',
 u'until',
 u'Tudor',
 u'Stepanovich',
 u'females',
 u'Christine',
 u'successful',
 u'brings',
 u'whirling',
 u'Rule',
 u'99',
 u'Person',
 u'remembered',
 u'menaced',
 u'tying',
 u'90',
 u'hole',
 u'hold',
 u'95',
 u'circumstances',
 u'AID',
 u'locked',
 u'dreadful',
 u'Wilderness',
 u'Armond',
 u'homemakers',
 u'famed',
 u'accomplishment',
 u'Professors',
 u'Westphalia',
 u'temperatures',
 u'132,000',
 u'centralization',
 u'example',
 u'fumes',
 u'august',
 u'Tournament',
 u'La',
 u'household',
 u'artillery',
 u'organized',
 u'Briar',
 u'Smith-Colmer',
 u'currency',
 u'caution',
 u'reviewing',
 u'want',
 u'counseling',
 u'Whelan',
 u'arenas',
 u'absolute',
 u'preferably',
 u'hog',
 u'hoc',
 u'complaining',
 u'travel',
 u'drying',
 u'stuff',
 u'feature',
 u'Gardner',
 u'machine',
 u'how',
 u'hot',
 u'Delray',
 u'significance',
 u'Jussel',
 u'Stock',
 u'preferable',
 u"He'll",
 u'blue-uniformed',
 u'A',
 u'beauty',
 u'L.',
 u'Welfare',
 u'206',
 u'outlawed',
 u'Players',
 u'modest',
 u'Reese',
 u'destined',
 u'fourteen-team',
 u'sentencing',
 u'types',
 u'compartment',
 u'effective',
 u'occasions',
 u'down-payments',
 u'youths',
 u'romped',
 u'revolt',
 u'headquarters',
 u'Walkers',
 u'cabled',
 u'baggage',
 u'18th',
 u'4-7/8',
 u'Another',
 u'keeps',
 u'democratic',
 u'wing',
 u'wind',
 u'leisurely',
 u'Willy',
 u'senators',
 u'$840,000',
 u'welcomed',
 u'Edith',
 u'Housing',
 u'govern',
 u'vary',
 u'kickoff',
 u'intangible',
 u'Dussa',
 u'Toll',
 u'Ludwig',
 u'Commies',
 u'some',
 u'His',
 u'Hit',
 u'fit',
 u'revenue',
 u"Gardner's",
 u'secede',
 u'survivors',
 u'rescued',
 u'Harris',
 u'Barber',
 u'Palsy',
 u'hidden',
 u'county-wide',
 u'virtually',
 u'slate',
 u'vouchers',
 u'detachment',
 u'oks',
 u'effects',
 u'schools',
 u'shadows',
 u'yourself',
 u'undeveloped',
 u'silver',
 u'Macon',
 u'headboard',
 u'represents',
 u'debut',
 u"road's",
 u'crops',
 u'McCluskey',
 u'4-year-old',
 u'clientele',
 u'Seidel',
 u"employers'",
 u'Superior',
 u'preceded',
 u'financial',
 u'reputedly',
 u'series',
 u'finger-paint',
 u'Mongolia',
 u"NATO's",
 u'3-to-o',
 u'whiz',
 u"we'd",
 u'substantially',
 u'laboratory',
 u"House's",
 u'whip',
 u'borne',
 u'misfortune',
 u'drove',
 u'ten-concert',
 u'ha',
 u'Leatherman',
 u'freeze',
 u'5847',
 u'Charges',
 u'encourage',
 u'engineer',
 u'Super',
 u"Simpson's",
 u'Matisses',
 u'foundation',
 u'Word',
 u'Extension',
 u'sellout',
 u'looting',
 u'University',
 u'Work',
 u'threatened',
 u'3-to-3',
 u'3-run',
 u'element',
 u'checked',
 u'estimate',
 u'Pakistanis',
 u'substantiation',
 u'cornerstone',
 u'enormous',
 u'Hord',
 u'shelves',
 u'3:57',
 u'24-inch',
 u'shipped',
 u'musicians',
 u'speedy',
 u'coeds',
 u'Human',
 u'1970s',
 u'reserving',
 u'repealed',
 u'Espagnol',
 u'pastel-like',
 u'hearsay',
 u'Convair',
 u"Al's",
 u'Due',
 u'channels',
 u'wash',
 u"Santa's",
 u'$18.9',
 u'175',
 u'174',
 u'173',
 u'$18.2',
 u'Steelers',
 u'basketball',
 u'service',
 u'Lucy',
 u'engagement',
 u'Skyway',
 u'needed',
 u'Simmons',
 u'master',
 u'listed',
 u'Dumont',
 u'legs',
 u'bitter',
 u'ranging',
 u'listen',
 u'collapse',
 u'predictably',
 u'bounty',
 u'nolo',
 u'wisdom',
 u'advisement',
 u'Serving',
 u'defaulted',
 u'Richmond-Petersburg',
 u'Bertoia',
 u'peril',
 u'outlay',
 u'showed',
 u'elegant',
 u'Inna',
 u'Carroll',
 u'walloped',
 u'nations',
 u'project',
 u'idle',
 u'Ilona',
 u'skimmed',
 u'feeling',
 u'acquisition',
 u'Cody',
 u'Angelo',
 u'Vice',
 u'acclaim',
 u'entail',
 u'willingness',
 u'Chicago',
 u'Woodyard',
 u'Mullenax',
 u'craven',
 u"Lanin's",
 u'Gursel',
 u'Son',
 u'Angels',
 u'Williams',
 u'dozen',
 u'Then',
 u'concrete',
 u'bleachers',
 u'responsible',
 u'Myron',
 u'Band',
 u'recommended',
 u'absorbed',
 u'Minister',
 u'They',
 u'$100,000',
 u'Ask',
 u'grips',
 u'Missionary',
 u'Jones',
 u'Lynn',
 u'Wise',
 u'shall',
 u'Wish',
 u'object',
 u'vexing',
 u'debonair',
 u'affirmation',
 u'mouth',
 u'letter',
 u'conceded',
 u'delegate',
 u'putout',
 u'Galveston',
 u'episode',
 u'Texans',
 u'professor',
 u'camp',
 u'Bulloch',
 u'Journal-Bulletin',
 u'ruthless',
 u'Democrat',
 u'China',
 u'paneling',
 u'Mansion',
 u'detriment',
 u'nineteenth',
 u'mating',
 u'purged',
 u'incomplete',
 u'marvel',
 u'saying',
 u'signatures',
 u'bomb',
 u'reactor',
 u'Symonds',
 u'U-2',
 u'Union',
 u'Schenk',
 u'meetings',
 u'Agency',
 u'parolees',
 u'nominated',
 u'undue',
 u"Communism's",
 u'cooking',
 u'judgeship',
 u'Paradise',
 u'Congressional',
 u'Meyner',
 u'touches',
 u'busy',
 u'clicked',
 u'Extend',
 u'695',
 u'headline',
 u'buss',
 u'Moller',
 u'haze',
 u'appreciated',
 u'Tiao',
 u'theme',
 u'touched',
 u'rich',
 u'Darrow',
 u'submarine-ball',
 u'Klux',
 u'lady',
 u'plate',
 u'D.C.',
 u'cubic',
 u'professionals',
 u'Nevertheless',
 u'$16',
 u'untrammeled',
 u'pocket',
 u'$17',
 u"Ruth's",
 u'societies',
 u'Senators',
 u'greens',
 u'maverick',
 u'Rip',
 u'Rio',
 u'three-year',
 u'adjourned',
 u'release',
 u"leader's",
 u'U-I',
 u'generosity',
 u'respond',
 u'mandatory',
 u'disaster',
 u'fair',
 u'irritable',
 u'Bennington',
 u'reconvened',
 u'pads',
 u'Brevard',
 u'result',
 u'fail',
 u'mea',
 u'resigned',
 u'fastened',
 u'best',
 u'pricking',
 u'Emory',
 u"Braves'",
 u'lots',
 u'Heinkel',
 u'rings',
 u"'20's",
 u'224-170',
 u'pressures',
 u'score',
 u'Lockies',
 u'toolmaker',
 u'preserve',
 u'wage',
 u'redistricting',
 u'men',
 u"Meyner's",
 u'nationwide',
 u'nature',
 u'rolled',
 u'impetus',
 u'authorizing',
 u'lefthanders',
 u'Burbank',
 u'extent',
 u'Bronx',
 u'marines',
 u'roller',
 u'Capello',
 u"war's",
 u'accident',
 u'refinement',
 u'country',
 u'readers',
 u'demanded',
 u'Vacancy',
 u"today's",
 u'erupted',
 u'planned',
 u'logic',
 u'federalism',
 u'argue',
 u"High's",
 u'asked',
 u'30th',
 u'Apartment',
 u'liberal-conservative',
 u'Korman',
 u'25%',
 u'active',
 u'rapport',
 u'month-long',
 u'250',
 u'exports',
 u'255',
 u'relearns',
 u'Clarence',
 u'reconsideration',
 u'shouting',
 u'union',
 u'Curry',
 u'feathers',
 u'breakoff',
 u'.',
 u'Nischwitz',
 u'extraction',
 u'startled',
 u'stadium',
 u'Insofar',
 u'privilege',
 u'one-week-old',
 u'Flowers',
 u'dots',
 u'Precise',
 u'life',
 u'retrospect',
 u'Tokyo',
 u'worker',
 u'allotting',
 u'1,212,000',
 u'child',
 u'worked',
 u'Gloriana',
 u'Elected',
 u'Holmes',
 u'commerce',
 u'presidency',
 u'Chips',
 u'1671',
 u'employ',
 u'misconstrued',
 u'1213-15',
 u'Campbell',
 u"Gannon's",
 u'harvesting',
 u'Zurcher',
 u'played',
 u'Innumerable',
 u'conditioned',
 u'player',
 u'$1,800',
 u'eighteen',
 u'London-based',
 u'Courtney',
 u'Puerto',
 u'churchmen',
 u'doorman',
 u'specter',
 u'trusted',
 u'Phouma',
 u'damaged',
 u'recover',
 u"Dresbachs'",
 u'things',
 u'cumulative',
 u'rebellion',
 u'Newman',
 u'Cocktails',
 u'harmony',
 u'babies',
 u'bid',
 u'fairly',
 u'Budapest',
 u'3,399',
 u'updated',
 u'$9',
 u'Maybe',
 u'torpedoes',
 u'Angeles',
 u'photographers',
 u'Peking',
 u'5-to-2',
 u'5-to-3',
 u"Stevenses'",
 u'furlough',
 u'Loewe',
 u'vice-president',
 u'academic',
 u'skidding',
 u'echoes',
 u'corporate',
 u'Fazio',
 u'fittest',
 u'opinions',
 u'spurred',
 u'capitol',
 u'sleeps',
 u'Subsequent',
 u'distribute',
 u'1981',
 u'plight',
 u'rushing',
 u'succeeding',
 u'previous',
 u'Colonial',
 u'ham',
 u'duffer',
 u'Oscar',
 u'ease',
 u'Odell',
 u'had',
 u'ideal',
 u'Leonard',
 u'Mohammedanism',
 u'Connecticut',
 u'collections',
 u'easy',
 u'prison',
 u'has',
 u'hat',
 u'Channel',
 u'Apart',
 u'municipal',
 u'survival',
 u'disagreement',
 u'possible',
 u"rocket's",
 u'firmer',
 u'possibly',
 u'birth',
 u'Missouri',
 u'clustered',
 u'imposed',
 u'unique',
 u'$2,170',
 u'desire',
 u'sliced',
 u"bridegroom's",
 u'seaside',
 u'misled',
 u'steps',
 u'Shrove',
 u"court's",
 u'Further',
 u'continuation',
 u"Louis's",
 u'Warren',
 u'attorney',
 u'right',
 u'old',
 u'crowd',
 u'$1,000,000,000',
 u'creed',
 u'Expressways',
 u'crown',
 u'System',
 u'culpas',
 u'3,325',
 u'Conservation',
 u'Whatever',
 u'Noel',
 u'Between',
 u"symphony's",
 u'enemies',
 u'chorus',
 u'for',
 u'bottom',
 u'p.m.',
 u'contributing',
 u'continue',
 u'motorist',
 u'summoned',
 u'pondered',
 u'Heideman',
 u'Instant',
 u'Calls',
 u"ol'",
 u'dental',
 u'6,000',
 u'shifting',
 u'defensive',
 u'losing',
 u'brokerage',
 u'manufacturing',
 u'shaken',
 u'balking',
 u'benches',
 u'visitors',
 u'dollars',
 u'citizens',
 u'globetrotter',
 u'despair',
 u'stoked',
 u'lacked',
 u'slightly',
 u'meddle',
 u'match',
 u'consulting',
 u'statements',
 u'rationale',
 u'Blacks',
 u'yen',
 u'honeymoon',
 u'Scotland',
 u'son',
 u'freshman',
 u'Misses',
 u'one-fourth',
 u'raiser',
 u'raises',
 u'sow',
 u'stockholder',
 u'reducing',
 u'defendants',
 u'Hank',
 u'east',
 u'Gulf',
 u'lining',
 u'support',
 u'constantly',
 u'busy-work',
 u'Hand',
 u'symphony',
 u"boy's",
 u'peddlers',
 u'resulted',
 u'overhead',
 u'happy',
 u'Vernor',
 u'offer',
 u'understandably',
 u'forming',
 u'Completing',
 u'oil',
 u'talents',
 u'understandable',
 u"Field's",
 u'Couturier',
 u'delegation',
 u'duel',
 u'121',
 u"else's",
 u'Toni',
 u'inside',
 u'officiated',
 u'Waldorf-Astoria',
 u'County',
 u'engulfed',
 u'unanimous',
 u'Guests',
 u'Tony',
 u'Andy',
 u'Enrique',
 u'panels',
 u'Weatherford',
 u'Stallard',
 u'8,293',
 u'150',
 u'juvenile',
 u'adopt',
 u'liberal',
 u'154',
 u'Trooper',
 u'Six',
 u'proven',
 u'Letitia',
 u"''",
 u'Virgin',
 u'Squad',
 u'exist',
 u'Pittsboro',
 u'bats',
 u'Sid',
 u'segregationist',
 u'acacia',
 u'dealer',
 u'negotiations',
 u'McDaniel',
 u'protested',
 u'eventual',
 u'floor',
 u'Track',
 u'Possible',
 u'crowns',
 u'flood',
 u'Nolan',
 u'republic',
 u'Friend',
 u'ambitious',
 u'entomologist',
 u'Dame',
 u'smell',
 u'roll',
 u'steamship',
 u'124',
 u'intend',
 u'Merrill',
 u'Lenny',
 u'models',
 u'high-wage',
 u'Western-style',
 u'Luthuli',
 u'acquaintance',
 u'Arkansas',
 u'undersea',
 u'persecution',
 u'Fuhrmann',
 u'unmatched',
 u'Charley',
 u'reorganization',
 u"Mongolia's",
 u'Debutante',
 u"workers'",
 u'godliness',
 u'toll-road',
 u'Charles',
 u'Quaker',
 u"Leavitt's",
 u'7:30',
 u'time',
 u'push',
 u"Stephen's",
 u'Empire',
 u'gown',
 u'smelts',
 u'chain',
 u'Indians',
 u'Nicklaus',
 u'Nehf',
 u'theaters',
 u'645-acre',
 u'Wabash',
 u'Indiana',
 u'chair',
 u'$278,877,000',
 u'competition',
 u'Beyeler',
 u'ballet',
 u'92',
 u'8861',
 u'900-student',
 u'sweat-suits',
 u'shouldda',
 u'carpenters',
 u'Bahi',
 u'96',
 u'eliminate',
 u'Prize',
 u'recipe',
 u"Howsam's",
 u'choice',
 u'Lyle',
 u'alcoholics',
 u'mourn',
 u'Reedville',
 u'stays',
 u'southpaw',
 u'fifty',
 u'exact',
 u'minute',
 u'Tau',
 u'staffs',
 u'1.10.8',
 u'commentator',
 u'1.10.4',
 u'Supply',
 u'Tax',
 u'1.10.1',
 u'leave',
 u'solved',
 u'depositors',
 u'settle',
 u'team',
 u'prevent',
 u'spiritual',
 u'$80,738',
 u'M.',
 u'prediction',
 u'sign',
 u'Bldg.',
 u'3505o',
 u'soloists',
 u'ogled',
 u'Augusta',
 u'Lt.',
 u'Brady',
 u'Associations',
 u'Adamson',
 u'jeopardy',
 u'celebrated',
 u'realizing',
 u'melt',
 u'current',
 u'wayward',
 u'fifth',
 u'Southwest',
 u'boost',
 u'Lopez',
 u'Me',
 u'drafted',
 u'jury',
 u'funeral',
 u'understanding',
 u'guise',
 u'competed',
 u'well-springs',
 u'yards',
 u'address',
 u'alone',
 u'along',
 u'Bow',
 u'My',
 u'Godwin',
 u'Tyson',
 u'nitroglycerine',
 u'passengers',
 u'revenues',
 u'Associated',
 u'brilliant',
 u'saws',
 u'studied',
 u'wherever',
 u'Haddix',
 u'Casals',
 u'accomplished',
 u'studies',
 u'influx',
 u'tasks',
 ...]

In [52]:
cfd['romance']['could']


Out[52]:
193

In [ ]:
'''
条件频率分布是一个对许多 NLP 任务都有用的数据结构。表 2-4 总结了它们常用的方法。
表 2-4. NLTK 中的条件频率分布:定义、访问和可视化一个计数的条件频率分布的常用方法和习 惯用法
示例 描述
cfdist= ConditionalFreqDist(pairs) 从配对链表中创建条件频率分布
cfdist.conditions() 将条件按字母排序
c fdist[condition] 此条件下的频率分布
c fdist[condition][sample] 此条件下给定样 的频率
c fdist.tabulate() 为条件频率分布制表
cfdist.tabulate(samples, conditions) 指定样 和条件限制下制表
c fdist.plot() 为条件频率分布绘图
cfdist.plot(samples, conditions) 指定样 和条件限制下绘图
cfdist1 < cfdist2 测试样 在cfdist1中出现次数是否小于在cfdist2中出现次 数
'''

In [ ]:


In [ ]:


In [16]:
#2.5 WordNet 是面向语义的英语词典,类似与传统辞典,但具有更丰富的结构。
#NLTK 中包 括英语 WordNet,共有 155,287 个词和 117,659 个同义词集合。
from nltk.corpus import wordnet as wn

In [17]:
#因此,motorcar 只有一个可能的 义,它被定义为 car.n.01,car 的第一个名词意义。
#car.n.01 被称为 synset 或“同义词集”,意义相同的词(或“词条”)的集合
wn.synsets('motorcar')


Out[17]:
[Synset('car.n.01')]

In [20]:
#查看同义词集合
wn.synset('car.n.01').lemma_names()


Out[20]:
[u'car', u'auto', u'automobile', u'machine', u'motorcar']

In [22]:
#同义词集也有一些一 般的定义
wn.synset('car.n.01').definition()


Out[22]:
u'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

In [23]:
#同义词集也有一些一般例句:
wn.synset('car.n.01').examples()


Out[23]:
[u'he needs a car to get to work']

In [24]:
csys = wn.synset('car.n.01')

In [25]:
csys.lemmas()


Out[25]:
[Lemma('car.n.01.car'),
 Lemma('car.n.01.auto'),
 Lemma('car.n.01.automobile'),
 Lemma('car.n.01.machine'),
 Lemma('car.n.01.motorcar')]

In [26]:
#car de 同义词集
asynsets = wn.synsets('car')

In [27]:
asynsets


Out[27]:
[Synset('car.n.01'),
 Synset('car.n.02'),
 Synset('car.n.03'),
 Synset('car.n.04'),
 Synset('cable_car.n.01')]

In [28]:
for synset in asynsets:
    print synset.lemma_names()


[u'car', u'auto', u'automobile', u'machine', u'motorcar']
[u'car', u'railcar', u'railway_car', u'railroad_car']
[u'car', u'gondola']
[u'car', u'elevator_car']
[u'cable_car', u'car']

In [29]:
#访问所有包 词 car 的词条
wn.lemmas('car')


Out[29]:
[Lemma('car.n.01.car'),
 Lemma('car.n.02.car'),
 Lemma('car.n.03.car'),
 Lemma('car.n.04.car'),
 Lemma('cable_car.n.01.car')]

In [30]:
#WordNet WordNet 的同义词集对应于抽象的概念,它们并不总是有对应的英语词汇。
#这些概念在 层次结构中相互联系在一起。一些概念也很一般,如实体、状态、事件;这些被称为独一无 二的根同义词集。
motorcar = wn.synset('car.n.01')

In [31]:
types_of_motorcar = motorcar.hyponyms()

In [32]:
types_of_motorcar[26]


Out[32]:
Synset('stanley_steamer.n.01')

In [34]:
sorted([lemma.name for synset in types_of_motorcar for lemma in synset.lemmas])


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-34-1790d4a6c389> in <module>()
----> 1 sorted([lemma.name for synset in types_of_motorcar for lemma in synset.lemmas])

TypeError: 'instancemethod' object is not iterable

In [36]:
#得到一个最一般的上位(或根上位)同义词集
motorcar.root_hypernyms()


Out[36]:
[Synset('entity.n.01')]

In [38]:
#NLTK 中便捷的图形化 WordNet浏览器:nltk.app.wordnet()。 沿着上位词与下位词之间的链接,探索 WordNet 的层次结构
#nltk.app.wordnet()