This example uses a dataset downloaded from https://www.opensubtitles.org/en/search/vip and the raw data at opus.lingfil.uu.se/OpenSubtitles2016/raw/en. Metadata such as title actor and director was scraped from IMDB and is not guaranteed to be complete. This example uses the last 5000 most recent movies. The full archive (1.1 Gig) is here.
The code does the following:
Be sure to install the following:
pip3 install sklearn
pip3 install pandas
pip3 install scipy
In [1]:
import pandas as pd
import sys
sys.version
Out[1]:
'3.6.3 |Anaconda, Inc.| (default, Oct 6 2017, 12:04:38) \n[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]'
In [2]:
import tempfile
import zipfile
import os.path
zipFile = "./openSubtitles-5000.json.zip"
print( "Unarchiving ...")
temp_dir = tempfile.mkdtemp()
zip_ref = zipfile.ZipFile(zipFile, 'r')
zip_ref.extractall(temp_dir)
zip_ref.close()
openSubtitlesFile = os.path.join(temp_dir, "openSubtitles-5000.json")
print ("file unarchived to:" + openSubtitlesFile)
Unarchiving ...
file unarchived to:/var/folders/k1/ywpsl_ld2fj1bn5vp9bbgsr40000gn/T/tmp155tiu8f/openSubtitles-5000.json
In [31]:
import json
from sklearn.feature_extraction.text import CountVectorizer
#from log_progress import log_progress
maxDocsToload = 50000
titles = []
def make_corpus(file):
with open(file) as f:
for i, line in enumerate(f):
doc = json.loads(line)
titles.append(doc.get('Title',''))
#if 'Sci-Fi' not in doc.get('Genre',''):
# continue
if i % 100 == 0:
print ("%d " % i, end='')
yield doc.get('Text','')
if i == maxDocsToload:
break
print ("Starting load ...")
textGenerator = make_corpus(openSubtitlesFile)
count_vectorizer = CountVectorizer(min_df=2, max_df=0.75, ngram_range=(1,2), max_features=50000,
stop_words='english', analyzer="word", token_pattern="[a-zA-Z]{3,}")
term_freq_matrix = count_vectorizer.fit_transform(textGenerator)
print ("Done.")
print ( "term_freq_matrix shape = %s" % (term_freq_matrix.shape,) )
print ("term_freq_matrix = \n%s" % term_freq_matrix)
Starting load ...
0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 Done.
term_freq_matrix shape = (5000, 50000)
term_freq_matrix =
(0, 43801) 1
(0, 14746) 1
(0, 44094) 1
(0, 21796) 1
(0, 4112) 1
(0, 10559) 1
(0, 17280) 1
(0, 34971) 1
(0, 38789) 1
(0, 9338) 1
(0, 29011) 1
(0, 31198) 1
(0, 49419) 1
(0, 3751) 1
(0, 9427) 1
(0, 46392) 1
(0, 24453) 1
(0, 27305) 1
(0, 24240) 1
(0, 21301) 1
(0, 25182) 1
(0, 48467) 1
(0, 26134) 1
(0, 36028) 1
(0, 41716) 1
: :
(4999, 6237) 1
(4999, 47667) 2
(4999, 12628) 1
(4999, 6734) 1
(4999, 22751) 1
(4999, 5372) 3
(4999, 19080) 1
(4999, 12840) 1
(4999, 3713) 1
(4999, 34455) 1
(4999, 33739) 1
(4999, 33125) 3
(4999, 4065) 1
(4999, 7763) 2
(4999, 33163) 1
(4999, 19771) 1
(4999, 36837) 2
(4999, 49888) 3
(4999, 27604) 3
(4999, 12677) 6
(4999, 1992) 11
(4999, 48757) 4
(4999, 5647) 4
(4999, 28853) 4
(4999, 47798) 2
In [32]:
print( "Vocabulary length = ", len(count_vectorizer.vocabulary_))
word = "data";
rainingIndex = count_vectorizer.vocabulary_[word];
print( "token index for \"%s\" = %d" % (word,rainingIndex))
feature_names = count_vectorizer.get_feature_names()
print( "feature_names[%d] = %s" % (rainingIndex, feature_names[rainingIndex]))
Vocabulary length = 50000
token index for "data" = 8419
feature_names[8419] = data
In [33]:
for i in range(0,1000):
print( "feature_names[%d] = %s" % (i, feature_names[i]))
feature_names[0] = aaaaah
feature_names[1] = aaaah
feature_names[2] = aaah
feature_names[3] = aaargh
feature_names[4] = aafrin
feature_names[5] = aagh
feature_names[6] = aah
feature_names[7] = aah aah
feature_names[8] = aah did
feature_names[9] = aah don
feature_names[10] = aah god
feature_names[11] = aah grunting
feature_names[12] = aah grunts
feature_names[13] = aah hey
feature_names[14] = aah okay
feature_names[15] = aargh
feature_names[16] = aaron
feature_names[17] = abaddon
feature_names[18] = abalone
feature_names[19] = abandon
feature_names[20] = abandoned
feature_names[21] = abandoning
feature_names[22] = abandonment
feature_names[23] = abba
feature_names[24] = abbas
feature_names[25] = abbey
feature_names[26] = abbi
feature_names[27] = abbie
feature_names[28] = abbott
feature_names[29] = abbs
feature_names[30] = abby
feature_names[31] = abby abby
feature_names[32] = abdomen
feature_names[33] = abdominal
feature_names[34] = abduct
feature_names[35] = abducted
feature_names[36] = abduction
feature_names[37] = abdul
feature_names[38] = abe
feature_names[39] = abed
feature_names[40] = abetting
feature_names[41] = abi
feature_names[42] = abide
feature_names[43] = abigail
feature_names[44] = abilities
feature_names[45] = ability
feature_names[46] = ablaze
feature_names[47] = able
feature_names[48] = able afford
feature_names[49] = able come
feature_names[50] = able control
feature_names[51] = able determine
feature_names[52] = able handle
feature_names[53] = able hear
feature_names[54] = able help
feature_names[55] = able hold
feature_names[56] = able identify
feature_names[57] = able know
feature_names[58] = able leave
feature_names[59] = able live
feature_names[60] = able look
feature_names[61] = able make
feature_names[62] = able pull
feature_names[63] = able reach
feature_names[64] = able save
feature_names[65] = able say
feature_names[66] = able sleep
feature_names[67] = able stop
feature_names[68] = able talk
feature_names[69] = able tell
feature_names[70] = able track
feature_names[71] = able use
feature_names[72] = able walk
feature_names[73] = able work
feature_names[74] = abnormal
feature_names[75] = aboard
feature_names[76] = abode
feature_names[77] = abomination
feature_names[78] = abort
feature_names[79] = aborted
feature_names[80] = abortion
feature_names[81] = abraham
feature_names[82] = abroad
feature_names[83] = abrupt
feature_names[84] = abruptly
feature_names[85] = abs
feature_names[86] = absence
feature_names[87] = absent
feature_names[88] = absolute
feature_names[89] = absolutely
feature_names[90] = absolutely absolutely
feature_names[91] = absolutely don
feature_names[92] = absolutely fine
feature_names[93] = absolutely idea
feature_names[94] = absolutely just
feature_names[95] = absolutely know
feature_names[96] = absolutely let
feature_names[97] = absolutely okay
feature_names[98] = absolutely right
feature_names[99] = absolutely sure
feature_names[100] = absolutely yeah
feature_names[101] = absolve
feature_names[102] = absorb
feature_names[103] = absorbed
feature_names[104] = absorbing
feature_names[105] = abstract
feature_names[106] = absurd
feature_names[107] = abu
feature_names[108] = abu omar
feature_names[109] = abuddin
feature_names[110] = abuela
feature_names[111] = abuelita
feature_names[112] = abundance
feature_names[113] = abuse
feature_names[114] = abused
feature_names[115] = abuser
feature_names[116] = abuses
feature_names[117] = abusing
feature_names[118] = abusive
feature_names[119] = abyss
feature_names[120] = academic
feature_names[121] = academics
feature_names[122] = academy
feature_names[123] = accelerant
feature_names[124] = accelerate
feature_names[125] = accelerated
feature_names[126] = accelerating
feature_names[127] = acceleration
feature_names[128] = accelerator
feature_names[129] = accent
feature_names[130] = accents
feature_names[131] = accept
feature_names[132] = accept apology
feature_names[133] = accept fact
feature_names[134] = acceptable
feature_names[135] = acceptance
feature_names[136] = accepted
feature_names[137] = accepting
feature_names[138] = accepts
feature_names[139] = access
feature_names[140] = accessed
feature_names[141] = accessible
feature_names[142] = accessing
feature_names[143] = accessories
feature_names[144] = accessory
feature_names[145] = accessory murder
feature_names[146] = accident
feature_names[147] = accident did
feature_names[148] = accident didn
feature_names[149] = accident don
feature_names[150] = accident happened
feature_names[151] = accident just
feature_names[152] = accident okay
feature_names[153] = accident right
feature_names[154] = accident wasn
feature_names[155] = accident yeah
feature_names[156] = accidental
feature_names[157] = accidentally
feature_names[158] = accidents
feature_names[159] = accommodate
feature_names[160] = accommodation
feature_names[161] = accommodations
feature_names[162] = accompanied
feature_names[163] = accompany
feature_names[164] = accomplice
feature_names[165] = accomplices
feature_names[166] = accomplish
feature_names[167] = accomplished
feature_names[168] = accomplishment
feature_names[169] = accomplishments
feature_names[170] = accord
feature_names[171] = accordance
feature_names[172] = according
feature_names[173] = according plan
feature_names[174] = accordingly
feature_names[175] = account
feature_names[176] = accountability
feature_names[177] = accountable
feature_names[178] = accountant
feature_names[179] = accountants
feature_names[180] = accounted
feature_names[181] = accounting
feature_names[182] = accounts
feature_names[183] = accuracy
feature_names[184] = accurate
feature_names[185] = accurately
feature_names[186] = accusation
feature_names[187] = accusations
feature_names[188] = accuse
feature_names[189] = accused
feature_names[190] = accusing
feature_names[191] = accustomed
feature_names[192] = ace
feature_names[193] = aces
feature_names[194] = ache
feature_names[195] = aches
feature_names[196] = achieve
feature_names[197] = achieved
feature_names[198] = achievement
feature_names[199] = achievements
feature_names[200] = achieving
feature_names[201] = achilles
feature_names[202] = aching
feature_names[203] = acid
feature_names[204] = acker
feature_names[205] = acknowledge
feature_names[206] = acknowledged
feature_names[207] = acknowledging
feature_names[208] = ackroyd
feature_names[209] = acoustic
feature_names[210] = acquaintance
feature_names[211] = acquaintances
feature_names[212] = acquainted
feature_names[213] = acquire
feature_names[214] = acquired
feature_names[215] = acquisition
feature_names[216] = acquitted
feature_names[217] = acres
feature_names[218] = act
feature_names[219] = act like
feature_names[220] = acted
feature_names[221] = acted like
feature_names[222] = acting
feature_names[223] = acting like
feature_names[224] = acting strange
feature_names[225] = acting weird
feature_names[226] = action
feature_names[227] = actions
feature_names[228] = activate
feature_names[229] = activated
feature_names[230] = activation
feature_names[231] = active
feature_names[232] = actively
feature_names[233] = activist
feature_names[234] = activists
feature_names[235] = activities
feature_names[236] = activity
feature_names[237] = actor
feature_names[238] = actors
feature_names[239] = actress
feature_names[240] = actresses
feature_names[241] = acts
feature_names[242] = acts like
feature_names[243] = actual
feature_names[244] = actually actually
feature_names[245] = actually bad
feature_names[246] = actually believe
feature_names[247] = actually better
feature_names[248] = actually called
feature_names[249] = actually came
feature_names[250] = actually care
feature_names[251] = actually come
feature_names[252] = actually did
feature_names[253] = actually didn
feature_names[254] = actually does
feature_names[255] = actually doing
feature_names[256] = actually don
feature_names[257] = actually feel
feature_names[258] = actually getting
feature_names[259] = actually going
feature_names[260] = actually gonna
feature_names[261] = actually good
feature_names[262] = actually got
feature_names[263] = actually great
feature_names[264] = actually happened
feature_names[265] = actually happening
feature_names[266] = actually haven
feature_names[267] = actually having
feature_names[268] = actually help
feature_names[269] = actually hoping
feature_names[270] = actually just
feature_names[271] = actually kind
feature_names[272] = actually know
feature_names[273] = actually let
feature_names[274] = actually like
feature_names[275] = actually little
feature_names[276] = actually live
feature_names[277] = actually look
feature_names[278] = actually looking
feature_names[279] = actually lot
feature_names[280] = actually love
feature_names[281] = actually make
feature_names[282] = actually makes
feature_names[283] = actually mean
feature_names[284] = actually met
feature_names[285] = actually need
feature_names[286] = actually nice
feature_names[287] = actually okay
feature_names[288] = actually pretty
feature_names[289] = actually quite
feature_names[290] = actually really
feature_names[291] = actually right
feature_names[292] = actually said
feature_names[293] = actually saw
feature_names[294] = actually say
feature_names[295] = actually sure
feature_names[296] = actually talk
feature_names[297] = actually talking
feature_names[298] = actually thing
feature_names[299] = actually think
feature_names[300] = actually thinking
feature_names[301] = actually thought
feature_names[302] = actually time
feature_names[303] = actually took
feature_names[304] = actually true
feature_names[305] = actually trying
feature_names[306] = actually use
feature_names[307] = actually want
feature_names[308] = actually wanted
feature_names[309] = actually wasn
feature_names[310] = actually way
feature_names[311] = actually went
feature_names[312] = actually work
feature_names[313] = actually worked
feature_names[314] = actually working
feature_names[315] = actually works
feature_names[316] = actually yeah
feature_names[317] = actually yes
feature_names[318] = acute
feature_names[319] = ada
feature_names[320] = adalind
feature_names[321] = adam
feature_names[322] = adam adam
feature_names[323] = adam don
feature_names[324] = adam got
feature_names[325] = adam just
feature_names[326] = adam levine
feature_names[327] = adam right
feature_names[328] = adam yeah
feature_names[329] = adamant
feature_names[330] = adamian
feature_names[331] = adams
feature_names[332] = adapt
feature_names[333] = adapted
feature_names[334] = add
feature_names[335] = add little
feature_names[336] = added
feature_names[337] = addict
feature_names[338] = addicted
feature_names[339] = addiction
feature_names[340] = addictive
feature_names[341] = addicts
feature_names[342] = adding
feature_names[343] = addition
feature_names[344] = additional
feature_names[345] = address
feature_names[346] = address right
feature_names[347] = addressed
feature_names[348] = addresses
feature_names[349] = addressing
feature_names[350] = adds
feature_names[351] = addy
feature_names[352] = adele
feature_names[353] = adequate
feature_names[354] = adhere
feature_names[355] = adi
feature_names[356] = adios
feature_names[357] = aditi
feature_names[358] = adjacent
feature_names[359] = adjourned
feature_names[360] = adjust
feature_names[361] = adjusted
feature_names[362] = adjusting
feature_names[363] = adjustment
feature_names[364] = adjustments
feature_names[365] = admin
feature_names[366] = administer
feature_names[367] = administered
feature_names[368] = administration
feature_names[369] = administrative
feature_names[370] = administrator
feature_names[371] = admirable
feature_names[372] = admiral
feature_names[373] = admiration
feature_names[374] = admire
feature_names[375] = admired
feature_names[376] = admirer
feature_names[377] = admiring
feature_names[378] = admissible
feature_names[379] = admission
feature_names[380] = admissions
feature_names[381] = admit
feature_names[382] = admit did
feature_names[383] = admit wrong
feature_names[384] = admits
feature_names[385] = admitted
feature_names[386] = admittedly
feature_names[387] = admitting
feature_names[388] = ado
feature_names[389] = adolf
feature_names[390] = adolf hitler
feature_names[391] = adopt
feature_names[392] = adopted
feature_names[393] = adopting
feature_names[394] = adoption
feature_names[395] = adorable
feature_names[396] = adore
feature_names[397] = adored
feature_names[398] = adrenaline
feature_names[399] = adrian
feature_names[400] = adriana
feature_names[401] = ads
feature_names[402] = adult
feature_names[403] = adult adam
feature_names[404] = adultery
feature_names[405] = adults
feature_names[406] = advance
feature_names[407] = advanced
feature_names[408] = advances
feature_names[409] = advancing
feature_names[410] = advantage
feature_names[411] = advantages
feature_names[412] = adventure
feature_names[413] = adventures
feature_names[414] = adventurous
feature_names[415] = adversary
feature_names[416] = adversity
feature_names[417] = advert
feature_names[418] = advertise
feature_names[419] = advertisement
feature_names[420] = advertising
feature_names[421] = advice
feature_names[422] = advice don
feature_names[423] = advice just
feature_names[424] = advise
feature_names[425] = advised
feature_names[426] = adviser
feature_names[427] = advising
feature_names[428] = advisor
feature_names[429] = advocate
feature_names[430] = aegeus
feature_names[431] = aerial
feature_names[432] = aeroplanes
feature_names[433] = aether
feature_names[434] = afar
feature_names[435] = affair
feature_names[436] = affairs
feature_names[437] = affect
feature_names[438] = affected
feature_names[439] = affecting
feature_names[440] = affection
feature_names[441] = affectionate
feature_names[442] = affects
feature_names[443] = affirmative
feature_names[444] = afford
feature_names[445] = afford lose
feature_names[446] = afford pay
feature_names[447] = affordable
feature_names[448] = afghan
feature_names[449] = afghanistan
feature_names[450] = afloat
feature_names[451] = afraid
feature_names[452] = afraid afraid
feature_names[453] = afraid come
feature_names[454] = afraid dark
feature_names[455] = afraid don
feature_names[456] = afraid going
feature_names[457] = afraid gonna
feature_names[458] = afraid just
feature_names[459] = afraid know
feature_names[460] = afraid let
feature_names[461] = afraid like
feature_names[462] = afraid losing
feature_names[463] = afraid say
feature_names[464] = afraid tell
feature_names[465] = afraid think
feature_names[466] = afraid won
feature_names[467] = africa
feature_names[468] = african
feature_names[469] = african american
feature_names[470] = afterlife
feature_names[471] = aftermath
feature_names[472] = afternoon
feature_names[473] = afternoon sir
feature_names[474] = afterward
feature_names[475] = agatha
feature_names[476] = age
feature_names[477] = age got
feature_names[478] = age just
feature_names[479] = age know
feature_names[480] = age old
feature_names[481] = aged
feature_names[482] = agencies
feature_names[483] = agency
feature_names[484] = agenda
feature_names[485] = agent
feature_names[486] = agent aubrey
feature_names[487] = agent booth
feature_names[488] = agent gibbs
feature_names[489] = agent hardy
feature_names[490] = agent keen
feature_names[491] = agent pride
feature_names[492] = agent ryan
feature_names[493] = agent thomas
feature_names[494] = agents
feature_names[495] = ages
feature_names[496] = ages ago
feature_names[497] = aggravated
feature_names[498] = aggravated assault
feature_names[499] = aggression
feature_names[500] = aggressive
feature_names[501] = aggressively
feature_names[502] = agh
feature_names[503] = agh agh
feature_names[504] = aging
feature_names[505] = agitated
feature_names[506] = agnes
feature_names[507] = agnew
feature_names[508] = ago
feature_names[509] = ago came
feature_names[510] = ago day
feature_names[511] = ago did
feature_names[512] = ago didn
feature_names[513] = ago doesn
feature_names[514] = ago don
feature_names[515] = ago going
feature_names[516] = ago gonna
feature_names[517] = ago good
feature_names[518] = ago got
feature_names[519] = ago happened
feature_names[520] = ago just
feature_names[521] = ago know
feature_names[522] = ago left
feature_names[523] = ago let
feature_names[524] = ago like
feature_names[525] = ago long
feature_names[526] = ago look
feature_names[527] = ago man
feature_names[528] = ago maybe
feature_names[529] = ago okay
feature_names[530] = ago really
feature_names[531] = ago remember
feature_names[532] = ago right
feature_names[533] = ago said
feature_names[534] = ago sorry
feature_names[535] = ago started
feature_names[536] = ago think
feature_names[537] = ago thought
feature_names[538] = ago time
feature_names[539] = ago told
feature_names[540] = ago wanted
feature_names[541] = ago way
feature_names[542] = ago went
feature_names[543] = ago yeah
feature_names[544] = ago years
feature_names[545] = ago yes
feature_names[546] = agonizing
feature_names[547] = agony
feature_names[548] = agos
feature_names[549] = agota
feature_names[550] = agree
feature_names[551] = agree agree
feature_names[552] = agree disagree
feature_names[553] = agree don
feature_names[554] = agree just
feature_names[555] = agree know
feature_names[556] = agree think
feature_names[557] = agreed
feature_names[558] = agreed come
feature_names[559] = agreed let
feature_names[560] = agreed meet
feature_names[561] = agreeing
feature_names[562] = agreement
feature_names[563] = agreements
feature_names[564] = agrees
feature_names[565] = agricultural
feature_names[566] = agriculture
feature_names[567] = agrinext
feature_names[568] = aguilera
feature_names[569] = aha
feature_names[570] = ahdu
feature_names[571] = ahead
feature_names[572] = ahead don
feature_names[573] = ahead going
feature_names[574] = ahead gonna
feature_names[575] = ahead got
feature_names[576] = ahead just
feature_names[577] = ahead let
feature_names[578] = ahead okay
feature_names[579] = ahead right
feature_names[580] = ahead say
feature_names[581] = ahead tell
feature_names[582] = ahead thank
feature_names[583] = ahead time
feature_names[584] = ahem
feature_names[585] = ahh
feature_names[586] = ahh ahh
feature_names[587] = ahhh
feature_names[588] = ahhhh
feature_names[589] = ahmad
feature_names[590] = ahmadi
feature_names[591] = ahmed
feature_names[592] = ahmos
feature_names[593] = ahn
feature_names[594] = ahold
feature_names[595] = aid
feature_names[596] = aid kit
feature_names[597] = aide
feature_names[598] = aiden
feature_names[599] = aiding
feature_names[600] = aiding abetting
feature_names[601] = aids
feature_names[602] = aife
feature_names[603] = aigoo
feature_names[604] = aim
feature_names[605] = aimed
feature_names[606] = aiming
feature_names[607] = ain
feature_names[608] = ain ain
feature_names[609] = ain bad
feature_names[610] = ain coming
feature_names[611] = ain doing
feature_names[612] = ain easy
feature_names[613] = ain getting
feature_names[614] = ain going
feature_names[615] = ain gonna
feature_names[616] = ain good
feature_names[617] = ain got
feature_names[618] = ain just
feature_names[619] = ain like
feature_names[620] = ain nothin
feature_names[621] = ain right
feature_names[622] = ain seen
feature_names[623] = ain way
feature_names[624] = air
feature_names[625] = air air
feature_names[626] = air breathe
feature_names[627] = air conditioning
feature_names[628] = air date
feature_names[629] = air don
feature_names[630] = air force
feature_names[631] = air hissing
feature_names[632] = air know
feature_names[633] = air pollution
feature_names[634] = airborne
feature_names[635] = aircraft
feature_names[636] = aircraft engines
feature_names[637] = airfield
feature_names[638] = airline
feature_names[639] = airlines
feature_names[640] = airlock
feature_names[641] = airplane
feature_names[642] = airport
feature_names[643] = airports
feature_names[644] = airs
feature_names[645] = airspace
feature_names[646] = airtight
feature_names[647] = airway
feature_names[648] = aisha
feature_names[649] = aisle
feature_names[650] = ajay
feature_names[651] = ajj
feature_names[652] = aka
feature_names[653] = akalitus
feature_names[654] = akbar
feature_names[655] = akeela
feature_names[656] = akio
feature_names[657] = aktaion
feature_names[658] = alabama
feature_names[659] = alaikum
feature_names[660] = alak
feature_names[661] = alamo
feature_names[662] = alan
feature_names[663] = alan alan
feature_names[664] = alana
feature_names[665] = alaric
feature_names[666] = alarm
feature_names[667] = alarm beeping
feature_names[668] = alarm beeps
feature_names[669] = alarm blares
feature_names[670] = alarm blaring
feature_names[671] = alarm chirps
feature_names[672] = alarm continues
feature_names[673] = alarm ringing
feature_names[674] = alarm sounding
feature_names[675] = alarm sounds
feature_names[676] = alarm stops
feature_names[677] = alarm wailing
feature_names[678] = alarmed
feature_names[679] = alarming
feature_names[680] = alarms
feature_names[681] = alas
feature_names[682] = alaska
feature_names[683] = alatriste
feature_names[684] = alba
feature_names[685] = albany
feature_names[686] = albeit
feature_names[687] = albert
feature_names[688] = albert einstein
feature_names[689] = alberto
feature_names[690] = album
feature_names[691] = albums
feature_names[692] = albuquerque
feature_names[693] = alcohol
feature_names[694] = alcoholic
feature_names[695] = alden
feature_names[696] = aldnoah
feature_names[697] = ale
feature_names[698] = alec
feature_names[699] = aleister
feature_names[700] = alejandro
feature_names[701] = alert
feature_names[702] = alerted
feature_names[703] = alerts
feature_names[704] = alex
feature_names[705] = alex alex
feature_names[706] = alex don
feature_names[707] = alex just
feature_names[708] = alex know
feature_names[709] = alex need
feature_names[710] = alex okay
feature_names[711] = alexa
feature_names[712] = alexander
feature_names[713] = alexandra
feature_names[714] = alexandria
feature_names[715] = alexis
feature_names[716] = alfie
feature_names[717] = alfred
feature_names[718] = alfredo
feature_names[719] = algebra
feature_names[720] = algorithm
feature_names[721] = algorithm just
feature_names[722] = algorithms
feature_names[723] = ali
feature_names[724] = alias
feature_names[725] = aliases
feature_names[726] = alibi
feature_names[727] = alibis
feature_names[728] = alice
feature_names[729] = alicia
feature_names[730] = alicia florrick
feature_names[731] = alien
feature_names[732] = aliens
feature_names[733] = align
feature_names[734] = aligned
feature_names[735] = alike
feature_names[736] = alimony
feature_names[737] = alison
feature_names[738] = alison dilaurentis
feature_names[739] = alistair
feature_names[740] = alive
feature_names[741] = alive alive
feature_names[742] = alive come
feature_names[743] = alive damn
feature_names[744] = alive dead
feature_names[745] = alive did
feature_names[746] = alive didn
feature_names[747] = alive don
feature_names[748] = alive got
feature_names[749] = alive just
feature_names[750] = alive know
feature_names[751] = alive let
feature_names[752] = alive long
feature_names[753] = alive need
feature_names[754] = alive okay
feature_names[755] = alive right
feature_names[756] = alive time
feature_names[757] = alive today
feature_names[758] = alive want
feature_names[759] = alive yeah
feature_names[760] = alive yes
feature_names[761] = aliyah
feature_names[762] = allah
feature_names[763] = allahu
feature_names[764] = allahu akbar
feature_names[765] = allan
feature_names[766] = allegation
feature_names[767] = allegations
feature_names[768] = alleged
feature_names[769] = allegedly
feature_names[770] = allegiance
feature_names[771] = allen
feature_names[772] = allerdyce
feature_names[773] = allergic
feature_names[774] = allergic reaction
feature_names[775] = allergies
feature_names[776] = allergy
feature_names[777] = alleviate
feature_names[778] = alley
feature_names[779] = alleys
feature_names[780] = alleyway
feature_names[781] = alliance
feature_names[782] = alliances
feature_names[783] = allie
feature_names[784] = allied
feature_names[785] = allies
feature_names[786] = alligator
feature_names[787] = allison
feature_names[788] = allow
feature_names[789] = allow introduce
feature_names[790] = allowance
feature_names[791] = allowed
feature_names[792] = allowed say
feature_names[793] = allowed talk
feature_names[794] = allowing
feature_names[795] = allows
feature_names[796] = alloy
feature_names[797] = allright
feature_names[798] = allsafe
feature_names[799] = ally
feature_names[800] = allyson
feature_names[801] = alma
feature_names[802] = almighty
feature_names[803] = almond
feature_names[804] = almond milk
feature_names[805] = almonds
feature_names[806] = almonte
feature_names[807] = almy
feature_names[808] = aloha
feature_names[809] = alongside
feature_names[810] = alonso
feature_names[811] = alonzo
feature_names[812] = aloud
feature_names[813] = alpha
feature_names[814] = alphabet
feature_names[815] = alphas
feature_names[816] = alqu
feature_names[817] = alqu zar
feature_names[818] = alright
feature_names[819] = alright alright
feature_names[820] = alright come
feature_names[821] = alright don
feature_names[822] = alright going
feature_names[823] = alright gonna
feature_names[824] = alright good
feature_names[825] = alright got
feature_names[826] = alright hey
feature_names[827] = alright just
feature_names[828] = alright know
feature_names[829] = alright let
feature_names[830] = alright listen
feature_names[831] = alright okay
feature_names[832] = alright yeah
feature_names[833] = alright yes
feature_names[834] = als
feature_names[835] = alt
feature_names[836] = altar
feature_names[837] = alter
feature_names[838] = altercation
feature_names[839] = altered
feature_names[840] = altering
feature_names[841] = alternate
feature_names[842] = alternative
feature_names[843] = alternatives
feature_names[844] = altitude
feature_names[845] = altogether
feature_names[846] = alton
feature_names[847] = aluminum
feature_names[848] = alumni
feature_names[849] = alvarez
feature_names[850] = alvin
feature_names[851] = alvis
feature_names[852] = aly
feature_names[853] = alyssa
feature_names[854] = alzate
feature_names[855] = alzheimer
feature_names[856] = ama
feature_names[857] = amanda
feature_names[858] = amanda amanda
feature_names[859] = amanda clarke
feature_names[860] = amantha
feature_names[861] = amara
feature_names[862] = amaro
feature_names[863] = amateur
feature_names[864] = amateurs
feature_names[865] = amaze
feature_names[866] = amazed
feature_names[867] = amazing
feature_names[868] = amazing amazing
feature_names[869] = amazing did
feature_names[870] = amazing don
feature_names[871] = amazing grace
feature_names[872] = amazing job
feature_names[873] = amazing just
feature_names[874] = amazing know
feature_names[875] = amazing like
feature_names[876] = amazing look
feature_names[877] = amazing race
feature_names[878] = amazing really
feature_names[879] = amazing right
feature_names[880] = amazing thank
feature_names[881] = amazing thing
feature_names[882] = amazing things
feature_names[883] = amazing think
feature_names[884] = amazing woman
feature_names[885] = amazing yeah
feature_names[886] = amazingly
feature_names[887] = amazon
feature_names[888] = ambassador
feature_names[889] = amber
feature_names[890] = amber alert
feature_names[891] = ambien
feature_names[892] = ambiguous
feature_names[893] = ambition
feature_names[894] = ambitions
feature_names[895] = ambitious
feature_names[896] = ambo
feature_names[897] = ambulance
feature_names[898] = ambulance way
feature_names[899] = ambulances
feature_names[900] = ambush
feature_names[901] = ambushed
feature_names[902] = amc
feature_names[903] = amelia
feature_names[904] = amen
feature_names[905] = amen amen
feature_names[906] = amendment
feature_names[907] = amends
feature_names[908] = america
feature_names[909] = america saved
feature_names[910] = america works
feature_names[911] = american
feature_names[912] = american accent
feature_names[913] = american citizen
feature_names[914] = american dream
feature_names[915] = american highway
feature_names[916] = american people
feature_names[917] = american soil
feature_names[918] = americans
feature_names[919] = americas
feature_names[920] = ames
feature_names[921] = amethysts
feature_names[922] = amidst
feature_names[923] = amigo
feature_names[924] = amin
feature_names[925] = aminata
feature_names[926] = amir
feature_names[927] = amish
feature_names[928] = amiss
feature_names[929] = ammo
feature_names[930] = ammonia
feature_names[931] = ammunition
feature_names[932] = amnesia
feature_names[933] = amor
feature_names[934] = amos
feature_names[935] = amounts
feature_names[936] = amp
feature_names[937] = amphora
feature_names[938] = ample
feature_names[939] = amplified
feature_names[940] = amram
feature_names[941] = amsterdam
feature_names[942] = amulet
feature_names[943] = amulets
feature_names[944] = amuse
feature_names[945] = amused
feature_names[946] = amusement
feature_names[947] = amusing
feature_names[948] = amy
feature_names[949] = amy amy
feature_names[950] = amy just
feature_names[951] = amy winehouse
feature_names[952] = ana
feature_names[953] = anal
feature_names[954] = analogy
feature_names[955] = analysis
feature_names[956] = analyst
feature_names[957] = analysts
feature_names[958] = analytical
feature_names[959] = analyze
feature_names[960] = analyzed
feature_names[961] = analyzing
feature_names[962] = anandi
feature_names[963] = anaphylaxis
feature_names[964] = anarchy
feature_names[965] = anatomy
feature_names[966] = ancestor
feature_names[967] = ancestors
feature_names[968] = ancestral
feature_names[969] = anchor
feature_names[970] = ancient
feature_names[971] = ancient astronaut
feature_names[972] = ancient history
feature_names[973] = ancient world
feature_names[974] = ancients
feature_names[975] = anders
feature_names[976] = andersen
feature_names[977] = anderson
feature_names[978] = anderssen
feature_names[979] = andes
feature_names[980] = andi
feature_names[981] = andie
feature_names[982] = andit
feature_names[983] = andr
feature_names[984] = andre
feature_names[985] = andrea
feature_names[986] = andreas
feature_names[987] = andrew
feature_names[988] = andrews
feature_names[989] = android
feature_names[990] = andy
feature_names[991] = andy andy
feature_names[992] = anesthesia
feature_names[993] = anesthetic
feature_names[994] = aneurysm
feature_names[995] = anew
feature_names[996] = ang
feature_names[997] = angel
feature_names[998] = angela
feature_names[999] = angeles
In [34]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(term_freq_matrix)
tf_idf_matrix = tfidf.transform(term_freq_matrix)
print( tf_idf_matrix)
(0, 1052) 0.0959520615105
(0, 47252) 0.0045643573485
(0, 46375) 0.00987768752281
(0, 47798) 0.0159646781073
(0, 36780) 0.00584869223708
(0, 888) 0.00733040914588
(0, 28853) 0.0133945486906
(0, 36925) 0.460728096203
(0, 11666) 0.0747220017172
(0, 48495) 0.00350520687676
(0, 2586) 0.0213489141115
(0, 5647) 0.00946811293935
(0, 44936) 0.00973571035319
(0, 28826) 0.0112220717293
(0, 33208) 0.0129756461679
(0, 48757) 0.0127876247595
(0, 1992) 0.0101274111868
(0, 3614) 0.0111653214653
(0, 12677) 0.0154170967102
(0, 21158) 0.500197844055
(0, 19051) 0.012990005648
(0, 37797) 0.033066465323
(0, 41887) 0.0100586723564
(0, 27604) 0.00969729004908
(0, 49888) 0.00572522605098
: :
(4999, 5359) 0.0121866541623
(4999, 40315) 0.0109134798469
(4999, 1533) 0.0121131749278
(4999, 42664) 0.0104026648556
(4999, 49858) 0.0378074287042
(4999, 5875) 0.137630935462
(4999, 18612) 0.0107636391144
(4999, 1070) 0.0243733083245
(4999, 31763) 0.0267838081578
(4999, 5436) 0.0136920306161
(4999, 34713) 0.0693071951858
(4999, 14450) 0.0121131749278
(4999, 43992) 0.0224142545624
(4999, 43598) 0.0114477997751
(4999, 48550) 0.0114997220155
(4999, 12720) 0.0121131749278
(4999, 8034) 0.0123424948028
(4999, 43687) 0.0119080413094
(4999, 3746) 0.0121131749278
(4999, 41678) 0.0116079147722
(4999, 26718) 0.0122630156602
(4999, 41457) 0.0122630156602
(4999, 44465) 0.0118442005531
(4999, 44831) 0.0127973355885
(4999, 39416) 0.0120423682656
In [58]:
%%time
from sklearn.cluster import KMeans,MiniBatchKMeans
import numpy
num_clusters = 5
#km = KMeans(n_clusters=num_clusters, verbose=True, init='k-means++', n_init=3, n_jobs=-1)
km = MiniBatchKMeans(n_clusters=num_clusters, verbose=True, init='k-means++', n_init=25, batch_size=2000)
km.fit(tf_idf_matrix)
clusters = km.labels_.tolist()
print ("cluster id for each document = %s" % clusters)
print()
# sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
Init 1/25 with method: k-means++
Inertia for init 1/25: 4784.462815
Init 2/25 with method: k-means++
Inertia for init 2/25: 4796.250436
Init 3/25 with method: k-means++
Inertia for init 3/25: 4784.292116
Init 4/25 with method: k-means++
Inertia for init 4/25: 4786.645619
Init 5/25 with method: k-means++
Inertia for init 5/25: 4798.048409
Init 6/25 with method: k-means++
Inertia for init 6/25: 4777.020875
Init 7/25 with method: k-means++
Inertia for init 7/25: 4791.242440
Init 8/25 with method: k-means++
Inertia for init 8/25: 4798.643628
Init 9/25 with method: k-means++
Inertia for init 9/25: 4794.832302
Init 10/25 with method: k-means++
Inertia for init 10/25: 4789.196281
Init 11/25 with method: k-means++
Inertia for init 11/25: 4783.763361
Init 12/25 with method: k-means++
Inertia for init 12/25: 4793.041913
Init 13/25 with method: k-means++
Inertia for init 13/25: 4794.125226
Init 14/25 with method: k-means++
Inertia for init 14/25: 4792.201854
Init 15/25 with method: k-means++
Inertia for init 15/25: 4796.190211
Init 16/25 with method: k-means++
Inertia for init 16/25: 4796.556150
Init 17/25 with method: k-means++
Inertia for init 17/25: 4784.365138
Init 18/25 with method: k-means++
Inertia for init 18/25: 4785.308107
Init 19/25 with method: k-means++
Inertia for init 19/25: 4788.862678
Init 20/25 with method: k-means++
Inertia for init 20/25: 4785.921268
Init 21/25 with method: k-means++
Inertia for init 21/25: 4796.252422
Init 22/25 with method: k-means++
Inertia for init 22/25: 4793.658573
Init 23/25 with method: k-means++
Inertia for init 23/25: 4787.189239
Init 24/25 with method: k-means++
Inertia for init 24/25: 4797.519752
Init 25/25 with method: k-means++
Inertia for init 25/25: 4791.961419
Minibatch iteration 1/300: mean batch inertia: 0.958124, ewa inertia: 0.958124
Minibatch iteration 2/300: mean batch inertia: 0.955533, ewa inertia: 0.956052
Minibatch iteration 3/300: mean batch inertia: 0.956102, ewa inertia: 0.956092
Minibatch iteration 4/300: mean batch inertia: 0.955150, ewa inertia: 0.955338
Minibatch iteration 5/300: mean batch inertia: 0.955839, ewa inertia: 0.955739
Minibatch iteration 6/300: mean batch inertia: 0.955031, ewa inertia: 0.955172
Minibatch iteration 7/300: mean batch inertia: 0.955068, ewa inertia: 0.955089
Minibatch iteration 8/300: mean batch inertia: 0.955063, ewa inertia: 0.955068
Minibatch iteration 9/300: mean batch inertia: 0.954158, ewa inertia: 0.954340
Minibatch iteration 10/300: mean batch inertia: 0.954057, ewa inertia: 0.954114
Minibatch iteration 11/300: mean batch inertia: 0.955130, ewa inertia: 0.954927
Minibatch iteration 12/300: mean batch inertia: 0.953941, ewa inertia: 0.954138
Minibatch iteration 13/300: mean batch inertia: 0.955071, ewa inertia: 0.954884
Minibatch iteration 14/300: mean batch inertia: 0.954265, ewa inertia: 0.954389
Minibatch iteration 15/300: mean batch inertia: 0.954184, ewa inertia: 0.954225
Minibatch iteration 16/300: mean batch inertia: 0.954633, ewa inertia: 0.954551
Minibatch iteration 17/300: mean batch inertia: 0.954300, ewa inertia: 0.954350
Minibatch iteration 18/300: mean batch inertia: 0.954806, ewa inertia: 0.954715
Minibatch iteration 19/300: mean batch inertia: 0.954108, ewa inertia: 0.954229
Minibatch iteration 20/300: mean batch inertia: 0.953406, ewa inertia: 0.953571
Minibatch iteration 21/300: mean batch inertia: 0.954312, ewa inertia: 0.954164
Minibatch iteration 22/300: mean batch inertia: 0.953547, ewa inertia: 0.953671
Minibatch iteration 23/300: mean batch inertia: 0.953828, ewa inertia: 0.953797
Minibatch iteration 24/300: mean batch inertia: 0.954492, ewa inertia: 0.954353
Minibatch iteration 25/300: mean batch inertia: 0.953573, ewa inertia: 0.953729
Minibatch iteration 26/300: mean batch inertia: 0.953372, ewa inertia: 0.953443
Minibatch iteration 27/300: mean batch inertia: 0.954045, ewa inertia: 0.953924
Minibatch iteration 28/300: mean batch inertia: 0.953677, ewa inertia: 0.953726
Minibatch iteration 29/300: mean batch inertia: 0.954005, ewa inertia: 0.953950
Minibatch iteration 30/300: mean batch inertia: 0.954236, ewa inertia: 0.954179
Minibatch iteration 31/300: mean batch inertia: 0.954736, ewa inertia: 0.954625
Minibatch iteration 32/300: mean batch inertia: 0.953776, ewa inertia: 0.953946
Minibatch iteration 33/300: mean batch inertia: 0.954172, ewa inertia: 0.954127
Minibatch iteration 34/300: mean batch inertia: 0.954216, ewa inertia: 0.954198
Minibatch iteration 35/300: mean batch inertia: 0.953835, ewa inertia: 0.953907
Minibatch iteration 36/300: mean batch inertia: 0.953866, ewa inertia: 0.953874
Converged (lack of improvement in inertia) at iteration 36/300
Computing label assignment and total inertia
cluster id for each document = [0, 3, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 3, 4, 1, 4, 0, 3, 1, 3, 3, 2, 1, 0, 0, 1, 3, 4, 1, 1, 0, 1, 4, 1, 0, 0, 2, 1, 1, 1, 4, 3, 4, 3, 1, 1, 0, 0, 1, 0, 3, 1, 1, 1, 3, 1, 1, 1, 1, 0, 3, 3, 3, 0, 4, 0, 4, 0, 0, 0, 1, 2, 1, 3, 2, 0, 4, 0, 1, 4, 3, 2, 3, 2, 3, 3, 0, 3, 1, 2, 0, 0, 4, 4, 3, 1, 4, 2, 2, 0, 0, 3, 0, 3, 0, 1, 1, 0, 0, 1, 0, 1, 0, 4, 4, 4, 4, 4, 4, 4, 2, 0, 1, 1, 1, 0, 0, 2, 4, 0, 1, 0, 0, 3, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 0, 1, 4, 1, 1, 0, 0, 4, 0, 3, 4, 0, 3, 0, 1, 2, 0, 4, 4, 3, 0, 3, 2, 0, 3, 3, 3, 1, 4, 4, 3, 0, 0, 4, 4, 0, 3, 4, 4, 4, 1, 4, 1, 0, 1, 3, 4, 1, 0, 0, 3, 3, 1, 0, 2, 3, 0, 2, 0, 1, 1, 3, 4, 1, 1, 0, 4, 0, 0, 1, 0, 0, 0, 0, 3, 1, 3, 2, 1, 0, 1, 0, 1, 1, 1, 3, 0, 2, 1, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 2, 2, 2, 2, 0, 2, 2, 2, 2, 1, 2, 2, 4, 4, 4, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 3, 1, 0, 2, 0, 1, 3, 4, 4, 3, 0, 2, 4, 4, 4, 4, 4, 4, 4, 2, 2, 0, 0, 0, 4, 2, 0, 1, 4, 4, 4, 0, 3, 2, 2, 1, 4, 0, 2, 2, 2, 0, 4, 0, 0, 4, 3, 4, 4, 4, 0, 1, 0, 3, 3, 3, 3, 3, 0, 0, 3, 0, 0, 3, 3, 0, 3, 0, 3, 1, 2, 2, 2, 4, 1, 0, 2, 2, 0, 0, 0, 0, 3, 3, 4, 2, 4, 3, 3, 4, 4, 3, 3, 3, 3, 0, 0, 3, 3, 0, 3, 4, 2, 0, 1, 0, 0, 0, 4, 2, 2, 1, 0, 0, 2, 3, 3, 3, 0, 0, 0, 0, 3, 0, 3, 3, 3, 4, 4, 2, 1, 4, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 1, 1, 1, 3, 1, 1, 0, 1, 2, 2, 3, 1, 1, 2, 0, 0, 0, 0, 0, 1, 0, 1, 3, 4, 4, 4, 0, 3, 3, 1, 0, 0, 4, 3, 1, 4, 0, 3, 4, 2, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 2, 0, 0, 0, 0, 3, 3, 2, 0, 2, 0, 0, 2, 1, 3, 3, 2, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 3, 3, 3, 0, 0, 0, 3, 1, 0, 4, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 2, 0, 0, 0, 3, 3, 0, 3, 0, 3, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 0, 0, 0, 0, 2, 2, 3, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 1, 3, 4, 4, 4, 4, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 3, 3, 1, 3, 2, 1, 2, 0, 0, 2, 0, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 2, 2, 2, 2, 1, 0, 3, 3, 0, 3, 1, 1, 1, 1, 1, 1, 4, 3, 2, 0, 4, 1, 1, 4, 3, 4, 3, 0, 2, 2, 2, 0, 4, 2, 0, 1, 1, 1, 1, 2, 0, 3, 0, 0, 0, 0, 1, 4, 4, 4, 3, 0, 1, 4, 1, 0, 4, 0, 2, 1, 1, 3, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 0, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 4, 4, 2, 1, 2, 4, 4, 4, 4, 4, 4, 4, 4, 3, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 4, 2, 2, 2, 3, 3, 2, 0, 1, 4, 0, 0, 0, 1, 3, 3, 4, 2, 3, 2, 2, 2, 2, 2, 2, 0, 0, 2, 3, 3, 3, 2, 0, 0, 0, 0, 3, 2, 3, 0, 2, 1, 1, 0, 4, 4, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 4, 0, 0, 3, 0, 3, 0, 4, 2, 2, 2, 3, 0, 2, 0, 0, 3, 2, 0, 1, 0, 1, 4, 4, 4, 1, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 1, 3, 0, 3, 3, 3, 1, 4, 4, 3, 0, 1, 1, 2, 3, 3, 3, 2, 2, 3, 3, 0, 0, 0, 0, 0, 1, 1, 0, 0, 3, 2, 3, 0, 4, 4, 4, 4, 3, 0, 3, 4, 2, 2, 2, 2, 2, 0, 3, 3, 2, 3, 0, 2, 3, 3, 0, 2, 2, 2, 1, 1, 1, 0, 4, 1, 1, 2, 1, 0, 1, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 0, 1, 3, 2, 2, 1, 3, 3, 3, 0, 1, 2, 4, 3, 1, 2, 3, 2, 2, 0, 3, 4, 3, 1, 2, 1, 2, 0, 0, 3, 2, 4, 4, 1, 1, 2, 0, 0, 0, 3, 2, 0, 2, 2, 0, 0, 0, 0, 1, 1, 0, 3, 2, 3, 3, 2, 0, 3, 0, 0, 0, 2, 1, 2, 2, 3, 0, 1, 0, 0, 0, 2, 0, 4, 4, 4, 0, 4, 4, 0, 1, 3, 3, 0, 0, 0, 1, 0, 4, 3, 3, 4, 2, 0, 0, 0, 0, 2, 0, 4, 0, 0, 0, 0, 3, 2, 1, 2, 2, 2, 2, 2, 0, 0, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 0, 1, 1, 0, 0, 2, 0, 2, 2, 2, 3, 2, 2, 3, 2, 0, 2, 2, 3, 3, 3, 2, 3, 2, 2, 2, 1, 1, 1, 0, 4, 1, 2, 1, 1, 3, 0, 3, 0, 0, 2, 0, 0, 0, 0, 2, 3, 2, 2, 0, 4, 0, 2, 2, 0, 0, 0, 1, 0, 0, 3, 0, 0, 2, 4, 4, 0, 3, 2, 0, 3, 1, 2, 2, 0, 0, 1, 3, 0, 1, 0, 0, 3, 3, 3, 2, 4, 0, 0, 0, 3, 4, 4, 4, 0, 1, 4, 1, 3, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 0, 0, 0, 0, 2, 2, 4, 2, 0, 3, 2, 2, 2, 0, 3, 2, 2, 0, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, 3, 2, 2, 3, 3, 1, 2, 3, 2, 4, 0, 0, 0, 2, 4, 3, 0, 2, 3, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 3, 3, 0, 3, 4, 3, 3, 3, 3, 1, 3, 4, 2, 3, 1, 2, 2, 2, 0, 0, 0, 2, 2, 0, 2, 4, 4, 3, 4, 4, 4, 2, 2, 2, 1, 1, 4, 2, 0, 0, 0, 0, 0, 4, 2, 0, 2, 2, 1, 0, 2, 1, 3, 3, 3, 2, 2, 3, 3, 3, 3, 2, 3, 3, 3, 2, 2, 0, 2, 1, 1, 0, 3, 3, 4, 3, 0, 0, 0, 2, 0, 0, 0, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 3, 1, 4, 3, 3, 3, 2, 2, 0, 3, 1, 0, 0, 0, 0, 0, 0, 3, 4, 2, 0, 4, 4, 4, 0, 0, 0, 1, 3, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 2, 1, 2, 3, 3, 3, 3, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 3, 3, 3, 3, 4, 4, 4, 4, 4, 2, 0, 0, 3, 3, 3, 3, 2, 0, 0, 3, 2, 0, 2, 3, 0, 0, 0, 3, 3, 0, 2, 1, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 0, 0, 0, 1, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 0, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 3, 3, 2, 3, 3, 3, 3, 0, 3, 3, 0, 0, 3, 0, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 0, 2, 0, 2, 0, 0, 0, 1, 1, 1, 1, 0, 3, 3, 1, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 3, 1, 0, 1, 3, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 0, 4, 1, 2, 4, 4, 3, 0, 2, 0, 3, 3, 1, 2, 0, 0, 3, 0, 0, 1, 2, 0, 2, 3, 2, 4, 4, 0, 0, 0, 2, 0, 0, 2, 3, 3, 0, 0, 3, 2, 0, 0, 0, 4, 4, 4, 0, 4, 3, 2, 3, 1, 1, 1, 1, 2, 3, 2, 2, 0, 0, 0, 3, 3, 2, 0, 2, 0, 3, 3, 3, 3, 3, 3, 3, 3, 0, 2, 3, 2, 4, 1, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 0, 2, 3, 0, 0, 2, 3, 2, 0, 3, 2, 0, 2, 2, 4, 2, 0, 2, 3, 2, 2, 2, 0, 1, 1, 2, 0, 0, 2, 2, 2, 4, 0, 0, 0, 0, 0, 3, 2, 4, 4, 0, 1, 1, 3, 4, 2, 3, 3, 0, 1, 4, 4, 4, 4, 4, 4, 4, 1, 0, 0, 0, 4, 2, 4, 0, 0, 4, 0, 4, 3, 3, 3, 3, 3, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 0, 0, 4, 3, 2, 3, 3, 0, 4, 2, 2, 3, 2, 1, 0, 1, 1, 2, 1, 2, 1, 3, 2, 2, 3, 3, 2, 0, 2, 0, 0, 3, 3, 3, 0, 0, 3, 0, 3, 3, 3, 3, 0, 1, 4, 4, 4, 0, 0, 0, 4, 0, 2, 3, 3, 3, 1, 0, 3, 3, 3, 4, 0, 3, 0, 3, 4, 0, 0, 2, 3, 3, 3, 2, 0, 0, 0, 2, 2, 3, 2, 0, 2, 2, 0, 0, 1, 4, 2, 0, 0, 0, 0, 2, 2, 0, 3, 0, 4, 3, 0, 0, 0, 4, 4, 4, 4, 0, 2, 2, 0, 2, 3, 2, 2, 0, 3, 0, 2, 1, 1, 1, 1, 3, 3, 2, 1, 1, 1, 3, 2, 1, 1, 1, 0, 3, 4, 2, 2, 1, 2, 2, 2, 0, 0, 3, 3, 3, 1, 3, 3, 3, 3, 3, 4, 0, 3, 2, 3, 2, 2, 3, 3, 3, 3, 3, 2, 1, 2, 2, 0, 1, 1, 1, 2, 3, 3, 2, 0, 0, 2, 4, 3, 2, 2, 0, 2, 1, 1, 2, 0, 3, 2, 2, 2, 0, 3, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 2, 2, 2, 2, 4, 0, 2, 3, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 1, 2, 2, 4, 4, 2, 4, 0, 2, 2, 1, 2, 3, 1, 0, 3, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 4, 4, 4, 4, 4, 4, 2, 2, 0, 0, 0, 0, 0, 4, 0, 3, 3, 2, 0, 0, 0, 0, 0, 2, 2, 3, 2, 0, 0, 0, 0, 0, 0, 2, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 0, 3, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 4, 1, 3, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 4, 0, 0, 0, 3, 4, 3, 3, 2, 4, 0, 0, 2, 0, 2, 3, 2, 4, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 3, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, 4, 0, 0, 4, 4, 4, 4, 2, 4, 4, 0, 4, 4, 4, 2, 3, 2, 1, 1, 1, 1, 3, 4, 0, 2, 2, 4, 3, 2, 3, 2, 3, 0, 2, 0, 3, 4, 2, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 0, 3, 2, 3, 3, 0, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 1, 1, 1, 4, 1, 1, 1, 1, 4, 3, 2, 2, 2, 0, 3, 3, 1, 0, 3, 3, 2, 2, 2, 2, 0, 2, 2, 3, 0, 3, 4, 3, 2, 0, 0, 0, 3, 0, 2, 2, 0, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 0, 2, 4, 4, 3, 3, 0, 3, 0, 4, 4, 4, 4, 4, 2, 4, 2, 4, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 2, 3, 3, 0, 0, 0, 0, 0, 3, 3, 0, 0, 3, 0, 2, 0, 3, 2, 0, 0, 2, 0, 2, 3, 3, 3, 2, 4, 4, 0, 4, 3, 4, 3, 3, 3, 3, 3, 4, 4, 4, 0, 2, 2, 3, 2, 2, 4, 3, 0, 3, 3, 2, 3, 0, 4, 2, 3, 3, 2, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 4, 0, 0, 1, 2, 4, 3, 2, 2, 4, 3, 4, 0, 0, 0, 3, 2, 1, 2, 3, 0, 3, 2, 2, 2, 2, 3, 3, 3, 2, 0, 2, 0, 2, 0, 4, 0, 0, 0, 0, 0, 3, 2, 0, 3, 2, 4, 2, 3, 3, 2, 0, 2, 4, 2, 0, 3, 0, 4, 3, 3, 0, 0, 0, 3, 0, 0, 0, 2, 2, 4, 4, 0, 0, 2, 4, 3, 2, 0, 1, 3, 2, 3, 2, 3, 0, 0, 3, 0, 3, 3, 4, 3, 4, 4, 0, 3, 2, 0, 0, 0, 0, 0, 2, 2, 2, 3, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 0, 2, 2, 2, 3, 3, 2, 3, 0, 2, 4, 2, 2, 3, 2, 2, 2, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0, 2, 0, 0, 2, 0, 0, 3, 0, 0, 0, 2, 0, 3, 4, 3, 0, 3, 3, 2, 1, 2, 2, 0, 0, 0, 3, 3, 0, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 3, 2, 3, 2, 0, 0, 1, 1, 1, 2, 0, 0, 0, 0, 3, 1, 2, 0, 4, 4, 3, 4, 1, 0, 4, 4, 3, 4, 3, 1, 1, 3, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 3, 2, 2, 3, 3, 3, 3, 3, 0, 0, 4, 0, 0, 3, 0, 4, 4, 4, 4, 4, 4, 0, 0, 0, 4, 0, 0, 3, 2, 2, 2, 1, 0, 4, 3, 0, 0, 2, 0, 3, 0, 3, 4, 0, 0, 4, 0, 0, 0, 0, 0, 4, 3, 0, 3, 3, 0, 0, 0, 0, 3, 0, 3, 3, 0, 0, 2, 3, 2, 0, 3, 1, 3, 2, 2, 0, 2, 2, 2, 3, 3, 0, 3, 0, 2, 4, 4, 4, 4, 4, 3, 0, 2, 0, 0, 0, 4, 4, 0, 0, 0, 3, 3, 3, 3, 0, 3, 3, 0, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 3, 3, 3, 3, 0, 0, 0, 3, 0, 4, 3, 3, 2, 3, 3, 3, 2, 3, 1, 3, 3, 2, 3, 3, 4, 4, 4, 4, 4, 3, 3, 0, 0, 0, 1, 3, 0, 4, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 3, 3, 4, 3, 2, 0, 0, 1, 1, 2, 3, 2, 3, 3, 2, 0, 0, 2, 2, 0, 4, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 1, 0, 2, 2, 3, 0, 0, 0, 0, 3, 1, 0, 0, 3, 3, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 3, 4, 0, 0, 0, 2, 2, 2, 3, 3, 3, 0, 1, 4, 1, 3, 2, 0, 0, 0, 2, 3, 3, 2, 4, 3, 0, 4, 2, 0, 0, 0, 0, 0, 0, 3, 0, 0, 4, 0, 4, 4, 4, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 3, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 3, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 3, 0, 2, 2, 3, 3, 0, 0, 4, 4, 3, 2, 2, 2, 0, 0, 0, 3, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 0, 3, 0, 0, 2, 0, 4, 0, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 2, 0, 2, 3, 0, 2, 0, 0, 3, 4, 2, 3, 3, 3, 3, 2, 2, 0, 2, 2, 3, 0, 0, 0, 0, 2, 3, 0, 2, 2, 4, 4, 4, 2, 0, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0, 2, 2, 3, 3, 3, 0, 3, 0, 3, 4, 4, 0, 2, 3, 2, 0, 0, 0, 4, 3, 0, 0, 0, 0, 0, 0, 0, 3, 2, 2, 2, 2, 2, 3, 3, 2, 0, 0, 1, 3, 0, 3, 3, 3, 2, 0, 2, 3, 2, 0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 2, 0, 3, 0, 0, 2, 2, 3, 3, 2, 4, 2, 4, 3, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 3, 2, 4, 3, 3, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 0, 0, 2, 1, 2, 3, 3, 3, 3, 3, 3, 0, 0, 2, 2, 3, 3, 2, 3, 0, 0, 0, 0, 4, 4, 4, 4, 1, 2, 4, 2, 2, 2, 4, 2, 0, 0, 3, 3, 3, 3, 3, 3, 0, 4, 0, 3, 3, 3, 0, 0, 2, 4, 0, 2, 2, 2, 2, 2, 0, 0, 2, 0, 0, 4, 2, 2, 0, 2, 3, 0, 2, 3, 0, 3, 2, 4, 4, 2, 3, 3, 3, 3, 3, 2, 3, 1, 0, 0, 0, 0, 3, 4, 1, 4, 4, 0, 3, 4, 0, 0, 3, 0, 2, 0, 3, 4, 3, 2, 2, 0, 3, 0, 2, 3, 2, 0, 4, 0, 3, 3, 3, 3, 4, 4, 4, 3, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 2, 2, 0, 4, 4, 3, 3, 3, 3, 0, 0, 0, 4, 4, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 4, 3, 2, 0, 1, 1, 0, 3, 0, 2, 0, 3, 0, 2, 0, 0, 0, 3, 0, 0, 0, 2, 0, 0, 2, 0, 0, 3, 3, 2, 2, 0, 0, 0, 1, 1, 2, 0, 0, 2, 4, 2, 0, 0, 0, 4, 2, 2, 2, 2, 2, 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 4, 3, 3, 1, 0, 2, 2, 0, 2, 3, 0, 2, 2, 0, 1, 4, 4, 0, 4, 0, 4, 1, 1, 1, 1, 1, 1, 2, 3, 3, 2, 0, 2, 4, 1, 3, 4, 4, 4, 0, 1, 0, 3, 0, 2, 4, 2, 4, 1, 1, 4, 0, 0, 1, 4, 1, 4, 4, 4, 2, 2, 3, 3, 3, 3, 0, 2, 2, 2, 2, 2, 0, 2, 3, 4, 2, 4, 0, 3, 4, 4, 4, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 0, 3, 3, 4, 3, 2, 3, 0, 3, 3, 0, 0, 2, 2, 2, 3, 4, 4, 4, 4, 4, 4, 4, 3, 3, 1, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 4, 4, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 3, 0, 3, 3, 4, 3, 0, 3, 3, 0, 3, 3, 3, 3, 3, 0, 0, 0, 3, 0, 2, 3, 0, 2, 2, 2, 2, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 3, 2, 3, 4, 3, 0, 3, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 1, 3, 2, 2, 1, 2, 4, 4, 0, 3, 2, 2, 2, 0, 3, 0, 3, 3, 0, 2, 3, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 1, 4, 4, 2, 4, 0, 3, 2, 2, 2, 2, 3, 4, 0, 2, 2, 2, 4, 0, 3, 0, 0, 0, 2, 0, 3, 0, 1, 0, 3, 2, 1, 1, 3, 0, 0, 3, 3, 0, 1, 3, 3, 4, 0, 0, 4, 1, 1, 1, 2, 2, 4, 2, 4, 4, 4, 2, 0, 0, 1, 1, 1, 2, 3, 3, 2, 0, 2, 3, 2, 0, 3, 3, 3, 3, 0, 4, 3, 0, 0, 0, 0, 3, 0, 3, 2, 0, 0, 1, 3, 2, 2, 0, 0, 0, 2, 0, 0, 3, 3, 3, 3, 3, 2, 3, 2, 2, 2, 0, 0, 0, 2, 4, 2, 0, 0, 0, 2, 0, 4, 4, 3, 0, 3, 3, 0, 0, 1, 0, 3, 0, 0, 0, 3, 3, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 4, 2, 0, 2, 0, 4, 4, 0, 0, 0, 0, 0, 2, 2, 4, 2, 0, 0, 0, 0, 0, 0, 4, 3, 4, 0, 0, 0, 3, 0, 0, 3, 3, 0, 0, 0, 0, 1, 2, 3, 3, 3, 0, 0, 0, 0, 0, 4, 0, 2, 4, 2, 2, 2, 2, 2, 2, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 3, 0, 0, 0, 0, 3, 3, 1, 1, 3, 1, 4, 0, 0, 0, 3, 3, 0, 3, 1, 3, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 0, 0, 4, 2, 4, 4, 0, 0, 0, 3, 2, 0, 4, 3, 0, 0, 0, 2, 4, 0, 0, 3, 4, 3, 0, 2, 0, 0, 0, 0, 3, 3, 0, 3, 4, 0, 2, 3, 3, 2, 4, 0, 0, 3, 0, 2, 4, 0, 4, 0, 2, 4, 3, 3, 0, 0, 3, 3, 0, 0, 0, 0, 3, 3, 2, 2, 0, 2, 0, 0, 4, 0, 0, 3, 0, 0, 0, 0, 0, 4, 0, 3, 0, 0, 3, 1, 0, 0, 2, 0, 0, 0, 4, 4, 0, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 3, 2, 3, 3, 3, 3, 3, 3, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 3, 2, 4, 3, 3, 3, 3, 0, 2, 2, 3, 0, 4, 4, 4, 2, 0, 2, 0, 0, 3, 2, 0, 0, 0, 0, 3, 2, 0, 3, 1, 0, 0, 2, 0, 2, 2, 1, 0, 3, 1, 2, 2, 2, 0, 0, 0, 0, 2, 2, 0, 3, 4, 4, 4, 4, 4, 4, 2, 2, 0, 0, 0, 0, 0, 0, 0, 4, 3, 0, 0, 2, 0, 3, 3, 3, 3, 3, 2, 0, 0, 0, 2, 0, 4, 4, 0, 0, 0, 2, 3, 3, 0, 0, 0, 3, 2, 2, 2, 0, 2, 2, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 4, 2, 0, 0, 3, 0, 3, 4, 4, 0, 4, 4, 4, 4, 4, 3, 3, 0, 0, 0, 0, 2, 3, 3, 0, 0, 0, 0, 0, 2, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2, 2, 2, 0, 0, 0, 4, 4, 3, 0, 0, 0, 3, 0, 3, 0, 2, 0, 3, 0, 0, 0, 2, 4, 4, 0, 3, 0, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 3, 2, 0, 0, 0, 4, 3, 3, 0, 2, 0, 3, 3, 0, 2, 0, 4, 0, 3, 0, 3, 2, 4, 2, 2, 2, 4, 0, 0, 3, 0, 3, 3, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2, 0, 4, 2, 3, 3, 3, 3, 3, 0, 3, 3, 2, 2, 0, 0, 0, 4, 4, 4, 2, 3, 2, 0, 0, 0, 0, 2, 3, 3, 0, 0, 0, 2, 0, 4, 3, 0, 4, 0, 0, 0, 0, 4, 4, 4, 4, 4, 0, 2, 2, 0, 0, 1, 4, 0, 0, 2, 2, 0, 4, 3, 3, 3, 3, 4, 2, 3, 3, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 0, 4, 0, 0, 0, 2, 1, 2, 0, 0, 0, 2, 0, 0, 0, 0, 3, 0, 0, 2, 2, 2, 2, 2, 2, 0, 1, 2, 0, 0, 3, 0, 0, 3, 0, 4, 0, 3, 0, 3, 2, 3, 3, 3, 3, 0, 3, 3, 3, 3, 0, 0, 0, 3, 0, 0, 3, 0, 0, 2, 0, 2, 2, 0, 3, 0, 2, 0, 3, 2, 2, 0, 0, 2, 1, 0, 0, 3, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 0, 0, 2, 2, 2, 0, 2, 2, 3, 2, 0, 3, 0, 0, 0, 2, 0, 2, 0, 0, 2, 3, 2, 3, 2, 3, 3, 2, 2, 0, 3, 2, 3, 4, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 3, 0, 0, 0, 3, 2, 2, 0, 2, 3, 3, 2, 0, 2, 0, 3, 2, 0, 3, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 3, 2, 2, 2, 0, 2, 0, 0, 0, 3, 3, 0, 2, 2, 2, 3, 3, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 2, 0, 3, 3, 3, 0, 2, 3, 0, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 0, 2, 0, 4, 0, 0, 2, 0, 0, 2, 3, 2, 2, 3, 0, 2, 2, 0, 2, 2, 0, 0, 0, 2, 3, 3, 2, 0, 2, 2, 2, 3, 3, 3, 3, 0, 0, 4, 4, 0, 0, 2, 2, 2, 2, 2, 3, 0, 2, 2, 0, 0, 0, 3, 0, 0, 2, 0, 3, 2, 2, 0, 2, 0, 0, 2, 0, 0, 3, 0, 2, 3, 3, 0, 2, 2, 2, 0, 3, 2, 0, 2, 2, 2, 0, 0, 0, 0, 3, 3, 0, 3, 0, 2, 0, 3, 3, 3, 0, 3, 2, 0, 3, 0, 0, 0, 0, 3, 3, 3, 3, 3, 2, 3, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 2, 2, 3, 0, 0, 2, 3, 2, 0, 0, 4, 3, 2, 3, 1, 0, 2, 2, 2, 2, 3, 0, 0, 2, 3, 3, 3, 0, 0, 2, 0, 3, 0, 3, 3, 1, 3, 3, 3, 0, 0, 0, 0, 0, 2, 3, 3, 2, 3, 0, 4, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 4, 0, 1, 3, 4, 0]
CPU times: user 18.5 s, sys: 1.89 s, total: 20.4 s
Wall time: 20.3 s
In [60]:
labels = pd.DataFrame(clusters, columns=['Cluster Labels'])
counts = pd.DataFrame(labels['Cluster Labels'].value_counts().sort_index())
counts.columns=['Document Count']
display(counts)
Document Count
0
1756
1
415
2
1209
3
1057
4
563
In [61]:
topNWords = 50
df = pd.DataFrame()
for i in range(num_clusters):
clusterWords = []
for topWordIndex,ind in enumerate(order_centroids[i, :topNWords]):
clusterWords.append( feature_names[ind] )
df['Cluster %d' % i] = pd.Series(clusterWords)
#dtype='object', data= [''] * topNWords)
#print(topWordIndex)
#print(ind)
#print(feature_names[ind])
df.style.set_properties(**{'text-align': 'right'})
df
Out[61]:
Cluster 0
Cluster 1
Cluster 2
Cluster 3
Cluster 4
0
guys
fuck
sighs
mom
sir
1
music
fucking
chuckles
dad
king
2
laughs
shit
police
guys
father
3
world
guy
phone
baby
men
4
guy
gotta
door
guy
lord
5
whoa
guys
guy
girl
mary
6
shit
money
killed
school
brother
7
huh
wanna
car
cause
majesty
8
sighs
jesus
detective
party
queen
9
hell
fucked
murder
house
mother
10
hello
dad
jane
don want
son
11
money
fuckin
case
sighs
kill
12
car
baby
agent
honey
shall
13
chuckles
ain
kill
mother
captain
14
grunting
phone
killer
family
war
15
joe
alright
dead
hmm
die
16
dad
ray
hell
danny
gods
17
gotta
huh
dad
kids
lady
18
baby
yeah yeah
house
happy
dead
19
team
girl
money
chuckles
child
20
ooh
cause
victim
wedding
death
21
door
music
guys
ooh
wife
22
real
marty
fbi
wow
francis
23
cause
listen
woman
tonight
heart
24
job
sighs
gun
huh
woman
25
sir
mom
ago
money
fight
26
grunts
asshole
went
hello
boy
27
today
job
mom
guess
highness
28
school
fuck fuck
sam
laughs
general
29
cool
ass
father
phone
family
30
bleep
sir
sir
fun
george
31
hmm
car
alex
hey hey
girl
32
house
bullshit
son
job
killed
33
playing
house
saw
mike
army
34
president
bitch
hmm
pretty
world
35
bit
laughs
indistinct
married
husband
36
wanna
real
family
yeah yeah
door
37
hulk
hey hey
henry
care
city
38
girl
door
blood
car
ship
39
family
kids
knew
having
sighs
40
listen
hell
laughs
hell
miss
41
kill
hmm
body
son
dear
42
game
dude
evidence
stuff
daughter
43
phil
brother
mother
today
prince
44
pretty
father
wife
music
speak
45
wow
johnny
job
listen
return
46
woman
kid
took
father
magic
47
guess
motherfucker
girl
cool
john
48
don want
don want
music
real
blood
49
fun
business
emma
friend
power
In [38]:
titlesFrame = pd.DataFrame()
titlesFrame['Labels']=km.labels_
titlesFrame['Titles']=titles
sort = titlesFrame.sort_values(by=['Labels','Titles'])
for i in range(num_clusters):
display( sort.query('Labels == %d' % i) )
Labels
Titles
2584
0
"X Company" Sixes & Sevens (TV Episode 2015)
Labels
Titles
2762
1
"19-2" Babylon (TV Episode 2015)
1633
1
"19-2" Borders (TV Episode 2015)
2553
1
"19-2" Bridges (TV Episode 2015)
870
1
"19-2" Disorder (TV Episode 2015)
2755
1
"19-2" Orphans (TV Episode 2015)
1815
1
"19-2" Property Line (TV Episode 2015)
1676
1
"19-2" Rock Garden (TV Episode 2015)
950
1
"19-2" School (TV Episode 2015)
1814
1
"19-2" Tables (TV Episode 2015)
1381
1
"19-2" Tribes (TV Episode 2015)
2868
1
"Ballers" Ends (TV Episode 2015)
3047
1
"Ballers" Everything Is Everything (TV Episode...
3372
1
"Ballers" Flamingos (TV Episode 2015)
2950
1
"Ballers" Gaslighting (TV Episode 2015)
3431
1
"Ballers" Head-On (TV Episode 2015)
2220
1
"Ballers" Heads Will Roll (TV Episode 2015)
2629
1
"Ballers" Machete Charge (TV Episode 2015)
2129
1
"Ballers" Move the Chains (TV Episode 2015)
413
1
"Ballers" Pilot (TV Episode 2015)
2044
1
"Ballers" Raise Up (TV Episode 2015)
2754
1
"Banana" Episode #1.1 (TV Episode 2015)
3093
1
"Banana" Episode #1.4 (TV Episode 2015)
732
1
"Banshee" A Fixer of Sorts (TV Episode 2015)
936
1
"Banshee" All the Wisdom I Got Left (TV Episod...
1346
1
"Banshee" Even God Doesn't Know What to Make o...
780
1
"Banshee" Real Life Is the Nightmare (TV Episo...
733
1
"Banshee" Tribal (TV Episode 2015)
1347
1
"Banshee" We All Pay Eventually (TV Episode 2015)
935
1
"Banshee" You Can't Hide from the Dead (TV Epi...
125
1
"Black Sails" IX. (TV Episode 2015)
...
...
...
57
1
Misery Loves Comedy (2015)
72
1
Momentum (2015)
1466
1
Muck (2015)
55
1
Nasty Baby (2015)
58
1
No Way Jose (2015)
267
1
Pod (2015)
716
1
Results (2015)
50
1
Road Hard (2015)
215
1
SWAT: Unit 887 (2015)
41
1
See You in Valhalla (2015)
54
1
Skin Traffik (2015)
465
1
Some Kind of Hate (2015)
40
1
Spy (2015)
60
1
Staten Island Summer (2015)
107
1
Stung (2015)
1173
1
Suburra (2015)
1686
1
Tales of Halloween (2015)
717
1
Tangerine (2015)
668
1
The Escort (2016)
124
1
The Night Crew (2015)
1900
1
The Stanford Prison Experiment (2015)
1996
1
The Subjects (2015)
4986
1
The Wedding Ringer (2015)
10
1
Tiger House (2015)
35
1
VANish (2015)
190
1
Vice (2015)
652
1
We Are Your Friends (2015)
2005
1
Wrecker (2015)
132
1
Zipper (2015)
487
1
Zombieworld (2015)
325 rows × 2 columns
Labels
Titles
4683
2
"Rosewood" Fireflies and Fidelity (TV Episode ...
Labels
Titles
3553
3
"Elementary" A Stitch in Time (TV Episode 2015)
1967
3
"Justified" Alive Day (TV Episode 2015)
2057
3
"Justified" Burned (TV Episode 2015)
1654
3
"Justified" Cash Game (TV Episode 2015)
2061
3
"Justified" Collateral (TV Episode 2015)
2058
3
"Justified" Dark as a Dungeon (TV Episode 2015)
388
3
"Justified" Fate's Right Hand (TV Episode 2015)
2060
3
"Justified" Fugitive Number One (TV Episode 2015)
1655
3
"Justified" Noblesse Oblige (TV Episode 2015)
1656
3
"Justified" Sounding (TV Episode 2015)
2056
3
"Justified" The Hunt (TV Episode 2015)
1964
3
"Justified" The Promise (TV Episode 2015)
1683
3
"Justified" The Trash and the Snake (TV Episod...
2059
3
"Justified" Trust (TV Episode 2015)
4911
3
"Last Man Standing" Educating Boyd (TV Episode...
384
3
"Poldark" Episode #1.1 (TV Episode 2015)
998
3
"Poldark" Episode #1.2 (TV Episode 2015)
999
3
"Poldark" Episode #1.3 (TV Episode 2015)
1000
3
"Poldark" Episode #1.4 (TV Episode 2015)
1001
3
"Poldark" Episode #1.5 (TV Episode 2015)
1002
3
"Poldark" Episode #1.6 (TV Episode 2015)
1003
3
"Poldark" Episode #1.7 (TV Episode 2015)
1004
3
"Poldark" Episode #1.8 (TV Episode 2015)
Labels
Titles
1106
4
"Glee" Child Star (TV Episode 2015)
Labels
Titles
2483
5
"Bad Judge" Naked and Afraid (TV Episode 2015)
Labels
Titles
4050
6
"American Dad!" Seizures Suit Stanny (TV Episo...
Labels
Titles
2839
7
"Salem" Blood Kiss (TV Episode 2015)
3421
7
"Salem" Book of Shadows (TV Episode 2015)
3995
7
"Salem" Dead Birds (TV Episode 2015)
3559
7
"Salem" Ill Met by Moonlight (TV Episode 2015)
4119
7
"Salem" Midnight Never Come (TV Episode 2015)
4040
7
"Salem" On Earth as in Hell (TV Episode 2015)
3699
7
"Salem" The Beckoning Fair One (TV Episode 2015)
3422
7
"Salem" The Wine Dark Sea (TV Episode 2015)
4293
7
"Salem" The Witching Hour (TV Episode 2015)
3996
7
"Salem" Til Death Do Us Part (TV Episode 2015)
3889
7
"Salem" Wages of Sin (TV Episode 2015)
Labels
Titles
606
8
4375
8
4376
8
4377
8
4378
8
4379
8
4380
8
4381
8
4382
8
4383
8
1808
8
"12 Monkeys" Arms of Mine (TV Episode 2015)
1540
8
"12 Monkeys" Atari (TV Episode 2015)
1624
8
"12 Monkeys" Divine Move (TV Episode 2015)
1807
8
"12 Monkeys" Paradox (TV Episode 2015)
1806
8
"12 Monkeys" Shonin (TV Episode 2015)
1543
8
"12 Monkeys" The Keys (TV Episode 2015)
1541
8
"12 Monkeys" The Night Room (TV Episode 2015)
1513
8
"12 Monkeys" Tomorrow (TV Episode 2015)
1516
8
"12 Monkeys" Yesterday (TV Episode 2015)
2849
8
"2 Broke Girls" And the Crime Ring (TV Episode...
3363
8
"2 Broke Girls" And the Cupcake Captives (TV E...
3909
8
"2 Broke Girls" And the Disappointing Unit (TV...
2663
8
"2 Broke Girls" And the Fat Cat (TV Episode 2015)
1569
8
"2 Broke Girls" And the Fun Factory (TV Episod...
3908
8
"2 Broke Girls" And the Grate Expectations (TV...
3064
8
"2 Broke Girls" And the Great Unwashed (TV Epi...
2761
8
"2 Broke Girls" And the High Hook-Up (TV Episo...
2951
8
"2 Broke Girls" And the Knock-Off Knockout (TV...
3392
8
"2 Broke Girls" And the Minor Problem (TV Epis...
2726
8
"2 Broke Girls" And the Move-In Meltdown (TV E...
...
...
...
414
8
Uttama Villain (2015)
4988
8
Vanquisher (2016)
368
8
Vares - sheriffi (2015)
566
8
Vendetta (2015)
1299
8
Ventoux (2015)
451
8
Viikossa aikuiseksi (2015)
3914
8
Vikings: Athelstan's Journal (TV Series 2015– )
1821
8
Wasurenai to chikatta boku ga ita (2015)
222
8
We Are Still Here (2015)
66
8
Welcome Back (2015)
963
8
Winnetous Sohn (2015)
1715
8
World of Tomorrow (2015)
4503
8
Wu xin fa shi (TV Series 2015– )
3963
8
Xi you ji zhi da sheng gui lai (2015)
396
8
Yakuza Apocalypse (2015)
2517
8
Yatchan (2015)
2175
8
Yennai Arindhaal (2015)
4638
8
Yeonaeui mat (2015)
4137
8
Yeonpyeong haejeon (2015)
3447
8
Yevade Subramanyam (2015)
4014
8
Yi wan nian yi hou (2015)
4746
8
Yong Pal (TV Series 2015– )
3579
8
You Too Brutus (2015)
510
8
Zero Tolerance (2015)
264
8
Zhan lang (2015)
307
8
Zhong Kui fu mo: Xue yao mo ling (2015)
617
8
Zhuo yao ji (2015)
385
8
Ziarno prawdy (2015)
4158
8
Zombie Shark (TV Movie 2015)
4991
8
Zoolander 2 (2016)
2770 rows × 2 columns
Labels
Titles
3646
9
"American Odyssey" Oscar Mike (TV Episode 2015)
Labels
Titles
3069
10
"Mr. Robinson" School's Out for Summer (TV Epi...
Labels
Titles
4201
11
"Nashville" 'Til the Pain Outwears the Shame (...
719
11
"Nashville" Before You Go Make Sure You Know (...
4199
11
"Nashville" Can't Get Used to Losing You (TV E...
4063
11
"Nashville" Can't Let Go (TV Episode 2015)
4197
11
"Nashville" How Can I Help You Say Goodbye (TV...
912
11
"Nashville" I Can't Keep Away from You (TV Epi...
911
11
"Nashville" I'm Lost Between Right or Wrong (T...
908
11
"Nashville" I'm Not That Good at Goodbye (TV E...
909
11
"Nashville" I've Got Reasons to Hate You (TV E...
918
11
"Nashville" Is the Better Part Over (TV Episod...
915
11
"Nashville" Nobody Knows But Me (TV Episode 2015)
4200
11
"Nashville" Please Help Me, I'm Fallin' (TV Ep...
910
11
"Nashville" Somebody Pick Up My Pieces (TV Epi...
4196
11
"Nashville" Stop the World (And Let Me Off) (T...
913
11
"Nashville" That's the Way Love Goes (TV Episo...
4198
11
"Nashville" The Slender Threads That Bind Us H...
916
11
"Nashville" The Storm Has Just Begun (TV Episo...
914
11
"Nashville" This Just Ain't a Good Day for Lea...
917
11
"Nashville" Time Changes Things (TV Episode 2015)
4104
11
"The Voice" Live Semi-Final Performances (TV E...
4111
11
"The Voice" Live Semi-Final Results (TV Episod...
3947
11
"The Voice" Live Semifinal Performances (TV Ep...
3921
11
"The Voice" Live Top 10 Performances (TV Episo...
3864
11
"The Voice" Live Top 12 Elimination (TV Episod...
3858
11
"The Voice" Live Top 12 Performances (TV Episo...
3946
11
"The Voice" Live Top 8 Eliminations (TV Episod...
4922
11
"The Voice" The Battles Premiere, Part 2 (TV E...
1714
11
"The Voice" The Blind Auditions Premier (TV Ep...
4340
11
"The Voice" The Blind Auditions Premiere (TV E...
4963
11
"The Voice" The Knockouts Premiere (TV Episode...
3717
11
"The Voice" The Knockouts, Part 3 (TV Episode ...
4976
11
"The Voice" The Knockouts, Part 3 (TV Episode ...
3716
11
"The Voice" The Road to the Live Show (TV Epis...
Labels
Titles
3795
12
"Bluestone 42" Episode #3.6 (TV Episode 2015)
Labels
Titles
3875
13
"Saving Hope" Start Me Up (TV Episode 2015)
Labels
Titles
2121
14
"The Strain" Dead End (TV Episode 2015)
Labels
Titles
626
15
"Parks and Recreation" Ron & Jammy (TV Episode...
Labels
Titles
4424
16
"Rick and Morty" Big Trouble in Little Sanchez...
4427
16
"Rick and Morty" The Wedding Squanchers (TV Ep...
4421
16
"Rick and Morty" Total Rickall (TV Episode 2015)
Labels
Titles
2066
17
"Star Wars Rebels" Fire Across the Galaxy (TV ...
2065
17
"Star Wars Rebels" Rebel Resolve (TV Episode 2...
2069
17
"Star Wars Rebels" The Lost Commanders (TV Epi...
Labels
Titles
160
18
Lila & Eve (2015)
Labels
Titles
1539
19
"12 Monkeys" Cassandra Complex (TV Episode 2015)
1537
19
"12 Monkeys" Mentally Divergent (TV Episode 2015)
1538
19
"12 Monkeys" Pilot (TV Episode 2015)
1542
19
"12 Monkeys" The Red Forest (TV Episode 2015)
3391
19
"2 Broke Girls" And the Look of the Irish (TV ...
1908
19
"A to Z" J Is for Jan Vaughan (TV Episode 2015)
1583
19
"A to Z" K Is for Keep Out (TV Episode 2015)
1708
19
"A to Z" L Is for Likability (TV Episode 2015)
1709
19
"A to Z" M Is for Meant to Be (TV Episode 2015)
1570
19
"About a Boy" About a Boy Becoming a Man (TV E...
2216
19
"About a Boy" About a Boyfriend (TV Episode 2015)
2366
19
"About a Boy" About a Cat Party (TV Episode 2015)
2541
19
"About a Boy" About a Love in the Air (TV Epis...
3417
19
"About a Boy" About a Memory Hole (TV Episode ...
2399
19
"About a Boy" About a Prostitute (TV Episode 2...
2639
19
"About a Boy" About a Self Defense (TV Episode...
2367
19
"About a Boy" About a Trunk (TV Episode 2015)
1475
19
"Agent Carter" Bridge and Tunnel (TV Episode 2...
1060
19
"Agent Carter" Now Is Not the End (TV Episode ...
1480
19
"Agent Carter" SNAFU (TV Episode 2015)
1477
19
"Agent Carter" The Blitzkrieg Button (TV Episo...
1476
19
"Agent Carter" Time and Tide (TV Episode 2015)
1481
19
"Agent Carter" Valediction (TV Episode 2015)
2269
19
"Agent X" Pilot (TV Episode 2015)
3587
19
"Agent X" The Enemy of My Enemy (TV Episode 2015)
4216
19
"Agents of S.H.I.E.L.D." A Wanted (Inhu)Man (T...
1456
19
"Agents of S.H.I.E.L.D." Afterlife (TV Episode...
1451
19
"Agents of S.H.I.E.L.D." Aftershocks (TV Episo...
4253
19
"Agents of S.H.I.E.L.D." Among Us Hide... (TV ...
4261
19
"Agents of S.H.I.E.L.D." Chaos Theory (TV Epis...
...
...
...
73
19
The Boy Next Door (2015)
138
19
The Burning Dead (2015)
829
19
The Cokeville Miracle (2015)
1648
19
The Condemned 2 (Video 2015)
261
19
The D Train (2015)
316
19
The Diabolical (2015)
155
19
The End of the Tour (2015)
677
19
The Exorcism of Molly Hartley (2015)
1752
19
The Gift (2015)
12
19
The Lazarus Effect (2015)
233
19
The Marine 4: Moving Target (Video 2015)
2395
19
The Murder Pact (2015)
512
19
The Ouija Experiment 2: Theatre of Death (2015)
2528
19
The Outfield (2015)
741
19
The Overnight (2015)
92
19
The Squeeze (2015)
205
19
The Walking Deceased (2015)
381
19
The Wrong Girl (TV Movie 2015)
1087
19
Tig (2015)
4299
19
Torno indietro e cambio vita (2015)
2142
19
Toute première fois (2015)
333
19
Under the Skin of Design (2015)
63
19
Underdog Kids (2015)
1617
19
Unexpected (2015)
22
19
Visions (2015)
2336
19
What Happened, Miss Simone? (2015)
3056
19
Wild Horses (2015)
940
19
With This Ring (TV Movie 2015)
5
19
Zombie Killers: Elephant's Graveyard (2015)
20
19
i-Lived (2015)
1820 rows × 2 columns
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Content source: david-hagar/NLP-Analytics
Similar notebooks: