In [3]:
%matplotlib inline
import random
import h5py
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
In [5]:
h5embeddings = h5py.File("../models/childbook/embeddings.h5", 'r')
embs = h5embeddings['weights'][:]
In [6]:
embs[1]
Out[6]:
array([ 0.01146187, 0.04382391, -0.0312936 , 0.00480321, -0.02746573,
0.01411096, -0.02293389, -0.0432489 , -0.02555708, -0.01566228,
0.01497123, -0.04904546, 0.00931979, -0.03597444, -0.04937387,
0.00143139, -0.04722073, -0.03676476, -0.04325166, 0.03005171,
-0.0298416 , 0.01173527, 0.03723064, 0.02457278, -0.04297737,
0.00484916, 0.02017322, 0.02247718, 0.0062704 , -0.00845485,
0.04976286, 0.03567825, 0.02735204, 0.02333726, 0.01753517,
-0.03979453, -0.01860597, 0.00356243, 0.04439086, 0.0170996 ,
0.00968324, -0.00321301, -0.03126408, -0.03470172, -0.02821344,
0.04459572, 0.00312378, -0.04241243, -0.00568274, 0.04713464], dtype=float32)
In [7]:
word_dict = "../models/childbook/words.dict"
In [10]:
id2word = {}
word2id = {}
with open(word_dict, "r") as f:
for line in f:
k,v = line.split()
id2word[int(v)-1] = k
word2id[k] = int(v)-1
In [13]:
means = KMeans(n_clusters=20)
In [24]:
assignments = means.fit_predict(embs)
In [22]:
h5kmeans = h5py.File("../models/childbook/kmeans.h5", 'w')
h5kmeans["cluster"] = assignments
h5kmeans.close()
In [25]:
word_clusters = {i:[] for i in xrange(20)}
In [31]:
for i,v in enumerate(assignments):
try:
word_clusters[v].append(id2word[i])
except:
pass
In [32]:
len(assignments)
Out[32]:
21689
In [33]:
len(id2word)
Out[33]:
21688
In [34]:
len(embs)
Out[34]:
21689
In [41]:
word_clusters[6]
Out[41]:
['one',
'spite',
'History',
'full',
'portraits',
'out',
'ancestress',
'approve',
'kind',
'drop',
'none',
'dens',
'ogres',
'end',
'parcel',
'pair',
'pores',
'remarks',
'rudeness',
'force',
'sums',
'Chancellor',
'part',
'accent',
'German',
'bit',
'pound',
'broiled',
'rid',
'whichever',
'tackle',
'glory',
'fun',
'aware',
'tops',
'built',
'lout',
'lot',
'hopes',
'insisted',
'torrents',
'presence',
'pursuit',
'arguments',
'scenes',
'dignity',
'ashamed',
'cleverness',
'deal',
'annoyance',
'those',
'allowance',
'conduct',
'heap',
'sorts',
'collection',
'direction',
'midst',
'plenty',
'uniform',
'lap',
'crowd',
'cries',
'examination',
'ca',
'suit',
'pairs',
'names',
'fond',
'Duke',
'point',
'beloved',
'spyglass',
'wo',
'fountains',
'information',
'slice',
'sides',
'shower',
'mass',
'bodies',
'east',
'edge',
'movement',
'folds',
'front',
'sight',
'vapour',
'roars',
'spine',
'scourges',
'weight',
'pint',
'remains',
'miss',
'pretence',
'hath',
'guilty',
'hearts',
'Castle',
'Furiosus',
'strain',
'Government',
'remind',
'libbuty',
'nature',
'number',
'aid',
'possession',
'murmur',
'premises',
'Caitiff',
'space',
'choice',
'state',
'prerogative',
'clap',
'Which',
'favour',
'shouts',
'success',
'contempt',
'maids',
'border',
'million',
'members',
'discourtesy',
'Rigour',
'positions',
'flash',
'unpleasantness',
'virtue',
'mouthful',
'patches',
'rays',
'heaps',
'vessels',
'drops',
'Fountain',
'store',
'need',
'share',
'troop',
'notion',
'Last',
'threat',
'affairs',
'honeymoon',
'tone',
'aloud',
'water-vessels',
'Water',
'Troubles',
'behaviour',
'Cap',
'hold',
'brace',
'object',
'Shoes',
'Sword',
'three-quarters',
'taste',
'tire',
'sounds',
'middle',
'frame',
'signs',
'sheen',
'howl',
'drinks',
'eclipse',
'science',
'sort',
'sentiments',
'laws',
'Adventure',
'rolls',
'cups',
'measure',
'refrain',
'deprive',
'worthy',
'owner',
'trace',
'compassion',
'astonishment',
'absence',
'act',
'value',
'touch',
'powers',
'judgment',
'Purse',
'cost',
'stroke',
'gnat',
'instruments',
'consisted',
'Daughter',
'audience',
'attitude',
'affirming',
'welfare',
'interests',
'toils',
'Path',
'Arms',
'ignorant',
'smells',
'herd',
'fame',
'blud',
'whack',
'descendant',
'caus',
'principle',
'characters',
'grounds',
'locks',
'hundreds',
'Gentlemen',
'standard',
'amusements',
'reign',
'Enough',
'possess',
'brutes',
'thousands',
'Think',
'Hang',
'prime',
'exercises',
'blighting',
'rubies',
'parts',
'Bottle',
'generosity',
'shape',
'praise',
'mark',
'advantage',
'deprived',
'language',
'complexions',
'sense',
'intention',
'motive',
'precaution',
'approved',
'sum',
'purchase',
'description',
'rows',
'appearance',
'lighting',
'praises',
'mention',
'effects',
'mince-collops',
'thoughts',
'lines',
'marks',
'bits',
'dint',
'None',
'corners',
'fragments',
'feat',
'form',
'ranges',
'whirlpool',
'habits',
'centre',
'port',
'stupidity',
'quantities',
'ingenuity',
'salaams',
'multitude',
'flight',
'coasts',
'harvest',
'Wand',
'summit',
'Mountains',
'blackness',
'showers',
'kinds',
'masses',
'lump',
'crest',
'towers',
'assistance',
'faces',
'patterns',
'groves',
'helmets',
'Cord',
'Incas',
'prophesied',
'crater',
'verge',
'thrill',
'uttered',
'foundations',
'Temple',
'delight',
'region',
'religion',
'merits',
'points',
'Cathedral',
'note',
'color',
'loads',
'Alas',
'consist',
'employment',
'couple',
'outskirts',
'genii',
'quarter',
'fiercest',
'1',
'Traditions',
'stories',
'fits',
'lack',
'talks',
'faults',
'virtues',
'defects',
'vex',
'splinters',
'west',
'loan',
'charity',
'stays',
'instead',
'howls',
'barrels',
'numbers',
'hairs',
'occurrence',
'hotly',
'approach',
'arches',
'beings',
'beware',
'train',
'bales',
'devices',
'memory',
'search',
'garlands',
'fate',
'palm',
'nosegays',
'ceremonies',
'decree',
'palms',
'bethought',
'prediction',
'stride',
'sample',
'avenue',
'ranks',
'discourse',
'government',
'sup',
'despaired',
'smell',
'sweetness',
'qualities',
'laces',
'beauties',
'prayer',
'Slave',
'bestow',
'basins',
'brimful',
'Genie',
'banks',
'robbed',
'head-dress',
'wines',
'shriek',
'required',
'grasp',
'dollars',
'sha',
'scruff',
'swarms',
'rabble',
'roll',
'chests',
'list',
'troubles',
'expense',
'composed',
'arrival',
'remembrance',
'quantity',
'pawing',
'admirer',
'parrots',
'repent',
'splendor',
'hired',
'tastes',
'twinkling',
'remainder',
'flock',
'piece',
'handful',
'bushels',
'one-half',
'Out',
'behalf',
'Hall',
'tubs',
'grinding',
'freights',
'portion',
'strings',
'abundance',
'deceits',
'Proud',
'suits',
'estates',
'smallest',
'buildings',
'conferred',
'canopy',
'shade',
'swarm',
'forms',
'ages',
'voices',
'histories',
'fricassee',
'costume',
'goblet',
'draught',
'contents',
'display',
'nutshell',
'woven',
'bedside',
'plumes',
'brightness',
'Weary',
'regiment',
'tongues',
'reared',
'bondage',
'delivered',
'greetings',
'moaning',
'Full',
'wagon-loads',
'necklaces',
'borders',
'Sister',
'Island',
'rattle',
'harbor',
'games',
'secrets',
'glimmer',
'rhymes',
'strokes',
'Instead',
'bushel',
'savored',
'crossness',
'partake',
'ill-usage',
'earnestness',
'excused',
'depression',
'Company',
'Academy',
'conquest',
'Land',
'collars',
'pleasantness',
'complains',
'wrists',
'Treasures',
'shadows',
'exclamations',
'syllable',
'jobs',
'cases',
'circumstance',
'borrowed',
'backs',
'partly',
'pretense',
'plot',
'grandchildren',
'worry',
'support',
'bunch',
'chips',
'boulders',
'ingratitude',
'glitter',
'gust',
'puff',
'deeds',
'accused',
'wickedness',
'counsels',
'deliverance',
'faithfulness',
'Numbers',
'blades',
'quit',
'tameness',
'ranged',
'beholds',
'bowls',
'tons',
'forepart',
'Number',
'flights',
'ounces',
'lightness',
'squeezed',
'5th',
'tidings',
'hogsheads',
'smarts',
'breadth',
'awe',
'performance',
'admiral',
'translation',
'1,724',
'twelfth',
'Empire',
'prow',
'knot',
'hatred',
'suspicion',
'amount',
'gloves',
'13th',
'eve',
'one-third',
'two-thirds',
'ecstacy',
'lustre',
'possessor',
'thickness',
'experiments',
'philosopher',
'recovery',
'acknowledgment',
'decision',
'concourse',
'richness',
'sconces',
'choruses',
'acclamations',
'complaining',
'Son',
'acquit',
'capable',
'snares',
'sheets',
'tires',
'clews',
'clew',
'demonstrations',
'execution',
'echo',
'Sultaness',
'den',
'billet',
'log',
'Cout',
'telled',
'leavings',
'aught',
'Ane',
'ane',
'com',
'stirrup',
'suite',
'consequential',
'bitterness',
'Holder',
'jug-jug',
'platform',
'bestowed',
'Waq',
'8',
'dependency',
'slab',
'Causer',
'Beware',
'fashioned',
'vagabond',
'society',
'threads',
'Names',
'gait',
'Scorpion',
'saddle-bags',
'Place',
'watches',
'grip',
'stages',
'gore',
'deliverer',
'portions',
'anguish',
'bereft',
'control',
'notables',
'solving',
'collyrium',
'ahead',
'herds',
'scrap',
'Bureau',
'Bird',
'supply',
'clump',
'grove',
'glimpse',
'wasting',
'weave',
'Journal',
'scuffling',
'grains',
'chances',
'Story',
'wreaths',
'attacks',
'attraction',
'traces',
'tips',
'depths',
'notes',
'streak',
'pangs',
'columns',
'Tales',
'token',
'scores',
'thong',
'thickets',
'claps',
'sewn',
'pailful',
'sackful',
'Husband',
'gnawing',
'group',
'tuft',
'coverings',
'sets',
'bays',
'ledge',
'bundles',
'sleeve',
'ray',
'breakfasted',
'messages',
'strip',
'whisk',
'court-yard',
'spadeful',
'rob',
'sill',
'recollection',
'energy',
'fates',
'tinkling',
'piles',
'paint',
'bands',
'team',
'doubts',
'stores',
'gleam',
'lasts',
'skein',
'tortures',
'intrusion',
'screen',
'rim',
'disappearance',
'outburst',
'tangle',
'Loveliest',
'trellis',
'deeps',
'boundary',
'worried',
'flickering',
'Officers',
'packet',
'breed',
'expenses',
'swing',
'blaring',
'foolishness',
'ledges',
'peered',
'myriads',
'Tale',
'Knights',
'flap',
'grating',
'troops',
'Fairest',
'Language',
'copy',
'sleeves',
'Heart',
'Hundreds',
'trails',
'baying',
'campful',
'murmurs',
'forgetfulness',
'shrinking',
'rumour',
'nickname',
'glow',
'woe',
'bemoan',
'hoard',
'fancies',
'Horn',
'Plenty',
'layer',
'profits',
'tunic',
'spadefuls',
'Kingdom',
'complaints',
'blinds',
'streaks',
'shores',
'boasting',
'falsehood',
'whips',
'reports',
'specimen',
'management',
'Wonder',
'halves',
'multitudes',
'World',
'cuff',
'bucketfuls',
'joints',
'stopper',
'movements',
'range',
'cupful',
'twang',
'agony',
'savings',
'Pacha',
'Sheik',
'tones',
'masterpiece',
'pupil',
'proofs',
'descendants',
'fonder',
'observant',
'large-minded',
'braying',
'misdeed',
'suddenness',
'hem',
'whereabouts',
'disorder',
'dyke',
'Rath',
'Fictions',
'flower-beds',
'tufts',
'sprig',
'sparkles',
'nought',
'armful',
'reels',
'knots',
'sheaf',
'eyelids',
'bragged',
'babes',
'jet',
'accommodate',
'inmates',
'slowness',
'Foremost',
'regiments',
'Crabs',
'nimbleness',
'frisk',
'thundering',
'Bones',
'Salmon',
'dominion',
'flip',
'carcase',
'pinafores',
'bracelets',
'Stones',
'Rear-Guard',
'horrors',
'snatches',
'occupants',
'handfuls',
'skeletons',
'incapable',
'Greatest',
'bowels',
'corridor',
'News',
'splinter',
'flagon',
'hilts',
'quest',
'peal',
'images',
'jests',
'Aziliez',
'hardness',
'lots',
'isle',
'car',
'Escape',
'tumult',
'packs',
'plateful',
'Brownie',
'Winning',
'war-horn',
'loops',
'burnisher',
'Abbey',
'Tradition',
'clods',
'clod',
'war-canoes',
'archives',
'Angel',
'boundaries',
'links',
'doer',
'scraps',
'cracks',
'fluff',
'steaks',
'web',
'regardless',
'resolves',
'prices',
'rear',
'rood',
'rattling',
'dose',
'Adventures',
'founder',
'Rover',
'posy',
'detail',
'etiquette',
'fatigues',
'terrors',
'shoal',
'Wedges',
'Arch',
'chaplet',
'inhabitant',
'stings',
'sprigs',
'sensation',
'regent',
'flagging',
'Gorla',
'Ardan',
'bearer',
'Dog',
'three-thirds',
'antlers',
'gills',
'teller',
'examples',
'screams',
'paddles',
'series',
'feats',
'dozens',
'disposed',
'ruffian',
'counterpane',
'fathoms',
'Slaying',
'affectation',
'assortment',
'hymn',
'nick',
'scents',
'bud',
'Dolls',
'soberly',
'babel',
'freshness',
'keg',
'bakings',
'kegs',
'raged',
'overseer',
'shipload',
'Sprig',
'trimmings',
'pane',
's',
'bristles',
'bristle',
'protectress',
'territory',
'mercies',
'pint-bottle',
'crusts',
'wells',
'margin',
'growls',
'consisting',
'turrets',
'peals',
'trains',
'mince-meat',
'pies',
'hive',
'jurisdiction',
'process',
'desirous',
'diversions',
'instruction',
'Steed',
'afflictions',
'eldest-looking',
'samples',
'pocketful',
'curiosities',
'stealers',
'swamps',
'avalanches',
'be-thought',
'scourge',
'pease-soup',
'potful',
'subsistence',
'reverse',
'provision',
'plait',
'salutations',
'mindful',
'firkin',
'cathedral',
'millions',
'retreats',
'Transylvanians',
'offences',
'clusters',
'carcasses',
'Prior',
'prongs',
'Fearing',
'chimes',
'consciousness',
'fore-part',
'cellarful',
'bi',
'maker',
'Fearless',
'Helm',
'whip-hand',
'bliss',
'calves',
'fore-leg',
'yolk',
'scratcher',
'tankful',
'glimpses',
'Small',
'Goddess',
...]
In [ ]:
Content source: HendrikStrobelt/LSTMVis
Similar notebooks: