In [3]:
%matplotlib inline
import random
import h5py
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

In [5]:
h5embeddings = h5py.File("../models/childbook/embeddings.h5", 'r')
embs = h5embeddings['weights'][:]

In [6]:
embs[1]


Out[6]:
array([ 0.01146187,  0.04382391, -0.0312936 ,  0.00480321, -0.02746573,
        0.01411096, -0.02293389, -0.0432489 , -0.02555708, -0.01566228,
        0.01497123, -0.04904546,  0.00931979, -0.03597444, -0.04937387,
        0.00143139, -0.04722073, -0.03676476, -0.04325166,  0.03005171,
       -0.0298416 ,  0.01173527,  0.03723064,  0.02457278, -0.04297737,
        0.00484916,  0.02017322,  0.02247718,  0.0062704 , -0.00845485,
        0.04976286,  0.03567825,  0.02735204,  0.02333726,  0.01753517,
       -0.03979453, -0.01860597,  0.00356243,  0.04439086,  0.0170996 ,
        0.00968324, -0.00321301, -0.03126408, -0.03470172, -0.02821344,
        0.04459572,  0.00312378, -0.04241243, -0.00568274,  0.04713464], dtype=float32)

In [7]:
word_dict = "../models/childbook/words.dict"

In [10]:
id2word = {}
word2id = {}
with open(word_dict, "r") as f:
    for line in f:
        k,v = line.split()
        id2word[int(v)-1] = k
        word2id[k] = int(v)-1

In [13]:
means = KMeans(n_clusters=20)

In [24]:
assignments = means.fit_predict(embs)

In [22]:
h5kmeans = h5py.File("../models/childbook/kmeans.h5", 'w')
h5kmeans["cluster"] = assignments
h5kmeans.close()

In [25]:
word_clusters = {i:[] for i in xrange(20)}

In [31]:
for i,v in enumerate(assignments):
    try:
        word_clusters[v].append(id2word[i])
    except:
        pass

In [32]:
len(assignments)


Out[32]:
21689

In [33]:
len(id2word)


Out[33]:
21688

In [34]:
len(embs)


Out[34]:
21689

In [41]:
word_clusters[6]


Out[41]:
['one',
 'spite',
 'History',
 'full',
 'portraits',
 'out',
 'ancestress',
 'approve',
 'kind',
 'drop',
 'none',
 'dens',
 'ogres',
 'end',
 'parcel',
 'pair',
 'pores',
 'remarks',
 'rudeness',
 'force',
 'sums',
 'Chancellor',
 'part',
 'accent',
 'German',
 'bit',
 'pound',
 'broiled',
 'rid',
 'whichever',
 'tackle',
 'glory',
 'fun',
 'aware',
 'tops',
 'built',
 'lout',
 'lot',
 'hopes',
 'insisted',
 'torrents',
 'presence',
 'pursuit',
 'arguments',
 'scenes',
 'dignity',
 'ashamed',
 'cleverness',
 'deal',
 'annoyance',
 'those',
 'allowance',
 'conduct',
 'heap',
 'sorts',
 'collection',
 'direction',
 'midst',
 'plenty',
 'uniform',
 'lap',
 'crowd',
 'cries',
 'examination',
 'ca',
 'suit',
 'pairs',
 'names',
 'fond',
 'Duke',
 'point',
 'beloved',
 'spyglass',
 'wo',
 'fountains',
 'information',
 'slice',
 'sides',
 'shower',
 'mass',
 'bodies',
 'east',
 'edge',
 'movement',
 'folds',
 'front',
 'sight',
 'vapour',
 'roars',
 'spine',
 'scourges',
 'weight',
 'pint',
 'remains',
 'miss',
 'pretence',
 'hath',
 'guilty',
 'hearts',
 'Castle',
 'Furiosus',
 'strain',
 'Government',
 'remind',
 'libbuty',
 'nature',
 'number',
 'aid',
 'possession',
 'murmur',
 'premises',
 'Caitiff',
 'space',
 'choice',
 'state',
 'prerogative',
 'clap',
 'Which',
 'favour',
 'shouts',
 'success',
 'contempt',
 'maids',
 'border',
 'million',
 'members',
 'discourtesy',
 'Rigour',
 'positions',
 'flash',
 'unpleasantness',
 'virtue',
 'mouthful',
 'patches',
 'rays',
 'heaps',
 'vessels',
 'drops',
 'Fountain',
 'store',
 'need',
 'share',
 'troop',
 'notion',
 'Last',
 'threat',
 'affairs',
 'honeymoon',
 'tone',
 'aloud',
 'water-vessels',
 'Water',
 'Troubles',
 'behaviour',
 'Cap',
 'hold',
 'brace',
 'object',
 'Shoes',
 'Sword',
 'three-quarters',
 'taste',
 'tire',
 'sounds',
 'middle',
 'frame',
 'signs',
 'sheen',
 'howl',
 'drinks',
 'eclipse',
 'science',
 'sort',
 'sentiments',
 'laws',
 'Adventure',
 'rolls',
 'cups',
 'measure',
 'refrain',
 'deprive',
 'worthy',
 'owner',
 'trace',
 'compassion',
 'astonishment',
 'absence',
 'act',
 'value',
 'touch',
 'powers',
 'judgment',
 'Purse',
 'cost',
 'stroke',
 'gnat',
 'instruments',
 'consisted',
 'Daughter',
 'audience',
 'attitude',
 'affirming',
 'welfare',
 'interests',
 'toils',
 'Path',
 'Arms',
 'ignorant',
 'smells',
 'herd',
 'fame',
 'blud',
 'whack',
 'descendant',
 'caus',
 'principle',
 'characters',
 'grounds',
 'locks',
 'hundreds',
 'Gentlemen',
 'standard',
 'amusements',
 'reign',
 'Enough',
 'possess',
 'brutes',
 'thousands',
 'Think',
 'Hang',
 'prime',
 'exercises',
 'blighting',
 'rubies',
 'parts',
 'Bottle',
 'generosity',
 'shape',
 'praise',
 'mark',
 'advantage',
 'deprived',
 'language',
 'complexions',
 'sense',
 'intention',
 'motive',
 'precaution',
 'approved',
 'sum',
 'purchase',
 'description',
 'rows',
 'appearance',
 'lighting',
 'praises',
 'mention',
 'effects',
 'mince-collops',
 'thoughts',
 'lines',
 'marks',
 'bits',
 'dint',
 'None',
 'corners',
 'fragments',
 'feat',
 'form',
 'ranges',
 'whirlpool',
 'habits',
 'centre',
 'port',
 'stupidity',
 'quantities',
 'ingenuity',
 'salaams',
 'multitude',
 'flight',
 'coasts',
 'harvest',
 'Wand',
 'summit',
 'Mountains',
 'blackness',
 'showers',
 'kinds',
 'masses',
 'lump',
 'crest',
 'towers',
 'assistance',
 'faces',
 'patterns',
 'groves',
 'helmets',
 'Cord',
 'Incas',
 'prophesied',
 'crater',
 'verge',
 'thrill',
 'uttered',
 'foundations',
 'Temple',
 'delight',
 'region',
 'religion',
 'merits',
 'points',
 'Cathedral',
 'note',
 'color',
 'loads',
 'Alas',
 'consist',
 'employment',
 'couple',
 'outskirts',
 'genii',
 'quarter',
 'fiercest',
 '1',
 'Traditions',
 'stories',
 'fits',
 'lack',
 'talks',
 'faults',
 'virtues',
 'defects',
 'vex',
 'splinters',
 'west',
 'loan',
 'charity',
 'stays',
 'instead',
 'howls',
 'barrels',
 'numbers',
 'hairs',
 'occurrence',
 'hotly',
 'approach',
 'arches',
 'beings',
 'beware',
 'train',
 'bales',
 'devices',
 'memory',
 'search',
 'garlands',
 'fate',
 'palm',
 'nosegays',
 'ceremonies',
 'decree',
 'palms',
 'bethought',
 'prediction',
 'stride',
 'sample',
 'avenue',
 'ranks',
 'discourse',
 'government',
 'sup',
 'despaired',
 'smell',
 'sweetness',
 'qualities',
 'laces',
 'beauties',
 'prayer',
 'Slave',
 'bestow',
 'basins',
 'brimful',
 'Genie',
 'banks',
 'robbed',
 'head-dress',
 'wines',
 'shriek',
 'required',
 'grasp',
 'dollars',
 'sha',
 'scruff',
 'swarms',
 'rabble',
 'roll',
 'chests',
 'list',
 'troubles',
 'expense',
 'composed',
 'arrival',
 'remembrance',
 'quantity',
 'pawing',
 'admirer',
 'parrots',
 'repent',
 'splendor',
 'hired',
 'tastes',
 'twinkling',
 'remainder',
 'flock',
 'piece',
 'handful',
 'bushels',
 'one-half',
 'Out',
 'behalf',
 'Hall',
 'tubs',
 'grinding',
 'freights',
 'portion',
 'strings',
 'abundance',
 'deceits',
 'Proud',
 'suits',
 'estates',
 'smallest',
 'buildings',
 'conferred',
 'canopy',
 'shade',
 'swarm',
 'forms',
 'ages',
 'voices',
 'histories',
 'fricassee',
 'costume',
 'goblet',
 'draught',
 'contents',
 'display',
 'nutshell',
 'woven',
 'bedside',
 'plumes',
 'brightness',
 'Weary',
 'regiment',
 'tongues',
 'reared',
 'bondage',
 'delivered',
 'greetings',
 'moaning',
 'Full',
 'wagon-loads',
 'necklaces',
 'borders',
 'Sister',
 'Island',
 'rattle',
 'harbor',
 'games',
 'secrets',
 'glimmer',
 'rhymes',
 'strokes',
 'Instead',
 'bushel',
 'savored',
 'crossness',
 'partake',
 'ill-usage',
 'earnestness',
 'excused',
 'depression',
 'Company',
 'Academy',
 'conquest',
 'Land',
 'collars',
 'pleasantness',
 'complains',
 'wrists',
 'Treasures',
 'shadows',
 'exclamations',
 'syllable',
 'jobs',
 'cases',
 'circumstance',
 'borrowed',
 'backs',
 'partly',
 'pretense',
 'plot',
 'grandchildren',
 'worry',
 'support',
 'bunch',
 'chips',
 'boulders',
 'ingratitude',
 'glitter',
 'gust',
 'puff',
 'deeds',
 'accused',
 'wickedness',
 'counsels',
 'deliverance',
 'faithfulness',
 'Numbers',
 'blades',
 'quit',
 'tameness',
 'ranged',
 'beholds',
 'bowls',
 'tons',
 'forepart',
 'Number',
 'flights',
 'ounces',
 'lightness',
 'squeezed',
 '5th',
 'tidings',
 'hogsheads',
 'smarts',
 'breadth',
 'awe',
 'performance',
 'admiral',
 'translation',
 '1,724',
 'twelfth',
 'Empire',
 'prow',
 'knot',
 'hatred',
 'suspicion',
 'amount',
 'gloves',
 '13th',
 'eve',
 'one-third',
 'two-thirds',
 'ecstacy',
 'lustre',
 'possessor',
 'thickness',
 'experiments',
 'philosopher',
 'recovery',
 'acknowledgment',
 'decision',
 'concourse',
 'richness',
 'sconces',
 'choruses',
 'acclamations',
 'complaining',
 'Son',
 'acquit',
 'capable',
 'snares',
 'sheets',
 'tires',
 'clews',
 'clew',
 'demonstrations',
 'execution',
 'echo',
 'Sultaness',
 'den',
 'billet',
 'log',
 'Cout',
 'telled',
 'leavings',
 'aught',
 'Ane',
 'ane',
 'com',
 'stirrup',
 'suite',
 'consequential',
 'bitterness',
 'Holder',
 'jug-jug',
 'platform',
 'bestowed',
 'Waq',
 '8',
 'dependency',
 'slab',
 'Causer',
 'Beware',
 'fashioned',
 'vagabond',
 'society',
 'threads',
 'Names',
 'gait',
 'Scorpion',
 'saddle-bags',
 'Place',
 'watches',
 'grip',
 'stages',
 'gore',
 'deliverer',
 'portions',
 'anguish',
 'bereft',
 'control',
 'notables',
 'solving',
 'collyrium',
 'ahead',
 'herds',
 'scrap',
 'Bureau',
 'Bird',
 'supply',
 'clump',
 'grove',
 'glimpse',
 'wasting',
 'weave',
 'Journal',
 'scuffling',
 'grains',
 'chances',
 'Story',
 'wreaths',
 'attacks',
 'attraction',
 'traces',
 'tips',
 'depths',
 'notes',
 'streak',
 'pangs',
 'columns',
 'Tales',
 'token',
 'scores',
 'thong',
 'thickets',
 'claps',
 'sewn',
 'pailful',
 'sackful',
 'Husband',
 'gnawing',
 'group',
 'tuft',
 'coverings',
 'sets',
 'bays',
 'ledge',
 'bundles',
 'sleeve',
 'ray',
 'breakfasted',
 'messages',
 'strip',
 'whisk',
 'court-yard',
 'spadeful',
 'rob',
 'sill',
 'recollection',
 'energy',
 'fates',
 'tinkling',
 'piles',
 'paint',
 'bands',
 'team',
 'doubts',
 'stores',
 'gleam',
 'lasts',
 'skein',
 'tortures',
 'intrusion',
 'screen',
 'rim',
 'disappearance',
 'outburst',
 'tangle',
 'Loveliest',
 'trellis',
 'deeps',
 'boundary',
 'worried',
 'flickering',
 'Officers',
 'packet',
 'breed',
 'expenses',
 'swing',
 'blaring',
 'foolishness',
 'ledges',
 'peered',
 'myriads',
 'Tale',
 'Knights',
 'flap',
 'grating',
 'troops',
 'Fairest',
 'Language',
 'copy',
 'sleeves',
 'Heart',
 'Hundreds',
 'trails',
 'baying',
 'campful',
 'murmurs',
 'forgetfulness',
 'shrinking',
 'rumour',
 'nickname',
 'glow',
 'woe',
 'bemoan',
 'hoard',
 'fancies',
 'Horn',
 'Plenty',
 'layer',
 'profits',
 'tunic',
 'spadefuls',
 'Kingdom',
 'complaints',
 'blinds',
 'streaks',
 'shores',
 'boasting',
 'falsehood',
 'whips',
 'reports',
 'specimen',
 'management',
 'Wonder',
 'halves',
 'multitudes',
 'World',
 'cuff',
 'bucketfuls',
 'joints',
 'stopper',
 'movements',
 'range',
 'cupful',
 'twang',
 'agony',
 'savings',
 'Pacha',
 'Sheik',
 'tones',
 'masterpiece',
 'pupil',
 'proofs',
 'descendants',
 'fonder',
 'observant',
 'large-minded',
 'braying',
 'misdeed',
 'suddenness',
 'hem',
 'whereabouts',
 'disorder',
 'dyke',
 'Rath',
 'Fictions',
 'flower-beds',
 'tufts',
 'sprig',
 'sparkles',
 'nought',
 'armful',
 'reels',
 'knots',
 'sheaf',
 'eyelids',
 'bragged',
 'babes',
 'jet',
 'accommodate',
 'inmates',
 'slowness',
 'Foremost',
 'regiments',
 'Crabs',
 'nimbleness',
 'frisk',
 'thundering',
 'Bones',
 'Salmon',
 'dominion',
 'flip',
 'carcase',
 'pinafores',
 'bracelets',
 'Stones',
 'Rear-Guard',
 'horrors',
 'snatches',
 'occupants',
 'handfuls',
 'skeletons',
 'incapable',
 'Greatest',
 'bowels',
 'corridor',
 'News',
 'splinter',
 'flagon',
 'hilts',
 'quest',
 'peal',
 'images',
 'jests',
 'Aziliez',
 'hardness',
 'lots',
 'isle',
 'car',
 'Escape',
 'tumult',
 'packs',
 'plateful',
 'Brownie',
 'Winning',
 'war-horn',
 'loops',
 'burnisher',
 'Abbey',
 'Tradition',
 'clods',
 'clod',
 'war-canoes',
 'archives',
 'Angel',
 'boundaries',
 'links',
 'doer',
 'scraps',
 'cracks',
 'fluff',
 'steaks',
 'web',
 'regardless',
 'resolves',
 'prices',
 'rear',
 'rood',
 'rattling',
 'dose',
 'Adventures',
 'founder',
 'Rover',
 'posy',
 'detail',
 'etiquette',
 'fatigues',
 'terrors',
 'shoal',
 'Wedges',
 'Arch',
 'chaplet',
 'inhabitant',
 'stings',
 'sprigs',
 'sensation',
 'regent',
 'flagging',
 'Gorla',
 'Ardan',
 'bearer',
 'Dog',
 'three-thirds',
 'antlers',
 'gills',
 'teller',
 'examples',
 'screams',
 'paddles',
 'series',
 'feats',
 'dozens',
 'disposed',
 'ruffian',
 'counterpane',
 'fathoms',
 'Slaying',
 'affectation',
 'assortment',
 'hymn',
 'nick',
 'scents',
 'bud',
 'Dolls',
 'soberly',
 'babel',
 'freshness',
 'keg',
 'bakings',
 'kegs',
 'raged',
 'overseer',
 'shipload',
 'Sprig',
 'trimmings',
 'pane',
 's',
 'bristles',
 'bristle',
 'protectress',
 'territory',
 'mercies',
 'pint-bottle',
 'crusts',
 'wells',
 'margin',
 'growls',
 'consisting',
 'turrets',
 'peals',
 'trains',
 'mince-meat',
 'pies',
 'hive',
 'jurisdiction',
 'process',
 'desirous',
 'diversions',
 'instruction',
 'Steed',
 'afflictions',
 'eldest-looking',
 'samples',
 'pocketful',
 'curiosities',
 'stealers',
 'swamps',
 'avalanches',
 'be-thought',
 'scourge',
 'pease-soup',
 'potful',
 'subsistence',
 'reverse',
 'provision',
 'plait',
 'salutations',
 'mindful',
 'firkin',
 'cathedral',
 'millions',
 'retreats',
 'Transylvanians',
 'offences',
 'clusters',
 'carcasses',
 'Prior',
 'prongs',
 'Fearing',
 'chimes',
 'consciousness',
 'fore-part',
 'cellarful',
 'bi',
 'maker',
 'Fearless',
 'Helm',
 'whip-hand',
 'bliss',
 'calves',
 'fore-leg',
 'yolk',
 'scratcher',
 'tankful',
 'glimpses',
 'Small',
 'Goddess',
 ...]

In [ ]: