In [1]:
import zipfile
import collections
import math
import random
import os

In [3]:
import numpy as np
from six.moves import urllib
from six.moves import xrange
import tensorflow as tf

In [7]:
with zipfile.ZipFile('text8.zip') as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    print(f.namelist())


['text8']

In [5]:
type(data)


Out[5]:
list

In [6]:
data[0]


Out[6]:
'anarchism'

In [8]:
words = data
print('Data size', len(words))


Data size 17005207

In [9]:
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(50000 - 1))

In [10]:
collections.Counter(words)


Out[10]:
Counter({'receipe': 1,
         'bbbb': 21,
         'tyrannos': 1,
         'taufb': 1,
         'weizenbock': 2,
         'plucked': 39,
         'cioppo': 1,
         'paracelsian': 1,
         'seventh': 440,
         'haven': 203,
         'garc': 66,
         'aterballeto': 1,
         'sikkerhetstjeneste': 1,
         'yereko': 1,
         'archeologia': 1,
         'poseidon': 84,
         'socks': 56,
         'addo': 2,
         'metrodome': 14,
         'dellingr': 1,
         'shahanshah': 14,
         'eldey': 1,
         'bhaktamara': 1,
         'gunsight': 1,
         'stuttered': 2,
         'two': 192644,
         'shirra': 1,
         'civile': 4,
         'boros': 2,
         'siyyid': 1,
         'ventadorn': 1,
         'obsidio': 1,
         'rickard': 3,
         'detachable': 10,
         'midfootprint': 1,
         'contrapunctus': 2,
         'fraktur': 3,
         'deuet': 1,
         'burkes': 2,
         'prompting': 86,
         'jaff': 1,
         'ebbing': 3,
         'richie': 36,
         'beachey': 1,
         'sselsprung': 2,
         'adaptec': 1,
         'maniac': 40,
         'mouthful': 4,
         'gnostica': 2,
         'pqcc': 1,
         'shadowcat': 2,
         'szef': 1,
         'unpronounced': 1,
         'sliaswich': 1,
         'latform': 1,
         'antispasmodics': 1,
         'embedix': 1,
         'promissory': 3,
         'latticework': 1,
         'gafur': 1,
         'limon': 6,
         'variations': 790,
         'residential': 196,
         'zoundweogo': 2,
         'lassitude': 1,
         'tschudi': 2,
         'bragga': 2,
         'icelandair': 3,
         'iuj': 1,
         'mausoleums': 5,
         'fagioli': 2,
         'enos': 13,
         'ghadir': 1,
         'adelstein': 2,
         'haganah': 22,
         'collision': 196,
         'romeyn': 1,
         'boedo': 1,
         'caramelizes': 1,
         'psychotria': 7,
         'resnais': 3,
         'turbobooksnob': 1,
         'tensilon': 1,
         'contact': 1111,
         'ddn': 4,
         'emplacing': 1,
         'serenades': 1,
         'stephenson': 81,
         'swaffham': 1,
         'pbx': 6,
         'zhiveli': 1,
         'needleless': 1,
         'anabasis': 4,
         'thaxter': 1,
         'xds': 2,
         'esala': 1,
         'indycar': 2,
         'taskforce': 4,
         'ueueteotl': 1,
         'kafr': 3,
         'codename': 40,
         'snue': 3,
         'decarburized': 1,
         'burchill': 2,
         'caymanian': 8,
         'taikong': 1,
         'dhtml': 21,
         'alderaan': 1,
         'juliani': 2,
         'thrombosis': 5,
         'petrovaradin': 1,
         'musters': 1,
         'tahb': 2,
         'agapanthaceae': 4,
         'benzyl': 12,
         'selwood': 1,
         'gibbon': 74,
         'seafire': 2,
         'ostrogradsky': 1,
         'whitt': 1,
         'blockstackers': 2,
         'korda': 6,
         'schelenker': 1,
         'cottingham': 2,
         'gcj': 4,
         'eparchia': 1,
         'ohmss': 1,
         'messianic': 76,
         'nfp': 1,
         'endocytosed': 1,
         'surnameweb': 1,
         'mostadir': 2,
         'sugarland': 1,
         'ency': 3,
         'tackhead': 1,
         'athonite': 4,
         'ninhursag': 25,
         'singamen': 1,
         'sandwich': 76,
         'hexaaqua': 1,
         'ministerthis': 1,
         'respire': 2,
         'yates': 45,
         'bargain': 59,
         'rspb': 3,
         'toiled': 2,
         'bocharova': 1,
         'surratt': 6,
         'warp': 56,
         'discala': 1,
         'dknf': 3,
         'miyamoto': 24,
         'kembla': 1,
         'moulmien': 1,
         'leaving': 1050,
         'jarma': 2,
         'anise': 23,
         'margretetorp': 2,
         'dirais': 1,
         'walkout': 1,
         'boltysh': 1,
         'bruner': 4,
         'kawakawa': 1,
         'sculptors': 58,
         'neesa': 1,
         'phantasy': 2,
         'basilius': 3,
         'counterpoint': 146,
         'grunewaldturm': 1,
         'margulis': 11,
         'equidistributed': 1,
         'philopateer': 1,
         'sopwith': 9,
         'hemoglobin': 107,
         'birchall': 1,
         'mikiver': 1,
         'zzuh': 1,
         'conditionalities': 2,
         'drape': 3,
         'ostrogski': 1,
         'baldwinsville': 2,
         'pentaphylacaceae': 1,
         'folkefiende': 1,
         'silesian': 23,
         'discussing': 198,
         'sequentially': 39,
         'jearim': 5,
         'isches': 2,
         'emplaced': 7,
         'nitra': 4,
         'subbytes': 4,
         'kephalaia': 1,
         'ohrenzeuge': 1,
         'effluunt': 1,
         'toco': 1,
         'crisps': 8,
         'callin': 3,
         'entourages': 1,
         'topically': 5,
         'koshanis': 1,
         'bondar': 2,
         'feo': 1,
         'multipath': 7,
         'timson': 1,
         'graysubject': 1,
         'hourani': 2,
         'pstn': 17,
         'lockerroom': 1,
         'lantana': 2,
         'sdsm': 4,
         'nakatta': 1,
         'crticism': 1,
         'suwanose': 1,
         'torres': 41,
         'lazaro': 1,
         'counterpulsation': 1,
         'teletypes': 4,
         'awali': 1,
         'lxiv': 3,
         'sonangol': 4,
         'miscarries': 1,
         'dentary': 2,
         'pull': 246,
         'guruma': 1,
         'amuses': 3,
         'elinvar': 1,
         'neuendorfer': 1,
         'coquaternion': 1,
         'aat': 3,
         'tetraol': 1,
         'iridales': 4,
         'adress': 1,
         'munificence': 2,
         'kitted': 1,
         'scarone': 1,
         'gentilly': 1,
         'agronomist': 4,
         'suprematism': 3,
         'baeyer': 2,
         'petraroia': 1,
         'kind': 1496,
         'lusavorich': 1,
         'intoxication': 45,
         'kesserich': 1,
         'eghlid': 1,
         'leibenstien': 1,
         'couronne': 1,
         'rituals': 192,
         'categorical': 54,
         'condensate': 44,
         'pean': 1,
         'fanu': 13,
         'gother': 1,
         'astorga': 1,
         'aaate': 1,
         'deitifying': 1,
         'almanack': 6,
         'outrage': 62,
         'avoid': 1062,
         'realists': 15,
         'wickets': 69,
         'quintus': 42,
         'ngurah': 2,
         'bsb': 2,
         'biophys': 2,
         'inutile': 1,
         'len': 62,
         'valdemar': 27,
         'eragon': 2,
         'debriefers': 2,
         'parlor': 9,
         'ceridwen': 1,
         'gcg': 2,
         'escombra': 1,
         'aspern': 5,
         'hyracoid': 1,
         'montogomery': 1,
         'budokan': 1,
         'fuliginosa': 1,
         'ande': 1,
         'nease': 2,
         'ay': 47,
         'necroticism': 1,
         'deletes': 6,
         'analogues': 29,
         'digitiser': 1,
         'levitcus': 1,
         'zambian': 12,
         'musonius': 5,
         'solim': 3,
         'watchfulness': 2,
         'permament': 4,
         'wipo': 91,
         'memoriam': 16,
         'costo': 1,
         'westprussia': 1,
         'spinoza': 109,
         'theodora': 34,
         'lins': 5,
         'glennie': 2,
         'wichman': 1,
         'unpressurised': 1,
         'membrane': 411,
         'xuanwu': 1,
         'odevaere': 1,
         'cemach': 1,
         'colonic': 2,
         'kitur': 1,
         'teena': 3,
         'arastirma': 1,
         'gnowee': 1,
         'spinibarbus': 1,
         'shneiderman': 1,
         'kosmische': 2,
         'pearcey': 2,
         'danann': 14,
         'hashd': 1,
         'zoromes': 1,
         'usnewslink': 2,
         'saxophonist': 50,
         'roja': 3,
         'dharmic': 7,
         'gdp': 1169,
         'lennart': 5,
         'schabir': 1,
         'nakhla': 1,
         'mcas': 2,
         'gedanite': 1,
         'palminteri': 3,
         'bretella': 1,
         'wayans': 8,
         'battus': 3,
         'songshan': 1,
         'ionescu': 1,
         'inhabits': 17,
         'ogooue': 3,
         'ariyya': 1,
         'revenge': 249,
         'kultur': 5,
         'massacred': 52,
         'mostest': 1,
         'tandem': 43,
         'netx': 1,
         'rambo': 6,
         'tripes': 1,
         'beat': 536,
         'melanoleuca': 9,
         'vasaplatsen': 1,
         'megillah': 3,
         'mineptah': 1,
         'stomachs': 14,
         'shapiro': 40,
         'operculum': 3,
         'devereaux': 1,
         'kayko': 1,
         'chimpanzees': 47,
         'sct': 3,
         'ffp': 1,
         'lasfar': 1,
         'dont': 15,
         'bavin': 1,
         'dellacroce': 2,
         'methimazole': 1,
         'daeraouenn': 1,
         'hercegovine': 2,
         'subplot': 10,
         'friemsprooch': 1,
         'cellcom': 3,
         'remarry': 15,
         'vanda': 2,
         'inx': 6,
         'certifiable': 1,
         'bichvinta': 1,
         'technicum': 2,
         'bosetti': 1,
         'drips': 7,
         'refract': 4,
         'woodsy': 1,
         'inchbonny': 1,
         'cscr': 1,
         'endorse': 43,
         'daniell': 15,
         'reitpony': 1,
         'christocentric': 1,
         'korsun': 1,
         'kankkunen': 1,
         'huda': 2,
         'wandra': 1,
         'dysrhythmia': 5,
         'hinkle': 2,
         'resumable': 1,
         'eiserne': 2,
         'holmr': 1,
         'holocaust': 498,
         'inescutcheon': 1,
         'keo': 4,
         'horsk': 1,
         'ambalat': 4,
         'hoovers': 3,
         'euxine': 4,
         'edukacine': 1,
         'chechahcos': 3,
         'impuestos': 1,
         'lymington': 5,
         'eslpdpro': 1,
         'muttra': 2,
         'klitou': 2,
         'gynaecologists': 2,
         'lateritic': 2,
         'tarkan': 1,
         'blackjacks': 6,
         'krupp': 50,
         'kawasaki': 17,
         'rapsur': 1,
         'comment': 324,
         'paschkis': 2,
         'tughluq': 1,
         'masurien': 1,
         'tufnel': 3,
         'railton': 1,
         'dusis': 1,
         'seyon': 1,
         'mcnamara': 30,
         'brizio': 1,
         'grappled': 2,
         'undernourished': 6,
         'biomolecular': 3,
         'morroco': 3,
         'torngasoak': 1,
         'footbal': 1,
         'slackness': 1,
         'goleta': 1,
         'comb': 71,
         'agust': 15,
         'klowdan': 2,
         'bretthorst': 1,
         'frusoni': 1,
         'songcatchers': 1,
         'roimh': 2,
         'dechei': 1,
         'seznec': 2,
         'hopefully': 38,
         'ezenarro': 1,
         'krebbs': 1,
         'berthier': 4,
         'pegged': 48,
         'domestically': 45,
         'torpediniformes': 1,
         'nofziger': 1,
         'wschodzi': 1,
         'atrophicus': 1,
         'saarinen': 12,
         'ifi': 1,
         'statesman': 380,
         'albertsons': 2,
         'chazelle': 1,
         'prepositioning': 7,
         'galeatus': 2,
         'honfleur': 4,
         'machen': 10,
         'bmx': 9,
         'ostfriesland': 2,
         'carlist': 18,
         'valera': 74,
         'confuse': 76,
         'iterators': 5,
         'spingovics': 1,
         'omeros': 3,
         'accelerating': 64,
         'vlbi': 3,
         'mortise': 2,
         'phlomobacter': 1,
         'strapline': 2,
         'faaglar': 1,
         'skel': 8,
         'aznan': 1,
         'csascii': 1,
         'rosbanen': 1,
         'barbarossa': 51,
         'discover': 276,
         'wexfordman': 1,
         'responses': 216,
         'laurentianus': 3,
         'iaorg': 1,
         'reuse': 33,
         'malt': 65,
         'demarchy': 2,
         'forre': 1,
         'konaseema': 1,
         'wulfila': 3,
         'illiterates': 1,
         'maneggio': 1,
         'cgtm': 1,
         'deum': 12,
         'informationsschriften': 1,
         'coleophora': 1,
         'qbz': 1,
         'contrapunctal': 1,
         'prefrences': 1,
         'illo': 2,
         'melia': 1,
         'ammah': 2,
         'collisions': 116,
         'gymnocercus': 1,
         'feline': 32,
         'yadav': 5,
         'revalent': 1,
         'danaides': 1,
         'antihero': 3,
         'velocites': 2,
         'leipziginfo': 1,
         'deusdedit': 1,
         'givant': 1,
         'connectors': 66,
         'cundinamarcensis': 1,
         'telencephalisation': 1,
         'nationalpark': 1,
         'extraperitoneal': 1,
         'bundesmarine': 5,
         'nazianzos': 4,
         'antilinear': 1,
         'ager': 5,
         'rossetta': 2,
         'abdelazar': 2,
         'yesand': 1,
         'speccie': 1,
         'adamou': 2,
         'azurite': 1,
         'deoxyhemoglobin': 2,
         'camiller': 2,
         'botaniates': 4,
         'grantees': 1,
         'spataro': 1,
         'barbe': 2,
         'merinid': 2,
         'matematicheskikh': 1,
         'doron': 5,
         'ergosphere': 6,
         'pentaamminechlorocobalt': 1,
         'feudatories': 7,
         'planetfall': 11,
         'strafe': 4,
         'powermacs': 5,
         'preen': 1,
         'kekouan': 1,
         'accugroove': 1,
         'glr': 7,
         'chali': 1,
         'sarcophagi': 10,
         'scannd': 1,
         'ozhypnosis': 1,
         'brittannica': 1,
         'colin': 178,
         'vdots': 8,
         'hispano': 13,
         'enigmatoze': 1,
         'mordant': 13,
         'tavas': 3,
         'ige': 17,
         'goretex': 1,
         'gymnopaedia': 1,
         'dsc': 4,
         'alfen': 2,
         'justicialist': 2,
         'yallop': 1,
         'btfsplk': 2,
         'psinet': 1,
         'transportas': 1,
         'biochemically': 2,
         'torona': 1,
         'hercules': 160,
         'multis': 2,
         'gallbladder': 21,
         'ragnit': 1,
         'wltw': 1,
         'cata': 2,
         'eliminator': 3,
         'lipolysis': 8,
         'superquadrics': 1,
         'litt': 29,
         'insecticide': 24,
         'antimicrobial': 11,
         'libi': 17,
         'proprioception': 8,
         'curonians': 4,
         'silat': 2,
         'petrodiesel': 1,
         'posion': 1,
         'chaux': 5,
         'barrows': 9,
         'grrm': 29,
         'fving': 1,
         'soliti': 1,
         'styrofoam': 7,
         'alito': 4,
         'thymomas': 1,
         'damascius': 6,
         'civil': 3443,
         'halflings': 2,
         'stated': 1383,
         'panache': 3,
         'sabbatical': 9,
         'landstreicher': 1,
         'mashita': 1,
         'pleads': 25,
         'algerie': 3,
         'eea': 36,
         'gagliano': 2,
         'baltikum': 2,
         'egy': 2,
         'chilterns': 1,
         'advocaat': 1,
         'bartlowicz': 1,
         'munam': 2,
         'party': 6943,
         'unction': 17,
         'jezkov': 1,
         'gekommen': 1,
         'malkin': 3,
         'transzendentale': 1,
         'esters': 71,
         'sandlin': 1,
         'telefantasy': 1,
         'nyby': 1,
         'solanine': 1,
         'tago': 2,
         'gdcl': 1,
         'hesperoleucus': 1,
         'servius': 11,
         'cicuta': 2,
         'civitatis': 1,
         'juliae': 1,
         'proposal': 460,
         'magnaflow': 1,
         'peliz': 1,
         'researchers': 615,
         'athiest': 1,
         'localtalk': 2,
         'makah': 6,
         'rennovation': 1,
         'soursop': 2,
         'housekeepers': 2,
         'bretons': 9,
         'vle': 4,
         'kasim': 1,
         'ceolmhar': 1,
         'scholfield': 1,
         'thousander': 2,
         'rajput': 14,
         'decibannage': 1,
         'foreheads': 3,
         'multimark': 1,
         'transferral': 1,
         'schistosomal': 1,
         'eccentrically': 2,
         'midibus': 1,
         'mappings': 12,
         'rabban': 13,
         'fadlallah': 1,
         'absconding': 1,
         'maajka': 1,
         'gedae': 1,
         'olandt': 1,
         'unzip': 2,
         'prealps': 1,
         'aboot': 4,
         'attacotti': 2,
         'antianxiety': 1,
         'decimals': 13,
         'perish': 34,
         'thorne': 19,
         'ahikuntaka': 1,
         'dcnos': 1,
         'rajasthani': 5,
         'nemrod': 2,
         'trimethoprim': 1,
         'kamman': 1,
         'katrina': 82,
         'cwiss': 1,
         'fromuth': 2,
         'unwinding': 7,
         'manipular': 1,
         'charityware': 1,
         'aara': 6,
         'flyingfish': 1,
         'netherlandish': 3,
         'premised': 6,
         'forelock': 1,
         'infertile': 18,
         'alyat': 1,
         'voyages': 99,
         'altarpiece': 16,
         'encre': 1,
         'menticide': 1,
         'fittest': 15,
         'nishkam': 1,
         'proliant': 2,
         'absu': 1,
         'intoxicating': 6,
         'lookahead': 9,
         'conscience': 145,
         'costas': 79,
         'duquesne': 6,
         'pino': 8,
         'unabsehbare': 1,
         'desmosedici': 2,
         'herod': 66,
         'ceti': 16,
         'overflight': 4,
         'maba': 1,
         'reiger': 1,
         'callipyge': 1,
         'durie': 3,
         'abenner': 3,
         'katakana': 99,
         'shabak': 3,
         'sackheim': 1,
         'nepente': 1,
         'boopsie': 10,
         'radiotherapeutic': 1,
         'nachtigal': 1,
         'diseconomy': 1,
         'mangoes': 5,
         'satchmo': 11,
         'nitrile': 13,
         'oyster': 43,
         'arabe': 5,
         'entertwined': 1,
         'shearman': 3,
         'rolighed': 1,
         'lillywhite': 1,
         'undertakings': 19,
         'bernarr': 3,
         'beauvoir': 23,
         'cosmonaut': 67,
         'aramite': 1,
         'campana': 6,
         'barragan': 1,
         'baylon': 2,
         'cultic': 24,
         'prog': 13,
         'frotteurism': 5,
         'creationists': 61,
         'extradition': 40,
         'skowron': 1,
         'qaim': 2,
         'resveratrol': 2,
         'stava': 1,
         'kumbalom': 1,
         'conseillers': 5,
         'kerneltrap': 1,
         'leijonhufvud': 1,
         'hymenaios': 1,
         'foes': 41,
         'licorne': 2,
         'chemosensory': 1,
         'zeropaid': 1,
         'sybian': 1,
         'thermoelectricity': 3,
         'baptismo': 2,
         'mallorquin': 4,
         'chongzhi': 1,
         'abipones': 5,
         'pelje': 2,
         'keisker': 1,
         'wingen': 1,
         'grosse': 23,
         'hubbed': 3,
         'caciocavallo': 1,
         'capillarity': 2,
         'aftab': 3,
         'husked': 1,
         'wiecino': 17,
         'canuti': 2,
         'attenuators': 1,
         'murena': 3,
         'relegating': 6,
         'reapportionment': 1,
         'xu': 25,
         'devised': 217,
         'laotians': 8,
         'temazepam': 2,
         'demolishment': 1,
         'ventricle': 15,
         'pownal': 3,
         'pageants': 5,
         'uttu': 3,
         'peaky': 1,
         'boye': 3,
         'canadice': 2,
         'gorditas': 1,
         'flores': 57,
         'stigmas': 5,
         'bludgeonings': 1,
         'titoist': 1,
         'merchants': 260,
         'chambres': 3,
         'haggada': 2,
         'lustreless': 1,
         'underbrace': 6,
         'subtilisin': 1,
         'retorica': 1,
         'sabbaths': 5,
         'blindheit': 1,
         'talbot': 44,
         'anagalida': 1,
         'sariel': 2,
         'selenia': 2,
         'kisumu': 4,
         'todd': 166,
         'infano': 1,
         'tinymuse': 1,
         'almsgiving': 7,
         'rowena': 9,
         'ironmonger': 2,
         'borradori': 1,
         'atypicals': 4,
         'smithkline': 1,
         'bareki': 1,
         'chione': 2,
         'ellus': 1,
         'strop': 2,
         'nationalizing': 4,
         'wakame': 2,
         'licori': 1,
         'recalculated': 6,
         'jq': 1,
         'martial': 639,
         'kronheimer': 1,
         'opresi': 1,
         'earthworm': 32,
         'lpu': 6,
         'kyukoku': 1,
         'weigard': 1,
         'jaden': 2,
         'grumiaux': 1,
         'dignified': 28,
         'juventud': 4,
         'serogroup': 1,
         'autokefalicznego': 1,
         'barroco': 2,
         'hawking': 51,
         'baluchestan': 3,
         'uldis': 1,
         'cultured': 60,
         'medicinally': 6,
         'leegot': 1,
         'paleckis': 1,
         'malcolmus': 1,
         'chronograms': 1,
         'kinneging': 1,
         'neuropterous': 1,
         'mokele': 6,
         'somar': 2,
         'umbrinus': 1,
         'arcweb': 1,
         'quijano': 2,
         'pituriaspida': 1,
         'liverani': 1,
         'yamachi': 1,
         'plowboy': 1,
         'polymixiiformes': 1,
         'pack': 248,
         'hymnwriter': 2,
         'lambic': 4,
         'gansters': 1,
         'quizzically': 1,
         'macqueen': 1,
         'ginits': 1,
         'paton': 12,
         'payame': 2,
         'hmac': 21,
         'photoplay': 5,
         'shantipur': 1,
         'artillerie': 1,
         'iots': 1,
         'liking': 36,
         'wize': 1,
         'quilts': 1,
         'tardigradus': 1,
         'cardelli': 6,
         'rihtiniemi': 2,
         'iste': 1,
         'alphekka': 2,
         'pinewood': 5,
         'ntruencrypt': 1,
         'comedians': 106,
         'eichlami': 1,
         'koniag': 1,
         'kawaimina': 1,
         'attiret': 2,
         'fleener': 2,
         'unpopularity': 27,
         'aiwaz': 1,
         'dongsishitiao': 1,
         'baffin': 25,
         'articulator': 1,
         'bullhead': 1,
         'ujaama': 1,
         'marketability': 3,
         'albumsbob': 1,
         'strawman': 2,
         'sinofsky': 1,
         'enthusiasms': 6,
         'lasdehnen': 1,
         'parodically': 2,
         'dohrn': 1,
         'ulnar': 1,
         'faired': 4,
         'wannabe': 1,
         'ijc': 1,
         'superscript': 16,
         'merarites': 1,
         'kdwb': 1,
         'iguanids': 1,
         'virgin': 446,
         'kajn': 1,
         'aceramic': 2,
         'cyclone': 50,
         'intelligible': 75,
         'ergonomie': 1,
         'viz': 39,
         'britcom': 1,
         'ackey': 1,
         'gann': 7,
         'galand': 1,
         'tauros': 2,
         'ikammanen': 1,
         'evacuating': 12,
         'checagou': 1,
         'sword': 596,
         'turps': 3,
         'knorr': 2,
         'didactylus': 3,
         'deleted': 83,
         'microfortnight': 1,
         'exemptus': 1,
         'sauraseni': 1,
         'vicarage': 12,
         'theodorus': 7,
         'shimane': 3,
         'carnoy': 1,
         'proteolytic': 3,
         'wienerwald': 1,
         'waitemata': 8,
         'brews': 18,
         'bumiputras': 1,
         'ambiguous': 195,
         'renaldo': 5,
         'pambazos': 1,
         'kacem': 1,
         'adel': 12,
         'lingua': 78,
         'keyn': 1,
         'waid': 11,
         'hypnotherapy': 23,
         'belleek': 1,
         'cathedral': 550,
         'inculcated': 5,
         'polglase': 1,
         'exil': 2,
         'chava': 1,
         'emerick': 1,
         'nazims': 2,
         'copulatio': 1,
         'empelor': 1,
         'promontory': 19,
         'constabulary': 22,
         'imperforation': 1,
         'mdir': 1,
         'musik': 37,
         'puffin': 6,
         'corita': 1,
         'benaud': 4,
         'dependents': 27,
         'seefahrer': 1,
         'salaca': 1,
         'zinovievna': 1,
         'uspenskaia': 1,
         'ginster': 1,
         'lithographer': 3,
         'morgaine': 2,
         'ratae': 1,
         'gadaba': 1,
         'rbau': 1,
         'ats': 21,
         'lldin': 1,
         'eritreans': 19,
         'chapterhouse': 15,
         'strangled': 26,
         'patterns': 718,
         'joppy': 1,
         'mboxg': 1,
         'penrose': 40,
         'mimimalists': 1,
         'oceania': 100,
         'laish': 1,
         'southamerican': 1,
         'figuere': 1,
         'turchetta': 1,
         ...})

In [11]:
dictionary = dict()
for word, _ in count:
    dictionary[word] = len(dictionary)

In [12]:
type(dictionary)


Out[12]:
dict

In [13]:
dictionary


Out[13]:
{'kyrie': 38567,
 'bbbb': 30107,
 'factually': 39531,
 'deane': 36772,
 'interlude': 34358,
 'seventh': 3833,
 'haven': 7089,
 'linebarger': 24744,
 'garc': 15348,
 'stephenson': 13505,
 'pln': 40141,
 'theron': 24290,
 'mime': 11248,
 'formations': 7101,
 'rulebase': 29374,
 'turkey': 2222,
 'counternarcotics': 44613,
 'poseidon': 13142,
 'socks': 17009,
 'ginza': 48584,
 'vulnerabilities': 21888,
 'transforms': 11624,
 'yukio': 45944,
 'metrodome': 37431,
 'realization': 10831,
 'shahanshah': 37432,
 'warnings': 14737,
 'laren': 29375,
 'activating': 25909,
 'wrecking': 46562,
 'unguarded': 46563,
 'mamluk': 34659,
 'jacques': 3757,
 'ioannis': 49345,
 'detachable': 44612,
 'rameses': 49435,
 'vincendeau': 34058,
 'newsstand': 49219,
 'bonfires': 35768,
 'faithful': 6944,
 'channel': 1091,
 'prompting': 12954,
 'mandating': 38207,
 'richie': 22154,
 'excitation': 24987,
 'anaximander': 20716,
 'lap': 14312,
 'adverb': 27212,
 'moranis': 48597,
 'maniac': 20848,
 'nepotism': 41512,
 'endorse': 20171,
 'habilitation': 37596,
 'cpa': 39365,
 'motivating': 25766,
 'lengthened': 28111,
 'novel': 841,
 'alois': 24869,
 'corcovado': 37096,
 'variations': 2290,
 'subliminal': 27046,
 'residential': 7261,
 'hopefully': 21630,
 'feud': 16067,
 'amalthea': 32128,
 'donkeys': 41513,
 'missy': 45929,
 'discogs': 43333,
 'vivekananda': 45265,
 'dowling': 43496,
 'juan': 4340,
 'experienced': 2707,
 'rosette': 36773,
 'haganah': 29373,
 'aor': 47045,
 'grandmaster': 17658,
 'distilling': 34510,
 'contact': 1622,
 'orchid': 26752,
 'tranquillity': 41514,
 'fares': 18393,
 'gwh': 8123,
 'hammers': 23332,
 'announced': 1423,
 'culmination': 20238,
 'owe': 13844,
 'begged': 23974,
 'cheques': 26521,
 'xof': 45945,
 'dhtml': 30109,
 'hydrate': 44081,
 'grok': 44028,
 'consist': 3959,
 'sumer': 26044,
 'beekeeping': 41996,
 'benzyl': 40563,
 'gibbon': 14260,
 'teschen': 44082,
 'stony': 26476,
 'aliyah': 39338,
 'foraging': 19968,
 'sosa': 27663,
 'messianic': 14023,
 'pay': 1898,
 'bee': 8613,
 'birley': 38212,
 'wan': 14563,
 'carmelite': 40142,
 'continuations': 35197,
 'ninhursag': 27350,
 'externally': 17520,
 'jamo': 12518,
 'nanometers': 36081,
 'sandwich': 14024,
 'adjacency': 48353,
 'fernandez': 23637,
 'yates': 19496,
 'bargain': 16552,
 'ridgway': 45266,
 'recording': 2094,
 'warp': 17010,
 'gander': 33301,
 'stockholders': 36774,
 'unsatisfactory': 27664,
 'leaving': 1717,
 'anise': 28630,
 'dependents': 26477,
 'plotinus': 19308,
 'approximated': 18207,
 'eritreans': 32352,
 'feelin': 47379,
 'omnivores': 49346,
 'sculptors': 16633,
 'fairies': 16415,
 'leet': 9443,
 'counterpoint': 8984,
 'develop': 1691,
 'protein': 2362,
 'kummer': 48598,
 'enrollment': 15097,
 'analects': 22069,
 'sopwith': 47136,
 'hemoglobin': 11222,
 'bookmarks': 38022,
 'semantic': 9023,
 'cannery': 48599,
 'babel': 19373,
 'eyebrows': 26622,
 'kip': 30965,
 'silesian': 28631,
 'discussing': 7215,
 'sequentially': 21149,
 'separation': 3487,
 'anxious': 18394,
 'collectivization': 22667,
 'diethyl': 31686,
 'wikiquote': 34364,
 'kib': 24414,
 'landscape': 4641,
 'device': 1499,
 'clarinetist': 37438,
 'mannerheim': 25910,
 'memetic': 17627,
 'ascendancy': 25357,
 'maceo': 40578,
 'defer': 37634,
 'torres': 20565,
 'batsman': 18030,
 'gro': 22430,
 'gaff': 49347,
 'pull': 6146,
 'bail': 20991,
 'mauro': 28475,
 'eller': 49348,
 'gematria': 27354,
 'evangelica': 49768,
 'certificate': 10475,
 'offerings': 12357,
 'sephardic': 20239,
 'resolutely': 45946,
 'mining': 3429,
 'kind': 1190,
 'cones': 13481,
 'trade': 513,
 'intoxication': 19497,
 'meteorological': 17301,
 'livestock': 7418,
 'jamie': 15877,
 'encourage': 5506,
 'petrified': 30332,
 'rituals': 7384,
 'categorical': 17413,
 'indicator': 9039,
 'condensate': 19751,
 'ratios': 7825,
 'retroactive': 39339,
 'outrage': 15946,
 'hutchison': 41515,
 'avoid': 1699,
 'realists': 36075,
 'metaphysical': 9055,
 'ours': 16337,
 'quintus': 20300,
 'mrna': 9528,
 'viridis': 40144,
 'microprogram': 23863,
 'len': 15947,
 'valdemar': 26185,
 'parlor': 47137,
 'mentally': 8825,
 'vos': 34265,
 'roman': 255,
 'lifeless': 33302,
 'u': 146,
 'moss': 14463,
 'activation': 11208,
 'utterances': 25768,
 'analysis': 900,
 'commune': 13589,
 'downloaded': 14555,
 'disposition': 14738,
 'ay': 18967,
 'chalukyas': 34359,
 'analogues': 25098,
 'feathers': 12096,
 'laissez': 12971,
 'hodder': 30647,
 'inventions': 9104,
 'coexisted': 41516,
 'wipo': 12453,
 'memoriam': 34948,
 'burgeoning': 19844,
 'positively': 12407,
 'theodora': 22893,
 'scientifically': 12551,
 'bj': 6368,
 'acted': 5158,
 'aptitude': 28476,
 'landowners': 18802,
 'fcc': 16036,
 'membrane': 4054,
 'indecisive': 32825,
 'gift': 4822,
 'rna': 6645,
 'anglia': 23432,
 'golgi': 18438,
 'danann': 37433,
 'botvinnik': 45267,
 'lifts': 18803,
 'bs': 14990,
 'consult': 9684,
 'gdp': 1532,
 'partners': 3671,
 'highlights': 10562,
 'annexes': 25346,
 'forgotten': 6082,
 'confinement': 17157,
 'inhabits': 33801,
 'typified': 21403,
 'revenge': 6093,
 'rstenberg': 29738,
 'massacred': 17838,
 'tandem': 20028,
 'luiz': 49541,
 'beat': 3249,
 'melanoleuca': 47138,
 'dodecahedron': 14966,
 'minds': 6251,
 'stomachs': 37434,
 'albany': 14780,
 'horizontally': 12179,
 'rakyat': 45948,
 'chimpanzees': 18968,
 'logarithmic': 15236,
 'hezekiah': 12495,
 'bearable': 49350,
 'antwerp': 11909,
 'evenings': 27044,
 'hq': 24402,
 'subplot': 44614,
 'priests': 4051,
 'remarry': 36076,
 'ruptured': 48106,
 'nig': 28285,
 'mishneh': 32129,
 'windowing': 37099,
 'terrace': 16130,
 'negligent': 47240,
 'aidan': 23119,
 'licensed': 5175,
 'dell': 10542,
 'rhyolite': 43519,
 'confounding': 49855,
 'loves': 10672,
 'yag': 27666,
 'smoky': 27357,
 'francophones': 28944,
 'magnify': 32353,
 'vitro': 16590,
 'templars': 24870,
 'girolamo': 23008,
 'pelham': 27045,
 'holocaust': 3453,
 'misinterpreted': 30746,
 'liam': 16967,
 'copa': 37448,
 'catalana': 42992,
 'correctness': 17051,
 'symphonies': 17579,
 'strange': 4084,
 'gurdjieff': 49842,
 'affirm': 18660,
 'brandon': 17519,
 'kawasaki': 33802,
 'comment': 4956,
 'hc': 21707,
 'dane': 23975,
 'huck': 34155,
 'apec': 27667,
 'mcnamara': 24625,
 'crush': 12298,
 'indictees': 38569,
 'ignosticism': 45594,
 'kell': 31202,
 'comb': 14623,
 'agust': 36078,
 'abbreviate': 48601,
 'boxcar': 47864,
 'buried': 2781,
 'intersystems': 41351,
 'lithuania': 3003,
 'lands': 2112,
 'man': 243,
 'snorri': 15846,
 'unravel': 44084,
 'sns': 33063,
 'pegged': 18742,
 'decorating': 32592,
 'languedoc': 43498,
 'domestically': 19498,
 'lees': 39130,
 'maitreya': 39401,
 'saarinen': 40566,
 'statesman': 4345,
 'zalta': 48602,
 'values': 1140,
 'bmx': 47139,
 'valera': 14261,
 'glycogen': 19520,
 'swift': 8228,
 'accelerating': 15644,
 'ventilated': 46567,
 'rehabilitated': 35770,
 'nayla': 45269,
 'coldcut': 39341,
 'subordinate': 8547,
 'catalyze': 34376,
 'doubled': 9009,
 'cultivated': 10320,
 'discover': 5621,
 'grabs': 34059,
 'articles': 1097,
 'responses': 6754,
 'enos': 38935,
 'reuse': 23331,
 'malt': 15505,
 'grassy': 26623,
 'differed': 11944,
 'suda': 42000,
 'deum': 40567,
 'almagest': 24521,
 'february': 596,
 'closed': 1631,
 'upn': 25236,
 'purify': 21392,
 'qualifiers': 43499,
 'toot': 45386,
 'collisions': 10573,
 'budding': 24871,
 'dara': 28477,
 'laboratories': 9633,
 'feline': 23737,
 'subtraction': 21236,
 'shalt': 29926,
 'peasant': 8213,
 'seminole': 31444,
 'connectors': 15349,
 'shahada': 49351,
 'redeem': 31434,
 'bangs': 21580,
 'hangzhou': 43764,
 'phonograph': 24872,
 'bookrags': 48817,
 'situs': 48603,
 'infections': 7477,
 'evaluations': 30541,
 'baking': 16914,
 'aylwin': 45950,
 'turner': 8129,
 'omari': 25911,
 'schubert': 25792,
 'planetfall': 42434,
 'bruno': 7401,
 'orcs': 46568,
 'methionine': 34060,
 'rupee': 23120,
 'spikes': 24059,
 'sarcophagi': 44615,
 'colin': 7809,
 'rsync': 49352,
 'advantageous': 15024,
 'fight': 1784,
 'gin': 8390,
 'anthropomorphism': 25769,
 'ige': 33803,
 'landmasses': 41059,
 'crackers': 23337,
 'screens': 9158,
 'ray': 1959,
 'hercules': 8400,
 'gallbladder': 30110,
 'shifter': 43528,
 'suitably': 29025,
 'litt': 25099,
 'insecticide': 28284,
 'antimicrobial': 42435,
 'citizenship': 3639,
 'radek': 39725,
 'andrei': 13746,
 'risen': 11834,
 'libi': 33804,
 'storks': 38209,
 'catholic': 566,
 'ascertaining': 39726,
 'quasars': 33311,
 'possessive': 17889,
 'convention': 1185,
 'intramolecular': 48604,
 'barrows': 47141,
 'grrm': 25100,
 'awakened': 24291,
 'implant': 26624,
 'dendritic': 35198,
 'emotionally': 14471,
 'https': 14249,
 'aeolian': 27047,
 'neil': 5118,
 'stated': 1290,
 'sabbatical': 47142,
 'alces': 34660,
 'obfuscated': 41546,
 'pleads': 27351,
 'rideau': 39342,
 'eea': 22155,
 'belfast': 9142,
 'vandals': 12180,
 'psi': 5402,
 'party': 202,
 'unction': 33805,
 'impossibility': 19294,
 'lob': 37833,
 'esters': 14624,
 'servius': 42436,
 'naples': 7914,
 'proposal': 3697,
 'semi': 1941,
 'erebus': 34061,
 'basque': 3198,
 'gernika': 47143,
 'stranger': 10835,
 'symbolizes': 20920,
 'deceiver': 44015,
 'harlequin': 36432,
 'dtv': 23433,
 'juneau': 35771,
 'bretons': 47144,
 'subtlety': 25237,
 'rajput': 37435,
 'logistic': 19392,
 'provability': 49720,
 'moulton': 45951,
 'toothpaste': 39727,
 'deride': 42302,
 'heresiologists': 43501,
 'kalahari': 21555,
 'tischendorf': 26891,
 'vagina': 15267,
 'calculated': 4900,
 'mappings': 40568,
 'rabban': 38938,
 'shreveport': 35471,
 'tezuka': 26625,
 'galahad': 38571,
 'zn': 36776,
 'topaz': 39802,
 'perish': 22894,
 'thorne': 31893,
 'appendicitis': 40146,
 'battles': 3087,
 'katrina': 13361,
 'equilateral': 36434,
 'male': 645,
 'liberate': 20101,
 'taoist': 21237,
 'pub': 8140,
 'infertile': 32816,
 'schottky': 46889,
 'clytemnestra': 29540,
 'ghosting': 47869,
 'statistical': 3298,
 'genocidal': 46570,
 'voyages': 11816,
 'governorship': 21889,
 'altarpiece': 34949,
 'fittest': 36079,
 'jotham': 46552,
 'permian': 21066,
 'hydroelectricity': 46041,
 'trigonometric': 15111,
 'cielo': 45952,
 'lookahead': 47145,
 'eldest': 6675,
 'costas': 13675,
 'norte': 16315,
 'halas': 23223,
 'herod': 15350,
 'id': 4060,
 'masorti': 22070,
 'frontier': 4538,
 'mammals': 4145,
 'anschluss': 15430,
 'comma': 16869,
 'lambda': 4849,
 'discontinuing': 47870,
 'amphibians': 20102,
 'boopsie': 45048,
 'raison': 30542,
 'shorthair': 39028,
 'brunswick': 5957,
 'vociferous': 41519,
 'satchmo': 42437,
 'nitrile': 38939,
 'wenders': 49767,
 'theremin': 14428,
 'publique': 27049,
 'flat': 2178,
 'rajendra': 45953,
 'recreational': 7175,
 'vite': 48863,
 'undertakings': 31894,
 'filler': 39533,
 'carbines': 22796,
 'wichita': 21867,
 'leaflet': 37101,
 'scoreless': 34661,
 'lengths': 7508,
 'sabo': 30748,
 'cultic': 27958,
 'karzai': 25912,
 'creationists': 16109,
 'offset': 8013,
 'extradition': 20939,
 'wigs': 43539,
 'foes': 20566,
 'bactria': 26626,
 'anatomists': 47503,
 'stiffer': 46571,
 'imprisoned': 5849,
 'reflector': 18439,
 'insure': 26478,
 'scrupulously': 40147,
 'verification': 13298,
 'flanagan': 35772,
 'joaqu': 32131,
 'bettors': 46572,
 'anatta': 45986,
 'grosse': 28633,
 'willed': 30543,
 'methodists': 18854,
 'designations': 8890,
 'grenadines': 21789,
 'synagogues': 13093,
 'wiecino': 33806,
 'catalans': 40148,
 'bedroom': 15125,
 'devised': 6730,
 'improvising': 39728,
 'ventricle': 36080,
 'neo': 3856,
 'elsewhere': 2908,
 'sign': 1442,
 'kzinti': 45273,
 'lamo': 28478,
 'temporary': 3610,
 'flores': 16821,
 'deleuze': 12406,
 'browse': 24832,
 'nationalist': 4038,
 'oxidize': 35773,
 'bahadur': 35922,
 'tragically': 34662,
 'sore': 25681,
 'talbot': 19752,
 'ismail': 14813,
 'intellect': 12311,
 'communally': 39729,
 'democracies': 10492,
 'todd': 8200,
 'nightingale': 35774,
 'larval': 33150,
 'gained': 1511,
 'flamebait': 44086,
 'secretive': 23224,
 'rowena': 47146,
 'eyepiece': 39193,
 'openoffice': 23864,
 'obedient': 34663,
 'vincent': 5722,
 'vegetarianism': 19569,
 'raskin': 29186,
 'riverine': 42486,
 'veterinary': 14068,
 'divers': 13590,
 'autonomously': 45984,
 'aztlan': 48607,
 'martial': 2779,
 'ratify': 16229,
 'earthworm': 23738,
 'antinomies': 48475,
 'hampering': 46573,
 'dignified': 25637,
 'sandwiched': 32008,
 'caesarion': 42003,
 'hawking': 18036,
 'kinetochores': 37102,
 'cultured': 16267,
 'samsung': 35775,
 'mille': 32354,
 'wcl': 20103,
 'aardvarks': 34950,
 'bloch': 14702,
 'keswick': 46574,
 'clarendon': 16230,
 'grigori': 30136,
 'buttocks': 27911,
 'eredivisie': 45954,
 'pryor': 24873,
 'jakarta': 9841,
 'inlets': 34664,
 'teleprinter': 36430,
 'amp': 11884,
 'ksr': 40611,
 'mouthpieces': 38573,
 'pack': 6107,
 'portrait': 3740,
 'defying': 40149,
 'vijayanagara': 44087,
 'untold': 26604,
 'outputting': 48608,
 'paton': 40570,
 'hmac': 30112,
 'colonize': 26253,
 'hendrix': 5976,
 'vernacular': 9849,
 'transient': 15389,
 'walton': 18342,
 'quaestor': 38574,
 'invisibly': 49354,
 'liking': 22156,
 'passau': 49355,
 'botha': 31436,
 'burt': 19885,
 'gillian': 35470,
 'brushing': 40150,
 'slip': 11725,
 'pillars': 11345,
 'coals': 35776,
 'railways': 4696,
 'comedians': 11281,
 'symptomatic': 30544,
 'gygax': 21980,
 'aviv': 13905,
 'baffin': 27352,
 'trisomy': 26342,
 'familia': 14988,
 'epicurus': 18747,
 'montenegro': 6249,
 'basso': 20719,
 'thespis': 44684,
 'fecal': 34360,
 'shrunk': 23064,
 'hatred': 8064,
 'molds': 23434,
 'willful': 38392,
 'supplier': 13435,
 'waterhouse': 43204,
 'skill': 3383,
 'lifeform': 45274,
 'sagrada': 32838,
 'tensions': 6199,
 'stoiber': 23525,
 'cyclone': 18277,
 'intelligible': 14163,
 'purpurea': 37541,
 'hypothyroidism': 29928,
 'aerials': 42004,
 'machen': 46755,
 'cabins': 49417,
 'lifeforms': 28848,
 'demotic': 35472,
 'rdoba': 17930,
 'migrants': 14171,
 'sword': 2970,
 'highland': 7373,
 'deleted': 13259,
 'abeda': 48609,
 'vicarage': 40572,
 'vertov': 15938,
 'cyclical': 27670,
 'brews': 32818,
 'received': 639,
 'ambiguous': 7294,
 'humourous': 35778,
 'knit': 15893,
 'hobson': 46939,
 'adel': 40573,
 'lingua': 13795,
 'relevant': 3888,
 'endures': 37315,
 'hypnotherapy': 28634,
 'rusty': 26892,
 'cuc': 34665,
 'cathedral': 3191,
 'hotline': 45992,
 'thermionic': 36111,
 'cope': 10321,
 'scales': 4796,
 'budo': 33065,
 'pascha': 36082,
 'ardent': 18395,
 'bottoms': 19570,
 'lepidus': 30749,
 'gordie': 42439,
 'twenty': 1674,
 'musik': 21794,
 'heading': 9240,
 'wildlife': 5591,
 'spivak': 42440,
 'csx': 47873,
 'keywords': 20639,
 'consubstantiation': 34801,
 'lazuli': 42005,
 'archaeology': 6431,
 'xinhua': 49358,
 'vegeta': 29739,
 'merriam': 24745,
 'viewers': 6818,
 'branches': 2647,
 'govern': 9302,
 'ats': 30113,
 'philistine': 34802,
 'chapterhouse': 36083,
 'strangled': 26754,
 'patterns': 2507,
 'marshland': 38239,
 'legis': 47874,
 'censored': 18089,
 'oxidized': 17357,
 'rosewood': 46576,
 'parking': 10136,
 'portmanteau': 24746,
 'flamsteed': 19571,
 'chaim': 23534,
 'ness': 9431,
 'sectors': 5120,
 'gladiatorial': 34361,
 'xxxix': 33066,
 'motivate': 25928,
 'isothermal': 43505,
 'mayonnaise': 28849,
 'vector': 2292,
 'baur': 45276,
 'timor': 6868,
 'impeller': 42006,
 'serve': 1486,
 'constantine': 2897,
 'percentage': 3102,
 'multipoint': 42994,
 'atat': 25638,
 'warship': 15990,
 'supposition': 26405,
 'coordinator': 11556,
 'moran': 25516,
 'spacey': 20856,
 'worships': 38024,
 'walter': 2755,
 'fl': 8595,
 'workplace': 14896,
 'innovators': 30701,
 'bos': 21631,
 'godzilla': 4332,
 'jl': 36113,
 'hfs': 32356,
 'nachrichten': 47875,
 'charlie': 5109,
 'manifolds': 11305,
 'canister': 38213,
 'dreamers': 25770,
 'sulaiman': 43506,
 'margulis': 42433,
 'announcers': 33541,
 'katsuhiro': 48611,
 'dramatists': 20992,
 'lounge': 21969,
 'overpowered': 38576,
 'endorsement': 14763,
 'thebaid': 47655,
 'cyborgs': 27050,
 'peripheral': 9056,
 'drinker': 24748,
 'harden': 42007,
 'defraud': 44089,
 'polar': 5365,
 'thousand': 2333,
 'gettysburg': 8685,
 'subdisciplines': 34853,
 'decried': 27368,
 'pundits': 31895,
 'enciphered': 48612,
 'quizzes': 44618,
 'kellerman': 37456,
 'grasped': 32819,
 'coler': 44793,
 'deoxyribonucleic': 47148,
 'spoofs': 43507,
 'renunciation': 21984,
 'acknowledgement': 26916,
 'arsenal': 6373,
 'rattle': 31896,
 'surplus': 8333,
 'fiercely': 18208,
 'mandela': 17794,
 'pads': 13362,
 'hemophilia': 33807,
 'alderney': 29542,
 'delusional': 28635,
 'drill': 14739,
 'groom': 14286,
 'sprout': 39776,
 'doubting': 41061,
 'youssef': 34952,
 'clinical': 4169,
 'refurbished': 30334,
 'unwillingness': 19759,
 'waite': 34666,
 'sociobiology': 27532,
 'photoelectric': 25358,
 'catalysts': 16679,
 'uninhabitable': 29740,
 'habsburgs': 15201,
 'unfold': 33808,
 'strassman': 46577,
 'sza': 34953,
 'indication': 7581,
 'marcius': 43508,
 'mayen': 31897,
 'den': 7591,
 'weston': 25101,
 'actresses': 13227,
 'freeways': 13165,
 'continuous': 2157,
 'layman': 21002,
 'dnow': 29188,
 'nims': 47150,
 'choreography': 25908,
 'erne': 42995,
 'mgm': 14881,
 'fk': 40703,
 'apicomplexa': 41521,
 'fifa': 7815,
 'woodcuts': 34994,
 'vietnam': 2342,
 'junius': 25102,
 'kuan': 15923,
 'augustinians': 47253,
 'korps': 37834,
 'introns': 27671,
 'bamboo': 12844,
 'cod': 16268,
 'chuck': 7693,
 'doj': 31433,
 'colorless': 20105,
 'etiology': 27051,
 'proofing': 41997,
 'democratically': 13612,
 'wallenberg': 43861,
 'free': 247,
 'kodansha': 33542,
 'subclass': 16231,
 'dafoe': 48615,
 'bestows': 47151,
 'music': 166,
 'orientalist': 31204,
 'pectin': 42441,
 'laes': 40621,
 'few': 289,
 'punks': 43509,
 'sikhs': 15390,
 'amnesty': 7609,
 'harmonize': 34455,
 'pledging': 37103,
 'pronouncements': 32357,
 'schuler': 43510,
 'equitable': 17011,
 'pula': 31898,
 'ligature': 23740,
 'xaf': 42442,
 'ind': 37437,
 'principles': 1557,
 'correspondence': 4626,
 'wikibook': 47877,
 'ghostbusters': 40577,
 'malts': 37104,
 'kobayashi': 33303,
 'sundance': 34505,
 'geologists': 14178,
 'uganda': 11065,
 'peloponnesus': 36807,
 'furigana': 21313,
 'proportionally': 30144,
 'punk': 4465,
 'semifinals': 31449,
 'mediator': 15171,
 'rotates': 20240,
 'deva': 31201,
 'pomp': 30336,
 'conductive': 19225,
 'nitro': 30750,
 'sexually': 6790,
 'existentialist': 19886,
 'richly': 22346,
 'pressure': 1025,
 'maneuver': 12647,
 'honeywell': 20640,
 'seneca': 17101,
 'tunisia': 9346,
 'historiographical': 45721,
 'drains': 15815,
 'melisende': 32358,
 'office': 591,
 'risc': 8891,
 'hanns': 38940,
 'archetypes': 27959,
 'generalized': 7161,
 'ris': 31457,
 'julius': 4414,
 'epochs': 24989,
 'anthropomorphic': 14989,
 'brock': 21890,
 'decennial': 49877,
 'tze': 34062,
 'negation': 11707,
 'truly': 4159,
 'exploring': 7928,
 'host': 1560,
 'learned': 2662,
 'transformers': 25103,
 'retaliatory': 40394,
 'consciously': 14122,
 'editor': 2266,
 'maori': 26479,
 'morita': 25238,
 'sda': 46579,
 'merits': 10955,
 'tails': 12592,
 'schindler': 28788,
 'banana': 10958,
 'medes': 20993,
 'eider': 33304,
 'cac': 45277,
 'cured': 13250,
 'mistreatment': 25771,
 'initiate': 11957,
 'magdalen': 27829,
 'anions': 29377,
 'orfeo': 43559,
 'mobility': 9227,
 'wheeler': 14164,
 'aeolus': 20781,
 'fluids': 9785,
 'expired': 13130,
 'fsm': 14263,
 'dorchester': 27825,
 'bartos': 45278,
 'artist': 1377,
 'podkopayeva': 41578,
 'mattel': 16369,
 'diab': 30546,
 'breviary': 34397,
 'suffolk': 10343,
 ...}

In [15]:
dictionary['bbbb']


Out[15]:
30107

In [16]:
print(len(dictionary))
data = list()
unk_count = 0


50000

In [18]:
for word in words:
    if word in dictionary:
        index = dictionary[word]
    else:
        index = 0
        unk_count += 1
    data.append(index)

In [19]:
data


Out[19]:
[5243,
 3083,
 12,
 6,
 195,
 2,
 3134,
 46,
 59,
 156,
 128,
 742,
 477,
 10636,
 134,
 1,
 27494,
 2,
 1,
 103,
 855,
 3,
 1,
 15181,
 0,
 2,
 1,
 151,
 855,
 3581,
 1,
 195,
 11,
 191,
 59,
 5,
 6,
 10740,
 215,
 7,
 1326,
 105,
 455,
 20,
 59,
 2734,
 363,
 7,
 3675,
 1,
 709,
 2,
 372,
 27,
 41,
 37,
 54,
 540,
 98,
 12,
 6,
 1426,
 2760,
 19,
 568,
 687,
 7099,
 1,
 248,
 5243,
 11,
 1053,
 28,
 1,
 321,
 249,
 45850,
 2878,
 793,
 187,
 5243,
 12,
 6,
 201,
 603,
 11,
 1,
 1135,
 20,
 2623,
 26,
 8987,
 3,
 280,
 32,
 4158,
 142,
 60,
 26,
 6445,
 4190,
 2,
 154,
 33,
 363,
 5243,
 37,
 1138,
 7,
 448,
 345,
 1819,
 20,
 4870,
 1,
 6764,
 2,
 7585,
 1775,
 567,
 1,
 94,
 1,
 248,
 11121,
 12,
 52,
 7099,
 90,
 27,
 271,
 38,
 5956,
 4863,
 20493,
 29,
 0,
 42,
 318,
 6,
 25947,
 528,
 7585,
 372,
 5,
 259,
 2,
 154,
 26,
 1207,
 12,
 7585,
 201,
 1578,
 3,
 15320,
 333,
 1775,
 7099,
 4870,
 345,
 765,
 161,
 407,
 5703,
 756,
 2,
 4116,
 1132,
 4338,
 1537,
 3,
 568,
 8132,
 99,
 5243,
 11,
 52,
 1409,
 687,
 19,
 154,
 27,
 11,
 156,
 7099,
 37,
 2035,
 1426,
 8187,
 2,
 154,
 47,
 694,
 7,
 32,
 6,
 4159,
 247,
 372,
 77,
 949,
 79,
 311,
 31,
 4790,
 372,
 508,
 140,
 2315,
 3556,
 365,
 24,
 1823,
 7,
 1906,
 60,
 11,
 37,
 8429,
 79,
 311,
 6,
 247,
 372,
 508,
 32,
 754,
 79,
 1737,
 3,
 8057,
 24323,
 3,
 276,
 1694,
 20,
 152,
 1035,
 96,
 225,
 372,
 18,
 1817,
 25,
 4790,
 1557,
 52,
 8128,
 1468,
 24323,
 3,
 12773,
 5,
 6138,
 20,
 4189,
 21241,
 2432,
 40,
 16626,
 3,
 7245,
 862,
 2,
 1195,
 10210,
 2515,
 29,
 15187,
 188,
 3,
 49,
 1124,
 914,
 7,
 1050,
 470,
 12354,
 7099,
 134,
 1,
 1,
 11121,
 3060,
 3,
 12125,
 735,
 4790,
 6601,
 5,
 14628,
 28,
 451,
 486,
 24323,
 199,
 296,
 949,
 5,
 20133,
 20528,
 2,
 0,
 301,
 7,
 24323,
 20528,
 16615,
 1,
 30323,
 2,
 1,
 94,
 45,
 3836,
 3,
 0,
 3,
 3813,
 1,
 3417,
 2,
 1,
 1782,
 188,
 2,
 1,
 627,
 1,
 13187,
 2,
 4,
 22,
 91,
 123,
 285,
 26,
 244,
 233,
 7,
 32,
 437,
 26684,
 2,
 174,
 5243,
 7735,
 2538,
 5,
 30,
 96,
 2,
 290,
 603,
 4316,
 20,
 1,
 13187,
 16615,
 50,
 188,
 118,
 47,
 360,
 20,
 1,
 385,
 243,
 97,
 32,
 5239,
 35,
 332,
 2728,
 19,
 1,
 839,
 1378,
 28,
 33,
 8782,
 47,
 6481,
 35,
 3246,
 1,
 27494,
 29,
 552,
 42867,
 40,
 31,
 128,
 0,
 436,
 85,
 1,
 65,
 2,
 1,
 103,
 464,
 84,
 3,
 26,
 233,
 19,
 48,
 12,
 26684,
 2,
 174,
 5243,
 5,
 1,
 174,
 840,
 1,
 46,
 7,
 90,
 1,
 195,
 7,
 1016,
 1044,
 43,
 70,
 4863,
 18,
 938,
 15009,
 4579,
 165,
 0,
 5,
 30,
 42066,
 11816,
 22953,
 260,
 1226,
 0,
 0,
 4,
 23,
 8,
 17,
 100,
 36,
 629,
 1,
 2665,
 64,
 372,
 34,
 49,
 76,
 94,
 688,
 14795,
 4051,
 29,
 818,
 918,
 12,
 102,
 5,
 11121,
 2538,
 363,
 6,
 6075,
 3,
 663,
 5,
 1,
 64,
 702,
 436,
 41,
 4219,
 1290,
 20,
 36,
 11,
 31,
 4790,
 3,
 95,
 26,
 50,
 30,
 5224,
 5,
 4,
 23,
 9,
 17,
 5,
 1,
 5360,
 2,
 1,
 151,
 855,
 470,
 12354,
 335,
 31,
 17205,
 2435,
 201,
 1294,
 142,
 12354,
 216,
 38,
 90,
 1,
 248,
 5243,
 56,
 117,
 7099,
 39,
 1207,
 33,
 198,
 12,
 1,
 46,
 182,
 4790,
 530,
 3,
 12354,
 12,
 1,
 1837,
 2,
 1916,
 5243,
 42,
 35,
 33,
 229,
 76,
 4790,
 436,
 734,
 2405,
 3,
 1,
 195,
 0,
 18,
 87,
 1074,
 12,
 31,
 14096,
 30669,
 19,
 1,
 12111,
 0,
 35,
 51,
 2421,
 617,
 5,
 1,
 151,
 855,
 1,
 46,
 568,
 8738,
 4790,
 2938,
 1034,
 14526,
 27,
 11,
 637,
 360,
 20,
 27,
 5392,
 155,
 192,
 2938,
 1034,
 14526,
 335,
 154,
 11,
 918,
 5,
 4,
 13,
 21,
 8,
 20,
 1,
 195,
 4790,
 18,
 990,
 12,
 6,
 568,
 1609,
 27,
 11,
 14,
 33,
 891,
 20,
 48,
 849,
 14526,
 12,
 1,
 1837,
 2,
 174,
 4790,
 209,
 5,
 154,
 11,
 918,
 14526,
 5469,
 24,
 1,
 420,
 12998,
 918,
 11,
 6896,
 5,
 33,
 140,
 36,
 1347,
 1,
 2881,
 2,
 15187,
 918,
 0,
 155,
 100,
 3468,
 39,
 653,
 446,
 7,
 90,
 3,
 3134,
 44,
 918,
 12,
 47,
 3592,
 55,
 12,
 15486,
 1556,
 14,
 2966,
 5,
 45,
 259,
 14526,
 1166,
 154,
 36,
 89,
 3226,
 1132,
 53,
 39,
 752,
 446,
 7,
 90,
 1050,
 523,
 3,
 1794,
 5,
 5819,
 24,
 1557,
 2,
 5132,
 3,
 1294,
 14526,
 15,
 2657,
 2,
 11121,
 34,
 36,
 89,
 29397,
 0,
 968,
 31,
 1335,
 519,
 100,
 1132,
 3,
 309,
 160,
 513,
 1,
 807,
 2,
 44,
 1259,
 219,
 1259,
 1038,
 34,
 1392,
 1,
 909,
 2,
 742,
 65,
 968,
 5,
 443,
 33,
 74,
 2918,
 20,
 76,
 4,
 74,
 2966,
 28,
 1,
 1259,
 2,
 276,
 1556,
 160,
 4330,
 2522,
 499,
 5,
 728,
 10660,
 18938,
 31,
 763,
 247,
 987,
 74,
 32,
 178,
 98,
 7,
 741,
 3273,
 24,
 914,
 7,
 1,
 363,
 2,
 443,
 14526,
 15,
 949,
 40,
 2026,
 223,
 151,
 742,
 477,
 1819,
 3,
 30,
 2987,
 40,
 1022,
 5,
 1,
 855,
 2,
 4,
 13,
 21,
 13,
 5,
 303,
 14526,
 15,
 603,
 2,
 918,
 11,
 648,
 27,
 18,
 348,
 5,
 6,
 113,
 2,
 312,
 80,
 30,
 3574,
 3,
 60,
 26,
 6445,
 4190,
 2,
 48,
 2,
 30,
 949,
 14,
 51,
 2161,
 1590,
 68,
 622,
 2750,
 26441,
 15,
 15765,
 5,
 30,
 1,
 10476,
 3,
 45,
 200,
 26441,
 1461,
 20,
 52,
 637,
 1047,
 345,
 1775,
 134,
 1,
 2582,
 2,
 94,
 918,
 12,
 6,
 245,
 390,
 446,
 5,
 153,
 3,
 1,
 148,
 2582,
 2,
 372,
 40,
 4583,
 21019,
 29,
 9221,
 5,
 1,
 941,
 1975,
 2,
 372,
 20,
 1,
 1132,
 26,
 45,
 1730,
 36,
 5233,
 15765,
 3,
 6,
 144,
 2,
 0,
 5,
 34,
 1132,
 74,
 9758,
 5,
 5839,
 2,
 0,
 67,
 62,
 27,
 18,
 5,
 44,
 568,
 763,
 7,
 176,
 95,
 14,
 119,
 918,
 685,
 1147,
 79,
 124,
 508,
 13107,
 5578,
 311,
 7,
 370,
 7,
 4115,
 1,
 1799,
 7,
 119,
 5470,
 918,
 3,
 154,
 72,
 39,
 5,
 609,
 184,
 20,
 11,
 609,
 200,
 95,
 185,
 12,
 72,
 6196,
 9049,
 12,
 7204,
 72,
 1226,
 1,
 34751,
 2,
 1,
 1799,
 26441,
 364,
 89,
 392,
 31,
 4790,
 36,
 1047,
 67,
 1,
 2760,
 42709,
 2070,
 30,
 949,
 40,
 2026,
 25,
 56,
 0,
 9936,
 7099,
 142,
 4190,
 2,
 30,
 502,
 26,
 3237,
 ...]

In [20]:
unk_count


Out[20]:
418391

In [21]:
count[0][1] = unk_count

In [22]:
count[0]


Out[22]:
['UNK', 418391]

In [23]:
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

In [24]:
del words

In [26]:
print(count[:15])


[['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430), ('two', 192644), ('is', 183153), ('as', 131815), ('eight', 125285), ('for', 118445)]

In [28]:
print(data[:10], [reverse_dictionary[i] for i in data[:10]])


[5243, 3083, 12, 6, 195, 2, 3134, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']

In [29]:
data_index = 0

In [30]:
batch = np.ndarray(shape=(8), dtype=np.int32)
labels = np.ndarray(shape=(8,1), dtype=np.int32)

In [32]:
span = 2 * 1 + 1
buffer = collections.deque(maxlen=span)
for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)

In [36]:
for i in range(8 // 2):
    target = 1
    targets_to_avoid = [1]
    for j in range(2):
        while target in targets_to_avoid:
            target = random.randint(0, span -1)
        target_to_avoid.append(target)
        batch[i * 2 + j] = buffer[1]
        labels[i * 2 + j, 0] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + len(data) - span) % len(data)

In [37]:
batch


Out[37]:
array([3083, 3083,   12,   12,    6,    6, 5243, 5243], dtype=int32)

In [38]:
labels


Out[38]:
array([[  12],
       [  12],
       [3083],
       [3083],
       [5243],
       [5243],
       [   6],
       [   6]], dtype=int32)

In [39]:
data_index


Out[39]:
17005198

In [40]:
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])


3083 originated -> 12 as
3083 originated -> 12 as
12 as -> 3083 originated
12 as -> 3083 originated
6 a -> 5243 anarchism
6 a -> 5243 anarchism
5243 anarchism -> 6 a
5243 anarchism -> 6 a

In [64]:
batch_size = 128
embedding_size = 128
skip_window = 1
num_skips = 2

valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64

In [42]:
valid_examples


Out[42]:
array([37, 88, 29, 72, 52, 54, 21, 73, 74,  5, 19, 69, 53, 36, 25, 18])

In [57]:
vocabulary_size = 50000
graph = tf.Graph()
with graph.as_default():
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    with tf.device('/cpu:0'):
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
    
        nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        
    loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                biases=nce_biases,
                labels=train_labels,
                inputs=embed,
                num_sampled=num_sampled,
                num_classes=vocabulary_size))
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)

    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
    init = tf.global_variables_initializer()

In [52]:
print(embeddings)


Tensor("Variable/read:0", shape=(50000, 128), dtype=float32, device=/device:CPU:0)

In [59]:
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  for i in range(batch_size // num_skips):
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [skip_window]
    for j in range(num_skips):
      while target in targets_to_avoid:
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  # Backtrack a little bit to avoid skipping words in the end of a batch
  data_index = (data_index + len(data) - span) % len(data)
  return batch, labels

In [60]:
num_steps = 100001

In [94]:
with tf.Session(graph=graph) as session:
    init.run()
    print("init..")
    
    average_loss = 0
    
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs:batch_inputs, train_labels:batch_labels}
        
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val
        
        if step%2000 == 0:
            if step > 0:
                average_loss /= 2000
            print("Average loss at step", step, ":", average_loss)
        
            average_loss = 0
        
        if step%10000 == 0:
            sim = similarity.eval()
            for i in xrange(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8
                nearest = (-sim[i,:]).argsort()[1:top_k + 1]
                # print(nearest)
                log_str = "Nearest to %s:" %valid_word
                for k in xrange(top_k):
                    # print(k)
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = "%s %s," % (log_str, close_word)
                print(log_str)
        
    final_embeddings = normalized_embeddings.eval()


init..
Average loss at step 0 : 255.736083984
Nearest to it: materiel, there, larry, boundaries, methylene, aerobatics, macrovision, submarines,
Nearest to s: dietary, besieged, usually, autistics, welfare, malvaceae, autocode, guevara,
Nearest to new: geocentric, parametric, confirm, scavenging, infected, impenetrable, embalmed, depriving,
Nearest to six: science, siliceous, strenuous, flunitrazepam, screenplay, ym, opec, syringe,
Nearest to time: expecting, methodism, mesolithic, thorns, manzikert, tess, trunks, rectum,
Nearest to people: mango, attract, clothes, secessionist, superintendence, profiles, flawless, acquis,
Nearest to there: kyanite, strawberry, mak, stanzas, tsvetaeva, stockpile, inheriting, macross,
Nearest to three: exporter, meteors, management, jakob, eclecticism, genitive, nephites, endangering,
Nearest to united: outstripped, clinging, yvonne, joked, connolly, ineffectiveness, bytes, nazareth,
Nearest to over: southeast, verdeans, debaters, spate, lair, solstice, minimi, sloti,
Nearest to has: coarse, electing, instruct, jaffe, vertical, libration, discontinuities, canvas,
Nearest to also: lager, immaculate, anemometers, arctos, crisp, anteaters, belgium, blackwell,
Nearest to five: utr, tachyon, corroborating, sultanates, roskilde, disillusionment, ingredient, built,
Nearest to in: dns, dreamer, conjunctions, jeremiah, whims, consignee, methanol, educating,
Nearest to that: doses, mathematische, main, prepositional, gash, ryaku, alcoholic, fleischmann,
Nearest to had: cke, tte, accessing, coherence, submission, peacocks, war, uk,
Average loss at step 2000 : 113.675760889
Average loss at step 4000 : 53.0733066626
Average loss at step 6000 : 32.8648398995
Average loss at step 8000 : 23.5830843294
Average loss at step 10000 : 17.5470844959
Nearest to it: there, richer, kiang, control, it, patriarch, each, event,
Nearest to s: dietary, usually, vessels, welfare, deviation, besieged, om, britain,
Nearest to new: and, UNK, the, is, reconcile, parents, two, thousands,
Nearest to six: science, UNK, adopted, deluded, ten, ell, widehat, weakness,
Nearest to time: lithium, expecting, methodism, UNK, humor, sometimes, band, diver,
Nearest to people: wrote, psychology, attack, UNK, luncheon, attract, yamato, sided,
Nearest to there: two, zero, arch, kiang, widehat, seven, one, UNK,
Nearest to three: management, sacrament, clockwork, pair, spins, echidna, mob, patience,
Nearest to united: woke, kiang, eph, husband, spectroscopy, voiceless, rain, pigeon,
Nearest to over: and, of, from, on, for, to, UNK, by,
Nearest to has: UNK, in, and, as, was, with, supported, ferus,
Nearest to also: flagella, requests, can, blackwell, UNK, belgium, exile, hermann,
Nearest to five: nine, disillusionment, UNK, utr, iteration, may, hilbert, built,
Nearest to in: it, dns, readers, judaism, zero, so, jeremiah, and,
Nearest to that: in, main, UNK, waterford, one, and, of, doses,
Nearest to had: is, by, supports, in, dec, has, so, and,
Average loss at step 12000 : 13.7723090439
Average loss at step 14000 : 11.5381830882
Average loss at step 16000 : 9.67109122252
Average loss at step 18000 : 8.84459714115
Average loss at step 20000 : 8.18229899728
Nearest to it: there, richer, it, taunus, that, which, still, patriarch,
Nearest to s: dietary, usually, vessels, om, welfare, should, deviation, sui,
Nearest to new: and, two, brie, zero, three, eight, is, taunus,
Nearest to six: science, strenuous, lindsay, litre, deluded, ten, drag, weakness,
Nearest to time: lithium, ovary, expecting, coolidge, methodism, diver, part, humor,
Nearest to people: is, wrote, marino, amine, had, psychology, by, galesburg,
Nearest to there: zero, seven, two, nine, eight, one, kiang, six,
Nearest to three: management, galesburg, sacrament, conjunctions, above, narrators, exporter, clockwork,
Nearest to united: to, bluffs, woke, eph, can, is, edged, will,
Nearest to over: and, of, from, at, for, on, by, kiang,
Nearest to has: and, was, in, as, for, amine, with, from,
Nearest to also: can, flagella, craft, blackwell, requests, neo, to, nt,
Nearest to five: may, to, iteration, disillusionment, nine, utr, cauchy, diocese,
Nearest to in: it, dns, xs, she, who, and, readers, so,
Nearest to that: in, main, benzyl, for, with, and, hak, waterford,
Nearest to had: is, by, has, and, were, dec, taunus, are,
Average loss at step 22000 : 7.31877236044
Average loss at step 24000 : 6.7772724818
Average loss at step 26000 : 7.22241799635
Average loss at step 28000 : 6.13721638286
Average loss at step 30000 : 6.07491606796
Nearest to it: there, richer, it, which, that, still, taunus, interrupt,
Nearest to s: dietary, should, vessels, usually, inertia, welfare, om, deviation,
Nearest to new: and, barb, tetra, seven, three, brie, six, eight,
Nearest to six: science, strenuous, modula, they, deluded, four, flunitrazepam, lindsay,
Nearest to time: amd, coolidge, lithium, ovary, methodism, diver, expecting, auschwitz,
Nearest to people: had, be, was, by, marino, were, wrote, amine,
Nearest to there: seven, eight, six, two, three, five, one, nine,
Nearest to three: d, galesburg, management, narrators, koontz, ffts, conjunctions, above,
Nearest to united: will, can, to, may, eph, bluffs, woke, rain,
Nearest to over: and, on, at, from, for, of, with, nine,
Nearest to has: was, in, with, and, as, for, bytecode, from,
Nearest to also: can, lager, to, flagella, nt, would, requests, craft,
Nearest to five: may, should, would, will, to, could, iteration, disillusionment,
Nearest to in: it, who, she, they, dns, xs, and, so,
Nearest to that: in, main, and, benzyl, for, waterford, bytecode, hak,
Nearest to had: is, were, by, has, sparc, frites, be, sibilant,
Average loss at step 32000 : 5.8268452996
Average loss at step 34000 : 5.61028611898
Average loss at step 36000 : 5.45080244505
Average loss at step 38000 : 5.56756269145
Average loss at step 40000 : 5.57747373378
Nearest to it: there, richer, which, still, it, that, not, taunus,
Nearest to s: dietary, when, and, should, where, vessels, om, inertia,
Nearest to new: and, barb, tetra, gru, glas, stadtbahn, eight, seven,
Nearest to six: they, strenuous, science, gino, we, modula, deluded, lindsay,
Nearest to time: coolidge, ovary, diver, lithium, glas, methodism, amd, gru,
Nearest to people: be, was, had, by, were, galesburg, amine, marino,
Nearest to there: six, five, eight, seven, three, zero, two, nine,
Nearest to three: d, management, narrators, mm, above, sacrament, celestial, galesburg,
Nearest to united: can, will, may, to, should, steen, must, rain,
Nearest to over: and, at, from, on, with, of, for, taunus,
Nearest to has: was, in, with, for, from, be, bytecode, and,
Nearest to also: can, would, will, should, lager, arctos, flagella, nt,
Nearest to five: may, would, should, will, could, must, to, might,
Nearest to in: it, she, who, they, dns, xs, and, toppled,
Nearest to that: in, main, gru, benzyl, and, from, hak, waterford,
Nearest to had: is, were, has, by, had, proleptic, sparc, be,
Average loss at step 42000 : 5.45841832423
Average loss at step 44000 : 5.33954357231
Average loss at step 46000 : 5.28882217085
Average loss at step 48000 : 5.19656235886
Average loss at step 50000 : 5.02684125626
Nearest to it: which, there, richer, that, still, it, sometimes, taunus,
Nearest to s: dietary, when, where, should, vessels, inertia, om, gordie,
Nearest to new: and, barb, tetra, gru, glas, stadtbahn, brie, UNK,
Nearest to six: they, strenuous, we, science, siliceous, gino, she, ii,
Nearest to time: coolidge, more, ovary, diver, auschwitz, glas, gru, methodism,
Nearest to people: was, be, had, were, by, marino, become, galesburg,
Nearest to there: six, five, seven, three, eight, two, zero, nine,
Nearest to three: d, UNK, narrators, eight, mm, management, celestial, four,
Nearest to united: can, will, may, to, should, must, steen, cannot,
Nearest to over: on, at, from, and, during, nine, with, of,
Nearest to has: was, bytecode, with, for, were, be, from, amine,
Nearest to also: can, would, will, should, could, lager, nt, arctos,
Nearest to five: may, would, will, should, could, must, to, might,
Nearest to in: it, she, they, who, zero, dns, there, xs,
Nearest to that: in, gru, main, benzyl, with, at, into, hak,
Nearest to had: is, were, has, had, by, proleptic, when, became,
Average loss at step 52000 : 5.11451469243
Average loss at step 54000 : 5.0601797725
Average loss at step 56000 : 4.99067440438
Average loss at step 58000 : 5.02971422517
Average loss at step 60000 : 5.02644173288
Nearest to it: which, there, still, richer, that, sometimes, not, it,
Nearest to s: when, dietary, landesverband, where, should, inertia, hyi, vessels,
Nearest to new: and, hyi, landesverband, barb, gru, tetra, stadtbahn, four,
Nearest to six: strenuous, ii, they, we, gino, siliceous, UNK, science,
Nearest to time: more, coolidge, diver, many, ass, ovary, landesverband, auschwitz,
Nearest to people: be, had, was, were, by, become, never, also,
Nearest to there: five, six, three, seven, eight, nine, two, zero,
Nearest to three: d, UNK, four, mm, narrators, directness, eight, two,
Nearest to united: can, will, may, should, must, to, cannot, could,
Nearest to over: during, at, landesverband, from, and, on, stadtbahn, hyi,
Nearest to has: was, in, with, be, landesverband, bytecode, amine, from,
Nearest to also: can, would, will, should, could, cannot, lager, nt,
Nearest to five: may, will, would, should, could, must, might, cannot,
Nearest to in: it, she, they, who, there, zero, bloody, landesverband,
Nearest to that: in, gru, benzyl, landesverband, at, main, upon, bytecode,
Nearest to had: is, has, were, had, by, be, landesverband, proleptic,
Average loss at step 62000 : 4.91109006059
Average loss at step 64000 : 4.94881534314
Average loss at step 66000 : 4.87608850288
Average loss at step 68000 : 4.89872267771
Average loss at step 70000 : 4.62613409173
Nearest to it: which, still, there, richer, sometimes, that, not, it,
Nearest to s: when, where, dietary, landesverband, should, though, inertia, hyi,
Nearest to new: and, landesverband, hyi, gru, four, than, barb, stadtbahn,
Nearest to six: they, we, strenuous, ii, gino, she, siliceous, he,
Nearest to time: more, many, coolidge, ass, ovary, diver, yamato, landesverband,
Nearest to people: be, was, become, were, had, by, galesburg, never,
Nearest to there: five, six, three, seven, eight, two, nine, landesverband,
Nearest to three: d, four, UNK, diluted, narrators, three, two, landsmannschaft,
Nearest to united: can, will, may, should, must, could, cannot, to,
Nearest to over: on, during, at, from, landesverband, and, taunus, with,
Nearest to has: was, be, bytecode, landesverband, in, were, amine, for,
Nearest to also: can, would, will, should, could, cannot, must, nt,
Nearest to five: may, will, would, should, could, must, might, cannot,
Nearest to in: it, she, they, who, there, zero, landesverband, dns,
Nearest to that: in, gru, main, landesverband, benzyl, at, and, bytecode,
Nearest to had: is, has, were, had, by, became, proleptic, be,
Average loss at step 72000 : 4.75129728556
Average loss at step 74000 : 4.84498063374
Average loss at step 76000 : 4.77063242346
Average loss at step 78000 : 4.7683170042
Average loss at step 80000 : 4.67875431442
Nearest to it: which, still, sometimes, richer, who, generally, there, that,
Nearest to s: when, where, landesverband, dietary, though, hyi, before, inertia,
Nearest to new: and, landesverband, hyi, gru, barb, than, stadtbahn, but,
Nearest to six: we, they, strenuous, ii, she, gino, deluded, t,
Nearest to time: more, many, coolidge, ass, diver, ovary, ecommerce, zut,
Nearest to people: be, become, were, was, had, by, inquest, galesburg,
Nearest to there: five, six, seven, three, eight, two, zero, nine,
Nearest to three: d, UNK, four, diluted, three, narrators, m, riso,
Nearest to united: will, can, may, should, must, could, might, cannot,
Nearest to over: during, at, landesverband, on, nine, hyi, from, stadtbahn,
Nearest to has: was, be, with, as, in, bytecode, landesverband, were,
Nearest to also: can, would, will, could, should, cannot, must, might,
Nearest to five: may, will, would, should, could, must, might, cannot,
Nearest to in: it, she, they, who, there, seven, zero, dns,
Nearest to that: in, main, gru, upon, landesverband, benzyl, into, through,
Nearest to had: is, were, has, had, became, proleptic, by, be,
Average loss at step 82000 : 4.69178952706
Average loss at step 84000 : 4.48726032615
Average loss at step 86000 : 4.4134451797
Average loss at step 88000 : 4.48546737504
Average loss at step 90000 : 4.51784078217
Nearest to it: which, still, sometimes, richer, who, it, generally, often,
Nearest to s: when, where, landesverband, dietary, though, before, should, but,
Nearest to new: and, landesverband, hyi, barb, gru, than, but, stadtbahn,
Nearest to six: we, they, ii, strenuous, t, you, she, icj,
Nearest to time: more, many, coolidge, ass, diver, ovary, highly, zut,
Nearest to people: be, become, was, had, were, by, inquest, galesburg,
Nearest to there: five, six, seven, three, eight, zero, nine, two,
Nearest to three: d, clockwork, celestial, closing, fenrir, canadian, french, ffts,
Nearest to united: will, can, may, should, must, could, might, cannot,
Nearest to over: during, at, from, landesverband, of, within, on, hyi,
Nearest to has: bytecode, be, was, landesverband, were, for, with, is,
Nearest to also: can, would, will, could, should, cannot, must, might,
Nearest to five: will, would, may, should, could, must, might, cannot,
Nearest to in: it, she, they, who, there, but, landesverband, and,
Nearest to that: in, gru, main, upon, two, at, benzyl, landesverband,
Nearest to had: is, had, has, were, became, when, be, landesverband,
Average loss at step 92000 : 4.58907672191
Average loss at step 94000 : 4.60324448931
Average loss at step 96000 : 4.62912453699
Average loss at step 98000 : 4.52983195758
Average loss at step 100000 : 4.70787091136
Nearest to it: which, still, sometimes, often, richer, now, generally, then,
Nearest to s: when, where, landesverband, dietary, should, though, before, hyi,
Nearest to new: and, landesverband, hyi, barb, gru, than, stadtbahn, taunus,
Nearest to six: we, they, ii, strenuous, you, t, she, deluded,
Nearest to time: more, many, coolidge, some, ass, highly, ovary, diver,
Nearest to people: be, become, was, had, were, by, inquest, galesburg,
Nearest to there: five, three, six, seven, eight, two, landesverband, zero,
Nearest to three: d, nine, fenrir, clockwork, ffts, celestial, riso, closing,
Nearest to united: will, can, may, should, could, must, might, cannot,
Nearest to over: during, at, landesverband, from, on, hyi, within, stadtbahn,
Nearest to has: was, bytecode, with, be, for, landesverband, in, were,
Nearest to also: can, would, could, will, should, cannot, must, might,
Nearest to five: will, would, could, may, should, must, might, cannot,
Nearest to in: she, it, they, who, there, but, zero, zut,
Nearest to that: in, upon, gru, through, two, benzyl, main, at,
Nearest to had: is, has, had, were, became, proleptic, by, been,

In [80]:
sim[0,:].argsort()[1:9]


Out[80]:
array([ 0.14506491, -0.12284749,  0.00486149, ...,  0.13809943,
        0.2045745 , -0.09087169], dtype=float32)

In [92]:
with tf.Session(graph=graph) as session:
  # We must initialize all variables before we use them.
  init.run()
  print("Initialized")

  average_loss = 0
  for step in xrange(num_steps):
    batch_inputs, batch_labels = generate_batch(
        batch_size, num_skips, skip_window)
    feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

    # We perform one update step by evaluating the optimizer op (including it
    # in the list of returned values for session.run()
    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += loss_val

    if step % 2000 == 0:
      if step > 0:
        average_loss /= 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print("Average loss at step ", step, ": ", average_loss)
      average_loss = 0

    # Note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in xrange(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8  # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k + 1]
        print(nearest)
        print(-sim[i, :].argsort()[1:8])
        log_str = "Nearest to %s:" % valid_word
        for k in xrange(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log_str = "%s %s," % (log_str, close_word)
        print(log_str)
  final_embeddings = normalized_embeddings.eval()


Initialized
Average loss at step  0 :  267.653686523
[23017 37668 20779 40596 49225 22181 15769 43092]
[-19431  -6871 -35573 -11581 -26246   -158  -5486]
Nearest to it: menelik, unscathed, hannover, inositol, mutinies, jays, courtyard, abol,
[39444 14898 15037   253 25775  4823 15385 37577]
[ -3372 -45240 -46665   -316 -45683 -42560 -17680]
Nearest to s: bst, undertaking, essenes, given, antiparticle, kick, heather, amulets,
[ 5903 35781  2644 25624 36358 14144 41578 11243]
[-44743  -3361  -5499 -17097 -36631 -43582 -33402]
Nearest to new: merchants, leucine, selling, pai, meander, pricing, podkopayeva, lone,
[49260 32636 37792 18909 42806 29688 15682 41013]
[-35746 -45707 -28887 -27463  -5251 -27165 -23507]
Nearest to six: cinq, bulky, magnetopause, clare, somber, andalus, seleucid, grappelli,
[20030 14446 27308 32091 22815  5447  6657  9685]
[-40277  -3259 -27551 -10944  -7551 -24546 -39634]
Nearest to time: dryden, immersion, outwardly, philippi, barth, viruses, rotating, proofs,
[31109  2635 26359 20538  9847 10636 26306 49823]
[-47708 -48278  -3449  -3204 -23017  -5894 -28781]
Nearest to people: squeezed, harry, antiparticles, stigma, catcher, radicals, chamorro, dwan,
[13459 29231 39063 33468 48458  1600  9191 38290]
[-26961 -49831 -42283 -32985 -40115 -40326 -18662]
Nearest to there: secretariat, overfishing, coens, inappropriately, isospin, winning, defects, suvs,
[15558  3390 18952 42936  3347  4767 25593 20553]
[-45016 -45842 -19470 -16778 -11527 -40040 -49831]
Nearest to three: radcliffe, fighter, pohl, timeout, seconds, convinced, kalmar, tunings,
[20169 24911 26849 17138 28076 20595 14942  9926]
[ -5543 -30130 -27931 -43727 -39159  -7927 -48209]
Nearest to united: yamato, hamburger, carnatic, explanatory, wrongful, encyclopedic, airliner, complaint,
[31641 13614  5062  6990  6060 31938 41788 24189]
[-47953 -47313  -1549 -33828 -12352  -4629 -16517]
Nearest to over: glycol, rupert, latitude, analytic, senses, renting, istv, singularities,
[ 3005 48251 35086  8387   792 29220 30588 35183]
[ -1275 -27658 -43439 -26604 -43495 -23704 -17998]
Nearest to has: execution, newsreels, flirting, precious, peace, neanderthal, tantra, tov,
[29086 44422  2121 26496 39289 18254 32625 14867]
[-41716 -13622 -49166 -13495 -24972 -21518 -32721]
Nearest to also: obsidian, fiesole, cable, guessing, unconquered, revolts, mathematik, nursing,
[14473 15672 17536 45503  9325 44134 44026  3050]
[ -5886 -10869 -43279 -17069  -7071 -34870   -104]
Nearest to five: framed, minerva, detectable, dever, centred, pwnage, fairway, residence,
[42795 18458 28605 25338  7191  7133  2862 39895]
[  -983 -11790 -11894 -34411 -31675  -8378 -21107]
Nearest to in: auditioning, dea, unwieldy, cartilage, cartoonist, junction, spaces, preis,
[10601 47267 20049 13581  5912 27067 33187  7477]
[-37048 -16705 -22946 -12769 -40029 -23071 -26514]
Nearest to that: sanctioned, occidentalis, transsexual, navarre, reduces, alexandrine, lakers, infections,
[ 6826  5386 48246 30123  5310 10635 49416 39416]
[ -7026 -20849 -36185   -969 -28863 -43325   -565]
Nearest to had: bowling, commitment, weaning, stately, throw, fermi, eusebio, pentecostalism,
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-92-deb0fe55db90> in <module>()
     12     # We perform one update step by evaluating the optimizer op (including it
     13     # in the list of returned values for session.run()
---> 14     _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
     15     average_loss += loss_val
     16 

/usr/local/lib/python3.5/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
    764     try:
    765       result = self._run(None, fetches, feed_dict, options_ptr,
--> 766                          run_metadata_ptr)
    767       if run_metadata:
    768         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/usr/local/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
    962     if final_fetches or final_targets:
    963       results = self._do_run(handle, final_targets, final_fetches,
--> 964                              feed_dict_string, options, run_metadata)
    965     else:
    966       results = []

/usr/local/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1012     if handle is None:
   1013       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1014                            target_list, options, run_metadata)
   1015     else:
   1016       return self._do_call(_prun_fn, self._session, handle, feed_dict,

/usr/local/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1019   def _do_call(self, fn, *args):
   1020     try:
-> 1021       return fn(*args)
   1022     except errors.OpError as e:
   1023       message = compat.as_text(e.message)

/usr/local/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1001         return tf_session.TF_Run(session, options,
   1002                                  feed_dict, fetch_list, target_list,
-> 1003                                  status, run_metadata)
   1004 
   1005     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

In [99]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
plt.figure(figsize=(18,18))


Out[99]:
<matplotlib.figure.Figure at 0x15181bc18>

In [106]:
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
plot_only = 500
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
labels = [reverse_dictionary[i] for i in xrange(plot_only)]
for i, label in enumerate(labels):
    x, y = low_dim_embs[i, :]
    plt.scatter(x, y)
    plt.annotate(label, xy=(x,y), xytext=(5,2), textcoords='offset points', ha='right', va='bottom')

In [107]:
plt.show()



In [108]:
type(final_embeddings)


Out[108]:
numpy.ndarray

In [110]:
final_embeddings[:50, :]


Out[110]:
array([[ 0.02535484, -0.21756044, -0.04685999, ..., -0.04695626,
         0.13432126, -0.21053489],
       [-0.04397225, -0.07383451, -0.04950885, ...,  0.04359524,
         0.05993943, -0.0320558 ],
       [ 0.06078843, -0.08979254, -0.02595252, ...,  0.08226719,
         0.18126218, -0.05112049],
       ..., 
       [-0.03486119, -0.20499289, -0.08066029, ...,  0.12775902,
        -0.14620858, -0.06338742],
       [ 0.05147786,  0.0296588 , -0.13042139, ..., -0.00699105,
        -0.01828911, -0.06534279],
       [-0.0376003 ,  0.02312498,  0.02433509, ...,  0.15502302,
        -0.08109225,  0.01435901]], dtype=float32)

In [112]:
low_dim_embs.shape


Out[112]:
(500, 2)

In [113]:
labels


Out[113]:
['UNK',
 'the',
 'of',
 'and',
 'one',
 'in',
 'a',
 'to',
 'zero',
 'nine',
 'two',
 'is',
 'as',
 'eight',
 'for',
 's',
 'five',
 'three',
 'was',
 'by',
 'that',
 'four',
 'six',
 'seven',
 'with',
 'on',
 'are',
 'it',
 'from',
 'or',
 'his',
 'an',
 'be',
 'this',
 'which',
 'at',
 'he',
 'also',
 'not',
 'have',
 'were',
 'has',
 'but',
 'other',
 'their',
 'its',
 'first',
 'they',
 'some',
 'had',
 'all',
 'more',
 'most',
 'can',
 'been',
 'such',
 'many',
 'who',
 'new',
 'used',
 'there',
 'after',
 'when',
 'into',
 'american',
 'time',
 'these',
 'only',
 'see',
 'may',
 'than',
 'world',
 'i',
 'b',
 'would',
 'd',
 'no',
 'however',
 'between',
 'about',
 'over',
 'years',
 'states',
 'people',
 'war',
 'during',
 'united',
 'known',
 'if',
 'called',
 'use',
 'th',
 'system',
 'often',
 'state',
 'so',
 'history',
 'will',
 'up',
 'while',
 'where',
 'city',
 'being',
 'english',
 'then',
 'any',
 'both',
 'under',
 'out',
 'made',
 'well',
 'her',
 'e',
 'number',
 'government',
 'them',
 'm',
 'later',
 'since',
 'him',
 'part',
 'name',
 'c',
 'century',
 'through',
 'because',
 'x',
 'university',
 'early',
 'life',
 'british',
 'year',
 'like',
 'same',
 'including',
 'became',
 'example',
 'day',
 'each',
 'even',
 'work',
 'language',
 'although',
 'several',
 'form',
 'john',
 'u',
 'national',
 'very',
 'much',
 'g',
 'french',
 'before',
 'general',
 'what',
 't',
 'against',
 'n',
 'high',
 'links',
 'could',
 'based',
 'those',
 'now',
 'second',
 'de',
 'music',
 'another',
 'large',
 'she',
 'f',
 'external',
 'german',
 'different',
 'modern',
 'great',
 'do',
 'common',
 'set',
 'list',
 'south',
 'series',
 'major',
 'game',
 'power',
 'long',
 'country',
 'king',
 'law',
 'group',
 'film',
 'still',
 'until',
 'north',
 'international',
 'term',
 'we',
 'end',
 'book',
 'found',
 'own',
 'political',
 'party',
 'order',
 'usually',
 'president',
 'church',
 'you',
 'death',
 'theory',
 'area',
 'around',
 'include',
 'god',
 'ii',
 'way',
 'did',
 'military',
 'population',
 'using',
 'though',
 'small',
 'following',
 'within',
 'non',
 'human',
 'left',
 'main',
 'among',
 'point',
 'r',
 'due',
 'p',
 'considered',
 'public',
 'popular',
 'computer',
 'west',
 'family',
 'east',
 'information',
 'important',
 'european',
 'man',
 'sometimes',
 'right',
 'old',
 'free',
 'word',
 'without',
 'last',
 'us',
 'members',
 'given',
 'times',
 'roman',
 'make',
 'h',
 'age',
 'place',
 'l',
 'thus',
 'science',
 'case',
 'become',
 'systems',
 'union',
 'born',
 'york',
 'line',
 'countries',
 'does',
 'isbn',
 'st',
 'control',
 'various',
 'others',
 'house',
 'article',
 'island',
 'should',
 'led',
 'back',
 'period',
 'player',
 'europe',
 'languages',
 'central',
 'water',
 'few',
 'western',
 'home',
 'began',
 'generally',
 'less',
 'k',
 'similar',
 'written',
 'original',
 'best',
 'must',
 'according',
 'school',
 'france',
 'air',
 'single',
 'force',
 'v',
 'land',
 'groups',
 'down',
 'how',
 'works',
 'development',
 'official',
 'support',
 'england',
 'j',
 'rather',
 'space',
 'data',
 'greek',
 'km',
 'named',
 'germany',
 'just',
 'games',
 'said',
 'version',
 'late',
 'earth',
 'company',
 'every',
 'economic',
 'short',
 'published',
 'black',
 'army',
 'off',
 'london',
 'million',
 'body',
 'field',
 'christian',
 'either',
 'social',
 'empire',
 'o',
 'developed',
 'standard',
 'court',
 'service',
 'kingdom',
 'along',
 'college',
 'republic',
 'sea',
 'america',
 'today',
 'result',
 'held',
 'team',
 'light',
 'means',
 'never',
 'especially',
 'third',
 'further',
 'character',
 'forces',
 'take',
 'men',
 'society',
 'show',
 'open',
 'possible',
 'fact',
 'battle',
 'took',
 'former',
 'books',
 'soviet',
 'river',
 'children',
 'having',
 'good',
 'local',
 'son',
 'current',
 'process',
 'natural',
 'present',
 'himself',
 'islands',
 'total',
 'near',
 'white',
 'days',
 'person',
 'itself',
 'seen',
 'culture',
 'little',
 'above',
 'software',
 'largest',
 'words',
 'upon',
 'level',
 'father',
 'side',
 'created',
 'red',
 'references',
 'press',
 'full',
 'region',
 'almost',
 'image',
 'al',
 'famous',
 'play',
 'came',
 'role',
 'once',
 'certain',
 'league',
 'jewish',
 'james',
 'january',
 'site',
 'again',
 'art',
 'numbers',
 'member',
 'areas',
 'movement',
 'religious',
 'type',
 'march',
 'community',
 'story',
 'played',
 'production',
 'released',
 'center',
 'rights',
 'real',
 'related',
 'foreign',
 'low',
 'ancient',
 'terms',
 'view',
 'source',
 'act',
 'minister',
 'change',
 'energy',
 'produced',
 'research',
 'actor',
 'making',
 'december',
 'civil',
 'women',
 'special',
 'style',
 'japanese',
 'design',
 'william',
 'available',
 'chinese',
 'forms',
 'canada',
 'northern',
 'died',
 'class',
 'living',
 'next',
 'particular',
 'program',
 'council',
 'television',
 'head',
 'david',
 'china',
 'middle',
 'established',
 'hand',
 'bc',
 'far',
 'july',
 'function',
 'position',
 'y',
 'built',
 'george',
 'band',
 'together']

In [115]:
low_dim_embs[3,:]


Out[115]:
array([ 18.15663014, -13.72264153])

In [116]:
valid_examples


Out[116]:
array([27, 15, 58, 22, 65, 83, 60, 17, 86, 80, 41, 37, 16,  5, 20, 49])

In [118]:
valid_word_t = reverse_dictionary[27]

In [119]:
valid_word_t


Out[119]:
'it'

In [ ]: