In [1]:
import zipfile
import collections
import math
import random
import os
In [3]:
import numpy as np
from six.moves import urllib
from six.moves import xrange
import tensorflow as tf
In [7]:
with zipfile.ZipFile('text8.zip') as f:
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
print(f.namelist())
['text8']
In [5]:
type(data)
Out[5]:
list
In [6]:
data[0]
Out[6]:
'anarchism'
In [8]:
words = data
print('Data size', len(words))
Data size 17005207
In [9]:
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(50000 - 1))
In [10]:
collections.Counter(words)
Out[10]:
Counter({'receipe': 1,
'bbbb': 21,
'tyrannos': 1,
'taufb': 1,
'weizenbock': 2,
'plucked': 39,
'cioppo': 1,
'paracelsian': 1,
'seventh': 440,
'haven': 203,
'garc': 66,
'aterballeto': 1,
'sikkerhetstjeneste': 1,
'yereko': 1,
'archeologia': 1,
'poseidon': 84,
'socks': 56,
'addo': 2,
'metrodome': 14,
'dellingr': 1,
'shahanshah': 14,
'eldey': 1,
'bhaktamara': 1,
'gunsight': 1,
'stuttered': 2,
'two': 192644,
'shirra': 1,
'civile': 4,
'boros': 2,
'siyyid': 1,
'ventadorn': 1,
'obsidio': 1,
'rickard': 3,
'detachable': 10,
'midfootprint': 1,
'contrapunctus': 2,
'fraktur': 3,
'deuet': 1,
'burkes': 2,
'prompting': 86,
'jaff': 1,
'ebbing': 3,
'richie': 36,
'beachey': 1,
'sselsprung': 2,
'adaptec': 1,
'maniac': 40,
'mouthful': 4,
'gnostica': 2,
'pqcc': 1,
'shadowcat': 2,
'szef': 1,
'unpronounced': 1,
'sliaswich': 1,
'latform': 1,
'antispasmodics': 1,
'embedix': 1,
'promissory': 3,
'latticework': 1,
'gafur': 1,
'limon': 6,
'variations': 790,
'residential': 196,
'zoundweogo': 2,
'lassitude': 1,
'tschudi': 2,
'bragga': 2,
'icelandair': 3,
'iuj': 1,
'mausoleums': 5,
'fagioli': 2,
'enos': 13,
'ghadir': 1,
'adelstein': 2,
'haganah': 22,
'collision': 196,
'romeyn': 1,
'boedo': 1,
'caramelizes': 1,
'psychotria': 7,
'resnais': 3,
'turbobooksnob': 1,
'tensilon': 1,
'contact': 1111,
'ddn': 4,
'emplacing': 1,
'serenades': 1,
'stephenson': 81,
'swaffham': 1,
'pbx': 6,
'zhiveli': 1,
'needleless': 1,
'anabasis': 4,
'thaxter': 1,
'xds': 2,
'esala': 1,
'indycar': 2,
'taskforce': 4,
'ueueteotl': 1,
'kafr': 3,
'codename': 40,
'snue': 3,
'decarburized': 1,
'burchill': 2,
'caymanian': 8,
'taikong': 1,
'dhtml': 21,
'alderaan': 1,
'juliani': 2,
'thrombosis': 5,
'petrovaradin': 1,
'musters': 1,
'tahb': 2,
'agapanthaceae': 4,
'benzyl': 12,
'selwood': 1,
'gibbon': 74,
'seafire': 2,
'ostrogradsky': 1,
'whitt': 1,
'blockstackers': 2,
'korda': 6,
'schelenker': 1,
'cottingham': 2,
'gcj': 4,
'eparchia': 1,
'ohmss': 1,
'messianic': 76,
'nfp': 1,
'endocytosed': 1,
'surnameweb': 1,
'mostadir': 2,
'sugarland': 1,
'ency': 3,
'tackhead': 1,
'athonite': 4,
'ninhursag': 25,
'singamen': 1,
'sandwich': 76,
'hexaaqua': 1,
'ministerthis': 1,
'respire': 2,
'yates': 45,
'bargain': 59,
'rspb': 3,
'toiled': 2,
'bocharova': 1,
'surratt': 6,
'warp': 56,
'discala': 1,
'dknf': 3,
'miyamoto': 24,
'kembla': 1,
'moulmien': 1,
'leaving': 1050,
'jarma': 2,
'anise': 23,
'margretetorp': 2,
'dirais': 1,
'walkout': 1,
'boltysh': 1,
'bruner': 4,
'kawakawa': 1,
'sculptors': 58,
'neesa': 1,
'phantasy': 2,
'basilius': 3,
'counterpoint': 146,
'grunewaldturm': 1,
'margulis': 11,
'equidistributed': 1,
'philopateer': 1,
'sopwith': 9,
'hemoglobin': 107,
'birchall': 1,
'mikiver': 1,
'zzuh': 1,
'conditionalities': 2,
'drape': 3,
'ostrogski': 1,
'baldwinsville': 2,
'pentaphylacaceae': 1,
'folkefiende': 1,
'silesian': 23,
'discussing': 198,
'sequentially': 39,
'jearim': 5,
'isches': 2,
'emplaced': 7,
'nitra': 4,
'subbytes': 4,
'kephalaia': 1,
'ohrenzeuge': 1,
'effluunt': 1,
'toco': 1,
'crisps': 8,
'callin': 3,
'entourages': 1,
'topically': 5,
'koshanis': 1,
'bondar': 2,
'feo': 1,
'multipath': 7,
'timson': 1,
'graysubject': 1,
'hourani': 2,
'pstn': 17,
'lockerroom': 1,
'lantana': 2,
'sdsm': 4,
'nakatta': 1,
'crticism': 1,
'suwanose': 1,
'torres': 41,
'lazaro': 1,
'counterpulsation': 1,
'teletypes': 4,
'awali': 1,
'lxiv': 3,
'sonangol': 4,
'miscarries': 1,
'dentary': 2,
'pull': 246,
'guruma': 1,
'amuses': 3,
'elinvar': 1,
'neuendorfer': 1,
'coquaternion': 1,
'aat': 3,
'tetraol': 1,
'iridales': 4,
'adress': 1,
'munificence': 2,
'kitted': 1,
'scarone': 1,
'gentilly': 1,
'agronomist': 4,
'suprematism': 3,
'baeyer': 2,
'petraroia': 1,
'kind': 1496,
'lusavorich': 1,
'intoxication': 45,
'kesserich': 1,
'eghlid': 1,
'leibenstien': 1,
'couronne': 1,
'rituals': 192,
'categorical': 54,
'condensate': 44,
'pean': 1,
'fanu': 13,
'gother': 1,
'astorga': 1,
'aaate': 1,
'deitifying': 1,
'almanack': 6,
'outrage': 62,
'avoid': 1062,
'realists': 15,
'wickets': 69,
'quintus': 42,
'ngurah': 2,
'bsb': 2,
'biophys': 2,
'inutile': 1,
'len': 62,
'valdemar': 27,
'eragon': 2,
'debriefers': 2,
'parlor': 9,
'ceridwen': 1,
'gcg': 2,
'escombra': 1,
'aspern': 5,
'hyracoid': 1,
'montogomery': 1,
'budokan': 1,
'fuliginosa': 1,
'ande': 1,
'nease': 2,
'ay': 47,
'necroticism': 1,
'deletes': 6,
'analogues': 29,
'digitiser': 1,
'levitcus': 1,
'zambian': 12,
'musonius': 5,
'solim': 3,
'watchfulness': 2,
'permament': 4,
'wipo': 91,
'memoriam': 16,
'costo': 1,
'westprussia': 1,
'spinoza': 109,
'theodora': 34,
'lins': 5,
'glennie': 2,
'wichman': 1,
'unpressurised': 1,
'membrane': 411,
'xuanwu': 1,
'odevaere': 1,
'cemach': 1,
'colonic': 2,
'kitur': 1,
'teena': 3,
'arastirma': 1,
'gnowee': 1,
'spinibarbus': 1,
'shneiderman': 1,
'kosmische': 2,
'pearcey': 2,
'danann': 14,
'hashd': 1,
'zoromes': 1,
'usnewslink': 2,
'saxophonist': 50,
'roja': 3,
'dharmic': 7,
'gdp': 1169,
'lennart': 5,
'schabir': 1,
'nakhla': 1,
'mcas': 2,
'gedanite': 1,
'palminteri': 3,
'bretella': 1,
'wayans': 8,
'battus': 3,
'songshan': 1,
'ionescu': 1,
'inhabits': 17,
'ogooue': 3,
'ariyya': 1,
'revenge': 249,
'kultur': 5,
'massacred': 52,
'mostest': 1,
'tandem': 43,
'netx': 1,
'rambo': 6,
'tripes': 1,
'beat': 536,
'melanoleuca': 9,
'vasaplatsen': 1,
'megillah': 3,
'mineptah': 1,
'stomachs': 14,
'shapiro': 40,
'operculum': 3,
'devereaux': 1,
'kayko': 1,
'chimpanzees': 47,
'sct': 3,
'ffp': 1,
'lasfar': 1,
'dont': 15,
'bavin': 1,
'dellacroce': 2,
'methimazole': 1,
'daeraouenn': 1,
'hercegovine': 2,
'subplot': 10,
'friemsprooch': 1,
'cellcom': 3,
'remarry': 15,
'vanda': 2,
'inx': 6,
'certifiable': 1,
'bichvinta': 1,
'technicum': 2,
'bosetti': 1,
'drips': 7,
'refract': 4,
'woodsy': 1,
'inchbonny': 1,
'cscr': 1,
'endorse': 43,
'daniell': 15,
'reitpony': 1,
'christocentric': 1,
'korsun': 1,
'kankkunen': 1,
'huda': 2,
'wandra': 1,
'dysrhythmia': 5,
'hinkle': 2,
'resumable': 1,
'eiserne': 2,
'holmr': 1,
'holocaust': 498,
'inescutcheon': 1,
'keo': 4,
'horsk': 1,
'ambalat': 4,
'hoovers': 3,
'euxine': 4,
'edukacine': 1,
'chechahcos': 3,
'impuestos': 1,
'lymington': 5,
'eslpdpro': 1,
'muttra': 2,
'klitou': 2,
'gynaecologists': 2,
'lateritic': 2,
'tarkan': 1,
'blackjacks': 6,
'krupp': 50,
'kawasaki': 17,
'rapsur': 1,
'comment': 324,
'paschkis': 2,
'tughluq': 1,
'masurien': 1,
'tufnel': 3,
'railton': 1,
'dusis': 1,
'seyon': 1,
'mcnamara': 30,
'brizio': 1,
'grappled': 2,
'undernourished': 6,
'biomolecular': 3,
'morroco': 3,
'torngasoak': 1,
'footbal': 1,
'slackness': 1,
'goleta': 1,
'comb': 71,
'agust': 15,
'klowdan': 2,
'bretthorst': 1,
'frusoni': 1,
'songcatchers': 1,
'roimh': 2,
'dechei': 1,
'seznec': 2,
'hopefully': 38,
'ezenarro': 1,
'krebbs': 1,
'berthier': 4,
'pegged': 48,
'domestically': 45,
'torpediniformes': 1,
'nofziger': 1,
'wschodzi': 1,
'atrophicus': 1,
'saarinen': 12,
'ifi': 1,
'statesman': 380,
'albertsons': 2,
'chazelle': 1,
'prepositioning': 7,
'galeatus': 2,
'honfleur': 4,
'machen': 10,
'bmx': 9,
'ostfriesland': 2,
'carlist': 18,
'valera': 74,
'confuse': 76,
'iterators': 5,
'spingovics': 1,
'omeros': 3,
'accelerating': 64,
'vlbi': 3,
'mortise': 2,
'phlomobacter': 1,
'strapline': 2,
'faaglar': 1,
'skel': 8,
'aznan': 1,
'csascii': 1,
'rosbanen': 1,
'barbarossa': 51,
'discover': 276,
'wexfordman': 1,
'responses': 216,
'laurentianus': 3,
'iaorg': 1,
'reuse': 33,
'malt': 65,
'demarchy': 2,
'forre': 1,
'konaseema': 1,
'wulfila': 3,
'illiterates': 1,
'maneggio': 1,
'cgtm': 1,
'deum': 12,
'informationsschriften': 1,
'coleophora': 1,
'qbz': 1,
'contrapunctal': 1,
'prefrences': 1,
'illo': 2,
'melia': 1,
'ammah': 2,
'collisions': 116,
'gymnocercus': 1,
'feline': 32,
'yadav': 5,
'revalent': 1,
'danaides': 1,
'antihero': 3,
'velocites': 2,
'leipziginfo': 1,
'deusdedit': 1,
'givant': 1,
'connectors': 66,
'cundinamarcensis': 1,
'telencephalisation': 1,
'nationalpark': 1,
'extraperitoneal': 1,
'bundesmarine': 5,
'nazianzos': 4,
'antilinear': 1,
'ager': 5,
'rossetta': 2,
'abdelazar': 2,
'yesand': 1,
'speccie': 1,
'adamou': 2,
'azurite': 1,
'deoxyhemoglobin': 2,
'camiller': 2,
'botaniates': 4,
'grantees': 1,
'spataro': 1,
'barbe': 2,
'merinid': 2,
'matematicheskikh': 1,
'doron': 5,
'ergosphere': 6,
'pentaamminechlorocobalt': 1,
'feudatories': 7,
'planetfall': 11,
'strafe': 4,
'powermacs': 5,
'preen': 1,
'kekouan': 1,
'accugroove': 1,
'glr': 7,
'chali': 1,
'sarcophagi': 10,
'scannd': 1,
'ozhypnosis': 1,
'brittannica': 1,
'colin': 178,
'vdots': 8,
'hispano': 13,
'enigmatoze': 1,
'mordant': 13,
'tavas': 3,
'ige': 17,
'goretex': 1,
'gymnopaedia': 1,
'dsc': 4,
'alfen': 2,
'justicialist': 2,
'yallop': 1,
'btfsplk': 2,
'psinet': 1,
'transportas': 1,
'biochemically': 2,
'torona': 1,
'hercules': 160,
'multis': 2,
'gallbladder': 21,
'ragnit': 1,
'wltw': 1,
'cata': 2,
'eliminator': 3,
'lipolysis': 8,
'superquadrics': 1,
'litt': 29,
'insecticide': 24,
'antimicrobial': 11,
'libi': 17,
'proprioception': 8,
'curonians': 4,
'silat': 2,
'petrodiesel': 1,
'posion': 1,
'chaux': 5,
'barrows': 9,
'grrm': 29,
'fving': 1,
'soliti': 1,
'styrofoam': 7,
'alito': 4,
'thymomas': 1,
'damascius': 6,
'civil': 3443,
'halflings': 2,
'stated': 1383,
'panache': 3,
'sabbatical': 9,
'landstreicher': 1,
'mashita': 1,
'pleads': 25,
'algerie': 3,
'eea': 36,
'gagliano': 2,
'baltikum': 2,
'egy': 2,
'chilterns': 1,
'advocaat': 1,
'bartlowicz': 1,
'munam': 2,
'party': 6943,
'unction': 17,
'jezkov': 1,
'gekommen': 1,
'malkin': 3,
'transzendentale': 1,
'esters': 71,
'sandlin': 1,
'telefantasy': 1,
'nyby': 1,
'solanine': 1,
'tago': 2,
'gdcl': 1,
'hesperoleucus': 1,
'servius': 11,
'cicuta': 2,
'civitatis': 1,
'juliae': 1,
'proposal': 460,
'magnaflow': 1,
'peliz': 1,
'researchers': 615,
'athiest': 1,
'localtalk': 2,
'makah': 6,
'rennovation': 1,
'soursop': 2,
'housekeepers': 2,
'bretons': 9,
'vle': 4,
'kasim': 1,
'ceolmhar': 1,
'scholfield': 1,
'thousander': 2,
'rajput': 14,
'decibannage': 1,
'foreheads': 3,
'multimark': 1,
'transferral': 1,
'schistosomal': 1,
'eccentrically': 2,
'midibus': 1,
'mappings': 12,
'rabban': 13,
'fadlallah': 1,
'absconding': 1,
'maajka': 1,
'gedae': 1,
'olandt': 1,
'unzip': 2,
'prealps': 1,
'aboot': 4,
'attacotti': 2,
'antianxiety': 1,
'decimals': 13,
'perish': 34,
'thorne': 19,
'ahikuntaka': 1,
'dcnos': 1,
'rajasthani': 5,
'nemrod': 2,
'trimethoprim': 1,
'kamman': 1,
'katrina': 82,
'cwiss': 1,
'fromuth': 2,
'unwinding': 7,
'manipular': 1,
'charityware': 1,
'aara': 6,
'flyingfish': 1,
'netherlandish': 3,
'premised': 6,
'forelock': 1,
'infertile': 18,
'alyat': 1,
'voyages': 99,
'altarpiece': 16,
'encre': 1,
'menticide': 1,
'fittest': 15,
'nishkam': 1,
'proliant': 2,
'absu': 1,
'intoxicating': 6,
'lookahead': 9,
'conscience': 145,
'costas': 79,
'duquesne': 6,
'pino': 8,
'unabsehbare': 1,
'desmosedici': 2,
'herod': 66,
'ceti': 16,
'overflight': 4,
'maba': 1,
'reiger': 1,
'callipyge': 1,
'durie': 3,
'abenner': 3,
'katakana': 99,
'shabak': 3,
'sackheim': 1,
'nepente': 1,
'boopsie': 10,
'radiotherapeutic': 1,
'nachtigal': 1,
'diseconomy': 1,
'mangoes': 5,
'satchmo': 11,
'nitrile': 13,
'oyster': 43,
'arabe': 5,
'entertwined': 1,
'shearman': 3,
'rolighed': 1,
'lillywhite': 1,
'undertakings': 19,
'bernarr': 3,
'beauvoir': 23,
'cosmonaut': 67,
'aramite': 1,
'campana': 6,
'barragan': 1,
'baylon': 2,
'cultic': 24,
'prog': 13,
'frotteurism': 5,
'creationists': 61,
'extradition': 40,
'skowron': 1,
'qaim': 2,
'resveratrol': 2,
'stava': 1,
'kumbalom': 1,
'conseillers': 5,
'kerneltrap': 1,
'leijonhufvud': 1,
'hymenaios': 1,
'foes': 41,
'licorne': 2,
'chemosensory': 1,
'zeropaid': 1,
'sybian': 1,
'thermoelectricity': 3,
'baptismo': 2,
'mallorquin': 4,
'chongzhi': 1,
'abipones': 5,
'pelje': 2,
'keisker': 1,
'wingen': 1,
'grosse': 23,
'hubbed': 3,
'caciocavallo': 1,
'capillarity': 2,
'aftab': 3,
'husked': 1,
'wiecino': 17,
'canuti': 2,
'attenuators': 1,
'murena': 3,
'relegating': 6,
'reapportionment': 1,
'xu': 25,
'devised': 217,
'laotians': 8,
'temazepam': 2,
'demolishment': 1,
'ventricle': 15,
'pownal': 3,
'pageants': 5,
'uttu': 3,
'peaky': 1,
'boye': 3,
'canadice': 2,
'gorditas': 1,
'flores': 57,
'stigmas': 5,
'bludgeonings': 1,
'titoist': 1,
'merchants': 260,
'chambres': 3,
'haggada': 2,
'lustreless': 1,
'underbrace': 6,
'subtilisin': 1,
'retorica': 1,
'sabbaths': 5,
'blindheit': 1,
'talbot': 44,
'anagalida': 1,
'sariel': 2,
'selenia': 2,
'kisumu': 4,
'todd': 166,
'infano': 1,
'tinymuse': 1,
'almsgiving': 7,
'rowena': 9,
'ironmonger': 2,
'borradori': 1,
'atypicals': 4,
'smithkline': 1,
'bareki': 1,
'chione': 2,
'ellus': 1,
'strop': 2,
'nationalizing': 4,
'wakame': 2,
'licori': 1,
'recalculated': 6,
'jq': 1,
'martial': 639,
'kronheimer': 1,
'opresi': 1,
'earthworm': 32,
'lpu': 6,
'kyukoku': 1,
'weigard': 1,
'jaden': 2,
'grumiaux': 1,
'dignified': 28,
'juventud': 4,
'serogroup': 1,
'autokefalicznego': 1,
'barroco': 2,
'hawking': 51,
'baluchestan': 3,
'uldis': 1,
'cultured': 60,
'medicinally': 6,
'leegot': 1,
'paleckis': 1,
'malcolmus': 1,
'chronograms': 1,
'kinneging': 1,
'neuropterous': 1,
'mokele': 6,
'somar': 2,
'umbrinus': 1,
'arcweb': 1,
'quijano': 2,
'pituriaspida': 1,
'liverani': 1,
'yamachi': 1,
'plowboy': 1,
'polymixiiformes': 1,
'pack': 248,
'hymnwriter': 2,
'lambic': 4,
'gansters': 1,
'quizzically': 1,
'macqueen': 1,
'ginits': 1,
'paton': 12,
'payame': 2,
'hmac': 21,
'photoplay': 5,
'shantipur': 1,
'artillerie': 1,
'iots': 1,
'liking': 36,
'wize': 1,
'quilts': 1,
'tardigradus': 1,
'cardelli': 6,
'rihtiniemi': 2,
'iste': 1,
'alphekka': 2,
'pinewood': 5,
'ntruencrypt': 1,
'comedians': 106,
'eichlami': 1,
'koniag': 1,
'kawaimina': 1,
'attiret': 2,
'fleener': 2,
'unpopularity': 27,
'aiwaz': 1,
'dongsishitiao': 1,
'baffin': 25,
'articulator': 1,
'bullhead': 1,
'ujaama': 1,
'marketability': 3,
'albumsbob': 1,
'strawman': 2,
'sinofsky': 1,
'enthusiasms': 6,
'lasdehnen': 1,
'parodically': 2,
'dohrn': 1,
'ulnar': 1,
'faired': 4,
'wannabe': 1,
'ijc': 1,
'superscript': 16,
'merarites': 1,
'kdwb': 1,
'iguanids': 1,
'virgin': 446,
'kajn': 1,
'aceramic': 2,
'cyclone': 50,
'intelligible': 75,
'ergonomie': 1,
'viz': 39,
'britcom': 1,
'ackey': 1,
'gann': 7,
'galand': 1,
'tauros': 2,
'ikammanen': 1,
'evacuating': 12,
'checagou': 1,
'sword': 596,
'turps': 3,
'knorr': 2,
'didactylus': 3,
'deleted': 83,
'microfortnight': 1,
'exemptus': 1,
'sauraseni': 1,
'vicarage': 12,
'theodorus': 7,
'shimane': 3,
'carnoy': 1,
'proteolytic': 3,
'wienerwald': 1,
'waitemata': 8,
'brews': 18,
'bumiputras': 1,
'ambiguous': 195,
'renaldo': 5,
'pambazos': 1,
'kacem': 1,
'adel': 12,
'lingua': 78,
'keyn': 1,
'waid': 11,
'hypnotherapy': 23,
'belleek': 1,
'cathedral': 550,
'inculcated': 5,
'polglase': 1,
'exil': 2,
'chava': 1,
'emerick': 1,
'nazims': 2,
'copulatio': 1,
'empelor': 1,
'promontory': 19,
'constabulary': 22,
'imperforation': 1,
'mdir': 1,
'musik': 37,
'puffin': 6,
'corita': 1,
'benaud': 4,
'dependents': 27,
'seefahrer': 1,
'salaca': 1,
'zinovievna': 1,
'uspenskaia': 1,
'ginster': 1,
'lithographer': 3,
'morgaine': 2,
'ratae': 1,
'gadaba': 1,
'rbau': 1,
'ats': 21,
'lldin': 1,
'eritreans': 19,
'chapterhouse': 15,
'strangled': 26,
'patterns': 718,
'joppy': 1,
'mboxg': 1,
'penrose': 40,
'mimimalists': 1,
'oceania': 100,
'laish': 1,
'southamerican': 1,
'figuere': 1,
'turchetta': 1,
...})
In [11]:
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
In [12]:
type(dictionary)
Out[12]:
dict
In [13]:
dictionary
Out[13]:
{'kyrie': 38567,
'bbbb': 30107,
'factually': 39531,
'deane': 36772,
'interlude': 34358,
'seventh': 3833,
'haven': 7089,
'linebarger': 24744,
'garc': 15348,
'stephenson': 13505,
'pln': 40141,
'theron': 24290,
'mime': 11248,
'formations': 7101,
'rulebase': 29374,
'turkey': 2222,
'counternarcotics': 44613,
'poseidon': 13142,
'socks': 17009,
'ginza': 48584,
'vulnerabilities': 21888,
'transforms': 11624,
'yukio': 45944,
'metrodome': 37431,
'realization': 10831,
'shahanshah': 37432,
'warnings': 14737,
'laren': 29375,
'activating': 25909,
'wrecking': 46562,
'unguarded': 46563,
'mamluk': 34659,
'jacques': 3757,
'ioannis': 49345,
'detachable': 44612,
'rameses': 49435,
'vincendeau': 34058,
'newsstand': 49219,
'bonfires': 35768,
'faithful': 6944,
'channel': 1091,
'prompting': 12954,
'mandating': 38207,
'richie': 22154,
'excitation': 24987,
'anaximander': 20716,
'lap': 14312,
'adverb': 27212,
'moranis': 48597,
'maniac': 20848,
'nepotism': 41512,
'endorse': 20171,
'habilitation': 37596,
'cpa': 39365,
'motivating': 25766,
'lengthened': 28111,
'novel': 841,
'alois': 24869,
'corcovado': 37096,
'variations': 2290,
'subliminal': 27046,
'residential': 7261,
'hopefully': 21630,
'feud': 16067,
'amalthea': 32128,
'donkeys': 41513,
'missy': 45929,
'discogs': 43333,
'vivekananda': 45265,
'dowling': 43496,
'juan': 4340,
'experienced': 2707,
'rosette': 36773,
'haganah': 29373,
'aor': 47045,
'grandmaster': 17658,
'distilling': 34510,
'contact': 1622,
'orchid': 26752,
'tranquillity': 41514,
'fares': 18393,
'gwh': 8123,
'hammers': 23332,
'announced': 1423,
'culmination': 20238,
'owe': 13844,
'begged': 23974,
'cheques': 26521,
'xof': 45945,
'dhtml': 30109,
'hydrate': 44081,
'grok': 44028,
'consist': 3959,
'sumer': 26044,
'beekeeping': 41996,
'benzyl': 40563,
'gibbon': 14260,
'teschen': 44082,
'stony': 26476,
'aliyah': 39338,
'foraging': 19968,
'sosa': 27663,
'messianic': 14023,
'pay': 1898,
'bee': 8613,
'birley': 38212,
'wan': 14563,
'carmelite': 40142,
'continuations': 35197,
'ninhursag': 27350,
'externally': 17520,
'jamo': 12518,
'nanometers': 36081,
'sandwich': 14024,
'adjacency': 48353,
'fernandez': 23637,
'yates': 19496,
'bargain': 16552,
'ridgway': 45266,
'recording': 2094,
'warp': 17010,
'gander': 33301,
'stockholders': 36774,
'unsatisfactory': 27664,
'leaving': 1717,
'anise': 28630,
'dependents': 26477,
'plotinus': 19308,
'approximated': 18207,
'eritreans': 32352,
'feelin': 47379,
'omnivores': 49346,
'sculptors': 16633,
'fairies': 16415,
'leet': 9443,
'counterpoint': 8984,
'develop': 1691,
'protein': 2362,
'kummer': 48598,
'enrollment': 15097,
'analects': 22069,
'sopwith': 47136,
'hemoglobin': 11222,
'bookmarks': 38022,
'semantic': 9023,
'cannery': 48599,
'babel': 19373,
'eyebrows': 26622,
'kip': 30965,
'silesian': 28631,
'discussing': 7215,
'sequentially': 21149,
'separation': 3487,
'anxious': 18394,
'collectivization': 22667,
'diethyl': 31686,
'wikiquote': 34364,
'kib': 24414,
'landscape': 4641,
'device': 1499,
'clarinetist': 37438,
'mannerheim': 25910,
'memetic': 17627,
'ascendancy': 25357,
'maceo': 40578,
'defer': 37634,
'torres': 20565,
'batsman': 18030,
'gro': 22430,
'gaff': 49347,
'pull': 6146,
'bail': 20991,
'mauro': 28475,
'eller': 49348,
'gematria': 27354,
'evangelica': 49768,
'certificate': 10475,
'offerings': 12357,
'sephardic': 20239,
'resolutely': 45946,
'mining': 3429,
'kind': 1190,
'cones': 13481,
'trade': 513,
'intoxication': 19497,
'meteorological': 17301,
'livestock': 7418,
'jamie': 15877,
'encourage': 5506,
'petrified': 30332,
'rituals': 7384,
'categorical': 17413,
'indicator': 9039,
'condensate': 19751,
'ratios': 7825,
'retroactive': 39339,
'outrage': 15946,
'hutchison': 41515,
'avoid': 1699,
'realists': 36075,
'metaphysical': 9055,
'ours': 16337,
'quintus': 20300,
'mrna': 9528,
'viridis': 40144,
'microprogram': 23863,
'len': 15947,
'valdemar': 26185,
'parlor': 47137,
'mentally': 8825,
'vos': 34265,
'roman': 255,
'lifeless': 33302,
'u': 146,
'moss': 14463,
'activation': 11208,
'utterances': 25768,
'analysis': 900,
'commune': 13589,
'downloaded': 14555,
'disposition': 14738,
'ay': 18967,
'chalukyas': 34359,
'analogues': 25098,
'feathers': 12096,
'laissez': 12971,
'hodder': 30647,
'inventions': 9104,
'coexisted': 41516,
'wipo': 12453,
'memoriam': 34948,
'burgeoning': 19844,
'positively': 12407,
'theodora': 22893,
'scientifically': 12551,
'bj': 6368,
'acted': 5158,
'aptitude': 28476,
'landowners': 18802,
'fcc': 16036,
'membrane': 4054,
'indecisive': 32825,
'gift': 4822,
'rna': 6645,
'anglia': 23432,
'golgi': 18438,
'danann': 37433,
'botvinnik': 45267,
'lifts': 18803,
'bs': 14990,
'consult': 9684,
'gdp': 1532,
'partners': 3671,
'highlights': 10562,
'annexes': 25346,
'forgotten': 6082,
'confinement': 17157,
'inhabits': 33801,
'typified': 21403,
'revenge': 6093,
'rstenberg': 29738,
'massacred': 17838,
'tandem': 20028,
'luiz': 49541,
'beat': 3249,
'melanoleuca': 47138,
'dodecahedron': 14966,
'minds': 6251,
'stomachs': 37434,
'albany': 14780,
'horizontally': 12179,
'rakyat': 45948,
'chimpanzees': 18968,
'logarithmic': 15236,
'hezekiah': 12495,
'bearable': 49350,
'antwerp': 11909,
'evenings': 27044,
'hq': 24402,
'subplot': 44614,
'priests': 4051,
'remarry': 36076,
'ruptured': 48106,
'nig': 28285,
'mishneh': 32129,
'windowing': 37099,
'terrace': 16130,
'negligent': 47240,
'aidan': 23119,
'licensed': 5175,
'dell': 10542,
'rhyolite': 43519,
'confounding': 49855,
'loves': 10672,
'yag': 27666,
'smoky': 27357,
'francophones': 28944,
'magnify': 32353,
'vitro': 16590,
'templars': 24870,
'girolamo': 23008,
'pelham': 27045,
'holocaust': 3453,
'misinterpreted': 30746,
'liam': 16967,
'copa': 37448,
'catalana': 42992,
'correctness': 17051,
'symphonies': 17579,
'strange': 4084,
'gurdjieff': 49842,
'affirm': 18660,
'brandon': 17519,
'kawasaki': 33802,
'comment': 4956,
'hc': 21707,
'dane': 23975,
'huck': 34155,
'apec': 27667,
'mcnamara': 24625,
'crush': 12298,
'indictees': 38569,
'ignosticism': 45594,
'kell': 31202,
'comb': 14623,
'agust': 36078,
'abbreviate': 48601,
'boxcar': 47864,
'buried': 2781,
'intersystems': 41351,
'lithuania': 3003,
'lands': 2112,
'man': 243,
'snorri': 15846,
'unravel': 44084,
'sns': 33063,
'pegged': 18742,
'decorating': 32592,
'languedoc': 43498,
'domestically': 19498,
'lees': 39130,
'maitreya': 39401,
'saarinen': 40566,
'statesman': 4345,
'zalta': 48602,
'values': 1140,
'bmx': 47139,
'valera': 14261,
'glycogen': 19520,
'swift': 8228,
'accelerating': 15644,
'ventilated': 46567,
'rehabilitated': 35770,
'nayla': 45269,
'coldcut': 39341,
'subordinate': 8547,
'catalyze': 34376,
'doubled': 9009,
'cultivated': 10320,
'discover': 5621,
'grabs': 34059,
'articles': 1097,
'responses': 6754,
'enos': 38935,
'reuse': 23331,
'malt': 15505,
'grassy': 26623,
'differed': 11944,
'suda': 42000,
'deum': 40567,
'almagest': 24521,
'february': 596,
'closed': 1631,
'upn': 25236,
'purify': 21392,
'qualifiers': 43499,
'toot': 45386,
'collisions': 10573,
'budding': 24871,
'dara': 28477,
'laboratories': 9633,
'feline': 23737,
'subtraction': 21236,
'shalt': 29926,
'peasant': 8213,
'seminole': 31444,
'connectors': 15349,
'shahada': 49351,
'redeem': 31434,
'bangs': 21580,
'hangzhou': 43764,
'phonograph': 24872,
'bookrags': 48817,
'situs': 48603,
'infections': 7477,
'evaluations': 30541,
'baking': 16914,
'aylwin': 45950,
'turner': 8129,
'omari': 25911,
'schubert': 25792,
'planetfall': 42434,
'bruno': 7401,
'orcs': 46568,
'methionine': 34060,
'rupee': 23120,
'spikes': 24059,
'sarcophagi': 44615,
'colin': 7809,
'rsync': 49352,
'advantageous': 15024,
'fight': 1784,
'gin': 8390,
'anthropomorphism': 25769,
'ige': 33803,
'landmasses': 41059,
'crackers': 23337,
'screens': 9158,
'ray': 1959,
'hercules': 8400,
'gallbladder': 30110,
'shifter': 43528,
'suitably': 29025,
'litt': 25099,
'insecticide': 28284,
'antimicrobial': 42435,
'citizenship': 3639,
'radek': 39725,
'andrei': 13746,
'risen': 11834,
'libi': 33804,
'storks': 38209,
'catholic': 566,
'ascertaining': 39726,
'quasars': 33311,
'possessive': 17889,
'convention': 1185,
'intramolecular': 48604,
'barrows': 47141,
'grrm': 25100,
'awakened': 24291,
'implant': 26624,
'dendritic': 35198,
'emotionally': 14471,
'https': 14249,
'aeolian': 27047,
'neil': 5118,
'stated': 1290,
'sabbatical': 47142,
'alces': 34660,
'obfuscated': 41546,
'pleads': 27351,
'rideau': 39342,
'eea': 22155,
'belfast': 9142,
'vandals': 12180,
'psi': 5402,
'party': 202,
'unction': 33805,
'impossibility': 19294,
'lob': 37833,
'esters': 14624,
'servius': 42436,
'naples': 7914,
'proposal': 3697,
'semi': 1941,
'erebus': 34061,
'basque': 3198,
'gernika': 47143,
'stranger': 10835,
'symbolizes': 20920,
'deceiver': 44015,
'harlequin': 36432,
'dtv': 23433,
'juneau': 35771,
'bretons': 47144,
'subtlety': 25237,
'rajput': 37435,
'logistic': 19392,
'provability': 49720,
'moulton': 45951,
'toothpaste': 39727,
'deride': 42302,
'heresiologists': 43501,
'kalahari': 21555,
'tischendorf': 26891,
'vagina': 15267,
'calculated': 4900,
'mappings': 40568,
'rabban': 38938,
'shreveport': 35471,
'tezuka': 26625,
'galahad': 38571,
'zn': 36776,
'topaz': 39802,
'perish': 22894,
'thorne': 31893,
'appendicitis': 40146,
'battles': 3087,
'katrina': 13361,
'equilateral': 36434,
'male': 645,
'liberate': 20101,
'taoist': 21237,
'pub': 8140,
'infertile': 32816,
'schottky': 46889,
'clytemnestra': 29540,
'ghosting': 47869,
'statistical': 3298,
'genocidal': 46570,
'voyages': 11816,
'governorship': 21889,
'altarpiece': 34949,
'fittest': 36079,
'jotham': 46552,
'permian': 21066,
'hydroelectricity': 46041,
'trigonometric': 15111,
'cielo': 45952,
'lookahead': 47145,
'eldest': 6675,
'costas': 13675,
'norte': 16315,
'halas': 23223,
'herod': 15350,
'id': 4060,
'masorti': 22070,
'frontier': 4538,
'mammals': 4145,
'anschluss': 15430,
'comma': 16869,
'lambda': 4849,
'discontinuing': 47870,
'amphibians': 20102,
'boopsie': 45048,
'raison': 30542,
'shorthair': 39028,
'brunswick': 5957,
'vociferous': 41519,
'satchmo': 42437,
'nitrile': 38939,
'wenders': 49767,
'theremin': 14428,
'publique': 27049,
'flat': 2178,
'rajendra': 45953,
'recreational': 7175,
'vite': 48863,
'undertakings': 31894,
'filler': 39533,
'carbines': 22796,
'wichita': 21867,
'leaflet': 37101,
'scoreless': 34661,
'lengths': 7508,
'sabo': 30748,
'cultic': 27958,
'karzai': 25912,
'creationists': 16109,
'offset': 8013,
'extradition': 20939,
'wigs': 43539,
'foes': 20566,
'bactria': 26626,
'anatomists': 47503,
'stiffer': 46571,
'imprisoned': 5849,
'reflector': 18439,
'insure': 26478,
'scrupulously': 40147,
'verification': 13298,
'flanagan': 35772,
'joaqu': 32131,
'bettors': 46572,
'anatta': 45986,
'grosse': 28633,
'willed': 30543,
'methodists': 18854,
'designations': 8890,
'grenadines': 21789,
'synagogues': 13093,
'wiecino': 33806,
'catalans': 40148,
'bedroom': 15125,
'devised': 6730,
'improvising': 39728,
'ventricle': 36080,
'neo': 3856,
'elsewhere': 2908,
'sign': 1442,
'kzinti': 45273,
'lamo': 28478,
'temporary': 3610,
'flores': 16821,
'deleuze': 12406,
'browse': 24832,
'nationalist': 4038,
'oxidize': 35773,
'bahadur': 35922,
'tragically': 34662,
'sore': 25681,
'talbot': 19752,
'ismail': 14813,
'intellect': 12311,
'communally': 39729,
'democracies': 10492,
'todd': 8200,
'nightingale': 35774,
'larval': 33150,
'gained': 1511,
'flamebait': 44086,
'secretive': 23224,
'rowena': 47146,
'eyepiece': 39193,
'openoffice': 23864,
'obedient': 34663,
'vincent': 5722,
'vegetarianism': 19569,
'raskin': 29186,
'riverine': 42486,
'veterinary': 14068,
'divers': 13590,
'autonomously': 45984,
'aztlan': 48607,
'martial': 2779,
'ratify': 16229,
'earthworm': 23738,
'antinomies': 48475,
'hampering': 46573,
'dignified': 25637,
'sandwiched': 32008,
'caesarion': 42003,
'hawking': 18036,
'kinetochores': 37102,
'cultured': 16267,
'samsung': 35775,
'mille': 32354,
'wcl': 20103,
'aardvarks': 34950,
'bloch': 14702,
'keswick': 46574,
'clarendon': 16230,
'grigori': 30136,
'buttocks': 27911,
'eredivisie': 45954,
'pryor': 24873,
'jakarta': 9841,
'inlets': 34664,
'teleprinter': 36430,
'amp': 11884,
'ksr': 40611,
'mouthpieces': 38573,
'pack': 6107,
'portrait': 3740,
'defying': 40149,
'vijayanagara': 44087,
'untold': 26604,
'outputting': 48608,
'paton': 40570,
'hmac': 30112,
'colonize': 26253,
'hendrix': 5976,
'vernacular': 9849,
'transient': 15389,
'walton': 18342,
'quaestor': 38574,
'invisibly': 49354,
'liking': 22156,
'passau': 49355,
'botha': 31436,
'burt': 19885,
'gillian': 35470,
'brushing': 40150,
'slip': 11725,
'pillars': 11345,
'coals': 35776,
'railways': 4696,
'comedians': 11281,
'symptomatic': 30544,
'gygax': 21980,
'aviv': 13905,
'baffin': 27352,
'trisomy': 26342,
'familia': 14988,
'epicurus': 18747,
'montenegro': 6249,
'basso': 20719,
'thespis': 44684,
'fecal': 34360,
'shrunk': 23064,
'hatred': 8064,
'molds': 23434,
'willful': 38392,
'supplier': 13435,
'waterhouse': 43204,
'skill': 3383,
'lifeform': 45274,
'sagrada': 32838,
'tensions': 6199,
'stoiber': 23525,
'cyclone': 18277,
'intelligible': 14163,
'purpurea': 37541,
'hypothyroidism': 29928,
'aerials': 42004,
'machen': 46755,
'cabins': 49417,
'lifeforms': 28848,
'demotic': 35472,
'rdoba': 17930,
'migrants': 14171,
'sword': 2970,
'highland': 7373,
'deleted': 13259,
'abeda': 48609,
'vicarage': 40572,
'vertov': 15938,
'cyclical': 27670,
'brews': 32818,
'received': 639,
'ambiguous': 7294,
'humourous': 35778,
'knit': 15893,
'hobson': 46939,
'adel': 40573,
'lingua': 13795,
'relevant': 3888,
'endures': 37315,
'hypnotherapy': 28634,
'rusty': 26892,
'cuc': 34665,
'cathedral': 3191,
'hotline': 45992,
'thermionic': 36111,
'cope': 10321,
'scales': 4796,
'budo': 33065,
'pascha': 36082,
'ardent': 18395,
'bottoms': 19570,
'lepidus': 30749,
'gordie': 42439,
'twenty': 1674,
'musik': 21794,
'heading': 9240,
'wildlife': 5591,
'spivak': 42440,
'csx': 47873,
'keywords': 20639,
'consubstantiation': 34801,
'lazuli': 42005,
'archaeology': 6431,
'xinhua': 49358,
'vegeta': 29739,
'merriam': 24745,
'viewers': 6818,
'branches': 2647,
'govern': 9302,
'ats': 30113,
'philistine': 34802,
'chapterhouse': 36083,
'strangled': 26754,
'patterns': 2507,
'marshland': 38239,
'legis': 47874,
'censored': 18089,
'oxidized': 17357,
'rosewood': 46576,
'parking': 10136,
'portmanteau': 24746,
'flamsteed': 19571,
'chaim': 23534,
'ness': 9431,
'sectors': 5120,
'gladiatorial': 34361,
'xxxix': 33066,
'motivate': 25928,
'isothermal': 43505,
'mayonnaise': 28849,
'vector': 2292,
'baur': 45276,
'timor': 6868,
'impeller': 42006,
'serve': 1486,
'constantine': 2897,
'percentage': 3102,
'multipoint': 42994,
'atat': 25638,
'warship': 15990,
'supposition': 26405,
'coordinator': 11556,
'moran': 25516,
'spacey': 20856,
'worships': 38024,
'walter': 2755,
'fl': 8595,
'workplace': 14896,
'innovators': 30701,
'bos': 21631,
'godzilla': 4332,
'jl': 36113,
'hfs': 32356,
'nachrichten': 47875,
'charlie': 5109,
'manifolds': 11305,
'canister': 38213,
'dreamers': 25770,
'sulaiman': 43506,
'margulis': 42433,
'announcers': 33541,
'katsuhiro': 48611,
'dramatists': 20992,
'lounge': 21969,
'overpowered': 38576,
'endorsement': 14763,
'thebaid': 47655,
'cyborgs': 27050,
'peripheral': 9056,
'drinker': 24748,
'harden': 42007,
'defraud': 44089,
'polar': 5365,
'thousand': 2333,
'gettysburg': 8685,
'subdisciplines': 34853,
'decried': 27368,
'pundits': 31895,
'enciphered': 48612,
'quizzes': 44618,
'kellerman': 37456,
'grasped': 32819,
'coler': 44793,
'deoxyribonucleic': 47148,
'spoofs': 43507,
'renunciation': 21984,
'acknowledgement': 26916,
'arsenal': 6373,
'rattle': 31896,
'surplus': 8333,
'fiercely': 18208,
'mandela': 17794,
'pads': 13362,
'hemophilia': 33807,
'alderney': 29542,
'delusional': 28635,
'drill': 14739,
'groom': 14286,
'sprout': 39776,
'doubting': 41061,
'youssef': 34952,
'clinical': 4169,
'refurbished': 30334,
'unwillingness': 19759,
'waite': 34666,
'sociobiology': 27532,
'photoelectric': 25358,
'catalysts': 16679,
'uninhabitable': 29740,
'habsburgs': 15201,
'unfold': 33808,
'strassman': 46577,
'sza': 34953,
'indication': 7581,
'marcius': 43508,
'mayen': 31897,
'den': 7591,
'weston': 25101,
'actresses': 13227,
'freeways': 13165,
'continuous': 2157,
'layman': 21002,
'dnow': 29188,
'nims': 47150,
'choreography': 25908,
'erne': 42995,
'mgm': 14881,
'fk': 40703,
'apicomplexa': 41521,
'fifa': 7815,
'woodcuts': 34994,
'vietnam': 2342,
'junius': 25102,
'kuan': 15923,
'augustinians': 47253,
'korps': 37834,
'introns': 27671,
'bamboo': 12844,
'cod': 16268,
'chuck': 7693,
'doj': 31433,
'colorless': 20105,
'etiology': 27051,
'proofing': 41997,
'democratically': 13612,
'wallenberg': 43861,
'free': 247,
'kodansha': 33542,
'subclass': 16231,
'dafoe': 48615,
'bestows': 47151,
'music': 166,
'orientalist': 31204,
'pectin': 42441,
'laes': 40621,
'few': 289,
'punks': 43509,
'sikhs': 15390,
'amnesty': 7609,
'harmonize': 34455,
'pledging': 37103,
'pronouncements': 32357,
'schuler': 43510,
'equitable': 17011,
'pula': 31898,
'ligature': 23740,
'xaf': 42442,
'ind': 37437,
'principles': 1557,
'correspondence': 4626,
'wikibook': 47877,
'ghostbusters': 40577,
'malts': 37104,
'kobayashi': 33303,
'sundance': 34505,
'geologists': 14178,
'uganda': 11065,
'peloponnesus': 36807,
'furigana': 21313,
'proportionally': 30144,
'punk': 4465,
'semifinals': 31449,
'mediator': 15171,
'rotates': 20240,
'deva': 31201,
'pomp': 30336,
'conductive': 19225,
'nitro': 30750,
'sexually': 6790,
'existentialist': 19886,
'richly': 22346,
'pressure': 1025,
'maneuver': 12647,
'honeywell': 20640,
'seneca': 17101,
'tunisia': 9346,
'historiographical': 45721,
'drains': 15815,
'melisende': 32358,
'office': 591,
'risc': 8891,
'hanns': 38940,
'archetypes': 27959,
'generalized': 7161,
'ris': 31457,
'julius': 4414,
'epochs': 24989,
'anthropomorphic': 14989,
'brock': 21890,
'decennial': 49877,
'tze': 34062,
'negation': 11707,
'truly': 4159,
'exploring': 7928,
'host': 1560,
'learned': 2662,
'transformers': 25103,
'retaliatory': 40394,
'consciously': 14122,
'editor': 2266,
'maori': 26479,
'morita': 25238,
'sda': 46579,
'merits': 10955,
'tails': 12592,
'schindler': 28788,
'banana': 10958,
'medes': 20993,
'eider': 33304,
'cac': 45277,
'cured': 13250,
'mistreatment': 25771,
'initiate': 11957,
'magdalen': 27829,
'anions': 29377,
'orfeo': 43559,
'mobility': 9227,
'wheeler': 14164,
'aeolus': 20781,
'fluids': 9785,
'expired': 13130,
'fsm': 14263,
'dorchester': 27825,
'bartos': 45278,
'artist': 1377,
'podkopayeva': 41578,
'mattel': 16369,
'diab': 30546,
'breviary': 34397,
'suffolk': 10343,
...}
In [15]:
dictionary['bbbb']
Out[15]:
30107
In [16]:
print(len(dictionary))
data = list()
unk_count = 0
50000
In [18]:
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0
unk_count += 1
data.append(index)
In [19]:
data
Out[19]:
[5243,
3083,
12,
6,
195,
2,
3134,
46,
59,
156,
128,
742,
477,
10636,
134,
1,
27494,
2,
1,
103,
855,
3,
1,
15181,
0,
2,
1,
151,
855,
3581,
1,
195,
11,
191,
59,
5,
6,
10740,
215,
7,
1326,
105,
455,
20,
59,
2734,
363,
7,
3675,
1,
709,
2,
372,
27,
41,
37,
54,
540,
98,
12,
6,
1426,
2760,
19,
568,
687,
7099,
1,
248,
5243,
11,
1053,
28,
1,
321,
249,
45850,
2878,
793,
187,
5243,
12,
6,
201,
603,
11,
1,
1135,
20,
2623,
26,
8987,
3,
280,
32,
4158,
142,
60,
26,
6445,
4190,
2,
154,
33,
363,
5243,
37,
1138,
7,
448,
345,
1819,
20,
4870,
1,
6764,
2,
7585,
1775,
567,
1,
94,
1,
248,
11121,
12,
52,
7099,
90,
27,
271,
38,
5956,
4863,
20493,
29,
0,
42,
318,
6,
25947,
528,
7585,
372,
5,
259,
2,
154,
26,
1207,
12,
7585,
201,
1578,
3,
15320,
333,
1775,
7099,
4870,
345,
765,
161,
407,
5703,
756,
2,
4116,
1132,
4338,
1537,
3,
568,
8132,
99,
5243,
11,
52,
1409,
687,
19,
154,
27,
11,
156,
7099,
37,
2035,
1426,
8187,
2,
154,
47,
694,
7,
32,
6,
4159,
247,
372,
77,
949,
79,
311,
31,
4790,
372,
508,
140,
2315,
3556,
365,
24,
1823,
7,
1906,
60,
11,
37,
8429,
79,
311,
6,
247,
372,
508,
32,
754,
79,
1737,
3,
8057,
24323,
3,
276,
1694,
20,
152,
1035,
96,
225,
372,
18,
1817,
25,
4790,
1557,
52,
8128,
1468,
24323,
3,
12773,
5,
6138,
20,
4189,
21241,
2432,
40,
16626,
3,
7245,
862,
2,
1195,
10210,
2515,
29,
15187,
188,
3,
49,
1124,
914,
7,
1050,
470,
12354,
7099,
134,
1,
1,
11121,
3060,
3,
12125,
735,
4790,
6601,
5,
14628,
28,
451,
486,
24323,
199,
296,
949,
5,
20133,
20528,
2,
0,
301,
7,
24323,
20528,
16615,
1,
30323,
2,
1,
94,
45,
3836,
3,
0,
3,
3813,
1,
3417,
2,
1,
1782,
188,
2,
1,
627,
1,
13187,
2,
4,
22,
91,
123,
285,
26,
244,
233,
7,
32,
437,
26684,
2,
174,
5243,
7735,
2538,
5,
30,
96,
2,
290,
603,
4316,
20,
1,
13187,
16615,
50,
188,
118,
47,
360,
20,
1,
385,
243,
97,
32,
5239,
35,
332,
2728,
19,
1,
839,
1378,
28,
33,
8782,
47,
6481,
35,
3246,
1,
27494,
29,
552,
42867,
40,
31,
128,
0,
436,
85,
1,
65,
2,
1,
103,
464,
84,
3,
26,
233,
19,
48,
12,
26684,
2,
174,
5243,
5,
1,
174,
840,
1,
46,
7,
90,
1,
195,
7,
1016,
1044,
43,
70,
4863,
18,
938,
15009,
4579,
165,
0,
5,
30,
42066,
11816,
22953,
260,
1226,
0,
0,
4,
23,
8,
17,
100,
36,
629,
1,
2665,
64,
372,
34,
49,
76,
94,
688,
14795,
4051,
29,
818,
918,
12,
102,
5,
11121,
2538,
363,
6,
6075,
3,
663,
5,
1,
64,
702,
436,
41,
4219,
1290,
20,
36,
11,
31,
4790,
3,
95,
26,
50,
30,
5224,
5,
4,
23,
9,
17,
5,
1,
5360,
2,
1,
151,
855,
470,
12354,
335,
31,
17205,
2435,
201,
1294,
142,
12354,
216,
38,
90,
1,
248,
5243,
56,
117,
7099,
39,
1207,
33,
198,
12,
1,
46,
182,
4790,
530,
3,
12354,
12,
1,
1837,
2,
1916,
5243,
42,
35,
33,
229,
76,
4790,
436,
734,
2405,
3,
1,
195,
0,
18,
87,
1074,
12,
31,
14096,
30669,
19,
1,
12111,
0,
35,
51,
2421,
617,
5,
1,
151,
855,
1,
46,
568,
8738,
4790,
2938,
1034,
14526,
27,
11,
637,
360,
20,
27,
5392,
155,
192,
2938,
1034,
14526,
335,
154,
11,
918,
5,
4,
13,
21,
8,
20,
1,
195,
4790,
18,
990,
12,
6,
568,
1609,
27,
11,
14,
33,
891,
20,
48,
849,
14526,
12,
1,
1837,
2,
174,
4790,
209,
5,
154,
11,
918,
14526,
5469,
24,
1,
420,
12998,
918,
11,
6896,
5,
33,
140,
36,
1347,
1,
2881,
2,
15187,
918,
0,
155,
100,
3468,
39,
653,
446,
7,
90,
3,
3134,
44,
918,
12,
47,
3592,
55,
12,
15486,
1556,
14,
2966,
5,
45,
259,
14526,
1166,
154,
36,
89,
3226,
1132,
53,
39,
752,
446,
7,
90,
1050,
523,
3,
1794,
5,
5819,
24,
1557,
2,
5132,
3,
1294,
14526,
15,
2657,
2,
11121,
34,
36,
89,
29397,
0,
968,
31,
1335,
519,
100,
1132,
3,
309,
160,
513,
1,
807,
2,
44,
1259,
219,
1259,
1038,
34,
1392,
1,
909,
2,
742,
65,
968,
5,
443,
33,
74,
2918,
20,
76,
4,
74,
2966,
28,
1,
1259,
2,
276,
1556,
160,
4330,
2522,
499,
5,
728,
10660,
18938,
31,
763,
247,
987,
74,
32,
178,
98,
7,
741,
3273,
24,
914,
7,
1,
363,
2,
443,
14526,
15,
949,
40,
2026,
223,
151,
742,
477,
1819,
3,
30,
2987,
40,
1022,
5,
1,
855,
2,
4,
13,
21,
13,
5,
303,
14526,
15,
603,
2,
918,
11,
648,
27,
18,
348,
5,
6,
113,
2,
312,
80,
30,
3574,
3,
60,
26,
6445,
4190,
2,
48,
2,
30,
949,
14,
51,
2161,
1590,
68,
622,
2750,
26441,
15,
15765,
5,
30,
1,
10476,
3,
45,
200,
26441,
1461,
20,
52,
637,
1047,
345,
1775,
134,
1,
2582,
2,
94,
918,
12,
6,
245,
390,
446,
5,
153,
3,
1,
148,
2582,
2,
372,
40,
4583,
21019,
29,
9221,
5,
1,
941,
1975,
2,
372,
20,
1,
1132,
26,
45,
1730,
36,
5233,
15765,
3,
6,
144,
2,
0,
5,
34,
1132,
74,
9758,
5,
5839,
2,
0,
67,
62,
27,
18,
5,
44,
568,
763,
7,
176,
95,
14,
119,
918,
685,
1147,
79,
124,
508,
13107,
5578,
311,
7,
370,
7,
4115,
1,
1799,
7,
119,
5470,
918,
3,
154,
72,
39,
5,
609,
184,
20,
11,
609,
200,
95,
185,
12,
72,
6196,
9049,
12,
7204,
72,
1226,
1,
34751,
2,
1,
1799,
26441,
364,
89,
392,
31,
4790,
36,
1047,
67,
1,
2760,
42709,
2070,
30,
949,
40,
2026,
25,
56,
0,
9936,
7099,
142,
4190,
2,
30,
502,
26,
3237,
...]
In [20]:
unk_count
Out[20]:
418391
In [21]:
count[0][1] = unk_count
In [22]:
count[0]
Out[22]:
['UNK', 418391]
In [23]:
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
In [24]:
del words
In [26]:
print(count[:15])
[['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430), ('two', 192644), ('is', 183153), ('as', 131815), ('eight', 125285), ('for', 118445)]
In [28]:
print(data[:10], [reverse_dictionary[i] for i in data[:10]])
[5243, 3083, 12, 6, 195, 2, 3134, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
In [29]:
data_index = 0
In [30]:
batch = np.ndarray(shape=(8), dtype=np.int32)
labels = np.ndarray(shape=(8,1), dtype=np.int32)
In [32]:
span = 2 * 1 + 1
buffer = collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
In [36]:
for i in range(8 // 2):
target = 1
targets_to_avoid = [1]
for j in range(2):
while target in targets_to_avoid:
target = random.randint(0, span -1)
target_to_avoid.append(target)
batch[i * 2 + j] = buffer[1]
labels[i * 2 + j, 0] = buffer[target]
buffer.append(data[data_index])
data_index = (data_index + len(data) - span) % len(data)
In [37]:
batch
Out[37]:
array([3083, 3083, 12, 12, 6, 6, 5243, 5243], dtype=int32)
In [38]:
labels
Out[38]:
array([[ 12],
[ 12],
[3083],
[3083],
[5243],
[5243],
[ 6],
[ 6]], dtype=int32)
In [39]:
data_index
Out[39]:
17005198
In [40]:
for i in range(8):
print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])
3083 originated -> 12 as
3083 originated -> 12 as
12 as -> 3083 originated
12 as -> 3083 originated
6 a -> 5243 anarchism
6 a -> 5243 anarchism
5243 anarchism -> 6 a
5243 anarchism -> 6 a
In [64]:
batch_size = 128
embedding_size = 128
skip_window = 1
num_skips = 2
valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64
In [42]:
valid_examples
Out[42]:
array([37, 88, 29, 72, 52, 54, 21, 73, 74, 5, 19, 69, 53, 36, 25, 18])
In [57]:
vocabulary_size = 50000
graph = tf.Graph()
with graph.as_default():
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
with tf.device('/cpu:0'):
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
loss = tf.reduce_mean(
tf.nn.nce_loss(weights=nce_weights,
biases=nce_biases,
labels=train_labels,
inputs=embed,
num_sampled=num_sampled,
num_classes=vocabulary_size))
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
init = tf.global_variables_initializer()
In [52]:
print(embeddings)
Tensor("Variable/read:0", shape=(50000, 128), dtype=float32, device=/device:CPU:0)
In [59]:
def generate_batch(batch_size, num_skips, skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
for i in range(batch_size // num_skips):
target = skip_window # target label at the center of the buffer
targets_to_avoid = [skip_window]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target]
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
# Backtrack a little bit to avoid skipping words in the end of a batch
data_index = (data_index + len(data) - span) % len(data)
return batch, labels
In [60]:
num_steps = 100001
In [94]:
with tf.Session(graph=graph) as session:
init.run()
print("init..")
average_loss = 0
for step in xrange(num_steps):
batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
feed_dict = {train_inputs:batch_inputs, train_labels:batch_labels}
_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += loss_val
if step%2000 == 0:
if step > 0:
average_loss /= 2000
print("Average loss at step", step, ":", average_loss)
average_loss = 0
if step%10000 == 0:
sim = similarity.eval()
for i in xrange(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 8
nearest = (-sim[i,:]).argsort()[1:top_k + 1]
# print(nearest)
log_str = "Nearest to %s:" %valid_word
for k in xrange(top_k):
# print(k)
close_word = reverse_dictionary[nearest[k]]
log_str = "%s %s," % (log_str, close_word)
print(log_str)
final_embeddings = normalized_embeddings.eval()
init..
Average loss at step 0 : 255.736083984
Nearest to it: materiel, there, larry, boundaries, methylene, aerobatics, macrovision, submarines,
Nearest to s: dietary, besieged, usually, autistics, welfare, malvaceae, autocode, guevara,
Nearest to new: geocentric, parametric, confirm, scavenging, infected, impenetrable, embalmed, depriving,
Nearest to six: science, siliceous, strenuous, flunitrazepam, screenplay, ym, opec, syringe,
Nearest to time: expecting, methodism, mesolithic, thorns, manzikert, tess, trunks, rectum,
Nearest to people: mango, attract, clothes, secessionist, superintendence, profiles, flawless, acquis,
Nearest to there: kyanite, strawberry, mak, stanzas, tsvetaeva, stockpile, inheriting, macross,
Nearest to three: exporter, meteors, management, jakob, eclecticism, genitive, nephites, endangering,
Nearest to united: outstripped, clinging, yvonne, joked, connolly, ineffectiveness, bytes, nazareth,
Nearest to over: southeast, verdeans, debaters, spate, lair, solstice, minimi, sloti,
Nearest to has: coarse, electing, instruct, jaffe, vertical, libration, discontinuities, canvas,
Nearest to also: lager, immaculate, anemometers, arctos, crisp, anteaters, belgium, blackwell,
Nearest to five: utr, tachyon, corroborating, sultanates, roskilde, disillusionment, ingredient, built,
Nearest to in: dns, dreamer, conjunctions, jeremiah, whims, consignee, methanol, educating,
Nearest to that: doses, mathematische, main, prepositional, gash, ryaku, alcoholic, fleischmann,
Nearest to had: cke, tte, accessing, coherence, submission, peacocks, war, uk,
Average loss at step 2000 : 113.675760889
Average loss at step 4000 : 53.0733066626
Average loss at step 6000 : 32.8648398995
Average loss at step 8000 : 23.5830843294
Average loss at step 10000 : 17.5470844959
Nearest to it: there, richer, kiang, control, it, patriarch, each, event,
Nearest to s: dietary, usually, vessels, welfare, deviation, besieged, om, britain,
Nearest to new: and, UNK, the, is, reconcile, parents, two, thousands,
Nearest to six: science, UNK, adopted, deluded, ten, ell, widehat, weakness,
Nearest to time: lithium, expecting, methodism, UNK, humor, sometimes, band, diver,
Nearest to people: wrote, psychology, attack, UNK, luncheon, attract, yamato, sided,
Nearest to there: two, zero, arch, kiang, widehat, seven, one, UNK,
Nearest to three: management, sacrament, clockwork, pair, spins, echidna, mob, patience,
Nearest to united: woke, kiang, eph, husband, spectroscopy, voiceless, rain, pigeon,
Nearest to over: and, of, from, on, for, to, UNK, by,
Nearest to has: UNK, in, and, as, was, with, supported, ferus,
Nearest to also: flagella, requests, can, blackwell, UNK, belgium, exile, hermann,
Nearest to five: nine, disillusionment, UNK, utr, iteration, may, hilbert, built,
Nearest to in: it, dns, readers, judaism, zero, so, jeremiah, and,
Nearest to that: in, main, UNK, waterford, one, and, of, doses,
Nearest to had: is, by, supports, in, dec, has, so, and,
Average loss at step 12000 : 13.7723090439
Average loss at step 14000 : 11.5381830882
Average loss at step 16000 : 9.67109122252
Average loss at step 18000 : 8.84459714115
Average loss at step 20000 : 8.18229899728
Nearest to it: there, richer, it, taunus, that, which, still, patriarch,
Nearest to s: dietary, usually, vessels, om, welfare, should, deviation, sui,
Nearest to new: and, two, brie, zero, three, eight, is, taunus,
Nearest to six: science, strenuous, lindsay, litre, deluded, ten, drag, weakness,
Nearest to time: lithium, ovary, expecting, coolidge, methodism, diver, part, humor,
Nearest to people: is, wrote, marino, amine, had, psychology, by, galesburg,
Nearest to there: zero, seven, two, nine, eight, one, kiang, six,
Nearest to three: management, galesburg, sacrament, conjunctions, above, narrators, exporter, clockwork,
Nearest to united: to, bluffs, woke, eph, can, is, edged, will,
Nearest to over: and, of, from, at, for, on, by, kiang,
Nearest to has: and, was, in, as, for, amine, with, from,
Nearest to also: can, flagella, craft, blackwell, requests, neo, to, nt,
Nearest to five: may, to, iteration, disillusionment, nine, utr, cauchy, diocese,
Nearest to in: it, dns, xs, she, who, and, readers, so,
Nearest to that: in, main, benzyl, for, with, and, hak, waterford,
Nearest to had: is, by, has, and, were, dec, taunus, are,
Average loss at step 22000 : 7.31877236044
Average loss at step 24000 : 6.7772724818
Average loss at step 26000 : 7.22241799635
Average loss at step 28000 : 6.13721638286
Average loss at step 30000 : 6.07491606796
Nearest to it: there, richer, it, which, that, still, taunus, interrupt,
Nearest to s: dietary, should, vessels, usually, inertia, welfare, om, deviation,
Nearest to new: and, barb, tetra, seven, three, brie, six, eight,
Nearest to six: science, strenuous, modula, they, deluded, four, flunitrazepam, lindsay,
Nearest to time: amd, coolidge, lithium, ovary, methodism, diver, expecting, auschwitz,
Nearest to people: had, be, was, by, marino, were, wrote, amine,
Nearest to there: seven, eight, six, two, three, five, one, nine,
Nearest to three: d, galesburg, management, narrators, koontz, ffts, conjunctions, above,
Nearest to united: will, can, to, may, eph, bluffs, woke, rain,
Nearest to over: and, on, at, from, for, of, with, nine,
Nearest to has: was, in, with, and, as, for, bytecode, from,
Nearest to also: can, lager, to, flagella, nt, would, requests, craft,
Nearest to five: may, should, would, will, to, could, iteration, disillusionment,
Nearest to in: it, who, she, they, dns, xs, and, so,
Nearest to that: in, main, and, benzyl, for, waterford, bytecode, hak,
Nearest to had: is, were, by, has, sparc, frites, be, sibilant,
Average loss at step 32000 : 5.8268452996
Average loss at step 34000 : 5.61028611898
Average loss at step 36000 : 5.45080244505
Average loss at step 38000 : 5.56756269145
Average loss at step 40000 : 5.57747373378
Nearest to it: there, richer, which, still, it, that, not, taunus,
Nearest to s: dietary, when, and, should, where, vessels, om, inertia,
Nearest to new: and, barb, tetra, gru, glas, stadtbahn, eight, seven,
Nearest to six: they, strenuous, science, gino, we, modula, deluded, lindsay,
Nearest to time: coolidge, ovary, diver, lithium, glas, methodism, amd, gru,
Nearest to people: be, was, had, by, were, galesburg, amine, marino,
Nearest to there: six, five, eight, seven, three, zero, two, nine,
Nearest to three: d, management, narrators, mm, above, sacrament, celestial, galesburg,
Nearest to united: can, will, may, to, should, steen, must, rain,
Nearest to over: and, at, from, on, with, of, for, taunus,
Nearest to has: was, in, with, for, from, be, bytecode, and,
Nearest to also: can, would, will, should, lager, arctos, flagella, nt,
Nearest to five: may, would, should, will, could, must, to, might,
Nearest to in: it, she, who, they, dns, xs, and, toppled,
Nearest to that: in, main, gru, benzyl, and, from, hak, waterford,
Nearest to had: is, were, has, by, had, proleptic, sparc, be,
Average loss at step 42000 : 5.45841832423
Average loss at step 44000 : 5.33954357231
Average loss at step 46000 : 5.28882217085
Average loss at step 48000 : 5.19656235886
Average loss at step 50000 : 5.02684125626
Nearest to it: which, there, richer, that, still, it, sometimes, taunus,
Nearest to s: dietary, when, where, should, vessels, inertia, om, gordie,
Nearest to new: and, barb, tetra, gru, glas, stadtbahn, brie, UNK,
Nearest to six: they, strenuous, we, science, siliceous, gino, she, ii,
Nearest to time: coolidge, more, ovary, diver, auschwitz, glas, gru, methodism,
Nearest to people: was, be, had, were, by, marino, become, galesburg,
Nearest to there: six, five, seven, three, eight, two, zero, nine,
Nearest to three: d, UNK, narrators, eight, mm, management, celestial, four,
Nearest to united: can, will, may, to, should, must, steen, cannot,
Nearest to over: on, at, from, and, during, nine, with, of,
Nearest to has: was, bytecode, with, for, were, be, from, amine,
Nearest to also: can, would, will, should, could, lager, nt, arctos,
Nearest to five: may, would, will, should, could, must, to, might,
Nearest to in: it, she, they, who, zero, dns, there, xs,
Nearest to that: in, gru, main, benzyl, with, at, into, hak,
Nearest to had: is, were, has, had, by, proleptic, when, became,
Average loss at step 52000 : 5.11451469243
Average loss at step 54000 : 5.0601797725
Average loss at step 56000 : 4.99067440438
Average loss at step 58000 : 5.02971422517
Average loss at step 60000 : 5.02644173288
Nearest to it: which, there, still, richer, that, sometimes, not, it,
Nearest to s: when, dietary, landesverband, where, should, inertia, hyi, vessels,
Nearest to new: and, hyi, landesverband, barb, gru, tetra, stadtbahn, four,
Nearest to six: strenuous, ii, they, we, gino, siliceous, UNK, science,
Nearest to time: more, coolidge, diver, many, ass, ovary, landesverband, auschwitz,
Nearest to people: be, had, was, were, by, become, never, also,
Nearest to there: five, six, three, seven, eight, nine, two, zero,
Nearest to three: d, UNK, four, mm, narrators, directness, eight, two,
Nearest to united: can, will, may, should, must, to, cannot, could,
Nearest to over: during, at, landesverband, from, and, on, stadtbahn, hyi,
Nearest to has: was, in, with, be, landesverband, bytecode, amine, from,
Nearest to also: can, would, will, should, could, cannot, lager, nt,
Nearest to five: may, will, would, should, could, must, might, cannot,
Nearest to in: it, she, they, who, there, zero, bloody, landesverband,
Nearest to that: in, gru, benzyl, landesverband, at, main, upon, bytecode,
Nearest to had: is, has, were, had, by, be, landesverband, proleptic,
Average loss at step 62000 : 4.91109006059
Average loss at step 64000 : 4.94881534314
Average loss at step 66000 : 4.87608850288
Average loss at step 68000 : 4.89872267771
Average loss at step 70000 : 4.62613409173
Nearest to it: which, still, there, richer, sometimes, that, not, it,
Nearest to s: when, where, dietary, landesverband, should, though, inertia, hyi,
Nearest to new: and, landesverband, hyi, gru, four, than, barb, stadtbahn,
Nearest to six: they, we, strenuous, ii, gino, she, siliceous, he,
Nearest to time: more, many, coolidge, ass, ovary, diver, yamato, landesverband,
Nearest to people: be, was, become, were, had, by, galesburg, never,
Nearest to there: five, six, three, seven, eight, two, nine, landesverband,
Nearest to three: d, four, UNK, diluted, narrators, three, two, landsmannschaft,
Nearest to united: can, will, may, should, must, could, cannot, to,
Nearest to over: on, during, at, from, landesverband, and, taunus, with,
Nearest to has: was, be, bytecode, landesverband, in, were, amine, for,
Nearest to also: can, would, will, should, could, cannot, must, nt,
Nearest to five: may, will, would, should, could, must, might, cannot,
Nearest to in: it, she, they, who, there, zero, landesverband, dns,
Nearest to that: in, gru, main, landesverband, benzyl, at, and, bytecode,
Nearest to had: is, has, were, had, by, became, proleptic, be,
Average loss at step 72000 : 4.75129728556
Average loss at step 74000 : 4.84498063374
Average loss at step 76000 : 4.77063242346
Average loss at step 78000 : 4.7683170042
Average loss at step 80000 : 4.67875431442
Nearest to it: which, still, sometimes, richer, who, generally, there, that,
Nearest to s: when, where, landesverband, dietary, though, hyi, before, inertia,
Nearest to new: and, landesverband, hyi, gru, barb, than, stadtbahn, but,
Nearest to six: we, they, strenuous, ii, she, gino, deluded, t,
Nearest to time: more, many, coolidge, ass, diver, ovary, ecommerce, zut,
Nearest to people: be, become, were, was, had, by, inquest, galesburg,
Nearest to there: five, six, seven, three, eight, two, zero, nine,
Nearest to three: d, UNK, four, diluted, three, narrators, m, riso,
Nearest to united: will, can, may, should, must, could, might, cannot,
Nearest to over: during, at, landesverband, on, nine, hyi, from, stadtbahn,
Nearest to has: was, be, with, as, in, bytecode, landesverband, were,
Nearest to also: can, would, will, could, should, cannot, must, might,
Nearest to five: may, will, would, should, could, must, might, cannot,
Nearest to in: it, she, they, who, there, seven, zero, dns,
Nearest to that: in, main, gru, upon, landesverband, benzyl, into, through,
Nearest to had: is, were, has, had, became, proleptic, by, be,
Average loss at step 82000 : 4.69178952706
Average loss at step 84000 : 4.48726032615
Average loss at step 86000 : 4.4134451797
Average loss at step 88000 : 4.48546737504
Average loss at step 90000 : 4.51784078217
Nearest to it: which, still, sometimes, richer, who, it, generally, often,
Nearest to s: when, where, landesverband, dietary, though, before, should, but,
Nearest to new: and, landesverband, hyi, barb, gru, than, but, stadtbahn,
Nearest to six: we, they, ii, strenuous, t, you, she, icj,
Nearest to time: more, many, coolidge, ass, diver, ovary, highly, zut,
Nearest to people: be, become, was, had, were, by, inquest, galesburg,
Nearest to there: five, six, seven, three, eight, zero, nine, two,
Nearest to three: d, clockwork, celestial, closing, fenrir, canadian, french, ffts,
Nearest to united: will, can, may, should, must, could, might, cannot,
Nearest to over: during, at, from, landesverband, of, within, on, hyi,
Nearest to has: bytecode, be, was, landesverband, were, for, with, is,
Nearest to also: can, would, will, could, should, cannot, must, might,
Nearest to five: will, would, may, should, could, must, might, cannot,
Nearest to in: it, she, they, who, there, but, landesverband, and,
Nearest to that: in, gru, main, upon, two, at, benzyl, landesverband,
Nearest to had: is, had, has, were, became, when, be, landesverband,
Average loss at step 92000 : 4.58907672191
Average loss at step 94000 : 4.60324448931
Average loss at step 96000 : 4.62912453699
Average loss at step 98000 : 4.52983195758
Average loss at step 100000 : 4.70787091136
Nearest to it: which, still, sometimes, often, richer, now, generally, then,
Nearest to s: when, where, landesverband, dietary, should, though, before, hyi,
Nearest to new: and, landesverband, hyi, barb, gru, than, stadtbahn, taunus,
Nearest to six: we, they, ii, strenuous, you, t, she, deluded,
Nearest to time: more, many, coolidge, some, ass, highly, ovary, diver,
Nearest to people: be, become, was, had, were, by, inquest, galesburg,
Nearest to there: five, three, six, seven, eight, two, landesverband, zero,
Nearest to three: d, nine, fenrir, clockwork, ffts, celestial, riso, closing,
Nearest to united: will, can, may, should, could, must, might, cannot,
Nearest to over: during, at, landesverband, from, on, hyi, within, stadtbahn,
Nearest to has: was, bytecode, with, be, for, landesverband, in, were,
Nearest to also: can, would, could, will, should, cannot, must, might,
Nearest to five: will, would, could, may, should, must, might, cannot,
Nearest to in: she, it, they, who, there, but, zero, zut,
Nearest to that: in, upon, gru, through, two, benzyl, main, at,
Nearest to had: is, has, had, were, became, proleptic, by, been,
In [80]:
sim[0,:].argsort()[1:9]
Out[80]:
array([ 0.14506491, -0.12284749, 0.00486149, ..., 0.13809943,
0.2045745 , -0.09087169], dtype=float32)
In [92]:
with tf.Session(graph=graph) as session:
# We must initialize all variables before we use them.
init.run()
print("Initialized")
average_loss = 0
for step in xrange(num_steps):
batch_inputs, batch_labels = generate_batch(
batch_size, num_skips, skip_window)
feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
# We perform one update step by evaluating the optimizer op (including it
# in the list of returned values for session.run()
_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += loss_val
if step % 2000 == 0:
if step > 0:
average_loss /= 2000
# The average loss is an estimate of the loss over the last 2000 batches.
print("Average loss at step ", step, ": ", average_loss)
average_loss = 0
# Note that this is expensive (~20% slowdown if computed every 500 steps)
if step % 10000 == 0:
sim = similarity.eval()
for i in xrange(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k + 1]
print(nearest)
print(-sim[i, :].argsort()[1:8])
log_str = "Nearest to %s:" % valid_word
for k in xrange(top_k):
close_word = reverse_dictionary[nearest[k]]
log_str = "%s %s," % (log_str, close_word)
print(log_str)
final_embeddings = normalized_embeddings.eval()
Initialized
Average loss at step 0 : 267.653686523
[23017 37668 20779 40596 49225 22181 15769 43092]
[-19431 -6871 -35573 -11581 -26246 -158 -5486]
Nearest to it: menelik, unscathed, hannover, inositol, mutinies, jays, courtyard, abol,
[39444 14898 15037 253 25775 4823 15385 37577]
[ -3372 -45240 -46665 -316 -45683 -42560 -17680]
Nearest to s: bst, undertaking, essenes, given, antiparticle, kick, heather, amulets,
[ 5903 35781 2644 25624 36358 14144 41578 11243]
[-44743 -3361 -5499 -17097 -36631 -43582 -33402]
Nearest to new: merchants, leucine, selling, pai, meander, pricing, podkopayeva, lone,
[49260 32636 37792 18909 42806 29688 15682 41013]
[-35746 -45707 -28887 -27463 -5251 -27165 -23507]
Nearest to six: cinq, bulky, magnetopause, clare, somber, andalus, seleucid, grappelli,
[20030 14446 27308 32091 22815 5447 6657 9685]
[-40277 -3259 -27551 -10944 -7551 -24546 -39634]
Nearest to time: dryden, immersion, outwardly, philippi, barth, viruses, rotating, proofs,
[31109 2635 26359 20538 9847 10636 26306 49823]
[-47708 -48278 -3449 -3204 -23017 -5894 -28781]
Nearest to people: squeezed, harry, antiparticles, stigma, catcher, radicals, chamorro, dwan,
[13459 29231 39063 33468 48458 1600 9191 38290]
[-26961 -49831 -42283 -32985 -40115 -40326 -18662]
Nearest to there: secretariat, overfishing, coens, inappropriately, isospin, winning, defects, suvs,
[15558 3390 18952 42936 3347 4767 25593 20553]
[-45016 -45842 -19470 -16778 -11527 -40040 -49831]
Nearest to three: radcliffe, fighter, pohl, timeout, seconds, convinced, kalmar, tunings,
[20169 24911 26849 17138 28076 20595 14942 9926]
[ -5543 -30130 -27931 -43727 -39159 -7927 -48209]
Nearest to united: yamato, hamburger, carnatic, explanatory, wrongful, encyclopedic, airliner, complaint,
[31641 13614 5062 6990 6060 31938 41788 24189]
[-47953 -47313 -1549 -33828 -12352 -4629 -16517]
Nearest to over: glycol, rupert, latitude, analytic, senses, renting, istv, singularities,
[ 3005 48251 35086 8387 792 29220 30588 35183]
[ -1275 -27658 -43439 -26604 -43495 -23704 -17998]
Nearest to has: execution, newsreels, flirting, precious, peace, neanderthal, tantra, tov,
[29086 44422 2121 26496 39289 18254 32625 14867]
[-41716 -13622 -49166 -13495 -24972 -21518 -32721]
Nearest to also: obsidian, fiesole, cable, guessing, unconquered, revolts, mathematik, nursing,
[14473 15672 17536 45503 9325 44134 44026 3050]
[ -5886 -10869 -43279 -17069 -7071 -34870 -104]
Nearest to five: framed, minerva, detectable, dever, centred, pwnage, fairway, residence,
[42795 18458 28605 25338 7191 7133 2862 39895]
[ -983 -11790 -11894 -34411 -31675 -8378 -21107]
Nearest to in: auditioning, dea, unwieldy, cartilage, cartoonist, junction, spaces, preis,
[10601 47267 20049 13581 5912 27067 33187 7477]
[-37048 -16705 -22946 -12769 -40029 -23071 -26514]
Nearest to that: sanctioned, occidentalis, transsexual, navarre, reduces, alexandrine, lakers, infections,
[ 6826 5386 48246 30123 5310 10635 49416 39416]
[ -7026 -20849 -36185 -969 -28863 -43325 -565]
Nearest to had: bowling, commitment, weaning, stately, throw, fermi, eusebio, pentecostalism,
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-92-deb0fe55db90> in <module>()
12 # We perform one update step by evaluating the optimizer op (including it
13 # in the list of returned values for session.run()
---> 14 _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
15 average_loss += loss_val
16
/usr/local/lib/python3.5/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
764 try:
765 result = self._run(None, fetches, feed_dict, options_ptr,
--> 766 run_metadata_ptr)
767 if run_metadata:
768 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/usr/local/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
962 if final_fetches or final_targets:
963 results = self._do_run(handle, final_targets, final_fetches,
--> 964 feed_dict_string, options, run_metadata)
965 else:
966 results = []
/usr/local/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1012 if handle is None:
1013 return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1014 target_list, options, run_metadata)
1015 else:
1016 return self._do_call(_prun_fn, self._session, handle, feed_dict,
/usr/local/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1019 def _do_call(self, fn, *args):
1020 try:
-> 1021 return fn(*args)
1022 except errors.OpError as e:
1023 message = compat.as_text(e.message)
/usr/local/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
1001 return tf_session.TF_Run(session, options,
1002 feed_dict, fetch_list, target_list,
-> 1003 status, run_metadata)
1004
1005 def _prun_fn(session, handle, feed_dict, fetch_list):
KeyboardInterrupt:
In [99]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
plt.figure(figsize=(18,18))
Out[99]:
<matplotlib.figure.Figure at 0x15181bc18>
In [106]:
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
plot_only = 500
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
labels = [reverse_dictionary[i] for i in xrange(plot_only)]
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
plt.annotate(label, xy=(x,y), xytext=(5,2), textcoords='offset points', ha='right', va='bottom')
In [107]:
plt.show()
In [108]:
type(final_embeddings)
Out[108]:
numpy.ndarray
In [110]:
final_embeddings[:50, :]
Out[110]:
array([[ 0.02535484, -0.21756044, -0.04685999, ..., -0.04695626,
0.13432126, -0.21053489],
[-0.04397225, -0.07383451, -0.04950885, ..., 0.04359524,
0.05993943, -0.0320558 ],
[ 0.06078843, -0.08979254, -0.02595252, ..., 0.08226719,
0.18126218, -0.05112049],
...,
[-0.03486119, -0.20499289, -0.08066029, ..., 0.12775902,
-0.14620858, -0.06338742],
[ 0.05147786, 0.0296588 , -0.13042139, ..., -0.00699105,
-0.01828911, -0.06534279],
[-0.0376003 , 0.02312498, 0.02433509, ..., 0.15502302,
-0.08109225, 0.01435901]], dtype=float32)
In [112]:
low_dim_embs.shape
Out[112]:
(500, 2)
In [113]:
labels
Out[113]:
['UNK',
'the',
'of',
'and',
'one',
'in',
'a',
'to',
'zero',
'nine',
'two',
'is',
'as',
'eight',
'for',
's',
'five',
'three',
'was',
'by',
'that',
'four',
'six',
'seven',
'with',
'on',
'are',
'it',
'from',
'or',
'his',
'an',
'be',
'this',
'which',
'at',
'he',
'also',
'not',
'have',
'were',
'has',
'but',
'other',
'their',
'its',
'first',
'they',
'some',
'had',
'all',
'more',
'most',
'can',
'been',
'such',
'many',
'who',
'new',
'used',
'there',
'after',
'when',
'into',
'american',
'time',
'these',
'only',
'see',
'may',
'than',
'world',
'i',
'b',
'would',
'd',
'no',
'however',
'between',
'about',
'over',
'years',
'states',
'people',
'war',
'during',
'united',
'known',
'if',
'called',
'use',
'th',
'system',
'often',
'state',
'so',
'history',
'will',
'up',
'while',
'where',
'city',
'being',
'english',
'then',
'any',
'both',
'under',
'out',
'made',
'well',
'her',
'e',
'number',
'government',
'them',
'm',
'later',
'since',
'him',
'part',
'name',
'c',
'century',
'through',
'because',
'x',
'university',
'early',
'life',
'british',
'year',
'like',
'same',
'including',
'became',
'example',
'day',
'each',
'even',
'work',
'language',
'although',
'several',
'form',
'john',
'u',
'national',
'very',
'much',
'g',
'french',
'before',
'general',
'what',
't',
'against',
'n',
'high',
'links',
'could',
'based',
'those',
'now',
'second',
'de',
'music',
'another',
'large',
'she',
'f',
'external',
'german',
'different',
'modern',
'great',
'do',
'common',
'set',
'list',
'south',
'series',
'major',
'game',
'power',
'long',
'country',
'king',
'law',
'group',
'film',
'still',
'until',
'north',
'international',
'term',
'we',
'end',
'book',
'found',
'own',
'political',
'party',
'order',
'usually',
'president',
'church',
'you',
'death',
'theory',
'area',
'around',
'include',
'god',
'ii',
'way',
'did',
'military',
'population',
'using',
'though',
'small',
'following',
'within',
'non',
'human',
'left',
'main',
'among',
'point',
'r',
'due',
'p',
'considered',
'public',
'popular',
'computer',
'west',
'family',
'east',
'information',
'important',
'european',
'man',
'sometimes',
'right',
'old',
'free',
'word',
'without',
'last',
'us',
'members',
'given',
'times',
'roman',
'make',
'h',
'age',
'place',
'l',
'thus',
'science',
'case',
'become',
'systems',
'union',
'born',
'york',
'line',
'countries',
'does',
'isbn',
'st',
'control',
'various',
'others',
'house',
'article',
'island',
'should',
'led',
'back',
'period',
'player',
'europe',
'languages',
'central',
'water',
'few',
'western',
'home',
'began',
'generally',
'less',
'k',
'similar',
'written',
'original',
'best',
'must',
'according',
'school',
'france',
'air',
'single',
'force',
'v',
'land',
'groups',
'down',
'how',
'works',
'development',
'official',
'support',
'england',
'j',
'rather',
'space',
'data',
'greek',
'km',
'named',
'germany',
'just',
'games',
'said',
'version',
'late',
'earth',
'company',
'every',
'economic',
'short',
'published',
'black',
'army',
'off',
'london',
'million',
'body',
'field',
'christian',
'either',
'social',
'empire',
'o',
'developed',
'standard',
'court',
'service',
'kingdom',
'along',
'college',
'republic',
'sea',
'america',
'today',
'result',
'held',
'team',
'light',
'means',
'never',
'especially',
'third',
'further',
'character',
'forces',
'take',
'men',
'society',
'show',
'open',
'possible',
'fact',
'battle',
'took',
'former',
'books',
'soviet',
'river',
'children',
'having',
'good',
'local',
'son',
'current',
'process',
'natural',
'present',
'himself',
'islands',
'total',
'near',
'white',
'days',
'person',
'itself',
'seen',
'culture',
'little',
'above',
'software',
'largest',
'words',
'upon',
'level',
'father',
'side',
'created',
'red',
'references',
'press',
'full',
'region',
'almost',
'image',
'al',
'famous',
'play',
'came',
'role',
'once',
'certain',
'league',
'jewish',
'james',
'january',
'site',
'again',
'art',
'numbers',
'member',
'areas',
'movement',
'religious',
'type',
'march',
'community',
'story',
'played',
'production',
'released',
'center',
'rights',
'real',
'related',
'foreign',
'low',
'ancient',
'terms',
'view',
'source',
'act',
'minister',
'change',
'energy',
'produced',
'research',
'actor',
'making',
'december',
'civil',
'women',
'special',
'style',
'japanese',
'design',
'william',
'available',
'chinese',
'forms',
'canada',
'northern',
'died',
'class',
'living',
'next',
'particular',
'program',
'council',
'television',
'head',
'david',
'china',
'middle',
'established',
'hand',
'bc',
'far',
'july',
'function',
'position',
'y',
'built',
'george',
'band',
'together']
In [115]:
low_dim_embs[3,:]
Out[115]:
array([ 18.15663014, -13.72264153])
In [116]:
valid_examples
Out[116]:
array([27, 15, 58, 22, 65, 83, 60, 17, 86, 80, 41, 37, 16, 5, 20, 49])
In [118]:
valid_word_t = reverse_dictionary[27]
In [119]:
valid_word_t
Out[119]:
'it'
In [ ]:
Content source: LogicWang/ml
Similar notebooks: