In [3]:
import tensorflow as tf
import collections
import math
import os
import random
import zipfile

import numpy as np

In [4]:
with open('./text8', 'r') as f:
    words = tf.compat.as_str(f.read()).split()
print('Data size', len(words))


('Data size', 17005207)

In [5]:
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000


def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count += 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
del words  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])


('Most common words (+UNK)', [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)])
('Sample data', [5239, 3084, 12, 6, 195, 2, 3137, 46, 59, 156], ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against'])

In [6]:
dictionary


Out[6]:
{'fawn': 45848,
 'homomorphism': 9648,
 'nordisk': 39343,
 'nunnery': 36075,
 'chthonic': 33554,
 'sowell': 40562,
 'sonja': 38175,
 'showa': 32906,
 'woods': 6263,
 'hsv': 44222,
 'spiders': 14623,
 'hanging': 8021,
 'woody': 11150,
 'comically': 38935,
 'localized': 16716,
 'schlegel': 39763,
 'sevens': 47931,
 'canes': 30965,
 'sprague': 19496,
 'chatter': 45028,
 'orthographies': 37085,
 'nanking': 40997,
 'originality': 19567,
 'mutinies': 47932,
 'alphabetic': 13925,
 'hermann': 6482,
 'cytochrome': 26060,
 'brocklin': 43516,
 'stipulate': 42056,
 'eugenics': 7421,
 'cofinality': 36724,
 'danmarks': 46453,
 'capoeira': 8971,
 'appropriation': 21010,
 'holyrood': 49046,
 'taj': 29360,
 'strictest': 30374,
 'bringing': 3627,
 'wooded': 18967,
 'liaisons': 38573,
 'vibrational': 25367,
 'umts': 33555,
 'wooden': 4779,
 'wednesday': 13043,
 'circuitry': 14992,
 'elgar': 16456,
 'stereotypical': 15093,
 'immunities': 38936,
 'pantheistic': 30327,
 'thrace': 12647,
 'kublai': 30769,
 'insular': 13743,
 'grainy': 48065,
 'francesco': 8358,
 'feasibility': 21379,
 'miniatures': 33046,
 'deadheads': 44612,
 'selassie': 16717,
 'gorman': 38176,
 'sustaining': 18689,
 'consenting': 36077,
 'kodak': 27685,
 'prosody': 30543,
 'inanimate': 16353,
 'dormancy': 46481,
 'errors': 4065,
 'semicircular': 35237,
 'tiered': 33047,
 'centimeter': 35090,
 'cooking': 5083,
 'paphos': 46482,
 'usenet': 7637,
 'vassals': 16425,
 'designing': 8827,
 'numeral': 8644,
 'succumb': 45241,
 'shocks': 18089,
 'evolutionism': 21633,
 'widget': 25768,
 'crouch': 37790,
 'moksha': 24751,
 'ibizan': 48812,
 'brainwashed': 45849,
 'affiliates': 22515,
 'ching': 9021,
 'mutinied': 45240,
 'china': 486,
 'galeon': 39345,
 'affiliated': 7121,
 'confronts': 31315,
 'finkel': 43517,
 'natured': 31211,
 'quart': 43747,
 'kids': 7809,
 'ansgar': 38177,
 'uplifting': 40563,
 'mechagodzilla': 32394,
 'climbed': 17582,
 'controversy': 1549,
 'kidd': 14023,
 'natures': 18345,
 'neurologist': 32815,
 'spotty': 41455,
 'climber': 21075,
 'gottlob': 29560,
 'millimetres': 29561,
 'golden': 1433,
 'topography': 12061,
 'projection': 7024,
 'wikimedia': 29185,
 'lengthen': 30329,
 'spacewalks': 43518,
 'stern': 11253,
 'battista': 19629,
 'agassi': 10639,
 'dna': 1403,
 'catchy': 30330,
 'insecurity': 21634,
 'abbreviations': 9731,
 'grahame': 43529,
 'cannibal': 24086,
 'definiteness': 38178,
 'music': 166,
 'therefore': 589,
 'dns': 7041,
 'mystic': 10296,
 'dusting': 43735,
 'sermons': 10237,
 'ungulates': 25920,
 'duals': 45850,
 'populations': 2970,
 'watercolor': 38568,
 'crossbar': 31418,
 'takla': 35809,
 'yahoo': 7089,
 'meteorologist': 38937,
 'expeditionary': 19497,
 'primeval': 29186,
 'tlp': 37729,
 'conch': 44040,
 'circumstances': 2780,
 'hj': 36140,
 'intake': 8102,
 'morally': 12781,
 'locked': 9980,
 'arndt': 39346,
 'heathland': 45851,
 'locker': 38575,
 'gershom': 42434,
 'tuatha': 27044,
 'confining': 35568,
 'bocks': 47933,
 'matilda': 12059,
 'subgenus': 30966,
 'wang': 10546,
 'wand': 28975,
 'wane': 34356,
 'unjust': 17790,
 'progenitors': 38195,
 'castell': 43133,
 'titanium': 18498,
 'want': 2154,
 'carew': 43519,
 'pinto': 28630,
 'cookery': 41970,
 'absolute': 2277,
 'ferranti': 37489,
 'wracked': 49401,
 'tchaikovsky': 32816,
 'beyer': 47137,
 'travel': 1568,
 'leutze': 42970,
 'copious': 23112,
 'playback': 12326,
 'actionscript': 35528,
 'canaries': 25769,
 'cadence': 26061,
 'addington': 45561,
 'shaitan': 31893,
 'assimilated': 13675,
 'apocrypha': 10874,
 'drenthe': 42971,
 'festus': 41456,
 'dinosaurs': 6107,
 'wrong': 2822,
 'sentencing': 21794,
 'perutz': 42972,
 'bratislava': 14429,
 'arranger': 24508,
 'vukovar': 42658,
 'recombination': 16543,
 'baldr': 19949,
 'tulip': 27201,
 'menlo': 32357,
 'torts': 46601,
 'carelessness': 47866,
 'nonsensical': 20712,
 'macmullen': 42444,
 'welcomed': 10785,
 'stoicism': 40564,
 'partnered': 29188,
 'concurrency': 28631,
 'rewarded': 13988,
 'sentry': 30862,
 'activating': 25637,
 'playhouse': 26449,
 'fir': 19751,
 'fis': 29373,
 'ligand': 14731,
 'fit': 2641,
 'fractal': 8995,
 'fix': 8041,
 'meantone': 26143,
 'uriah': 32596,
 'secede': 20369,
 'fib': 47138,
 'fic': 37431,
 'fia': 22154,
 'fig': 14260,
 'fie': 37139,
 'wales': 1946,
 'menachem': 19686,
 'fin': 13903,
 'fim': 37432,
 'songwriter': 4446,
 'vouchers': 23737,
 'hypoxic': 42447,
 'hypoxia': 30544,
 'effects': 769,
 'multidimensional': 21212,
 'sixteen': 6863,
 'undeveloped': 23638,
 'saddened': 40178,
 'ulmanis': 46806,
 'honeybee': 32358,
 'barton': 10524,
 'telekom': 45852,
 'bartos': 44627,
 'rabba': 35354,
 'arrow': 5900,
 'ingrid': 32359,
 'burial': 5500,
 'diatoms': 44106,
 'telescope': 6169,
 'zuid': 36079,
 'allah': 6099,
 'allan': 6941,
 'phoenicians': 19374,
 'parasites': 14921,
 'strips': 5656,
 'internalism': 42973,
 'veracruz': 24294,
 'tricked': 26221,
 'stabilised': 32597,
 'adapa': 34357,
 'golem': 12453,
 'mannerheim': 25638,
 'unital': 27488,
 'syd': 33476,
 'mason': 7035,
 'combinatorial': 17838,
 'encourage': 5501,
 'adapt': 9964,
 'unitas': 18810,
 'guitarists': 10271,
 'outburst': 38576,
 'abbott': 10402,
 'stamping': 38577,
 'abbots': 16109,
 'strata': 14398,
 'expressiveness': 46905,
 'pumpkins': 29374,
 'corrects': 37791,
 'estimate': 4447,
 'gameplay': 8618,
 'universally': 5362,
 'chlorine': 9629,
 'husbands': 17627,
 'competes': 19437,
 'usurpers': 45041,
 'nephites': 31212,
 'pfaff': 47139,
 'flicker': 46869,
 'ministries': 11307,
 'disturbed': 12699,
 'competed': 11132,
 'portmanteau': 24624,
 'loudness': 32137,
 'lightened': 42616,
 'exoplanets': 44500,
 'federko': 36390,
 'reintroduced': 20715,
 'humanitarians': 35811,
 'pamphilus': 39765,
 'mesmer': 20783,
 'kashima': 47935,
 'megabytes': 22893,
 'feldspar': 32360,
 'discworld': 10733,
 'trochaic': 43521,
 'dupuis': 40999,
 'antilles': 12670,
 'douard': 25306,
 'seizures': 9948,
 'rett': 44880,
 'olds': 20848,
 'bakelite': 39766,
 'renovated': 19498,
 'service': 351,
 'forrester': 26450,
 'reuben': 17684,
 'needed': 1262,
 'master': 1418,
 'dddddd': 41303,
 'genesis': 3694,
 'berlioz': 39767,
 'rewards': 13005,
 'voight': 30102,
 'organically': 41537,
 'formant': 33712,
 'pham': 47190,
 'doreen': 43050,
 'mutilated': 24855,
 'icbms': 23181,
 'positively': 12373,
 'ahmed': 7891,
 'maglev': 41473,
 'bannister': 38574,
 'duckworth': 33801,
 'awacs': 35239,
 'anniversaries': 28280,
 'regulator': 27202,
 'idle': 9169,
 'exclaimed': 25639,
 'sheen': 21995,
 'boudica': 18391,
 'silky': 40455,
 'feeling': 4316,
 'citeaux': 49404,
 'herbalists': 49405,
 'codeine': 20422,
 'pliocene': 29563,
 'spectrum': 2722,
 'pandemics': 38179,
 'increment': 22343,
 'arousal': 27203,
 'thaw': 38107,
 'urinate': 49406,
 'gyeonggi': 40975,
 'nmi': 45883,
 'dozen': 6751,
 'foundational': 14186,
 'affairs': 1704,
 'scraped': 33556,
 'wholesome': 32361,
 'courier': 18881,
 'hymen': 43523,
 'sein': 34117,
 'beltway': 31846,
 'vga': 41457,
 'primed': 45469,
 'racers': 20645,
 'toothed': 24859,
 'kremlin': 18743,
 'shipments': 21149,
 'inoue': 36667,
 'committing': 13439,
 'sugarcane': 17791,
 'limitless': 32598,
 'diminishing': 18392,
 'cinematic': 15274,
 'retroviruses': 35812,
 'metrics': 21720,
 'simplify': 14853,
 'mouth': 2827,
 'dissociatives': 37794,
 'reverence': 15946,
 'conceded': 19828,
 'resonated': 39768,
 'expanse': 37205,
 'proliferate': 36459,
 'signalled': 21762,
 'gaucho': 40406,
 'bradford': 15164,
 'treize': 40070,
 'singer': 768,
 'khattab': 42454,
 'purges': 17730,
 'vertices': 10365,
 'ragnarok': 32817,
 'multiracial': 31654,
 'tech': 7294,
 'anabaptists': 13179,
 'cyrene': 27047,
 'rakis': 34683,
 'scream': 15685,
 'saying': 1977,
 'blatantly': 31405,
 'dickey': 30107,
 'teresa': 15426,
 'jocular': 49729,
 'rediscovery': 18070,
 'hillbillies': 24267,
 'fennel': 37089,
 'ulcer': 41972,
 'tempted': 27958,
 'cheaply': 23114,
 'eliminated': 5280,
 'orleans': 5033,
 'faure': 41483,
 'clicked': 45853,
 'nontrinitarian': 48679,
 'kinase': 23331,
 'blish': 24516,
 'rico': 8195,
 'statehouse': 43524,
 'bliss': 12343,
 'rick': 7209,
 'rich': 1611,
 'macrinus': 22335,
 'rice': 2973,
 'rica': 5363,
 'caribs': 26451,
 'plate': 3257,
 'plata': 22337,
 'plato': 5224,
 'umbilical': 25770,
 'firenze': 29385,
 'platt': 25368,
 'photoelectric': 25098,
 'altogether': 6118,
 'bhfiann': 33315,
 'platz': 36726,
 'superfamily': 19631,
 'jaguar': 8619,
 'rectal': 41472,
 'nicely': 36727,
 'ennedi': 40565,
 'boarder': 45244,
 'patch': 7928,
 'eyelids': 47937,
 'ldp': 19874,
 'boarded': 22630,
 'aikidoka': 38180,
 'conant': 47944,
 'programmatic': 40566,
 'scolded': 47644,
 'clarified': 16959,
 'sensitivity': 8492,
 'irritable': 26928,
 'tyndale': 25099,
 'eschewing': 47938,
 'erika': 33557,
 'tyndall': 25100,
 'clarifies': 40179,
 'shunting': 43415,
 'playfulness': 46485,
 'lots': 7853,
 'irr': 37433,
 'irs': 26753,
 'irt': 44614,
 'iru': 37434,
 'irv': 16110,
 'lott': 28976,
 'xvi': 10117,
 'loti': 39769,
 'irl': 42435,
 'irn': 18744,
 'conductive': 19223,
 'ira': 4780,
 'adnan': 28923,
 'irc': 4925,
 'proteases': 39468,
 'ire': 20565,
 'volap': 49983,
 'discipline': 3578,
 'redistricting': 35529,
 'natura': 28633,
 'extend': 3871,
 'nature': 544,
 'pharisee': 38181,
 'extent': 1869,
 'tendons': 40567,
 'inflorescence': 43525,
 'tyranny': 14293,
 'airflow': 16544,
 'veer': 35813,
 'seleucids': 39770,
 'himalayas': 21878,
 'heating': 6236,
 'incense': 28121,
 'fruity': 36489,
 'wenham': 45154,
 'himalayan': 22997,
 'southeastern': 7317,
 'moravia': 11401,
 'eradicate': 21721,
 'libyan': 8421,
 'assur': 39771,
 'gopher': 11908,
 'gypsies': 21076,
 'rooks': 49613,
 'knossos': 22517,
 'basque': 3200,
 'intrauterine': 35336,
 'blonde': 11888,
 'organisational': 27204,
 'godhead': 19687,
 'fra': 14854,
 'frg': 35530,
 'penderecki': 32908,
 'icebergs': 49554,
 'union': 266,
 'guybrush': 30358,
 'fro': 41459,
 'cue': 7223,
 'enumerative': 45245,
 'frs': 23967,
 'much': 149,
 'wyman': 37795,
 'progenitor': 24752,
 'unleavened': 34358,
 'fry': 14316,
 'tallest': 7580,
 'meribbaal': 46432,
 'obese': 27959,
 'retrospect': 22429,
 'spit': 20171,
 'cotangent': 30332,
 'conifers': 33558,
 'freehold': 33327,
 'davy': 10956,
 'dave': 3736,
 'invalidated': 29896,
 'doubts': 12225,
 'spin': 3401,
 'propellants': 31895,
 'wildcat': 31909,
 'participatory': 24407,
 'yerushalayim': 47940,
 'professionally': 17414,
 'employ': 6209,
 'decapitated': 47414,
 'prostrate': 28281,
 'elaborate': 4775,
 'shirow': 31655,
 'moraines': 35974,
 'majin': 36081,
 'rebus': 49564,
 'kohl': 26452,
 'conditioned': 14993,
 'elmira': 42974,
 'eighteen': 7732,
 'zoroaster': 31455,
 'musicianship': 35801,
 'oxymoron': 41460,
 'hone': 39772,
 'hong': 1644,
 'memoriam': 35509,
 'mummified': 27043,
 'democracies': 10495,
 'conformed': 35430,
 'yggdrasil': 41461,
 'split': 1901,
 'codename': 21077,
 'dunkirk': 29756,
 'kinetochores': 37090,
 'outstripped': 41581,
 'boiled': 12720,
 'kilocalories': 46924,
 'myocardial': 29746,
 'inadvertently': 16354,
 'kushner': 36743,
 'frenchmen': 35531,
 'vivien': 49407,
 'qualifications': 13065,
 'workforce': 15603,
 'emphases': 42437,
 'marched': 8632,
 'boiler': 32600,
 'issas': 41462,
 'supper': 11207,
 'abendana': 48681,
 'wct': 44017,
 'wcw': 33516,
 'peremptory': 42436,
 'mentors': 38939,
 'academic': 1950,
 'stillness': 44616,
 'academia': 13406,
 'academie': 45855,
 'corporate': 2774,
 'plaque': 10771,
 'outlived': 33559,
 'arcologies': 44108,
 'appropriately': 13876,
 'epochs': 25074,
 'spassky': 34072,
 'linehan': 47941,
 'edessa': 23862,
 'psychometrics': 43528,
 'homogeneity': 20919,
 'lassa': 28784,
 'belloc': 30108,
 'fela': 41185,
 'portrayed': 3777,
 'lasso': 38182,
 'hai': 23113,
 'ih': 32442,
 'hak': 33316,
 'hal': 7280,
 'ham': 7908,
 'han': 6937,
 'hao': 45856,
 'hab': 30545,
 'espouses': 40181,
 'had': 49,
 'advancement': 9127,
 'hag': 39774,
 'keynesians': 36082,
 'fortran': 7830,
 'mcnamara': 24625,
 'beloved': 7785,
 'haq': 36728,
 'har': 29747,
 'has': 41,
 'hat': 4680,
 'hau': 48684,
 'haw': 23968,
 'municipal': 4576,
 'osman': 34645,
 'elders': 10430,
 'survival': 4418,
 'palahniuk': 47662,
 'unequivocally': 31896,
 'objective': 4057,
 'otherworldly': 41000,
 'indicative': 9659,
 'clustered': 24517,
 'shadow': 4538,
 'hamas': 6441,
 'istat': 39350,
 'masonic': 11793,
 'pennies': 36632,
 'bernd': 36168,
 'longbowmen': 39351,
 'alice': 4439,
 'niels': 16587,
 'festivities': 21213,
 'businessweek': 37022,
 'misdemeanors': 30968,
 'warping': 37091,
 'attorney': 4889,
 'crowd': 4942,
 'dianetic': 22998,
 'czech': 2597,
 'mosques': 9578,
 'rer': 13626,
 'crown': 1733,
 'topping': 22060,
 'proportionally': 30772,
 'deflection': 17731,
 'captive': 11192,
 'choctaw': 14995,
 'billboard': 8337,
 'fiduciary': 40568,
 'bottom': 2189,
 'applescript': 44617,
 'inhuman': 26453,
 'plucked': 21380,
 'ichij': 36083,
 'locksmithing': 38971,
 'syphilis': 18627,
 'monogamy': 17366,
 'agadir': 43554,
 'howell': 25497,
 'barcode': 30333,
 'eduard': 15240,
 'binder': 28446,
 'brigades': 9579,
 'starring': 2999,
 'anagram': 16960,
 'accelerations': 30476,
 'stoker': 12275,
 'stokes': 16038,
 'restlessness': 33803,
 'benches': 25369,
 'ades': 48229,
 'anomalous': 15201,
 'bicentennial': 29191,
 'oneness': 20370,
 'adel': 41973,
 'aden': 26063,
 'ribozymes': 47142,
 'obscenity': 24088,
 'lemmings': 29748,
 'flambards': 47943,
 'maxwell': 4681,
 'marshall': 3143,
 'honeymoon': 20028,
 'nongovernmental': 42438,
 'mba': 39352,
 'bulgar': 42975,
 'administer': 13026,
 'beings': 2853,
 'lieben': 38209,
 'marshals': 20850,
 'undertakings': 32671,
 'telomeres': 42339,
 'hallucinogenic': 19724,
 'shoots': 11463,
 'clarinetists': 48068,
 'stomping': 44858,
 'despised': 18968,
 'fabric': 8679,
 'tamm': 49410,
 'altitude': 5270,
 'diocese': 8829,
 'raped': 20566,
 'scheele': 38756,
 'grasping': 30109,
 'greatness': 15463,
 'rapes': 44618,
 'alexandrian': 14778,
 'subkey': 47363,
 'tama': 40182,
 'distortions': 24401,
 'denoting': 15125,
 'thesaurus': 26185,
 'verde': 8226,
 'safeguard': 19829,
 'verdi': 11955,
 'duel': 12982,
 'toolkits': 45857,
 'azo': 42976,
 'arrays': 7492,
 'musik': 22059,
 'cervix': 29846,
 'facchetti': 42702,
 'complications': 9090,
 'pesos': 28447,
 'smashed': 28197,
 'duet': 15134,
 'azt': 36391,
 'dues': 34074,
 'passenger': 3230,
 'disgrace': 29749,
 'moderne': 33804,
 'barrymore': 17934,
 'publica': 34844,
 'minas': 21722,
 'fanzine': 16551,
 'railhead': 29052,
 'voroshilov': 29933,
 'signification': 38580,
 'paperwork': 36544,
 'pubmed': 27166,
 'scuttled': 26558,
 'triangles': 13529,
 'eventual': 5943,
 'dowling': 42977,
 'cambodia': 5273,
 'pasadena': 18216,
 'role': 423,
 'chemnitz': 40431,
 'taney': 40183,
 'rolf': 32601,
 'intrusions': 37739,
 'transgender': 15531,
 'roll': 2598,
 'diagonals': 43530,
 'anisotropies': 34258,
 'cabrera': 37989,
 'edom': 20851,
 'intend': 14732,
 'palms': 25101,
 'ointment': 39353,
 'outage': 40184,
 'transported': 7862,
 'palme': 31897,
 'conservatively': 47143,
 'acquaintance': 14941,
 'yellowstone': 21156,
 'smelling': 31420,
 'variable': 2555,
 'transporter': 26612,
 'aragorn': 41464,
 'hawker': 19233,
 'steffens': 45429,
 'explosions': 15126,
 'loren': 28977,
 'dreamers': 25771,
 'shootout': 22338,
 'bnls': 37797,
 'ordination': 11518,
 'hexokinase': 47945,
 'overturned': 15464,
 'gown': 37092,
 'osz': 28793,
 'cincinnati': 4520,
 'chaim': 23333,
 'chain': 1952,
 'whoever': 13044,
 'oss': 24410,
 'osr': 45248,
 'osu': 45249,
 'ost': 38429,
 'idealistic': 24374,
 'mongoloids': 48685,
 'osi': 22245,
 'osh': 41002,
 'aleman': 47946,
 'childe': 25921,
 'bandits': 21976,
 'brainchild': 37560,
 'chair': 5404,
 'macht': 36086,
 'ballet': 8200,
 'amplification': 12846,
 'freelance': 19100,
 'megawatts': 37437,
 'crates': 41974,
 'crater': 7083,
 'macha': 34955,
 'adventist': 26448,
 'oversight': 14855,
 'tenacious': 38185,
 'nouvelle': 31148,
 'downloading': 24981,
 'jerk': 20029,
 'scharnhorst': 30529,
 'iggy': 34188,
 'equilateral': 36078,
 'olympus': 17841,
 'choice': 1431,
 'lark': 43935,
 'embark': 28282,
 'gloomy': 34956,
 'rationality': 11272,
 'knapsack': 34647,
 'stays': 13008,
 'bestiary': 25498,
 'exact': 2365,
 'aichi': 42730,
 'minute': 3075,
 'catalonia': 8315,
 'talbot': 19812,
 'cooks': 32602,
 'reining': 45859,
 'minnie': 13825,
 'skewed': 25370,
 'islets': 13703,
 'illustrators': 25640,
 'akrotiri': 44620,
 'cooke': 19688,
 'heracleidae': 39293,
 'palestrina': 20172,
 'multiprocessing': 28979,
 'batsmen': 26064,
 'espoo': 48530,
 'meadow': 23220,
 'adorno': 22155,
 'trails': 12160,
 'copyrighted': 12654,
 'cameraman': 39775,
 'lengthened': 28448,
 'heavyweight': 13956,
 'jumblatt': 32603,
 'chopping': 33318,
 'shirts': 13617,
 'unmik': 23541,
 'agonists': 27489,
 'strictness': 36648,
 'montoneros': 35814,
 'headset': 37798,
 'vorder': 43531,
 'antwerp': 11909,
 'celebrated': 3090,
 'polygonal': 38581,
 'baggins': 38941,
 'geography': 1391,
 'boost': 8167,
 'gamsakhurdia': 41004,
 'unintentionally': 27350,
 'centripetal': 24472,
 'annalen': 26909,
 'drafted': 8400,
 'oldies': 25102,
 'climbs': 27654,
 'honour': 4214,
 'pedagogic': 41005,
 'vanderbilt': 23542,
 'gladys': 23429,
 'address': 1308,
 'baudot': 29750,
 'annales': 19380,
 'benson': 11578,
 'mafioso': 32604,
 'enroll': 28295,
 'plunges': 40714,
 'accomplishes': 41006,
 'cyclades': 31656,
 'mafiosi': 38216,
 'dusty': 24518,
 'impacted': 17732,
 'queue': 14317,
 'colonna': 24982,
 'accomplished': 4809,
 'throughput': 15315,
 'stencil': 28909,
 'phane': 33314,
 'gollancz': 27059,
 'windowing': 36731,
 'influx': 9462,
 'presocratic': 29934,
 'ellensburg': 41465,
 'prepositional': 35337,
 'betraying': 34648,
 'agesilaus': 36088,
 'myoglobin': 41007,
 'darnell': 45861,
 'undergone': 11283,
 'working': 742,
 'perished': 13796,
 'oldham': 31440,
 'pints': 34957,
 'optimize': 24250,
 'redman': 40011,
 'vigour': 34958,
 'opposed': 1347,
 'overviews': 22063,
 'alastair': 32362,
 'fluorite': 47136,
 'assimilation': 14962,
 'tundra': 22246,
 'approving': 25103,
 'thompson': 3827,
 'consoles': 7941,
 'tines': 44111,
 'cardinality': 10089,
 'riders': 10160,
 'freyja': 40185,
 'rebounding': 42439,
 'lowercase': 9745,
 'opioids': 25243,
 'dendrite': 49731,
 'workprint': 47949,
 'originally': 606,
 'rutherford': 13142,
 'abortion': 4311,
 'harmonious': 25772,
 'albright': 24753,
 'following': 222,
 'reordering': 48662,
 'ammonite': 26910,
 'admired': 9542,
 'osage': 45862,
 'capablanca': 41975,
 'reification': 45542,
 'locke': 7533,
 'mailboxes': 45863,
 'parachute': 13989,
 'locks': 9981,
 'incremental': 18562,
 'admirer': 21381,
 'listens': 34359,
 'litre': 12138,
 'bibliographical': 40606,
 'septic': 29564,
 'vainly': 45250,
 'haarlem': 28717,
 'thanking': 42440,
 'edouard': 24519,
 'oswaldo': 33049,
 'maude': 23493,
 'minicomputers': 23865,
 'flaubert': 16722,
 'atal': 49412,
 'custer': 31943,
 'paleontologists': 23640,
 'neuch': 37438,
 'mythos': 8514,
 'convincingly': 24856,
 'fueled': 11281,
 'echidna': 37439,
 'reassessment': 37799,
 'sociolinguistics': 44621,
 'atat': 26065,
 'harmon': 25641,
 'temperate': 6633,
 'pulley': 26323,
 'surfing': 16457,
 'conscious': 5248,
 'burckhardt': 41976,
 'quadrant': 23969,
 'inhabiting': 16033,
 'subdivisions': 9202,
 'mango': 34898,
 'forebears': 34361,
 'swollen': 27490,
 ...}

In [7]:
data_index = 0

# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  for i in range(batch_size // num_skips):
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [skip_window]
    for j in range(num_skips):
      while target in targets_to_avoid:
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  return batch, labels

batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)

In [8]:
[reverse_dictionary[i] for i in batch], [reverse_dictionary[i] for i in labels[:, 0]]
for i in range(8):
  print(batch[i], reverse_dictionary[batch[i]],
        '->', labels[i, 0], reverse_dictionary[labels[i, 0]])


(3084, 'originated', '->', 5239, 'anarchism')
(3084, 'originated', '->', 12, 'as')
(12, 'as', '->', 3084, 'originated')
(12, 'as', '->', 6, 'a')
(6, 'a', '->', 12, 'as')
(6, 'a', '->', 195, 'term')
(195, 'term', '->', 2, 'of')
(195, 'term', '->', 6, 'a')

In [9]:
# Step 4: Build and train a skip-gram model.

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default():

  # Input data.
  train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

  # Ops and variables pinned to the CPU because of missing GPU implementation
  with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

  # Compute the average NCE loss for the batch.
  # tf.nce_loss automatically draws a new sample of the negative labels each
  # time we evaluate the loss.
  loss = tf.reduce_mean(
      tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=train_labels,
                     inputs=embed,
                     num_sampled=num_sampled,
                     num_classes=vocabulary_size))

  # Construct the SGD optimizer using a learning rate of 1.0.
  optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

  # Compute the cosine similarity between minibatch examples and all embeddings.
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(
      normalized_embeddings, valid_dataset)
  similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True)

  # Add variable initializer.
  init = tf.global_variables_initializer()

In [11]:
num_steps = 100001

with tf.Session(graph=graph) as session:
  # We must initialize all variables before we use them.
  init.run()
  print("Initialized")

  average_loss = 0
  for step in xrange(num_steps):
    batch_inputs, batch_labels = generate_batch(
        batch_size, num_skips, skip_window)
    feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

    # We perform one update step by evaluating the optimizer op (including it
    # in the list of returned values for session.run()
    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += loss_val

    if step % 2000 == 0:
      if step > 0:
        average_loss /= 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print("Average loss at step ", step, ": ", average_loss)
      average_loss = 0

    # Note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in xrange(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8  # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k + 1]
        log_str = "Nearest to %s:" % valid_word
        for k in xrange(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log_str = "%s %s," % (log_str, close_word)
        print(log_str)
  final_embeddings = normalized_embeddings.eval()


Initialized
('Average loss at step ', 0, ': ', 271.43389892578125)
Nearest to first: greenery, unsatisfied, shit, priscus, objectivism, interred, patricia, outer,
Nearest to UNK: powerplant, sow, intergovernmentalism, counters, pentium, educated, tahiti, apical,
Nearest to has: benoit, spacecraft, paradise, compressors, circ, decry, mia, etext,
Nearest to used: modify, refreshing, decommissioned, sectors, collectivism, tidy, mattress, signifies,
Nearest to may: walras, dartmoor, santiago, defied, service, portfolios, noting, autos,
Nearest to so: funneled, sari, misgivings, hetzel, performances, redaction, estuary, hawaii,
Nearest to while: watterson, tilden, bearing, alam, patenting, electron, electronegative, ige,
Nearest to i: domingue, ghetto, ka, bcp, partita, mere, unlimited, outre,
Nearest to were: wisconsin, displayed, stagnant, ostia, succinctly, andropov, zhdanov, curiosity,
Nearest to by: spiegel, aqdas, netherland, metastasis, rediscovery, distinguishing, modulator, lowered,
Nearest to he: hype, bev, kassel, crannog, backslash, method, hog, thunderbirds,
Nearest to if: poked, vicarage, report, gott, rewards, bentley, kournikova, pop,
Nearest to history: prospective, steelers, meriwether, predominated, adjectives, evers, girolamo, brooklyn,
Nearest to its: formulating, trintignant, portable, greenish, electronegativity, peroxides, tupelo, esc,
Nearest to a: irs, wit, lansdowne, cardiff, critically, cult, newbies, expressways,
Nearest to will: reconnection, disable, woodcut, ailing, casinos, sao, pirates, recorder,
('Average loss at step ', 2000, ': ', 113.16442505550384)
('Average loss at step ', 4000, ': ', 52.546211807250977)
('Average loss at step ', 6000, ': ', 33.443033806920049)
('Average loss at step ', 8000, ': ', 23.762722144126894)
('Average loss at step ', 10000, ': ', 18.265776746153833)
Nearest to first: objectivism, achill, priscus, akira, both, in, interred, settles,
Nearest to UNK: and, one, the, vs, reginae, guilty, mathbf, basins,
Nearest to has: is, paradise, spacecraft, strength, authors, mia, altenberg, reginae,
Nearest to used: metaphysical, collectivism, consists, modify, sectors, jpg, selden, amo,
Nearest to may: nine, fins, leader, tongue, ghosts, service, kournikova, aol,
Nearest to so: hawaii, performances, fao, rudolph, vs, england, followed, legs,
Nearest to while: electron, bearing, amo, aquarius, lot, beginning, allies, mans,
Nearest to i: reginae, ghetto, chloride, abdali, trick, mere, ka, economics,
Nearest to were: are, plaster, displayed, earlier, wisconsin, experience, encounters, retreated,
Nearest to by: in, as, and, from, users, dogmatic, two, phi,
Nearest to he: it, crannog, killed, been, cracking, flanders, gollancz, correctly,
Nearest to if: report, pop, abwehr, excavation, voice, ages, vicarage, kournikova,
Nearest to history: prospective, steelers, crowd, holocaust, good, vs, upon, brooklyn,
Nearest to its: the, one, becoming, selznick, offensive, resigned, sr, portable,
Nearest to a: the, and, UNK, mathbf, cardiff, fins, gb, another,
Nearest to will: victoriae, barbuda, and, printed, asteroid, reflection, transcontinental, fins,
('Average loss at step ', 12000, ': ', 14.056990735054017)
('Average loss at step ', 14000, ': ', 11.810415421843528)
('Average loss at step ', 16000, ': ', 9.8365392937660214)
('Average loss at step ', 18000, ': ', 8.6671083641052249)
('Average loss at step ', 20000, ': ', 7.7571109436750412)
Nearest to first: agouti, objectivism, in, achill, interred, both, zero, akira,
Nearest to UNK: agouti, dasyprocta, and, vs, two, four, hbox, three,
Nearest to has: is, had, was, paradise, circ, compiler, have, authors,
Nearest to used: metaphysical, dasyprocta, collectivism, browed, tenth, ashmore, consists, amo,
Nearest to may: nine, fins, defied, seven, six, limb, tongue, would,
Nearest to so: hawaii, dasyprocta, he, palatal, fao, performances, hg, followed,
Nearest to while: bearing, electron, amo, lf, sitting, from, and, aquarius,
Nearest to i: UNK, reginae, abdali, ghetto, chloride, trick, mere, picks,
Nearest to were: are, was, is, by, plaster, displayed, yin, had,
Nearest to by: as, in, from, for, and, was, with, two,
Nearest to he: it, backslash, they, and, been, eight, nine, crannog,
Nearest to if: pop, report, excavation, abwehr, vicarage, kbit, already, ages,
Nearest to history: prospective, agouti, annales, crowd, brooklyn, dasyprocta, girardeau, edicts,
Nearest to its: the, his, their, a, agouti, one, offensive, abadan,
Nearest to a: the, agouti, another, dasyprocta, this, or, one, its,
Nearest to will: and, recorder, agouti, printed, produces, sunda, fins, dasyprocta,
('Average loss at step ', 22000, ': ', 7.2026611722707745)
('Average loss at step ', 24000, ': ', 6.9355187491178514)
('Average loss at step ', 26000, ': ', 6.6781790490150454)
('Average loss at step ', 28000, ': ', 6.249697914242744)
('Average loss at step ', 30000, ': ', 6.160901928067207)
Nearest to first: agouti, objectivism, in, interred, achill, both, patricia, with,
Nearest to UNK: agouti, dasyprocta, tunings, vs, four, abakan, backslash, reginae,
Nearest to has: had, is, was, have, paradise, compiler, circ, altenberg,
Nearest to used: metaphysical, torgau, dasyprocta, aba, browed, decommissioned, collectivism, expand,
Nearest to may: would, can, nine, categorised, defied, six, fins, seven,
Nearest to so: abitibi, but, hawaii, dasyprocta, hg, palatal, fao, he,
Nearest to while: and, aba, bearing, were, from, electron, amo, ige,
Nearest to i: UNK, reginae, ghetto, abdali, chloride, trick, mere, four,
Nearest to were: are, was, is, by, had, plaster, while, be,
Nearest to by: in, as, and, from, with, was, for, abet,
Nearest to he: it, they, backslash, she, who, and, there, then,
Nearest to if: when, pop, report, doubtful, excavation, vicarage, bentley, abwehr,
Nearest to history: prospective, agouti, aba, chaldean, brooklyn, annales, crowd, steelers,
Nearest to its: the, their, his, a, agouti, pairing, offensive, bpp,
Nearest to a: the, agouti, dasyprocta, another, this, or, vojt, abitibi,
Nearest to will: would, can, and, might, or, primigenius, anthroposophy, must,
('Average loss at step ', 32000, ': ', 5.8942106403112415)
('Average loss at step ', 34000, ': ', 5.8399711575508118)
('Average loss at step ', 36000, ': ', 5.6933053689002993)
('Average loss at step ', 38000, ': ', 5.2934979794025425)
('Average loss at step ', 40000, ': ', 5.4946520501375202)
Nearest to first: agouti, objectivism, achill, interred, both, patricia, unsatisfied, second,
Nearest to UNK: dasyprocta, agouti, and, backslash, tunings, three, one, reginae,
Nearest to has: had, is, was, have, compiler, circ, paradise, altenberg,
Nearest to used: metaphysical, dasyprocta, torgau, aba, amo, reuptake, browed, galois,
Nearest to may: can, would, categorised, will, nine, defied, to, should,
Nearest to so: abitibi, but, transferring, it, threatening, he, dasyprocta, palatal,
Nearest to while: and, aba, were, bearing, from, ige, amo, electron,
Nearest to i: reginae, UNK, t, ghetto, abdali, trick, chloride, picks,
Nearest to were: are, was, is, by, had, while, have, be,
Nearest to by: was, from, as, with, were, in, hellfire, and,
Nearest to he: it, she, they, who, backslash, there, then, later,
Nearest to if: when, pop, abwehr, vicarage, bentley, excavation, doubtful, ing,
Nearest to history: prospective, agouti, aba, chaldean, brooklyn, steelers, annales, peptide,
Nearest to its: their, the, his, pairing, agouti, a, butler, some,
Nearest to a: the, agouti, abitibi, another, dasyprocta, this, vojt, eight,
Nearest to will: would, can, may, must, might, and, to, sed,
('Average loss at step ', 42000, ': ', 5.2884035185575486)
('Average loss at step ', 44000, ': ', 5.3145419737100603)
('Average loss at step ', 46000, ': ', 5.2635524272918701)
('Average loss at step ', 48000, ': ', 5.0368763834238051)
('Average loss at step ', 50000, ': ', 5.1556160471439361)
Nearest to first: objectivism, second, agouti, interred, patricia, next, achill, greenery,
Nearest to UNK: agouti, dasyprocta, four, reginae, three, victoriae, backslash, reuptake,
Nearest to has: had, is, was, have, biconditional, compiler, altenberg, already,
Nearest to used: metaphysical, prat, dasyprocta, torgau, aba, reuptake, amo, galois,
Nearest to may: can, would, will, categorised, should, nine, to, seven,
Nearest to so: abitibi, but, threatening, transferring, residues, palatal, dasyprocta, hg,
Nearest to while: and, aba, from, thibetanus, were, bearing, though, but,
Nearest to i: t, reginae, ghetto, UNK, abdali, he, trick, immensely,
Nearest to were: are, was, is, had, have, be, being, by,
Nearest to by: was, with, from, as, thibetanus, for, in, be,
Nearest to he: it, she, they, who, there, backslash, then, this,
Nearest to if: when, pop, but, abwehr, after, vicarage, for, doubtful,
Nearest to history: prospective, aba, agouti, steelers, chaldean, brooklyn, peptide, annales,
Nearest to its: their, the, his, pairing, agouti, roshan, a, escalation,
Nearest to a: the, another, agouti, this, vojt, dasyprocta, eight, gourd,
Nearest to will: would, can, may, must, might, could, to, and,
('Average loss at step ', 52000, ': ', 5.1628256613016132)
('Average loss at step ', 54000, ': ', 5.0801254163980483)
('Average loss at step ', 56000, ': ', 5.0409505732059481)
('Average loss at step ', 58000, ': ', 5.1087362531423572)
('Average loss at step ', 60000, ': ', 4.9483141461610796)
Nearest to first: second, objectivism, agouti, interred, next, patricia, callithrix, greenery,
Nearest to UNK: callithrix, agouti, dasyprocta, tamarin, cebus, ssbn, microsite, marmoset,
Nearest to has: had, have, is, was, biconditional, compiler, altenberg, enables,
Nearest to used: metaphysical, prat, dasyprocta, tamarin, aba, reuptake, galois, amo,
Nearest to may: can, would, will, categorised, should, could, might, nine,
Nearest to so: abitibi, but, threatening, transferring, residues, dasyprocta, palatal, excavation,
Nearest to while: and, or, aba, thibetanus, though, but, were, however,
Nearest to i: t, reginae, ghetto, UNK, abdali, chasing, trick, ssbn,
Nearest to were: are, was, have, had, is, being, be, while,
Nearest to by: was, with, as, ssbn, be, thibetanus, from, abet,
Nearest to he: it, she, they, who, there, backslash, then, scranton,
Nearest to if: when, but, callithrix, pop, doubtful, after, vicarage, for,
Nearest to history: aba, prospective, agouti, tamarin, steelers, dasyprocta, peptide, solicitation,
Nearest to its: their, his, the, pairing, some, agouti, cebus, butler,
Nearest to a: the, another, callithrix, marmoset, vojt, agouti, cebus, this,
Nearest to will: would, can, may, must, might, could, to, microcebus,
('Average loss at step ', 62000, ': ', 4.7917307525873181)
('Average loss at step ', 64000, ': ', 4.8071960885524749)
('Average loss at step ', 66000, ': ', 4.9775575051307674)
('Average loss at step ', 68000, ': ', 4.9157073162794109)
('Average loss at step ', 70000, ': ', 4.7652808930873869)
Nearest to first: second, objectivism, next, agouti, interred, callithrix, greenery, patricia,
Nearest to UNK: callithrix, agouti, dasyprocta, cebus, tamarin, ssbn, tunings, microsite,
Nearest to has: had, have, is, was, biconditional, naaman, altenberg, already,
Nearest to used: metaphysical, dasyprocta, prat, tamarin, reuptake, amo, aba, known,
Nearest to may: can, would, will, could, should, might, categorised, must,
Nearest to so: abitibi, thaler, transferring, threatening, residues, but, dasyprocta, palatal,
Nearest to while: and, or, however, thibetanus, aba, but, though, were,
Nearest to i: t, UNK, reginae, ghetto, you, we, they, abdali,
Nearest to were: are, was, have, had, be, being, is, while,
Nearest to by: was, specimens, with, ssbn, thibetanus, as, be, from,
Nearest to he: it, she, they, who, there, backslash, then, scranton,
Nearest to if: when, for, callithrix, abwehr, although, doubtful, after, but,
Nearest to history: prospective, aba, agouti, steelers, tamarin, peptide, chaldean, dasyprocta,
Nearest to its: their, his, the, pairing, some, cebus, escalation, agouti,
Nearest to a: the, another, marmoset, agouti, callithrix, cebus, vojt, abitibi,
Nearest to will: would, can, may, must, could, might, to, should,
('Average loss at step ', 72000, ': ', 4.8034837681055071)
('Average loss at step ', 74000, ': ', 4.7866865113079546)
('Average loss at step ', 76000, ': ', 4.848878457486629)
('Average loss at step ', 78000, ': ', 4.8069599908590313)
('Average loss at step ', 80000, ': ', 4.8158067229986194)
Nearest to first: second, cegep, next, objectivism, callithrix, triangles, agouti, interred,
Nearest to UNK: agouti, callithrix, dasyprocta, upanija, cegep, tamarin, three, cebus,
Nearest to has: had, have, is, was, biconditional, naaman, absalom, altenberg,
Nearest to used: known, dasyprocta, metaphysical, prat, tamarin, amo, reuptake, aba,
Nearest to may: can, would, will, could, should, might, must, categorised,
Nearest to so: abitibi, thaler, transferring, threatening, residues, dasyprocta, palatal, excavation,
Nearest to while: and, however, but, thibetanus, or, aba, though, upanija,
Nearest to i: UNK, t, reginae, you, we, ghetto, umbilical, they,
Nearest to were: are, was, have, had, be, being, is, while,
Nearest to by: was, specimens, be, with, ssbn, thibetanus, as, from,
Nearest to he: it, she, they, who, there, backslash, later, scranton,
Nearest to if: when, callithrix, candide, although, abwehr, for, bracing, after,
Nearest to history: prospective, aba, steelers, agouti, peptide, tamarin, chaldean, haman,
Nearest to its: their, his, the, pairing, escalation, cebus, some, cegep,
Nearest to a: the, another, marmoset, cegep, agouti, callithrix, cebus, microsite,
Nearest to will: would, can, may, must, could, might, should, to,
('Average loss at step ', 82000, ': ', 4.8188700177669528)
('Average loss at step ', 84000, ': ', 4.7771393737792973)
('Average loss at step ', 86000, ': ', 4.7510095541477204)
('Average loss at step ', 88000, ': ', 4.6768265293836597)
('Average loss at step ', 90000, ': ', 4.7664554728269577)
Nearest to first: second, cegep, next, callithrix, agouti, objectivism, interred, priscus,
Nearest to UNK: callithrix, dasyprocta, tamarin, agouti, upanija, cegep, microsite, cebus,
Nearest to has: had, have, is, was, biconditional, already, fsm, naaman,
Nearest to used: known, dasyprocta, metaphysical, boutros, prat, considered, tamarin, reuptake,
Nearest to may: can, would, will, could, should, might, must, categorised,
Nearest to so: transferring, abitibi, thaler, threatening, residues, palatal, otherwise, dasyprocta,
Nearest to while: and, however, but, though, thibetanus, or, although, aba,
Nearest to i: t, reginae, you, we, they, g, umbilical, ssbn,
Nearest to were: are, was, had, have, being, be, while, is,
Nearest to by: specimens, was, with, thibetanus, through, ssbn, be, for,
Nearest to he: it, she, they, there, who, then, backslash, but,
Nearest to if: when, callithrix, for, candide, although, where, agouti, bracing,
Nearest to history: aba, prospective, steelers, tamarin, peptide, agouti, microsite, haman,
Nearest to its: their, his, the, pairing, boutros, some, escalation, cebus,
Nearest to a: the, another, marmoset, cegep, callithrix, agouti, cebus, dasyprocta,
Nearest to will: would, can, may, must, could, might, should, to,
('Average loss at step ', 92000, ': ', 4.7146851714849474)
('Average loss at step ', 94000, ': ', 4.6265769623517992)
('Average loss at step ', 96000, ': ', 4.7305508151054383)
('Average loss at step ', 98000, ': ', 4.6262534256279473)
('Average loss at step ', 100000, ': ', 4.6827038342952729)
Nearest to first: second, cegep, next, callithrix, agouti, objectivism, interred, crb,
Nearest to UNK: callithrix, agouti, dasyprocta, tamarin, marmoset, cegep, tunings, abitibi,
Nearest to has: had, have, is, was, biconditional, fsm, already, altenberg,
Nearest to used: known, dasyprocta, considered, boutros, prat, tamarin, reuptake, metaphysical,
Nearest to may: can, would, will, could, should, must, might, categorised,
Nearest to so: abitibi, transferring, thaler, threatening, otherwise, residues, dasyprocta, palatal,
Nearest to while: and, but, however, though, thibetanus, although, when, aba,
Nearest to i: t, you, we, reginae, they, g, umbilical, microinstruction,
Nearest to were: are, was, have, had, be, being, while, is,
Nearest to by: with, through, specimens, was, be, as, thibetanus, gnutella,
Nearest to he: it, she, they, there, who, later, backslash, this,
Nearest to if: when, where, although, for, callithrix, candide, before, abwehr,
Nearest to history: aba, prospective, steelers, tamarin, peptide, agouti, microsite, chaldean,
Nearest to its: their, his, the, pairing, boutros, escalation, her, cebus,
Nearest to a: another, the, cegep, marmoset, agouti, cebus, callithrix, superfluid,
Nearest to will: would, can, may, must, could, might, should, to,

In [25]:
final_embeddings[dictionary['flow']]
import scipy.spatial 

print(final_embeddings[dictionary['flow']])
print(scipy.spatial.distance.cosine(final_embeddings[dictionary['flow']], final_embeddings[dictionary['flow']]))

print(scipy.spatial.distance.cosine(final_embeddings[dictionary['tensor']], final_embeddings[dictionary['flow']]))
print(scipy.spatial.distance.cosine(final_embeddings[dictionary['stream']], final_embeddings[dictionary['flow']]))
print(scipy.spatial.distance.cosine(final_embeddings[dictionary['red']], final_embeddings[dictionary['blue']]))


[ 0.1657137   0.03434748  0.06846502  0.0070382   0.0350267   0.03630972
  0.13870713 -0.08952355 -0.01397907  0.10646137  0.20203994 -0.02784512
  0.00405212  0.07034153 -0.08157731 -0.09949311 -0.06979465  0.07609912
  0.126651    0.1981343  -0.01641454  0.10250082  0.01758348 -0.04216308
 -0.15933873  0.07945434 -0.01796242 -0.00106697 -0.02517156 -0.12261134
  0.11137157  0.03639388 -0.1277702   0.00541975  0.07960305  0.03566069
 -0.04528071 -0.09769962  0.06677049 -0.02655403 -0.03063397  0.04496832
 -0.06530884  0.06109088 -0.04010385  0.01271805  0.05815612 -0.16572201
  0.00262617 -0.03960225 -0.09188902  0.14104255  0.12343952  0.08305998
 -0.04416177 -0.09624919 -0.09049979 -0.10931598  0.20486616 -0.00828655
  0.13724695  0.03226418  0.05047261  0.06022134  0.12492619  0.15182598
 -0.06894512 -0.02406714  0.07673304  0.02638871  0.02326527 -0.07070541
 -0.09337375 -0.08966452  0.16761334 -0.12091971 -0.02916223 -0.06994145
 -0.07964484 -0.01511631  0.03522532  0.06124158  0.02416194  0.0876618
  0.13634762  0.00068257  0.02688454  0.02144787  0.02604954  0.26445743
  0.03154384  0.07350118 -0.03664459  0.07665622 -0.00189574 -0.11288846
  0.16751142 -0.00503308  0.02526451  0.01647182 -0.13779892  0.05948371
  0.01406884  0.00629795 -0.11318459 -0.11795699  0.19943622 -0.15585676
  0.02766428 -0.05309179 -0.08386849 -0.12239943  0.03040705  0.01117246
 -0.05888936 -0.02056031 -0.08114543  0.10561201 -0.12420179 -0.01755991
 -0.00239956 -0.02879796 -0.0455617  -0.02028885 -0.00581089 -0.04240141
  0.09448038 -0.08710189]
-1.19209246918e-07
0.772016452075
0.864356577396
0.675594030214

In [32]:
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
  assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
  plt.figure(figsize=(18, 18))  # in inches
  for i, label in enumerate(labels):
    x, y = low_dim_embs[i, :]
    plt.scatter(x, y)
    plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')

  plt.savefig(filename)

try:
  from sklearn.manifold import TSNE
  import matplotlib.pyplot as plt

  tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
  plot_only = 500
  low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
  labels = [reverse_dictionary[i] for i in xrange(plot_only)]
  plot_with_labels(low_dim_embs, labels)

except ImportError:
  print("Please install sklearn, matplotlib, and scipy to visualize embeddings.")