In [130]:
import numpy as np
import theano
import six.moves.cPickle
import os, re, json
import operator

from keras.preprocessing import sequence, text
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils, generic_utils
from keras.models import Sequential
from keras.layers.embeddings import WordContextProduct, Embedding
from six.moves import range
from six.moves import zip

In [189]:
max_features = 1000
skip_top = 0 # ignore top 0 most common words
nb_epoch = 10
dim_proj = 35 # embedding space dimension

In [100]:
data_path = os.path.expanduser("~/")+"Downloads/sample.txt"

In [164]:
# text preprocessing utils
html_tags = re.compile(r'<.*?>')
to_replace = [('&#x27;', "'")]
hex_tags = re.compile(r'&.*?;')

def clean_comment(comment):
    c = str(comment)
    c = html_tags.sub(' ', c)
    for tag, char in to_replace:
        c = c.replace(tag, char)
    c = hex_tags.sub(' ', c)
    c = c.strip()
    return c

def text_generator(path=data_path):
    f = open(path)
    lines = f.read()
    lines = lines.split(".")
    for line in lines:
        line = clean_comment(line)
        yield line
    f.close()

In [190]:
print("Fit tokenizer...")
tokenizer = text.Tokenizer(nb_words=max_features)
tokenizer.fit_on_texts(text_generator())


Fit tokenizer...

In [191]:
print tokenizer.document_count
print tokenizer.word_index


351
{'essay': 350, 'limited': 611, 'all': 80, 'founder': 251, 'impression': 612, 'caused': 613, 'results': 614, 'deliveries': 615, 'existing': 252, 'leads': 443, 'go': 253, 'shot': 905, 'decisions': 616, 'children': 352, 'resourceful': 445, 'seemed': 254, 'increase': 485, 'careful': 998, 'depend': 617, "startup's": 447, 'producing': 716, 'technique': 448, 'young': 619, 'collogic': 620, 'to': 2, 'behave': 750, 'those': 1000, 'determinations': 621, 'under': 622, 'discovering': 451, 'disapproved': 623, 'fatal': 624, 'worth': 290, 'backwant': 625, 'town': 452, 'force': 1017, 'risk': 626, 'advantage': 355, 'permissive': 627, 'very': 130, 'implicitly': 628, 'story': 1096, 'focus': 630, 'every': 376, 'decide': 255, 'advised': 632, 'telling': 453, 'trouble': 633, 'bottleneck': 634, 'ramen': 635, 'method': 930, 'school': 256, 'impressive': 454, 'presented': 455, 'turns': 456, 'list': 233, "they've": 637, 'relentlessly': 638, 'standards': 639, 'large': 131, 'solved': 356, 'phase': 1133, 'small': 223, 'people\xe2\x80\x94in': 1110, 'leonardo': 642, 'round': 203, 'the': 1, 'smaller': 458, 'says': 358, "you'd": 257, 'battle': 649, 'trend': 645, 'dealing': 359, 'reacted': 859, 'direct': 647, 'sign': 291, 'past': 648, 'disciplined': 520, 'second': 333, 'cost': 460, 'design': 292, 'uncomfortable': 650, 'markets': 707, 'choices': 651, 'odd': 652, 'shows': 1167, 'even': 62, 'what': 45, 'business': 247, 'giving': 444, 'sum': 654, 'drifting': 655, 'profitability': 293, 'networks': 656, 'current': 657, 'experiment': 259, 'dismission': 1185, 'costing': 1060, 'capital': 660, 'new': 78, 'learned': 294, 'increasing': 661, 'ever': 295, 'public': 296, 'told': 462, 'niche': 662, 'degree': 360, 'nonmoly': 663, 'denial': 1028, 'never': 173, 'component': 664, 'answers': 665, 'delivering': 666, 'protection': 667, 'let': 297, 'suspectively': 668, 'groups': 361, 'meet': 362, "hadn't": 669, 'path': 363, 'along': 670, "aren't": 364, 'substantial': 1010, 'implicit': 671, 'change': 132, 'extreme': 672, 'sending': 674, 'great': 224, 'kids': 225, 'shift': 675, 'employees': 640, 'changed': 677, 'funded': 678, 'completely': 366, 'experience': 174, 'outding': 679, 'through': 680, 'paris': 681, 'amount': 226, "i'd": 682, 'products': 683, "weren't": 298, 'internet\xe2\x80\x94the': 684, 'pick': 475, 'usually': 260, 'opinion': 686, "i'm": 463, 'sound': 1144, 'makes': 127, 'genuinely': 688, 'hub\xe2\x80\x94committing': 689, 'love': 690, 'apple': 691, 'extra': 446, 'merely': 261, 'prefer': 465, 'compare': 886, 'conversations': 1034, 'put': 694, 'acception': 695, 'uncompanies': 696, 'trained': 697, 'fake': 698, 'frontpage': 699, 'market': 204, 'yourself': 205, 'use': 175, 'from': 53, 'wealth': 466, 'working': 149, 'prove': 367, 'sacche': 701, 'until': 1151, 'two': 340, 'next': 141, 'few': 159, 'live': 703, 'explicitly': 704, 'call': 705, 'iphone': 468, '6': 469, 'survive': 470, 'type': 235, 'tell': 150, 'more': 23, 'sort': 113, 'attractive': 471, 'optimizing': 708, 'rich': 709, 'advance': 472, 'started': 227, 'initial': 436, 'company': 42, 'factor': 596, 'yahoo': 712, 'it': 14, 'granted': 1001, 'known': 473, 'cases': 368, 'effort': 618, 'must': 717, 'me': 206, 'organizations': 262, 'none': 718, 'word': 719, 'g': 820, 'hour': 721, 'this': 85, 'science': 722, 'work': 114, 'radom': 723, 'tv': 724, 'undergrad': 540, 'obvious': 727, 'can': 43, 'learn': 142, 'exceptions': 728, 'making': 160, 'revenues': 474, 'my': 369, 'example': 228, 'complain': 729, 'numbers': 769, 'control': 299, 'crazy': 476, 'figure': 207, 'disuselves': 731, 'predict': 300, 'doal': 732, 'limitate': 733, "didn't": 176, 'accept': 734, 'holdless': 747, 'high': 477, 'heard': 370, 'critic': 735, 'something': 54, 'want': 51, 'sense': 177, 'sharp': 736, 'purposed': 737, 'times': 418, 'simple': 739, 'worse': 740, 'huge': 725, 'needs': 478, 'productive': 741, 'end': 229, 'goal': 371, 'conversation': 351, 'country': 567, 'means': 138, 'everydo': 745, 'brain': 929, '1': 415, 'how': 90, 'hot': 748, 'winners': 749, 'answer': 479, 'instead': 480, 'ordinary': 752, 'explains': 753, 'stock': 301, 'credential': 658, 'product': 263, 'whereas': 1163, 'information': 755, 'may': 126, 'watch': 756, 'after': 302, 'mexist': 757, 'applications': 481, 'wrong': 372, 'produce': 303, 'designed': 482, 'curiosity': 759, 'ready': 1026, 'such': 133, 'choice': 760, 'grow': 208, 'laptors': 761, 'undergrads': 762, 'a': 3, 'distinctions': 1138, 'realized': 394, 'remember': 373, 'succeeded': 763, 'conscious': 374, 'maybe': 764, 'desperation': 765, 'preced': 766, 'so': 30, 'pay': 354, 'taking': 1015, 'facebook': 631, 'order': 486, 'talk': 377, "that's": 120, 'excepted': 768, 'help': 378, "don't": 24, 'gradual': 770, 'over': 771, 'move': 776, 'vary': 772, 'choose': 955, 'published': 1040, 'years': 134, 'course': 491, 'industrial': 713, "won't": 230, 'treed': 926, 'mentioned': 774, 'segreated': 775, 'still': 379, 'before': 588, 'admirably': 778, 'write': 746, 'degreed': 780, 'group': 211, 'interesting': 135, 'fix': 781, 'decaded': 782, 'borned': 783, 'complex': 1032, 'writing': 264, 'better': 97, 'production': 507, 'differently': 629, 'founders': 56, 'main': 488, 'might': 178, 'create': 987, "haven't": 265, 'outside': 786, 'then': 231, 'them': 21, 'someone': 69, 'return': 787, 'food': 788, 'existences': 789, 'yc': 489, 'practice': 490, "we'll": 790, 'brutally': 834, 'complete': 1143, "you've": 304, 'they': 13, 'half': 791, 'not': 15, 'now': 179, 'investor': 305, 'realize': 143, 'bigger': 492, 'term': 493, 'several': 339, 'failed': 792, 'always': 98, 'university': 307, 'sufficiently': 494, 'productions': 793, 'mode': 794, 'reasonable': 308, 'each': 380, 'found': 795, 'organized': 710, 'european': 1037, 'side': 381, "isn't": 495, 'mean': 309, 'subset': 382, 'domain': 496, 'everyone': 144, 'financial': 798, 'significantly': 266, 'spoy': 754, 'doing': 91, 'companies': 81, 'hard': 99, 'reduce': 800, 'idea': 86, 'senment': 801, 'vote': 836, 'books': 497, 'expect': 498, 'year': 383, 'our': 603, 'programmer': 310, 'avoided': 1160, 'tenth': 805, 'message': 509, 'out': 100, 'hacker': 806, 'try': 151, '100k': 807, 'factors': 809, 'since': 311, 'truth': 1112, 'your': 27, 'looking': 202, 'similarly': 822, 'seriously': 811, 'vcs': 180, "shouldn't": 312, "they'll": 214, 'internet': 313, 'she': 501, 'colleges': 812, "you're": 12, 'million': 502, 'given': 715, 'free': 386, 'quite': 813, 'lead': 1179, 'extraordinary': 814, 'reason': 110, 'complicated': 503, 'york': 504, "people's": 505, 'ask': 314, 'wanted': 267, 'beginning': 815, 'airborning': 816, 'care': 387, 'definition': 234, 'hardware': 817, 'organic': 818, 'service': 636, 'language': 720, 'launch': 388, 'starts': 821, 'could': 70, 'adplice': 730, 'programming': 389, 'moment': 823, 'keep': 315, 'silicon': 824, 'thing': 121, 'american': 825, 'explored': 826, 'place': 827, 'due': 1132, 'threat': 828, 'suve': 829, 'consequence': 390, 'think': 145, 'first': 71, 'already': 210, 'seeming': 831, 'revenue': 506, 'feel': 161, 'third': 832, 'lumping': 833, 'powerful': 268, 'number': 212, 'one': 46, 'clear': 893, 'syrelicon': 935, 'done': 316, 'fast': 248, 'investors': 47, 'carry': 802, 'impossible': 837, 'sounds': 499, 'precisely': 391, 'open': 838, 'size': 182, "doesn't": 181, 'little': 385, 'likely': 510, 'needed': 269, 'leading': 317, 'top': 487, 'least': 162, 'response': 392, 'anyone': 842, 'their': 68, 'returns': 843, 'assumption': 384, 'too': 237, 'confliction': 844, 'developer': 845, 'legally': 846, 'percentage': 847, 'perfectly': 518, 'west': 1124, 'friend': 511, 'low': 437, 'acquisitions': 849, 'selling': 512, 'way': 87, 'that': 7, 'humans': 936, 'exactly': 513, 'refuse': 483, 'rejected': 514, 'discover': 851, 'acquirers': 852, 'part': 515, 'zire': 853, 'fall': 854, 'convinced': 856, 'than': 57, '10': 516, 'kind': 517, 'hundred': 848, 'unfortunately': 519, 'prevent': 643, 'gotten': 521, 'terms': 944, 'rate': 459, 'roberts': 860, 'matter': 861, 'future': 188, 'supposed': 270, 'were': 58, "company's": 522, 'feeling': 523, 'acquisition': 862, 'result': 863, 'and': 10, 'deciding': 396, 'mieways': 864, 'failing': 644, 'false': 866, 'angel': 238, 'slowment': 867, 'face': 1159, 'mind': 524, 'argument': 868, 'jided': 954, 'talking': 397, 'say': 64, 'conspiracy': 869, 'unlikely': 870, 'have': 11, 'sentences': 871, 'need': 96, 'seen': 398, 'seem': 59, 'turn': 742, 'saw': 873, 'any': 183, 'mooney': 841, 'normal': 1102, 'sell': 239, 'lie': 874, 'speakers': 875, 'experted': 876, 'offering': 877, 'zero': 525, 'refute': 526, 'offer': 318, 'able': 92, 'ideas': 122, 'client': 1048, 'also': 152, 'schleps': 878, 'take': 137, 'which': 93, 'performance': 399, 'willful': 879, 'techniques': 885, 'becoming': 1027, 'preventous': 881, 'universities': 882, "who've": 527, 'experienced': 184, 'unless': 400, 'committing': 883, 'opposite': 751, 'programmers': 101, 'price': 528, 'who': 40, 'most': 63, 'lust': 1024, 'regular': 529, 'significant': 887, 'position': 530, 'nothing': 320, 'carculal': 888, 'extremely': 890, 'why': 111, 'stupid': 891, 'surprising': 185, 'considered': 892, 'went': 796, 'later': 321, 'stronger': 894, 'drive': 895, 'spend': 421, 'traditional': 896, 'investing': 461, 'competitors': 287, 'demoralized': 604, 'error': 702, 'bending': 1054, 'pattern': 948, 'professional': 901, 'painting': 902, 'fact': 186, 'precise': 904, 'saying': 146, 'promising': 1084, 'particularly': 172, 'reasons\xe2\x80\x94when': 906, 'meetings': 907, 'labels': 531, 'recipe': 532, 'anyway': 908, 'discovered': 533, 'religions': 840, 'businesses': 910, 'solutions': 911, 'piggest': 641, 'sentenced': 975, 'subtle': 913, 'find': 163, 'time\xe2\x80\x94what': 914, 'partners': 534, 'consequenced': 897, 'knowledge': 917, 'valuation': 323, 'explain': 324, 'winner': 535, 'enough': 155, 'should': 107, 'values': 918, 'increasingly': 240, 'only': 102, 'going': 55, 'buying': 536, 'pretty': 326, 'expensively': 537, 'interested': 580, 'sure': 117, "investors'": 919, 'do': 20, 'suggest': 767, 'his': 327, 'destribe': 920, 'get': 19, 'air': 921, 'stop': 539, 'lucky': 758, 'cheaper': 922, "they'd": 187, 'ones': 153, 'alimination': 923, 'rest': 850, 'hif': 925, '2': 393, 'google': 595, 'him': 272, 'example\xe2\x80\x94quickly': 927, 'processes': 928, 'reading': 952, 'proposes': 653, 'generally': 406, 'median': 403, 'worried': 804, 'bad': 139, 'stuff': 541, 'common': 328, 'dangerous': 404, 'x': 659, 'fixed': 405, 'where': 154, 'steam': 932, 'consciously': 933, 'supplys': 934, 'set': 835, 'startup': 22, 'notice': 401, 'sex': 855, 'relative': 940, 'see': 116, 'decided': 164, 'college': 241, 'are': 25, 'desperate': 942, 'optimization': 1064, 'fail': 943, 'jobs': 858, 'yes': 542, 'best': 103, 'subject': 543, 'afflicted': 945, 'project': 946, 'said': 457, 'lots': 329, 'movie': 900, 'away': 903, 'currently': 950, 'sites': 951, 'tend': 407, 'initially': 958, '3': 325, "there's": 236, 'correctly': 545, 'between': 189, 'probably': 83, 'bunch': 777, 'approach': 953, 'checks': 941, 'knowing': 947, 'discipline': 1063, 'we': 60, 'recently': 547, 'ability': 830, 'unconscious': 957, 'missing': 548, 'evolving': 915, 'deliver': 959, 'screen': 549, 'attention': 330, 'points': 898, 'willing': 784, 'job': 961, 'succeed': 217, 'extent': 331, 'article': 554, 'intelligent': 963, 'come': 964, 'improve': 550, 'both': 412, 'figured': 966, 'last': 242, 'avoid': 1101, 'reddit': 937, 'deals': 165, "wouldn't": 104, 'minds': 969, 'beliefs': 551, 'sensitive': 970, 'share': 552, 'expense': 972, 'whole': 273, "can't": 84, 'negotiation': 974, 'among': 646, 'degeneral': 553, 'key': 976, 'hackers': 977, 'point': 140, 'reasons': 274, 'seems': 244, 'forgot': 978, 'whatever': 979, 'alone': 980, 'simply': 409, 'learning': 410, 'news': 962, 'disaspect': 1077, 'prepare': 1137, 'actually': 555, '100': 982, 'expensive': 983, 'adults': 1156, 'considering': 332, 'late': 984, 'unusual': 985, 'described': 986, 'raise': 123, 'asks': 544, "it's": 34, 'smoth': 700, 'three': 556, 'been': 72, 'quickly': 557, 'cared': 989, 'confident': 558, 'difference': 990, 'much': 65, 'treat': 559, 'delay': 779, 'judging': 560, 'expected': 335, 'parents': 581, 'meeting': 561, '4': 343, 'wants': 992, 'life': 275, 'partly': 562, 'valuations': 993, 'restrictions': 965, 'turned': 563, "what's": 994, 'search': 692, 'else': 166, 'fund': 996, 'engage': 997, 'understand': 337, 'ways': 1145, 'worked': 276, 'an': 79, 'seed': 999, 'present': 450, 'honest': 738, 'case': 190, 'prospect': 1002, 'replaces': 1003, 'look': 191, 'unlike': 1004, 'launches': 1005, 'straight': 1006, 'schlep': 564, 'harder': 413, 'single': 565, 'value': 1007, 'n': 414, 'will': 31, 'successes': 967, 'while': 1009, '1980': 819, 'accidently': 1011, 'smart': 566, 'many': 319, 'hubs': 1012, 'situation': 1013, 'primise': 1014, 'property': 568, 'study': 676, 'launched': 1016, 'mistake': 338, 'supply': 578, 'hurton': 693, 'vc': 1018, 'eventually': 1019, 'almost': 1020, 'demo': 1021, 'is': 9, 'surprisingly': 156, 'expenses': 1023, 'respected': 743, 'good': 33, 'brutal': 1025, 'in': 8, 'cook': 773, 'partner': 569, 'technology': 167, 'worry': 570, 'if': 17, 'different': 88, 'regulate': 872, "factor's": 1029, 'perhaps': 1030, 'things': 112, 'make': 49, 'inborn': 1161, 'same': 76, 'clearly': 1031, 'another': 508, 'strange': 968, 'speaker': 416, 'ares': 1033, 'weakness': 808, 'encourage': 1035, 'advise': 1036, 'gets': 306, 'bring': 909, 'higher': 232, 'development': 1038, 'responses': 956, 'used': 192, 'imply': 1039, 'social': 685, 'disagree': 571, 'effect': 417, 'driven': 419, 'presing': 1041, 'running': 277, 'moving': 572, 'purpose': 744, 'student': 1042, 'changing': 1043, 'opportunity': 573, 'expand': 1044, 'recent': 1045, 'judgement': 1079, 'lower': 1046, 'off': 278, 'subsets': 1047, 'i': 61, 'changes': 687, 'depending': 574, 'well': 218, 'obviously': 279, 'person': 193, 'without': 546, 'solve': 341, 'does': 1105, 'components': 1049, 'grasped': 988, 'organization': 194, 'model': 215, 'corporate': 1051, 'investments': 322, 'left': 1053, 'problem\xe2\x80\x94people': 981, "couldn't": 1055, 'responsible': 1056, 'stores': 1057, 'just': 36, 'less': 168, 'being': 94, 'money': 44, 'removed': 1058, 'presumably': 1059, 'valuable': 342, 'schools': 422, 'kill': 1061, 'credentials': 365, 'useful': 1062, 'startups': 28, 'yet': 195, 'previous': 576, 'profitable': 408, 'web': 577, 'field': 605, 'thinking': 1065, "we've": 424, 'implications\xe2\x80\x94that': 1066, 'danger': 464, 'had': 243, 'except': 425, 'day': 426, 'thousand': 1067, 'source': 579, 'verse': 1068, 'spread': 423, 'deliberately': 411, 'board': 336, 'easy': 1069, 'plenty': 1070, '5': 1165, 'has': 147, 'app': 1072, 'surprised': 245, 'useless': 1073, 'real': 1074, 'customers': 440, 'optimizations': 1076, 'around': 938, 'depends': 1078, 'freedom': 711, 'read': 427, 'big': 89, 'couple': 196, "who's": 582, 'disagreement': 583, 'possible': 584, 'early': 420, 'traffic': 1080, 'know': 67, 'average': 1081, 'world': 105, "who'd": 1082, 'projects': 1083, 'helpful': 428, 'developers': 865, 'nice': 1168, 'like': 29, 'alarms': 1050, '55': 1085, 'suppliers': 1086, '50': 1087, 'acquirer': 1088, 'specific': 280, 'true': 281, 'radiar': 1089, 'become': 585, 'works': 586, 'signal': 1091, 'because': 39, 'old': 282, 'often': 429, 'deal': 246, 'people': 26, 'absolutely': 1092, 'flatter': 1093, 'scared': 1094, 'some': 157, 'back': 1095, 'growth': 334, 'choosing': 430, 'multiple': 1097, 'guess': 839, 'scares': 1098, 'leaders': 714, 'ignore': 587, 'gradually': 431, 'scale': 219, 'describing': 1100, 'for': 18, 'decision': 889, 'though': 884, 'audience': 1103, 'comments': 1104, 'conservative': 589, 'everything': 432, 'asking': 449, 'tends': 1106, 'happened': 1090, 'kleries': 1108, 'content': 500, 'straightforward': 1109, 'refer': 357, 'be': 6, 'process': 344, 'cluelessly': 1111, 'run': 590, 'dramatical': 1052, 'power': 258, 'schedule': 433, 'convincing': 345, 'flacked': 785, 'failure\xe2\x80\x94why': 1114, 'technology\xe2\x80\x94probably': 1115, 'broken': 1116, 'step': 434, 'shifting': 1117, 'losing': 1118, 'clumber': 1119, 'exciting': 1120, 'super': 197, 'by': 48, 'stage': 706, 'on': 41, 'about': 37, 'would': 73, 'anything': 198, 'getting': 591, 'conceal': 1121, 'of': 4, 'industry': 592, 'months': 435, 'meaning': 1173, 'o': 1122, 'linkable': 1123, 'evidences': 1107, 'plus': 346, 'trajectory': 973, 'efforts': 1125, 'slightly': 1126, 'or': 108, 'confidence': 1113, 'software': 594, 'consulting': 209, 'own': 283, 'fundraising': 118, 'burning': 1128, 'wasting': 1129, 'into': 128, 'predictions': 1130, 'commitment': 1131, "world's": 467, 'down': 220, 'device': 353, 'right': 200, 'formidable': 375, 'batch': 1135, 'rounds': 213, '1985': 1136, 'computer': 597, 'famous': 810, 'responding': 1008, 'there': 82, 'question': 598, 'long': 136, 'class': 599, 'strip': 1134, 'start': 74, 'describes': 1140, "we're": 216, 'lot': 66, 'series': 221, 'biggest': 199, 'valley': 600, 'was': 75, 'persisting': 1150, 'launching': 1141, 'function': 1142, 'building': 601, 'buy': 347, 'interest': 991, 'form': 438, "i've": 169, 'brand': 880, 'recommend': 797, 'whould': 1146, 'but': 16, 'palt': 1147, 'failure': 1148, 'hear': 439, 'survived': 1149, 'e': 1178, 'trying': 170, 'with': 50, 'raising': 222, 'he': 284, "they're": 38, 'hire': 402, 'made': 285, 'whether': 249, 'angels': 158, 'up': 124, 'us': 250, 'slow': 916, 'convince': 286, 'barbershop': 960, 'critical': 1152, 'structure': 1176, 'site': 441, 'problem': 77, 'stages': 1153, 'called': 348, 'nature': 1154, 'fractions': 1155, 'incompeted': 673, 'describe': 602, 'store': 1157, 'general': 271, 'as': 35, 'at': 52, 'stoping': 1022, 'unwere': 1158, 'mistaken': 924, '20': 1139, 'physical': 484, 'complaint': 1075, 'raised': 1127, 'embition': 726, 'no': 125, 'amounts': 1162, 'user': 949, 'when': 32, "you'll": 106, 'tablets': 931, 'occasional': 1164, 'successful': 115, 'badly': 799, 'other': 119, 'role': 1071, 'ambitious': 606, 'becomes': 1166, 'compete': 1182, 'you': 5, 'really': 109, 'acceptable': 857, 'principle': 899, 'happens': 1099, 'media': 1169, 'users': 129, 'living': 1170, 'models': 607, 'valuate': 608, 'felt': 609, 'problems': 171, 'prepared': 610, "let's": 1171, 'push': 1172, 'chance': 593, 'important': 148, 'variable': 1174, 'product\xe2\x80\x94as': 1175, 'friends': 349, 'crossors': 803, 'began': 1177, 'ago': 442, 'individual': 395, 'faster': 912, 'using': 575, 'scenes': 939, 'invest': 201, 'rule': 1180, 'safe': 1181, 'students': 971, 'situations': 1183, 'time': 95, 'far': 288, "she's": 1184, 'once': 1186, 'convert': 995, 'having': 289, 'hope': 538}

In [135]:
x = tokenizer.word_counts
print sorted(x.items(), key=operator.itemgetter(1), reverse=True)


[('the', 485), ('to', 484), ('a', 289), ('of', 265), ('you', 216), ('be', 180), ('that', 161), ('in', 149), ('is', 145), ('and', 125), ('have', 119), ("you're", 110), ('they', 105), ('it', 104), ('not', 92), ('but', 83), ('if', 81), ('for', 77), ('get', 70), ('do', 70), ('them', 69), ('startup', 68), ('more', 67), ("don't", 67), ('are', 61), ('people', 59), ('your', 56), ('startups', 55), ('like', 55), ('so', 51), ('will', 51), ('when', 51), ('good', 50), ("it's", 50), ('as', 48), ('just', 47), ('about', 47), ("they're", 47), ('because', 46), ('who', 45), ('on', 45), ('company', 44), ('can', 44), ('money', 44), ('what', 43), ('one', 43), ('investors', 42), ('by', 41), ('make', 40), ('with', 40), ('want', 38), ('at', 37), ('from', 33), ('something', 33), ('going', 32), ('founders', 31), ('than', 31), ('were', 31), ('seem', 31), ('we', 31), ('i', 31), ('even', 30), ('most', 30), ('say', 29), ('much', 29), ('lot', 29), ('know', 28), ('their', 26), ('someone', 26), ('could', 25), ('first', 25), ('been', 25), ('would', 25), ('start', 25), ('was', 25), ('same', 24), ('problem', 24), ('new', 23), ('an', 23), ('all', 22), ('companies', 22), ('there', 22), ('probably', 22), ("can't", 22), ('this', 21), ('idea', 21), ('way', 21), ('different', 21), ('big', 21), ('how', 20), ('doing', 20), ('able', 20), ('which', 20), ('being', 20), ('time', 20), ('need', 19), ('better', 18), ('always', 18), ('hard', 18), ('out', 18), ('programmers', 18), ('only', 18), ('best', 18), ("wouldn't", 18), ('world', 18), ("you'll", 17), ('should', 17), ('or', 17), ('really', 17), ('reason', 16), ('why', 16), ('things', 16), ('sort', 15), ('work', 15), ('successful', 15), ('see', 15), ('sure', 15), ('fundraising', 15), ('other', 15), ("that's", 14), ('thing', 14), ('ideas', 14), ('raise', 14), ('up', 14), ('no', 14), ('may', 13), ('makes', 13), ('into', 13), ('users', 13), ('very', 12), ('large', 12), ('change', 12), ('such', 12), ('years', 12), ('interesting', 12), ('long', 12), ('take', 12), ('means', 12), ('bad', 12), ('point', 12), ('next', 11), ('learn', 11), ('realize', 11), ('everyone', 11), ('think', 11), ('saying', 11), ('has', 11), ('important', 11), ('working', 10), ('tell', 10), ('try', 10), ('also', 10), ('ones', 10), ('where', 10), ('enough', 10), ('surprisingly', 10), ('some', 10), ('angels', 10), ('few', 9), ('making', 9), ('feel', 9), ('least', 9), ('find', 9), ('decided', 9), ('deals', 9), ('else', 9), ('technology', 9), ('less', 9), ("i've", 9), ('trying', 9), ('problems', 9), ('particularly', 8), ('never', 8), ('experience', 8), ('use', 8), ("didn't", 8), ('sense', 8), ('might', 8), ('now', 8), ('vcs', 8), ("doesn't", 8), ('size', 8), ('any', 8), ('experienced', 8), ('surprising', 8), ('fact', 8), ("they'd", 8), ('future', 8), ('between', 8), ('case', 8), ('look', 8), ('used', 8), ('person', 8), ('organization', 8), ('yet', 8), ('couple', 8), ('super', 8), ('anything', 8), ('biggest', 8), ('right', 8), ('invest', 8), ('looking', 7), ('round', 7), ('market', 7), ('yourself', 7), ('me', 7), ('figure', 7), ('grow', 7), ('consulting', 7), ('already', 7), ('group', 7), ('number', 7), ('rounds', 7), ("they'll", 7), ('model', 7), ("we're", 7), ('succeed', 7), ('well', 7), ('scale', 7), ('down', 7), ('series', 7), ('raising', 7), ('small', 6), ('great', 6), ('kids', 6), ('amount', 6), ('started', 6), ('example', 6), ('end', 6), ("won't", 6), ('then', 6), ('higher', 6), ('list', 6), ('definition', 6), ('type', 6), ("there's", 6), ('too', 6), ('angel', 6), ('sell', 6), ('increasingly', 6), ('college', 6), ('last', 6), ('had', 6), ('seems', 6), ('surprised', 6), ('deal', 6), ('business', 6), ('fast', 6), ('whether', 6), ('us', 6), ('founder', 5), ('existing', 5), ('go', 5), ('seemed', 5), ('decide', 5), ('school', 5), ("you'd", 5), ('power', 5), ('experiment', 5), ('usually', 5), ('merely', 5), ('organizations', 5), ('product', 5), ('writing', 5), ("haven't", 5), ('significantly', 5), ('wanted', 5), ('powerful', 5), ('needed', 5), ('supposed', 5), ('general', 5), ('him', 5), ('whole', 5), ('reasons', 5), ('life', 5), ('worked', 5), ('running', 5), ('off', 5), ('obviously', 5), ('specific', 5), ('true', 5), ('old', 5), ('own', 5), ('he', 5), ('made', 5), ('convince', 5), ('competitors', 5), ('far', 5), ('having', 5), ('worth', 4), ('sign', 4), ('design', 4), ('profitability', 4), ('learned', 4), ('ever', 4), ('public', 4), ('let', 4), ("weren't", 4), ('control', 4), ('predict', 4), ('stock', 4), ('after', 4), ('produce', 4), ("you've", 4), ('investor', 4), ('gets', 4), ('university', 4), ('reasonable', 4), ('mean', 4), ('programmer', 4), ('since', 4), ("shouldn't", 4), ('internet', 4), ('ask', 4), ('keep', 4), ('done', 4), ('leading', 4), ('offer', 4), ('many', 4), ('nothing', 4), ('later', 4), ('investments', 4), ('valuation', 4), ('explain', 4), ('3', 4), ('pretty', 4), ('his', 4), ('common', 4), ('lots', 4), ('attention', 4), ('extent', 4), ('considering', 4), ('second', 4), ('growth', 4), ('expected', 4), ('board', 4), ('understand', 4), ('mistake', 4), ('several', 4), ('two', 4), ('solve', 4), ('valuable', 4), ('4', 4), ('process', 4), ('convincing', 4), ('plus', 4), ('buy', 4), ('called', 4), ('friends', 4), ('essay', 3), ('conversation', 3), ('children', 3), ('device', 3), ('pay', 3), ('advantage', 3), ('solved', 3), ('refer', 3), ('says', 3), ('dealing', 3), ('degree', 3), ('groups', 3), ('meet', 3), ('path', 3), ("aren't", 3), ('credentials', 3), ('completely', 3), ('prove', 3), ('cases', 3), ('my', 3), ('heard', 3), ('goal', 3), ('wrong', 3), ('remember', 3), ('conscious', 3), ('formidable', 3), ('every', 3), ('talk', 3), ('help', 3), ('still', 3), ('each', 3), ('side', 3), ('subset', 3), ('year', 3), ('assumption', 3), ('little', 3), ('free', 3), ('care', 3), ('launch', 3), ('programming', 3), ('consequence', 3), ('precisely', 3), ('response', 3), ('2', 3), ('realized', 3), ('individual', 3), ('deciding', 3), ('talking', 3), ('seen', 3), ('performance', 3), ('unless', 3), ('notice', 3), ('hire', 3), ('median', 3), ('dangerous', 3), ('fixed', 3), ('generally', 3), ('tend', 3), ('profitable', 3), ('simply', 3), ('learning', 3), ('deliberately', 3), ('both', 3), ('harder', 3), ('n', 3), ('1', 3), ('speaker', 3), ('effect', 3), ('times', 3), ('driven', 3), ('early', 3), ('spend', 3), ('schools', 3), ('spread', 3), ("we've", 3), ('except', 3), ('day', 3), ('read', 3), ('helpful', 3), ('often', 3), ('choosing', 3), ('gradually', 3), ('everything', 3), ('schedule', 3), ('step', 3), ('months', 3), ('initial', 3), ('low', 3), ('form', 3), ('hear', 3), ('customers', 3), ('site', 3), ('ago', 3), ('leads', 2), ('giving', 2), ('resourceful', 2), ('extra', 2), ("startup's", 2), ('technique', 2), ('asking', 2), ('present', 2), ('discovering', 2), ('town', 2), ('telling', 2), ('impressive', 2), ('presented', 2), ('turns', 2), ('said', 2), ('smaller', 2), ('rate', 2), ('cost', 2), ('investing', 2), ('told', 2), ("i'm", 2), ('danger', 2), ('prefer', 2), ('wealth', 2), ("world's", 2), ('iphone', 2), ('6', 2), ('survive', 2), ('attractive', 2), ('advance', 2), ('known', 2), ('revenues', 2), ('pick', 2), ('crazy', 2), ('high', 2), ('needs', 2), ('answer', 2), ('instead', 2), ('applications', 2), ('designed', 2), ('refuse', 2), ('physical', 2), ('increase', 2), ('order', 2), ('top', 2), ('main', 2), ('yc', 2), ('practice', 2), ('course', 2), ('bigger', 2), ('term', 2), ('sufficiently', 2), ("isn't", 2), ('domain', 2), ('books', 2), ('expect', 2), ('sounds', 2), ('content', 2), ('she', 2), ('million', 2), ('complicated', 2), ('york', 2), ("people's", 2), ('revenue', 2), ('production', 2), ('another', 2), ('message', 2), ('likely', 2), ('friend', 2), ('selling', 2), ('exactly', 2), ('rejected', 2), ('part', 2), ('10', 2), ('kind', 2), ('perfectly', 2), ('unfortunately', 2), ('disciplined', 2), ('gotten', 2), ("company's", 2), ('feeling', 2), ('mind', 2), ('zero', 2), ('refute', 2), ("who've", 2), ('price', 2), ('regular', 2), ('position', 2), ('labels', 2), ('recipe', 2), ('discovered', 2), ('partners', 2), ('winner', 2), ('buying', 2), ('expensively', 2), ('hope', 2), ('stop', 2), ('undergrad', 2), ('stuff', 2), ('yes', 2), ('subject', 2), ('asks', 2), ('correctly', 2), ('without', 2), ('recently', 2), ('missing', 2), ('screen', 2), ('improve', 2), ('beliefs', 2), ('share', 2), ('degeneral', 2), ('article', 2), ('actually', 2), ('three', 2), ('quickly', 2), ('confident', 2), ('treat', 2), ('judging', 2), ('meeting', 2), ('partly', 2), ('turned', 2), ('schlep', 2), ('single', 2), ('smart', 2), ('country', 2), ('property', 2), ('partner', 2), ('worry', 2), ('disagree', 2), ('moving', 2), ('opportunity', 2), ('depending', 2), ('using', 2), ('previous', 2), ('web', 2), ('supply', 2), ('source', 2), ('interested', 2), ('parents', 2), ("who's", 2), ('disagreement', 2), ('possible', 2), ('become', 2), ('works', 2), ('ignore', 2), ('before', 2), ('conservative', 2), ('run', 2), ('getting', 2), ('industry', 2), ('chance', 2), ('software', 2), ('google', 2), ('factor', 2), ('computer', 2), ('question', 2), ('class', 2), ('valley', 2), ('building', 2), ('describe', 2), ('our', 2), ('demoralized', 2), ('field', 2), ('ambitious', 2), ('models', 2), ('valuate', 2), ('felt', 2), ('prepared', 2), ('limited', 1), ('impression', 1), ('caused', 1), ('results', 1), ('deliveries', 1), ('decisions', 1), ('depend', 1), ('effort', 1), ('young', 1), ('collogic', 1), ('determinations', 1), ('under', 1), ('disapproved', 1), ('fatal', 1), ('backwant', 1), ('risk', 1), ('permissive', 1), ('implicitly', 1), ('differently', 1), ('focus', 1), ('facebook', 1), ('advised', 1), ('trouble', 1), ('bottleneck', 1), ('ramen', 1), ('service', 1), ("they've", 1), ('relentlessly', 1), ('standards', 1), ('employees', 1), ('piggest', 1), ('leonardo', 1), ('prevent', 1), ('failing', 1), ('trend', 1), ('among', 1), ('direct', 1), ('past', 1), ('battle', 1), ('uncomfortable', 1), ('choices', 1), ('odd', 1), ('proposes', 1), ('sum', 1), ('drifting', 1), ('networks', 1), ('current', 1), ('credential', 1), ('x', 1), ('capital', 1), ('increasing', 1), ('niche', 1), ('nonmoly', 1), ('component', 1), ('answers', 1), ('delivering', 1), ('protection', 1), ('suspectively', 1), ("hadn't", 1), ('along', 1), ('implicit', 1), ('extreme', 1), ('incompeted', 1), ('sending', 1), ('shift', 1), ('study', 1), ('changed', 1), ('funded', 1), ('outding', 1), ('through', 1), ('paris', 1), ("i'd", 1), ('products', 1), ('internet\xe2\x80\x94the', 1), ('social', 1), ('opinion', 1), ('changes', 1), ('genuinely', 1), ('hub\xe2\x80\x94committing', 1), ('love', 1), ('apple', 1), ('search', 1), ('hurton', 1), ('put', 1), ('acception', 1), ('uncompanies', 1), ('trained', 1), ('fake', 1), ('frontpage', 1), ('smoth', 1), ('sacche', 1), ('error', 1), ('live', 1), ('explicitly', 1), ('call', 1), ('stage', 1), ('markets', 1), ('optimizing', 1), ('rich', 1), ('organized', 1), ('freedom', 1), ('yahoo', 1), ('industrial', 1), ('leaders', 1), ('given', 1), ('producing', 1), ('must', 1), ('none', 1), ('word', 1), ('language', 1), ('hour', 1), ('science', 1), ('radom', 1), ('tv', 1), ('huge', 1), ('embition', 1), ('obvious', 1), ('exceptions', 1), ('complain', 1), ('adplice', 1), ('disuselves', 1), ('doal', 1), ('limitate', 1), ('accept', 1), ('critic', 1), ('sharp', 1), ('purposed', 1), ('honest', 1), ('simple', 1), ('worse', 1), ('productive', 1), ('turn', 1), ('respected', 1), ('purpose', 1), ('everydo', 1), ('write', 1), ('holdless', 1), ('hot', 1), ('winners', 1), ('behave', 1), ('opposite', 1), ('ordinary', 1), ('explains', 1), ('spoy', 1), ('information', 1), ('watch', 1), ('mexist', 1), ('lucky', 1), ('curiosity', 1), ('choice', 1), ('laptors', 1), ('undergrads', 1), ('succeeded', 1), ('maybe', 1), ('desperation', 1), ('preced', 1), ('suggest', 1), ('excepted', 1), ('numbers', 1), ('gradual', 1), ('over', 1), ('vary', 1), ('cook', 1), ('mentioned', 1), ('segreated', 1), ('move', 1), ('bunch', 1), ('admirably', 1), ('delay', 1), ('degreed', 1), ('fix', 1), ('decaded', 1), ('borned', 1), ('willing', 1), ('flacked', 1), ('outside', 1), ('return', 1), ('food', 1), ('existences', 1), ("we'll", 1), ('half', 1), ('failed', 1), ('productions', 1), ('mode', 1), ('found', 1), ('went', 1), ('recommend', 1), ('financial', 1), ('badly', 1), ('reduce', 1), ('senment', 1), ('carry', 1), ('crossors', 1), ('worried', 1), ('tenth', 1), ('hacker', 1), ('100k', 1), ('weakness', 1), ('factors', 1), ('famous', 1), ('seriously', 1), ('colleges', 1), ('quite', 1), ('extraordinary', 1), ('beginning', 1), ('airborning', 1), ('hardware', 1), ('organic', 1), ('1980', 1), ('g', 1), ('starts', 1), ('similarly', 1), ('moment', 1), ('silicon', 1), ('american', 1), ('explored', 1), ('place', 1), ('threat', 1), ('suve', 1), ('ability', 1), ('seeming', 1), ('third', 1), ('lumping', 1), ('brutally', 1), ('set', 1), ('vote', 1), ('impossible', 1), ('open', 1), ('guess', 1), ('religions', 1), ('mooney', 1), ('anyone', 1), ('returns', 1), ('confliction', 1), ('developer', 1), ('legally', 1), ('percentage', 1), ('hundred', 1), ('acquisitions', 1), ('rest', 1), ('discover', 1), ('acquirers', 1), ('zire', 1), ('fall', 1), ('sex', 1), ('convinced', 1), ('acceptable', 1), ('jobs', 1), ('reacted', 1), ('roberts', 1), ('matter', 1), ('acquisition', 1), ('result', 1), ('mieways', 1), ('developers', 1), ('false', 1), ('slowment', 1), ('argument', 1), ('conspiracy', 1), ('unlikely', 1), ('sentences', 1), ('regulate', 1), ('saw', 1), ('lie', 1), ('speakers', 1), ('experted', 1), ('offering', 1), ('schleps', 1), ('willful', 1), ('brand', 1), ('preventous', 1), ('universities', 1), ('committing', 1), ('though', 1), ('techniques', 1), ('compare', 1), ('significant', 1), ('carculal', 1), ('decision', 1), ('extremely', 1), ('stupid', 1), ('considered', 1), ('clear', 1), ('stronger', 1), ('drive', 1), ('traditional', 1), ('consequenced', 1), ('points', 1), ('principle', 1), ('movie', 1), ('professional', 1), ('painting', 1), ('away', 1), ('precise', 1), ('shot', 1), ('reasons\xe2\x80\x94when', 1), ('meetings', 1), ('anyway', 1), ('bring', 1), ('businesses', 1), ('solutions', 1), ('faster', 1), ('subtle', 1), ('time\xe2\x80\x94what', 1), ('evolving', 1), ('slow', 1), ('knowledge', 1), ('values', 1), ("investors'", 1), ('destribe', 1), ('air', 1), ('cheaper', 1), ('alimination', 1), ('mistaken', 1), ('hif', 1), ('treed', 1), ('example\xe2\x80\x94quickly', 1), ('processes', 1), ('brain', 1), ('method', 1), ('tablets', 1), ('steam', 1), ('consciously', 1), ('supplys', 1), ('syrelicon', 1), ('humans', 1), ('reddit', 1), ('around', 1), ('scenes', 1), ('relative', 1), ('checks', 1), ('desperate', 1), ('fail', 1), ('terms', 1), ('afflicted', 1), ('project', 1), ('knowing', 1), ('pattern', 1), ('user', 1), ('currently', 1), ('sites', 1), ('reading', 1), ('approach', 1), ('jided', 1), ('choose', 1), ('responses', 1), ('unconscious', 1), ('initially', 1), ('deliver', 1), ('barbershop', 1), ('job', 1), ('news', 1), ('intelligent', 1), ('come', 1), ('restrictions', 1), ('figured', 1), ('successes', 1), ('strange', 1), ('minds', 1), ('sensitive', 1), ('students', 1), ('expense', 1), ('trajectory', 1), ('negotiation', 1), ('sentenced', 1), ('key', 1), ('hackers', 1), ('forgot', 1), ('whatever', 1), ('alone', 1), ('problem\xe2\x80\x94people', 1), ('100', 1), ('expensive', 1), ('late', 1), ('unusual', 1), ('described', 1), ('create', 1), ('grasped', 1), ('cared', 1), ('difference', 1), ('interest', 1), ('wants', 1), ('valuations', 1), ("what's", 1), ('convert', 1), ('fund', 1), ('engage', 1), ('careful', 1), ('seed', 1), ('those', 1), ('granted', 1), ('prospect', 1), ('replaces', 1), ('unlike', 1), ('launches', 1), ('straight', 1), ('value', 1), ('responding', 1), ('while', 1), ('substantial', 1), ('accidently', 1), ('hubs', 1), ('situation', 1), ('primise', 1), ('taking', 1), ('launched', 1), ('force', 1), ('vc', 1), ('eventually', 1), ('almost', 1), ('demo', 1), ('stoping', 1), ('expenses', 1), ('lust', 1), ('brutal', 1), ('ready', 1), ('becoming', 1), ('denial', 1), ("factor's", 1), ('perhaps', 1), ('clearly', 1), ('complex', 1), ('ares', 1), ('conversations', 1), ('encourage', 1), ('advise', 1), ('european', 1), ('development', 1), ('imply', 1), ('published', 1), ('presing', 1), ('student', 1), ('changing', 1), ('expand', 1), ('recent', 1), ('lower', 1), ('subsets', 1), ('client', 1), ('components', 1), ('alarms', 1), ('corporate', 1), ('dramatical', 1), ('left', 1), ('bending', 1), ("couldn't", 1), ('responsible', 1), ('stores', 1), ('removed', 1), ('presumably', 1), ('costing', 1), ('kill', 1), ('useful', 1), ('discipline', 1), ('optimization', 1), ('thinking', 1), ('implications\xe2\x80\x94that', 1), ('thousand', 1), ('verse', 1), ('easy', 1), ('plenty', 1), ('role', 1), ('app', 1), ('useless', 1), ('real', 1), ('complaint', 1), ('optimizations', 1), ('disaspect', 1), ('depends', 1), ('judgement', 1), ('traffic', 1), ('average', 1), ("who'd", 1), ('projects', 1), ('promising', 1), ('55', 1), ('suppliers', 1), ('50', 1), ('acquirer', 1), ('radiar', 1), ('happened', 1), ('signal', 1), ('absolutely', 1), ('flatter', 1), ('scared', 1), ('back', 1), ('story', 1), ('multiple', 1), ('scares', 1), ('happens', 1), ('describing', 1), ('avoid', 1), ('normal', 1), ('audience', 1), ('comments', 1), ('does', 1), ('tends', 1), ('evidences', 1), ('kleries', 1), ('straightforward', 1), ('people\xe2\x80\x94in', 1), ('cluelessly', 1), ('truth', 1), ('confidence', 1), ('failure\xe2\x80\x94why', 1), ('technology\xe2\x80\x94probably', 1), ('broken', 1), ('shifting', 1), ('losing', 1), ('clumber', 1), ('exciting', 1), ('conceal', 1), ('o', 1), ('linkable', 1), ('west', 1), ('efforts', 1), ('slightly', 1), ('raised', 1), ('burning', 1), ('wasting', 1), ('predictions', 1), ('commitment', 1), ('due', 1), ('phase', 1), ('strip', 1), ('batch', 1), ('1985', 1), ('prepare', 1), ('distinctions', 1), ('20', 1), ('describes', 1), ('launching', 1), ('function', 1), ('complete', 1), ('sound', 1), ('ways', 1), ('whould', 1), ('palt', 1), ('failure', 1), ('survived', 1), ('persisting', 1), ('until', 1), ('critical', 1), ('stages', 1), ('nature', 1), ('fractions', 1), ('adults', 1), ('store', 1), ('unwere', 1), ('face', 1), ('avoided', 1), ('inborn', 1), ('amounts', 1), ('whereas', 1), ('occasional', 1), ('5', 1), ('becomes', 1), ('shows', 1), ('nice', 1), ('media', 1), ('living', 1), ("let's", 1), ('push', 1), ('meaning', 1), ('variable', 1), ('product\xe2\x80\x94as', 1), ('structure', 1), ('began', 1), ('e', 1), ('lead', 1), ('rule', 1), ('safe', 1), ('compete', 1), ('situations', 1), ("she's", 1), ('dismission', 1), ('once', 1)]

In [192]:
print('Build model...')
model = Sequential()
model.add(WordContextProduct(max_features, proj_dim=dim_proj, init="uniform"))
model.compile(loss='mse', optimizer='rmsprop')


Build model...

In [193]:
sampling_table = sequence.make_sampling_table(max_features)

In [194]:
for e in range(nb_epoch):
    print('-'*40)
    print('Epoch', e)
    print('-'*40)

    progbar = generic_utils.Progbar(tokenizer.document_count)
    samples_seen = 0
    losses = []
    
    for i, seq in enumerate(tokenizer.texts_to_sequences_generator(text_generator())):
        # get skipgram couples for one text in the dataset
        couples, labels = sequence.skipgrams(seq, max_features, window_size=4, negative_samples=1., sampling_table=sampling_table)
        if couples:
            # one gradient update per sentence (one sentence = a few 1000s of word couples)
            X = np.array(couples, dtype="int32")
            loss = model.train(X, labels)
            losses.append(loss)
            if len(losses) % 100 == 0:
                progbar.update(i, values=[("loss", np.mean(losses))])
                losses = []
            samples_seen += len(labels)
    print('Samples seen:', samples_seen)
print("Training completed!")


----------------------------------------
('Epoch', 0)
----------------------------------------
281/351 [=======================>......] - ETA: 0s - loss: 0.2500('Samples seen:', 8154)
----------------------------------------
('Epoch', 1)
----------------------------------------
285/351 [=======================>......] - ETA: 0s - loss: 0.2500('Samples seen:', 8254)
----------------------------------------
('Epoch', 2)
----------------------------------------
259/351 [=====================>........] - ETA: 0s - loss: 0.2500('Samples seen:', 8356)
----------------------------------------
('Epoch', 3)
----------------------------------------
263/351 [=====================>........] - ETA: 0s - loss: 0.2500('Samples seen:', 8660)
----------------------------------------
('Epoch', 4)
----------------------------------------
284/351 [=======================>......] - ETA: 0s - loss: 0.2499('Samples seen:', 8088)
----------------------------------------
('Epoch', 5)
----------------------------------------
278/351 [======================>.......] - ETA: 0s - loss: 0.2499('Samples seen:', 8070)
----------------------------------------
('Epoch', 6)
----------------------------------------
283/351 [=======================>......] - ETA: 0s - loss: 0.2499('Samples seen:', 7952)
----------------------------------------
('Epoch', 7)
----------------------------------------
267/351 [=====================>........] - ETA: 0s - loss: 0.2499('Samples seen:', 8026)
----------------------------------------
('Epoch', 8)
----------------------------------------
275/351 [======================>.......] - ETA: 0s - loss: 0.2498('Samples seen:', 8412)
----------------------------------------
('Epoch', 9)
----------------------------------------
254/351 [====================>.........] - ETA: 0s - loss: 0.2498('Samples seen:', 9016)
Training completed!

In [195]:
# recover the embedding weights trained with skipgram:
weights = model.layers[0].get_weights()[0]

In [196]:
# max_features = 100
# dim_proj = 35 # embedding space dimension
weights.shape


Out[196]:
(1000, 35)

In [ ]:
# we no longer need this
del model

In [197]:
# weights[:skip_top] = np.zeros((skip_top, dim_proj))
norm_weights = np_utils.normalize(weights)

word_index = tokenizer.word_index
reverse_word_index = dict([(v, k) for k, v in list(word_index.items())])
word_index = tokenizer.word_index

def embed_word(w):
    i = word_index.get(w)
    if (not i) or (i<skip_top) or (i>=max_features):
        return None
    return norm_weights[i]

def closest_to_point(point, nb_closest=10):
    proximities = np.dot(norm_weights, point)
    tups = list(zip(list(range(len(proximities))), proximities))
    tups.sort(key=lambda x: x[1], reverse=True)
    return [(reverse_word_index.get(t[0]), t[1]) for t in tups[:nb_closest]]  

def closest_to_word(w, nb_closest=10):
    i = word_index.get(w)
    if (not i) or (i<skip_top) or (i>=max_features):
        return []
    return closest_to_point(norm_weights[i].T, nb_closest)

In [198]:
norm_weights.shape


Out[198]:
(1000, 35)

In [201]:
words = ["first", "someone", "very"]

for w in words:
    res = closest_to_word(w)
    print('====', w)
    for r in res:
        print(r)


('====', 'first')
('first', 0.99999999999999978)
('seriously', 0.49790643756829794)
('an', 0.49406799268412138)
('seed', 0.47541329321783088)
('previous', 0.4513666896799422)
('backwant', 0.43745369720771504)
("who's", 0.43460943539188535)
('under', 0.42033294106264929)
('extraordinary', 0.41741061676681934)
('syrelicon', 0.40209655048546522)
('====', 'someone')
('someone', 1.0000000000000002)
("can't", 0.47500323676311418)
('say', 0.46329861592896882)
('new', 0.45458308748423015)
('next', 0.39691060199573591)
('raise', 0.39534702898483132)
('advance', 0.38937610117978871)
('partly', 0.37741839592987986)
('founder', 0.37450721730308095)
('term', 0.37288162180278317)
('====', 'very')
('very', 0.99999999999999978)
('answer', 0.48319412652317617)
('future', 0.43628154602186681)
('goal', 0.43514353022550423)
('roberts', 0.43177315436033947)
('went', 0.42850009557969904)
('existences', 0.41884230963944535)
('offering', 0.41849965587078797)
('slow', 0.41655929049425955)
('convincing', 0.40552226659025647)

In [188]:
len(word_index)


Out[188]:
1186