importing require packages


In [1]:
from __future__ import print_function

import json
import os
import numpy as np
import sys
import h5py

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, Bidirectional
from keras.preprocessing import sequence
from intersect_embeddings import Embeddings
from keras.callbacks import ModelCheckpoint

from nltk.tokenize import word_tokenize
import random
from itertools import groupby


Using TensorFlow backend.

Instantiate Embeddings


In [2]:
embeddings = Embeddings(300, 4, 1, 4)

Getting data from preprocessing


In [3]:
word2vec_model = embeddings.get_intersected_model()
word2index, index2word = embeddings.get_vocabulary()
word2vec_weights = word2vec_model.wv.syn0
tokenized_indexed_sentences = embeddings.get_indexed_sentences()


Loading Indexed Sentences...

In [4]:
word2index = {word:index+1 for word, index in word2index.items()}
index2word = {index:word for word, index in word2index.items()}

In [5]:
word2index


Out[5]:
{'bmi': 33763,
 "'the_warriors'_movie": 47826,
 'kaddu': 38007,
 'aston_villa': 26424,
 'alignment': 9963,
 'old_frisian': 41768,
 'the_first_few_years': 30269,
 'tariah': 33355,
 'undeveloped': 33266,
 'temur_khan': 31423,
 'universal_postal_union': 33915,
 'the_admiral_of': 26364,
 'mid-1980': 35582,
 'b-plug': 43613,
 'the_new_york_cosmos': 29231,
 'sexually': 9178,
 'niet': 33854,
 'the_greatest_uk_science_fiction_series_ever': 28901,
 'avoided': 3926,
 'march_26': 30685,
 'tai-lo': 48160,
 'southern_hills_country_club': 36418,
 'pinkertons': 48061,
 'exile': 6159,
 'mau': 5766,
 'conventa': 36450,
 'experimenters': 30998,
 'hurricane': 6017,
 'novo': 17718,
 'conetemporary': 25138,
 'house_of_dereon': 28979,
 'ora': 50508,
 'five_million': 29109,
 'the_fourt_macedonian_war': 27873,
 'pope_pius_xxiii': 34632,
 '1917-1920': 43813,
 'tom_kenny': 34129,
 'lemon_v_kurtzman': 19837,
 'video-conference': 10772,
 'conquistador': 15291,
 'pillar': 21269,
 'zinke': 20228,
 'prabhakara': 11285,
 'exemption': 5845,
 'diversified': 14193,
 '1570': 52500,
 'pure_theory_of_capital': 46766,
 'fallen': 5158,
 'wi-fi': 11507,
 'stanley': 17937,
 'teh': 8490,
 'langgasse': 40335,
 'faint': 12678,
 'the_hall_of_the_mountain_king': 44714,
 'murray_gold': 52189,
 'fervor': 50009,
 'francis_marcus_say': 39875,
 'bombing': 1818,
 'the_barbican_area': 30132,
 'grundgesetz': 49262,
 'stacked': 50759,
 'enclaves': 36425,
 'describes': 1083,
 'burgundians': 37458,
 'transverse': 12694,
 'vetoed': 17279,
 'run-off': 45158,
 'non-governmental': 17105,
 'amc': 14580,
 'recessions': 52306,
 'uraniumvi': 13764,
 'waiting': 9871,
 'gandhi_smriti': 41958,
 'contemptible': 29976,
 'buffa': 40519,
 'square-shaped': 25622,
 'survivied': 47734,
 'nearly_half': 23486,
 'johnathon_israel': 30199,
 'ihdi': 5973,
 'greek_cypriots': 18177,
 'communicates': 41376,
 'northern_greece': 30943,
 'rococo': 16784,
 'diabetics': 26377,
 'heaquarter': 27753,
 'soldiers': 1227,
 'falls': 4629,
 'maltose': 36451,
 'hyatt': 33721,
 'personality': 9261,
 're-introduction': 17200,
 'the_college_of_minor_canons': 30024,
 'reprised': 47343,
 'licchavi-era': 43322,
 'marlene_dietrich': 51518,
 'wls': 33838,
 'tashrih_al-badan': 33874,
 'polygynous': 19586,
 'merlin': 39215,
 'doctrines': 5454,
 'cabooses': 32070,
 'von_nemann': 18032,
 'the_deutscher_werkbund': 21329,
 'arminian': 36580,
 'dissidents': 40165,
 'energy-efficient': 17465,
 'spending': 3063,
 'straight': 7837,
 'sterilization': 43962,
 'metz': 13183,
 'issied': 30357,
 'co-led': 20099,
 'the_british_architectural_library': 42744,
 'glassworkers': 50879,
 'sunk': 5676,
 'iron': 2016,
 'pidgins': 37975,
 'cover': 902,
 'the_jerusalem_center': 20070,
 '1809': 18214,
 'the_city_of_god': 19986,
 'darkness': 12430,
 'many_miles': 8746,
 'sedition': 37684,
 'the_eastern_army': 26513,
 'der': 8365,
 'recorders': 22099,
 'solstice': 26529,
 'crucification': 39381,
 'ama': 49702,
 'tonian': 23120,
 'federal_constitution': 27114,
 'germanisation': 28156,
 'abbaye': 45918,
 'broiler': 13454,
 'dick_dale': 35623,
 'orson_welles': 48647,
 'catgories': 26159,
 'rivers': 2168,
 'elexcon': 20564,
 'enterprise': 7047,
 'january_1991': 34351,
 '1580': 34875,
 'hiller': 23923,
 'abnormalities': 26596,
 'palace_of_westminster': 15031,
 '15-1_in': 33366,
 'returns': 22974,
 'tb': 1364,
 'footballer': 7961,
 'firefox': 10382,
 'destry': 21714,
 'week_six_of_the_regular_season': 41273,
 'fish': 2937,
 'mill': 5543,
 'tubman': 47312,
 'robert_brown': 25995,
 'gses': 23760,
 'investor': 6463,
 'francis_bacon': 13815,
 'appropriately': 38464,
 'miles_per_hour': 26884,
 'takers': 40971,
 'hmadi': 37785,
 'super': 2500,
 'reoccupy': 50484,
 'millbay': 50500,
 'rite': 29512,
 'tintin': 13023,
 'consistently': 15509,
 'the_third_dynasty': 22365,
 'granparents': 27399,
 'programmable': 11228,
 'malarial': 25275,
 'the_metropolitan_police_service': 31914,
 'open_world_taekwondo_challenge': 24965,
 "'yoh-hu": 47102,
 'the_canonbury_tavern': 26541,
 'quit': 7025,
 'independence_day': 20551,
 'quotes': 35738,
 'forums': 41040,
 'the_decade_of_the_brain': 37731,
 'low-end': 23013,
 'the_times_atlas': 30549,
 'nixon': 7964,
 'dholak': 51734,
 'thousand_days': 14758,
 'turnover': 9753,
 'op_10': 26224,
 'statement': 1717,
 'bedrock': 50512,
 'psychiatrists': 30001,
 'reciprocity': 12657,
 'post-translational': 31078,
 'roman_god': 50470,
 'alexis_de_tocqueville': 19037,
 'the_middle_way': 23415,
 'habit': 13785,
 'the_national_philatelic_museum': 44255,
 'sheptycki': 11074,
 'interpretated': 52658,
 'evita': 12049,
 'regualtions': 47786,
 'preachers': 5882,
 'museo_tamayo': 51547,
 'lantern': 9984,
 'jointly': 11269,
 'the_us_congress': 46604,
 'spokesman': 11181,
 'abu_sufyan': 14025,
 'plymouth_gin': 45007,
 'november_30_1853': 44813,
 'abrasive': 50177,
 'choosing': 12988,
 'glycemic': 15340,
 'ranger': 14109,
 'distant': 10906,
 "'vision": 28720,
 'the_santa_monica_junior_college': 35025,
 'static': 6406,
 'vytautas': 48673,
 'stain': 7720,
 'afghanistan': 3294,
 'middle_english': 18780,
 'james_keith': 40865,
 'chair': 3514,
 'quintessentially': 41878,
 'royal_prussia': 45246,
 'gradute': 30097,
 'the_south_saharan': 45813,
 'yoroi': 41122,
 'emboldened': 48690,
 'dichotomous': 35658,
 'isomorphism': 12974,
 'zayed_university': 27453,
 'warship': 8760,
 'pro-tibetan': 48971,
 'mahatma_gandhi': 14641,
 'numbe': 47738,
 'orbit': 2897,
 'benign': 47593,
 'the_coldest_month': 46437,
 'permanently': 5378,
 'thorn_ey': 27647,
 'the_french_truck': 38573,
 'between_1927-1934': 29834,
 'osborne': 13278,
 'march_26_2013': 17650,
 'downfall': 5822,
 'olympic_torch_relay': 29917,
 'speciation': 48529,
 'rice_university': 42740,
 'slowing': 21005,
 'new_york_botanical_gardens': 37620,
 'donna_noble': 28553,
 'inspection': 7428,
 'protons': 13222,
 '1774': 11716,
 'incentives': 6842,
 'cartoonish': 36775,
 'interjection': 43811,
 'ethnographic': 24253,
 'dennis_ferrer': 42543,
 'analyzers': 14122,
 'kim_il-sung': 11138,
 'the_jin_dynasty_end': 45162,
 "'the_world_in_grey": 15293,
 'djoser': 25564,
 'contrabassoon': 29082,
 'privileged': 9759,
 'investigation': 3066,
 'the_african_lodge': 23199,
 'the_medical_research_council': 38816,
 'the_medal_of_the_order_of': 32246,
 'uziece': 48200,
 'august': 4613,
 'sarvastivada': 34237,
 'sharks': 18950,
 'rudi': 28168,
 'words': 828,
 'contributor': 7721,
 '7bit': 25829,
 'spielbergo': 30884,
 'censured': 14162,
 'bower': 13364,
 'agriculture': 1912,
 'reuben_greenberg': 41295,
 'the_windows_manager': 28640,
 'six_days': 18148,
 'adhesive': 21856,
 'baptised': 20069,
 'the_city_of_london': 6805,
 'renaissance': 2534,
 'recievers': 46323,
 'between_june_2007': 24423,
 "'melting": 47655,
 'diamondback': 37719,
 'goespel': 47246,
 'nullification': 23276,
 'assault': 5710,
 'during': 43,
 'microprocessors': 37074,
 'fort_duquesne': 11386,
 'arians': 6422,
 'digestive': 7748,
 'ins': 20396,
 'the_labout_party': 22885,
 'beidou_navigation_satellite_system': 20600,
 'waterfalls': 31151,
 'knell': 49814,
 'bombardments': 26190,
 'the_canadian_armed_forced': 31047,
 'waitangi': 37114,
 'rodriguez': 38400,
 'shenouda': 38433,
 'building': 219,
 'poverty': 3174,
 'intruz': 33287,
 'replace': 1323,
 'relationships': 2274,
 'firefighters': 4612,
 'frozen': 9455,
 '30000': 20794,
 'legislatures': 21157,
 'the_treaty_of_kyakhta': 34520,
 'the_trinity_church': 30755,
 'shia': 4090,
 'montgelas': 45924,
 'movements': 2245,
 'steamer': 24765,
 'recirculating': 37453,
 'revolutionized': 12460,
 'vestiges': 13404,
 'the_paris_basin': 42203,
 'wills': 45490,
 '23_april_1775': 41872,
 'the_west_bank': 17427,
 'bharatpur': 43744,
 'uthman': 20348,
 'tighten': 16921,
 'crowded': 25485,
 'harassed': 35942,
 'corruption': 1203,
 'a_treatise_on_algebra': 50650,
 'cortisol': 19180,
 'isolating': 20796,
 'the_fertile_crescent': 13486,
 'the_international_photography_hall_of_fame': 45064,
 'wells': 12592,
 'silent': 11795,
 'euorpeans': 39120,
 'comes': 1433,
 'datnet': 32211,
 'commander': 2328,
 'st_nicholas': 48727,
 'jacques_derrida': 46175,
 'boulogne-sur-mer_harbour': 39305,
 'propaganda': 7307,
 'encyclopedic': 43743,
 'their_primitive_days': 45531,
 'flac': 31130,
 'rescue_the_french_republic': 32047,
 'perk': 22714,
 'names': 723,
 'clover': 52505,
 'hind': 42693,
 '54th': 27509,
 'processions': 33843,
 'ravine': 37158,
 'muscle': 6388,
 'plan_de_guadalupe': 17297,
 'accomlish': 36095,
 'noveau': 37967,
 'tonearm': 48239,
 'agrree': 32350,
 'book': 164,
 'muso_soseki': 50311,
 'enemy': 3406,
 'sasanin_persians': 18157,
 'materially': 32928,
 'nicolas': 33517,
 'wildcard': 14205,
 'yiu_yung-chin': 45434,
 'popper': 654,
 'phase': 1566,
 'cyborg': 11443,
 'weinrich': 24971,
 'ozawa': 7992,
 'sealevel': 41746,
 'ammunition': 19316,
 'zhi_guang': 16480,
 'bait': 33271,
 'perceive': 4709,
 'such': 779,
 'the_papal_schism': 28633,
 'profiling': 23117,
 'trier': 20192,
 'hadaway': 24717,
 'prefrontal': 32282,
 'the_korean_communist_party': 25066,
 'chelm': 20375,
 'indiviual': 40677,
 'miami_central_station': 30774,
 'city-dwellers': 24480,
 'pasupata': 31552,
 'adjectives': 8777,
 'cheap': 10018,
 'hideout': 49863,
 'vikramaditya_ii': 31587,
 'pangenesis': 49302,
 'wc-130h': 32635,
 'british_basketball_league': 27618,
 'bounces': 28600,
 'zakria': 25739,
 'halal': 14208,
 'manuel_oribe': 16517,
 'bells_beach_in_victoria': 50069,
 'mesoglea': 21255,
 'the_national_security_act': 46862,
 'new_zealand': 2386,
 'two-layer': 29760,
 'the_bronx_museum_of_the_arts': 20849,
 'oswald': 7894,
 'standardization': 10873,
 'homeowner': 45052,
 'desegregate': 19364,
 'the_final_week': 46103,
 'pittsburg': 43966,
 'seaway': 49693,
 'shot': 2243,
 'duke': 4875,
 'the_city_of_malindi': 35071,
 'charitable': 6070,
 'nanak': 37213,
 'the_golf_club_of_punta_carretas': 48205,
 'protestors': 19005,
 'mike_marino': 48931,
 'backend': 25222,
 'gluconate': 40291,
 'secretery': 52263,
 'raising': 5986,
 '142': 41453,
 'librarian': 42312,
 'many_academy_awards': 47462,
 'mstislav': 46984,
 'apparant': 22116,
 'hotel_yak__yeti': 33724,
 'multi-color': 8087,
 'science_of_logic': 19326,
 'enlarged_homestead_act': 28604,
 'organs': 4330,
 'cathars': 10520,
 'bare': 7388,
 'opportunistic': 19373,
 'greater_than_30000': 36670,
 '1701': 13834,
 'emanating': 52399,
 'the_dervish_state': 17011,
 'marks': 2803,
 'nested': 24713,
 'denominationally': 44048,
 'the_senate_committee_on_commerce': 27510,
 'spawn': 23166,
 'rinbung': 22942,
 'xudong_an': 40177,
 'desert_national_park': 13396,
 'scapegoat': 30318,
 'invited': 3246,
 'para': 33775,
 'premature': 25526,
 'whar': 17928,
 'alfabia': 38379,
 'rupert_murdoch': 11840,
 'conducting': 9522,
 'captions': 20496,
 'disucussed': 51345,
 'blvd': 33334,
 'the_associations_incorporation_act': 17470,
 'emulsion': 23367,
 'the_nags_head': 31137,
 '244_yards': 48935,
 'melodramas': 21982,
 'david_brown': 38504,
 'cultivating': 24287,
 'max_weinreich': 44415,
 'augustan': 34640,
 'cadenza': 43494,
 'antisemitism': 8826,
 'carr_supreme_court': 39229,
 'rountines': 28434,
 'fliegerkorps': 34287,
 'the_finnish_infantry_regiment': 31708,
 'dairy': 6250,
 'seleucid': 6254,
 'the_gorton_government': 44863,
 'suggests': 3609,
 'metacritic': 23268,
 'bin': 12050,
 'article_10_of_the_berne_convention': 47554,
 'the_1st_marines': 51981,
 'the_disco_eye-cued_system': 25323,
 'mcveigh': 49290,
 'digimon': 1440,
 'plymstock': 14355,
 'excelled': 42897,
 'the_mann_act': 21817,
 'reflecting': 13936,
 'habitations': 20289,
 'tathgatagarbha': 41979,
 'between_1617_and_1619': 49788,
 'the_war_of_the_fourth_coalition': 33236,
 'contradiction': 35452,
 'conifer': 41137,
 'mound': 47567,
 'tends': 6078,
 'interiors': 20591,
 'august_5_1865': 27141,
 'shines': 43502,
 'the_early_1900s': 8930,
 'solace': 40582,
 'lee_strasberg': 26708,
 'ena': 31850,
 'yagi-uda': 22602,
 'thomas_de_maiziere': 26347,
 'kinks': 40117,
 'qingchuan_county': 29548,
 'captain_cook': 47784,
 'stamens': 29937,
 'guilt-innocence': 42095,
 'conference_of_yugoslavia': 23099,
 'the_early_1700': 45143,
 'hospira': 15777,
 'eton_dorney': 12163,
 'the_capital_city_-_abc': 25426,
 'imperial_institute': 51421,
 'temporary': 3302,
 'the_rialto_building': 37882,
 'ill-will': 49708,
 'world_war_1': 8690,
 'pong': 35134,
 'socialist': 5069,
 'bpn': 13281,
 'the_1860s': 18042,
 'traducere': 51990,
 'neoprene': 42264,
 'secretprojectrevolution': 35298,
 'the_delta': 44319,
 'texture': 12828,
 'harb': 46249,
 '1851': 8801,
 'intermarriage': 8211,
 'talented': 27243,
 'screened': 13410,
 'cinematographer': 48316,
 'veins': 32198,
 'synnecrosis': 18698,
 'tefu': 25880,
 'self-esteem': 8966,
 'confederalism': 33049,
 'contingents': 37890,
 'ostrogothic': 39428,
 'more_rush_hour': 46031,
 'ta': 11558,
 'caboclos': 43816,
 'connectors': 5736,
 'claris': 19841,
 'denounce': 43346,
 'w-vhs': 17954,
 'erich_raeder': 40256,
 'beds': 9328,
 'pope_leo_x': 36022,
 'whig': 10913,
 'seguranca': 50478,
 'piasts': 32443,
 'cloron': 29338,
 'weavings': 28310,
 'yogacarins': 32277,
 'ncdc': 26943,
 'atlantic': 3956,
 'asia': 1961,
 'agence': 39290,
 'new_york': 502,
 'between_1956_and_1998': 49168,
 'costal': 43089,
 'bath_spa_gain_university': 42588,
 'valuable': 3958,
 'bush': 2487,
 'roughly_200000': 25409,
 'qaynuqa': 21352,
 'situations': 6596,
 'diets': 6478,
 'ordination': 11258,
 'militant': 22937,
 'melting': 7701,
 'warsaw_pact': 15367,
 'questing': 31205,
 'flag-carrier': 37266,
 'cascajal': 41385,
 'ability': 1262,
 'calatrava': 14745,
 'talbot_hughes': 26618,
 'helpful': 6951,
 'rosenfield': 34663,
 'centrality': 10337,
 'olivia_newton-john': 34489,
 'patronage': 12324,
 'excavated': 11473,
 'blend': 10694,
 'precursor': 6758,
 'arthur_cronquist': 37889,
 'gateway_arch': 36802,
 'culturally': 6777,
 'curlews': 40265,
 'splintered': 39899,
 'the_performing_arts': 13888,
 'aide': 13048,
 'the_howard_hughes_medical_institute': 28543,
 'fabbrica': 29594,
 'the_abc_radio': 43436,
 'holy_roman_empire': 29035,
 'jeffrey_bub': 29255,
 'dimophism': 47870,
 'involced': 41995,
 'ibrahim_pasha': 20462,
 'sunlight': 4851,
 'excited': 9750,
 'collaboration': 18656,
 'legionnaires': 37669,
 'the_end_of_2011': 33951,
 'ww-ii': 16955,
 'calculations': 6475,
 'herculoids': 37236,
 'plastic': 4670,
 'the_boy_scouts': 27357,
 'promulgated': 20409,
 'settlement': 987,
 'thunder': 7054,
 'bathyctena': 19648,
 'durham_cathedral': 36067,
 'maristems': 30159,
 'the_british_indian_empire': 43536,
 'polarized': 14262,
 'dificult': 43727,
 'the_congress_of_erfurt': 22067,
 'sponsoring': 42213,
 'the_early_seventies': 52459,
 'jones_hall': 18303,
 'alvise_cadamosto': 30069,
 'asylum': 10817,
 'james_hutton': 14468,
 'coerced': 48802,
 'summer_olympics_sailing': 43446,
 'detailed': 4248,
 'sliver': 28170,
 'accompaniment': 11606,
 'medicinal': 8282,
 'baggage': 19610,
 'uncials': 28557,
 'george_robey': 34354,
 'issaac_newton': 35975,
 'jewish_german': 28044,
 'lacemakers': 25531,
 'contractor': 9079,
 'super_bowl_xli': 23471,
 'farenheit': 37123,
 'mark_childress': 36311,
 'ub_2': 46317,
 'justin_timberlake': 42056,
 'non-otg': 33451,
 'russian': 547,
 'prices': 2281,
 'north_germanic': 25027,
 'previously-patented': 26601,
 'serf': 34214,
 'yeezus_was': 42496,
 'november_of_1942': 39049,
 'nama': 30025,
 'principle': 1479,
 'improperly': 22462,
 'musique': 24551,
 'swaziland': 1472,
 'pine': 5890,
 'the_international_bell_telephone_company': 23858,
 'the_same_years': 42157,
 'blocking': 33780,
 'suburibicarian': 24769,
 'opportunites': 46040,
 'tyranny': 16611,
 'centers': 2582,
 'the_licensing_act': 29649,
 '101': 23340,
 'traction': 6983,
 'david_atherton': 17719,
 'gaddafi_begin': 41230,
 'simultaneous': 13240,
 'starlin_castro': 27079,
 'syria': 2728,
 'mantis': 16865,
 'shells': 8196,
 'non-hispanic': 5072,
 'constantius': 18192,
 'the_council_vote': 36387,
 '290': 48948,
 'the_bey_hive': 23796,
 'humble': 20526,
 'constituent': 7062,
 'grand_mastership': 38109,
 'urbium': 46632,
 'revitalize': 21711,
 'friedrich_ratzel': 12526,
 'mute': 21900,
 'the_fiscal_year': 43538,
 'courted': 44594,
 'seer': 20550,
 'the_ohio_valley': 51781,
 'gay-lussac': 46532,
 'ibs': 15661,
 'auto-sembly': 11379,
 'ru': 41954,
 'tuned': 11196,
 'old_truman_brewery': 32492,
 'fifty_to_one_hundred': 21811,
 'nucleus-orbiting': 37360,
 'advisor': 6438,
 'construes': 37929,
 'rappers': 45364,
 'fc_barca': 33984,
 'stressed': 8543,
 'effected': 5515,
 'oklahoma_cities': 10437,
 'health': 741,
 'consisted': 3577,
 "queen's_news": 23733,
 'the_white_rose': 26026,
 'koch': 7308,
 'terminologies': 26214,
 'antigonid': 30658,
 'close': 825,
 'organozinc': 45191,
 'jcpc': 48138,
 'persuaded': 49119,
 'engravers': 39962,
 'pegasus': 36128,
 'julian': 3819,
 'expertise': 22268,
 'unenforceable': 22310,
 'angel': 4967,
 '100646': 41049,
 'lagrime': 37291,
 'current-dependent': 25635,
 'qing_china': 16488,
 'sermon': 8570,
 'michael_j_shea': 42923,
 'rao': 44544,
 'correspond': 5321,
 'the_virginia_biotechnology_research_park': 34624,
 'mortgage': 4485,
 'pituitary': 52009,
 'jock': 31019,
 'the_deccan_plateau': 39206,
 'interplanetary': 36822,
 'exiles': 11915,
 'two_months_later': 28317,
 'luther_place': 27862,
 'ripper': 45589,
 'nation_of_islam': 33749,
 'arminianism': 42578,
 'isuppli': 28503,
 'joint-degree': 38622,
 'the_new_right': 32538,
 'hydra': 22708,
 'done': 671,
 'advising': 26539,
 'shool': 43285,
 'falter': 48293,
 'st_bartholomew_chapel': 36201,
 'ethics': 7818,
 'wipo': 43361,
 'first_week': 27527,
 'drinkable': 17285,
 'grazes': 33202,
 'the_far_east': 47976,
 'docklands': 20190,
 'angela_merkel': 25759,
 'carnival_corporation': 26107,
 'montevideo_bay': 24406,
 'greylag': 21587,
 'ballarat': 10650,
 'suround': 37309,
 'eugene_atget': 52438,
 'chihuahua_city': 16583,
 'actinide': 35739,
 'kilopascals': 49735,
 'peacocks': 32061,
 'cog-railways': 39204,
 'orchestrated': 36229,
 'december_of_1971': 49383,
 'surry_nuclear_generating_station': 35521,
 'croat': 17015,
 'seamount': 14153,
 'uncoiled': 51583,
 'christians': 1039,
 'supporting': 2974,
 "the_papalote_children's_museum": 31463,
 'gauguin': 25475,
 'spec': 48832,
 'memorialized': 42725,
 'forbidden': 3522,
 'patriotic_democratic': 25174,
 'hakka': 17890,
 '64': 18411,
 'nucleons': 36871,
 'firebox': 14456,
 'salmon': 14523,
 'edward_long': 32985,
 'mazda': 42691,
 'william_halfpenny': 49562,
 'कनतपर': 38451,
 'built': 155,
 'fiancee': 18028,
 'peasant': 6430,
 'queue': 16941,
 'replacement': 3309,
 '1500s': 12295,
 'riffs': 38286,
 'gallifrey': 40686,
 'the_atlanta-based_intercontinental_exchange': 35109,
 'polytechnic': 3233,
 'conkling': 31105,
 'bruni': 34802,
 "'usui": 52316,
 'pacific_time': 28513,
 'the_stage_award_for_best_regional_theatre': 42560,
 'pants': 17945,
 'the_season_prior_to': 44652,
 'isuzu': 44472,
 'gigantism': 32994,
 'injustice': 47309,
 'exceeding': 17552,
 'dominant-party': 33010,
 'shar': 31242,
 'mount_tangjia': 25795,
 'millions_of_years_old': 33839,
 'vintage': 45059,
 'forget': 16577,
 'the_brooklyn_nets': 49277,
 'inheritance': 5364,
 'january_22': 25495,
 'spirochete': 44244,
 'distrusted': 23720,
 'diplomats': 10491,
 'inspriting': 39479,
 '1539': 20096,
 'penetrated': 36699,
 'essentials': 8943,
 '1839': 5538,
 'project_nepal': 31514,
 'analog': 3973,
 'late_2009': 15686,
 'central_asia': 10433,
 'fiers': 47259,
 'memior': 38651,
 'appalachian_mountains': 29154,
 'world_war_i': 3007,
 'sergel': 45473,
 'the_eleventh_doctor': 24261,
 'britanica': 34733,
 'jk': 14263,
 'cladding': 34660,
 'historica': 35748,
 'national_nature_reserves': 27426,
 'the_confederation_from': 44656,
 'lutheranism': 12897,
 'threaded': 30550,
 'paul_sheehy': 30418,
 'lange': 15456,
 'calidris': 23381,
 'volatiles': 38462,
 'the_south_west_england_region': 32243,
 '45s_to_the': 43403,
 'kevin_faulconer': 27802,
 'crest': 3400,
 'elms': 46143,
 'flawed': 36370,
 'may_30_1971': 30329,
 'housing': 1563,
 'reincarnate': 44087,
 'nhl': 11118,
 'recommendations': 6541,
 'illuminated': 14705,
 'advises': 8432,
 'busan_south_korea': 48748,
 'the_killers': 26582,
 'renounced': 10273,
 'living_black': 19919,
 'computable_numbers': 50412,
 'lo_rat_penat': 50039,
 'simulcast': 20544,
 'policing': 4025,
 'shall': 12973,
 'officeworks': 18858,
 'liability': 10594,
 'fireplace': 17390,
 'mission_san_diego': 41761,
 'axiomatization': 51515,
 'extict': 36897,
 'breaking': 4384,
 '1309': 23001,
 'snet': 14895,
 'pita': 29713,
 'the_plymouth_raiders': 17872,
 'sessions': 9277,
 'expended': 47467,
 'manner': 3749,
 'diocese': 10199,
 'sailsbury': 48079,
 'st_lucia': 30467,
 'newsmagazine': 36355,
 'bound': 3856,
 'hydroplanes': 44202,
 'sept_1760': 31119,
 'the_jagiellonian_university': 46212,
 '2oth_century': 34947,
 'the_melcher_center': 51293,
 'rumble': 16975,
 'vibration': 20100,
 'anwar_el_sadat': 32797,
 'migrant': 20189,
 'surrendering': 11289,
 'appears': 3830,
 'bing_crosby': 21175,
 'travel_+': 48206,
 'spotlight': 41927,
 'slavonic-speaking': 40562,
 'single': 406,
 'specified': 4409,
 'little_rock_central_high_school': 27196,
 'fracture': 40495,
 'disasterous': 35139,
 'grieved': 47942,
 'spawning': 29985,
 'mores': 27775,
 'theatre': 3142,
 'the_prussian_guard': 27344,
 'harvardradcliffe': 47205,
 'paramylon': 52066,
 'al-qarawiyin': 36482,
 'the_erector_square': 23172,
 "'crowns": 33926,
 'mhayana': 40288,
 'sunni_hadith': 43478,
 'christmases': 38868,
 'lord_salisbury': 20091,
 'summoned': 8332,
 'far_eastern': 14892,
 'circumpolar': 32612,
 'cold-mix': 38149,
 'snugs': 38341,
 'concreteness': 37091,
 'esatap': 26202,
 'occurs': 1982,
 'maurice_schlesinger': 33834,
 'cultural_autonomy': 29261,
 'anticipated': 8849,
 'kitab_rudjdjar': 46874,
 'purporting': 52612,
 'isaiah_berlin': 35188,
 'carnot': 49261,
 'saronic': 50081,
 'the_delhi_metro': 8742,
 'nabta_playa': 26495,
 'the_mature_indus_civilization': 37333,
 'the_rocky_and_bullwinkle_show': 24448,
 'special_clerical_court': 29317,
 'cousins': 29817,
 'remembrance_day': 26079,
 'county_antrim': 40478,
 'constituted': 6760,
 'shaping': 23738,
 'rajamangala_university_of_technology': 25953,
 'gaulish': 51253,
 'ifl': 45945,
 'peformance': 31892,
 'the_eid_ul-fitr': 47103,
 'acquittal': 35419,
 'subsidized': 11475,
 "a_commonwealth_writers'_prize": 31468,
 'arms': 2387,
 'non-catholic': 27385,
 'interbrand': 52372,
 'february_1861': 30988,
 'intrusive': 27911,
 'the_end_of_the_500s': 28918,
 'knibb': 46377,
 'the_arctic_ocean': 35583,
 'frederick_i_barbarossa': 31854,
 'the_laser-debakey_clinical_medical_research_aware': 34485,
 'bleaching': 14989,
 ...}

In [6]:
tokenized_indexed_sentences[0]


Out[6]:
[1, 3, 2206, 9, 388, 498, 93, 108, 5, 0]

In [7]:
tokenized_indexed_sentences = [np.array(sentence) + 1 for sentence in tokenized_indexed_sentences if len(sentence) > 0]

In [8]:
tokenized_indexed_sentences[0]


Out[8]:
array([   2,    4, 2207,   10,  389,  499,   94,  109,    6,    1])

In [9]:
new_weights = np.zeros((1, word2vec_weights.shape[1]))

In [10]:
new_weights = np.append(new_weights, word2vec_weights, axis=0)

In [11]:
new_weights.shape


Out[11]:
(52731, 300)

In [12]:
new_weights[52730]


Out[12]:
array([ 0.35742188,  0.03369141, -0.03881836,  0.07666016, -0.06079102,
        0.6328125 ,  0.05615234,  0.04345703,  0.00265503, -0.21582031,
        0.40234375, -0.0559082 , -0.15820312,  0.21289062,  0.28710938,
        0.54296875, -0.13085938,  0.14746094,  0.06738281, -0.171875  ,
        0.07373047, -0.0006485 , -0.10986328, -0.13476562,  0.06152344,
       -0.03833008, -0.07519531, -0.00221252,  0.09179688, -0.37890625,
       -0.31054688, -0.07666016, -0.484375  , -0.0546875 , -0.13183594,
       -0.33203125,  0.20996094,  0.25      ,  0.0534668 ,  0.08496094,
       -0.1875    ,  0.09960938,  0.24902344, -0.07714844, -0.01123047,
       -0.06787109,  0.21191406, -0.11865234, -0.01660156,  0.22265625,
       -0.37695312,  0.36914062, -0.51171875,  0.06640625, -0.19726562,
       -0.01818848,  0.0612793 , -0.21582031,  0.13574219, -0.08154297,
        0.18652344,  0.3203125 ,  0.26367188,  0.24609375,  0.01208496,
        0.04931641,  0.18652344,  0.29296875,  0.21289062,  0.06884766,
        0.13476562, -0.17480469, -0.02246094,  0.25195312,  0.02380371,
       -0.00354004,  0.09228516,  0.1953125 , -0.07763672, -0.13867188,
        0.05175781,  0.17578125, -0.02124023, -0.38476562, -0.16992188,
       -0.12597656, -0.11376953,  0.13671875, -0.06835938, -0.00921631,
        0.04394531, -0.27148438, -0.45703125, -0.08837891,  0.04321289,
        0.15332031,  0.1796875 , -0.02099609,  0.20507812, -0.05688477,
        0.10839844, -0.12011719,  0.203125  , -0.31054688,  0.28125   ,
       -0.23828125, -0.44921875, -0.02966309,  0.19628906, -0.36523438,
        0.05761719,  0.15527344,  0.10742188,  0.23242188, -0.13085938,
        0.14257812, -0.32617188,  0.09423828,  0.32421875,  0.00183868,
        0.20703125,  0.203125  ,  0.07617188, -0.17285156, -0.10449219,
        0.15136719, -0.06542969,  0.11083984,  0.04956055,  0.25585938,
       -0.10302734, -0.08886719, -0.23925781,  0.11328125,  0.14941406,
       -0.03833008,  0.12402344,  0.13085938,  0.19433594,  0.03613281,
        0.0072937 , -0.0612793 , -0.20703125,  0.38867188,  0.12988281,
        0.23925781, -0.36523438,  0.265625  , -0.50390625,  0.21679688,
        0.26367188,  0.05566406, -0.26757812,  0.23632812, -0.171875  ,
       -0.11181641,  0.16796875, -0.23925781, -0.02380371,  0.10400391,
       -0.20117188,  0.12402344, -0.10546875,  0.02233887,  0.15234375,
        0.31640625,  0.05322266, -0.20214844,  0.13769531,  0.00442505,
        0.14550781, -0.0703125 ,  0.17382812, -0.18457031, -0.21191406,
        0.07275391, -0.1640625 , -0.01660156, -0.0019989 ,  0.01361084,
        0.04223633, -0.01116943,  0.05786133, -0.13378906, -0.24804688,
       -0.38085938, -0.03100586, -0.10839844,  0.21386719,  0.03686523,
       -0.11279297,  0.34765625,  0.20507812, -0.14941406,  0.05175781,
       -0.06445312, -0.16210938, -0.15917969, -0.12792969, -0.18554688,
       -0.08007812,  0.02526855, -0.17285156,  0.14550781,  0.1015625 ,
        0.04321289, -0.19433594,  0.10107422,  0.11279297, -0.14453125,
       -0.53125   , -0.21679688, -0.02258301, -0.09277344, -0.02612305,
        0.14453125,  0.10839844, -0.23242188, -0.34765625, -0.09423828,
        0.19726562,  0.06054688,  0.11230469, -0.16894531, -0.21972656,
        0.12011719,  0.18164062, -0.0177002 , -0.18359375, -0.22070312,
        0.09863281, -0.06787109, -0.36132812,  0.09423828, -0.00601196,
        0.38085938,  0.2890625 , -0.02661133, -0.14550781,  0.02832031,
        0.3984375 , -0.1328125 ,  0.09960938,  0.04882812,  0.03466797,
       -0.05078125,  0.25390625,  0.09716797,  0.06152344, -0.16210938,
       -0.11523438,  0.07861328,  0.47851562,  0.546875  ,  0.05859375,
       -0.15039062, -0.08398438, -0.22070312,  0.25195312,  0.14355469,
       -0.18457031, -0.22851562, -0.11767578,  0.21386719, -0.14746094,
        0.02307129, -0.20019531, -0.09423828,  0.1015625 , -0.04589844,
       -0.09472656,  0.03320312, -0.06835938,  0.05566406,  0.30273438,
       -0.0456543 , -0.02111816,  0.18847656,  0.33007812,  0.3984375 ,
        0.12695312, -0.03173828,  0.35742188, -0.12792969,  0.28320312,
       -0.17773438, -0.29101562, -0.10839844, -0.13183594,  0.15527344,
        0.20800781,  0.2734375 , -0.14355469, -0.11865234, -0.16699219,
       -0.04101562, -0.03564453,  0.00174713, -0.0859375 , -0.17773438])

generating training data


In [13]:
window_size = 5
vocab_size = len(word2index)
print(vocab_size)


52730

In [14]:
maxlen = max([len(sentence) for sentence in tokenized_indexed_sentences])

In [15]:
tokenized_indexed_sentences = sequence.pad_sequences(tokenized_indexed_sentences)

In [16]:
seq_in = []
seq_out = []
# generating dataset
tokenized_indexed_sentences = [sentence for sentence in tokenized_indexed_sentences if len(sentence) > 0]
for sentence in tokenized_indexed_sentences:
    x = sentence
    y = np.append(sentence[1:], np.array(sentence[len(sentence)-1]))
    seq_in.append(x)
    seq_out.append([new_weights[index] for index in y])

# converting seq_in and seq_out into numpy array
seq_in = np.array(seq_in)
seq_out = np.array(seq_out)
n_samples = len(seq_in)
print ("Number of samples : ", n_samples)


Number of samples :  97974

Defining model


In [19]:
# Changes to the model to be done here
model = Sequential()
model.add(Embedding(input_dim=new_weights.shape[0], output_dim=new_weights.shape[1], weights=[new_weights], mask_zero=True))
model.add(Bidirectional(LSTM(, return_sequences=True), merge_mode="ave"))
model.add(Bidirectional(LSTM(300, return_sequences=True), merge_mode="ave"))
model.load_weights("../weights/bidirectional-lstm-2-1024-300-batchsize-256-epochs-30-Sequence/weights.29.hdf5")
model.compile(loss='cosine_proximity', optimizer='adam',metrics=['accuracy'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_2 (Embedding)      (None, None, 300)         15819300  
_________________________________________________________________
bidirectional_3 (Bidirection (None, None, 1024)        10854400  
_________________________________________________________________
bidirectional_4 (Bidirection (None, None, 300)         3180000   
=================================================================
Total params: 29,853,700
Trainable params: 29,853,700
Non-trainable params: 0
_________________________________________________________________

In [18]:
model_weights_path = "../weights/lstm-3-1024-1024-batchsize-256-epochs-30-Sequence"
if not os.path.exists(model_weights_path):
    os.makedirs(model_weights_path)
checkpoint_path = model_weights_path + '/weights.{epoch:02d}.hdf5'
checkpoint = ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_best_only=False, mode='max')

Train Model


In [20]:
# model.fit(seq_in, seq_out, epochs=1,/ verbose=30, batch_size=256, callbacks=[checkpoint])

model predict


In [45]:
start = 0
sentence_test = " who is the"
indexed_sentences = embeddings.get_indexed_query(sentence_test)
print("indexed_sentences ",indexed_sentences)
sent = np.array(indexed_sentences) + 1
#pattern = list(seq_in[start])
pattern = list(sent)
print("\"",' '.join(index2word[index] for index in pattern))
for i in range(2):
    prediction = model.predict(np.array([pattern]))
    print(len(prediction[0]))
    pred_word = word2vec_model.similar_by_vector(prediction[0][prediction.shape[1] - 1])[0][0]
    sys.stdout.write(pred_word+" ")
    pattern.append(word2index[pred_word])
    pattern = pattern[:len(pattern)]


indexed_sentences  [1, 12, 8, 2]
" squadstart who is the
4
the 5
the 

In [34]:
model.layers[1].get_weights()[0]


Out[34]:
array([[-0.01984103,  0.00866925,  0.00596858, ..., -0.01896135,
         0.01253303, -0.01682685],
       [ 0.02944827,  0.03790607,  0.01673041, ..., -0.03337999,
         0.0015724 ,  0.01733136],
       [-0.00504541,  0.00439127, -0.01940113, ..., -0.02757988,
         0.01667905, -0.00050718],
       ..., 
       [ 0.04402941,  0.03036774,  0.00817353, ...,  0.02618253,
         0.01713058,  0.00941994],
       [-0.00202136,  0.00781708, -0.03179857, ..., -0.01009496,
        -0.00812547, -0.04278539],
       [-0.03006271,  0.02193416, -0.00921515, ...,  0.00953403,
         0.01885799, -0.0187456 ]], dtype=float32)

In [37]:
len(tokenized_indexed_sentences)


Out[37]:
97974

In [ ]:
#e_model = embeddings.get_model()

In [ ]:
#e_model.similar_by_word("profitabl")

Accuracy


In [ ]:
def accuracy():
    count = 0
    correct = 0
    for sub_sample_in, sub_sample_out in zip(seq_in, seq_out):
        ypred = model.predict_on_batch(np.expand_dims(sub_sample_in, axis=0))[0]
        ytrue = sub_sample_out
        pred_word = word2vec_model.similar_by_vector(ypred)[0][0]
        true_word = word2vec_model.similar_by_vector(ytrue)[0][0]
        similarity = word2vec_model.similarity(pred_word, true_word)
        if similarity == 1:
            correct += 1
        count += 1
    print("Accuracy {0}".format(correct/count))

In [ ]:
#seq_out[0]

In [ ]:
# accuracy()

In [ ]:
#model_results = model_fit_summary.history

In [ ]:
#model_results.update(model_fit_summary.params)

In [ ]:
#model_results["train_accuracy"] = accuracy()

In [ ]:
# n = no. of predictions
# accuracy = accuracy(400)
#print(model_results)

In [ ]:
#text_file_path = "../weights/lstm-2-1024-512-batchsize-128-epochs-25/model_results.json"

In [ ]:
#with open(text_file_path, "w") as f:
        #json.dump(model_results, f)

In [ ]: