Importing require packages


In [1]:
from __future__ import print_function

import json
import os
import numpy as np
import sys
import h5py

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.engine import Input
from keras.layers import Embedding, merge
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.preprocessing import sequence
from intersect_embeddings import Embeddings
from keras.callbacks import ModelCheckpoint

from nltk.tokenize import word_tokenize
import random
from itertools import groupby


Using TensorFlow backend.

Setting Parameters


In [2]:
model_name = "lstm-2-1024-300-batchsize-256-epochs-25-Sequence"
word_embedding_dimension = 300
word_embedding_window_size = 4
batch_size = 512 # 32, 64, 128
epochs = 25 # 10, 15, 30
window_size = "None" # 3, 4, 5
accuracy_threshold = 1
activation = 'softmax' # sigmoid, relu, softmax
custom_accuracy = 0
loss_function = 'cosine_proximity' # mse

Instantiate Embeddings


In [3]:
embeddings = Embeddings(word_embedding_dimension, word_embedding_window_size, 1, 4)

Getting data from preprocessing


In [4]:
word2vec_model = embeddings.get_intersected_model()
word2index, index2word = embeddings.get_vocabulary()
word2vec_weights = word2vec_model.wv.syn0
tokenized_indexed_sentences = embeddings.get_indexed_sentences()


Loading Indexed Sentences...

In [5]:
word2index = {word:index+1 for word, index in word2index.items()}
index2word = {index:word for word, index in word2index.items()}

In [6]:
word2index


Out[6]:
{'chanting': 20108,
 'xinhua': 7553,
 'bicycle': 6861,
 'allow': 642,
 'living': 620,
 'antenna': 874,
 'energies': 12525,
 'cropanzano': 16707,
 'mascot': 7123,
 'hæmatococcus': 47333,
 'december_12_1964': 26511,
 'kosher': 31340,
 'cup-europa': 52165,
 'greek_communities': 47006,
 'the_melbourne_internation_exhibition': 47770,
 'commander-in-chief': 9247,
 'poitiers': 15956,
 'branches': 2586,
 'belongs': 5178,
 'lake': 2040,
 'interspersed': 45436,
 'efcc': 47498,
 'caridinal': 32826,
 'fleet': 1753,
 'multiplied': 19853,
 'forgien': 32254,
 'reproducing': 23445,
 'calm': 8683,
 'the_national_and_public_library_servies': 41682,
 'excommunicated': 47819,
 "'vinyl": 47481,
 'lester_brown': 24464,
 'wreaths': 19535,
 'claris': 19841,
 'wooster_square_cheery_blossom_festival': 44056,
 'the_great_northern_war': 22458,
 'adobe_flash': 48091,
 '1-1': 47562,
 'the_museo_carrillo_gil': 30763,
 "'abdu'l-hamid": 41434,
 'extras': 12279,
 'leonard_goldenson': 15185,
 'tofu': 41143,
 '512k': 36298,
 'dow': 45982,
 'landfall': 20711,
 'christianity': 1504,
 'removable': 16708,
 'something': 1844,
 'janners': 45228,
 'el_centro': 34661,
 '1_million': 48592,
 '190': 39279,
 'physicalist': 47120,
 'isabel_i': 49514,
 'the_trust_for_public_land': 34572,
 'mou': 12292,
 'the_freedom': 30615,
 'alfonso_xxii': 41847,
 'hoengseong': 41658,
 'supervisor': 12582,
 'solutes': 30742,
 'renounce': 22457,
 'york': 6634,
 'excitation': 20299,
 'advetisments': 32863,
 'advaita': 45173,
 'lodgings': 18910,
 'the_nazi_party': 30030,
 '1839': 5538,
 '1805': 12293,
 'ochoa': 37674,
 'hemu': 41777,
 'uefa_euro_2012': 52318,
 'the_islamic_liberation_party': 41072,
 'the_gurukul_school_of_theatre': 43251,
 'literature': 1011,
 'owning': 15564,
 'h__j_smith': 19202,
 'paul_delouvrier': 20250,
 'soulages': 18455,
 'feast_days': 31594,
 'caused': 147,
 'well-fed': 35652,
 'camp_meade': 11553,
 'the_church_of_scotland': 21120,
 "super_slim's_design": 40466,
 'biologically': 8427,
 'lenited': 22263,
 'financial_times': 30985,
 'algerians': 50259,
 'the_supreme_court_of_appeal': 33057,
 'miles_per_hour': 26884,
 'darian_stewart': 21505,
 'landmarks': 23875,
 'jose_de_quiroga': 25195,
 'venezuelans': 8098,
 'entrusted': 36931,
 'nimitz-class': 39226,
 'juiveniles': 31422,
 'national_aboriginal_day': 46629,
 'connsidered': 50710,
 'iq_tests': 32988,
 'solidified': 23187,
 'the_troubles': 47578,
 'corner': 7550,
 'safety_canada': 47650,
 'alike': 42181,
 'classical_sumer': 31045,
 'non-theists': 31494,
 'widening': 14632,
 'brest': 19619,
 '63': 45921,
 'yang_sanbao': 15573,
 'amniotes': 13920,
 'january_1_1934': 30129,
 'gunboats': 24960,
 'europian': 28400,
 'meals': 8052,
 'flot': 13350,
 'tipping': 13343,
 'steinstossen': 23977,
 'the_college_football_playoff_system': 29696,
 'twelve_tribes': 33685,
 'crevices': 19944,
 'republics': 7595,
 'process_and_reality': 16464,
 'paraphrased': 49947,
 'ex-wife': 15442,
 'consider': 835,
 'apprimately': 41128,
 'the_primera_division': 38021,
 'tonearm': 48239,
 'idols_thirteenth_season': 52514,
 '1920x1080p25': 20325,
 'dvaita_vedanta': 46081,
 'recruits': 21223,
 'silver_nemesis': 33610,
 'directorial': 16180,
 'wave-affiliated': 50372,
 'conseil': 34338,
 'hormones': 5473,
 'between_20-64': 34365,
 'perrin': 30283,
 'the_scientific_crime_detection_laboratory': 43545,
 'ctenophhores': 51499,
 'zuccotti_park': 26127,
 'honorees': 41445,
 'lion': 10680,
 'polymath': 49171,
 'mueller': 8463,
 'trukestan': 34477,
 'the_triple_alliance': 30624,
 'the_st_helena_independent': 44230,
 '1720s': 26688,
 'pharaohs': 28504,
 'nama-herero': 18790,
 'classes': 1298,
 'hop': 6845,
 'plummet': 39639,
 'polonization': 36731,
 'fallacy': 33459,
 'the_beagle_expedition': 40652,
 'cried': 28556,
 'hydraulic': 5552,
 'reza_shan': 28496,
 'the_university_of_southampton': 7725,
 '238u': 16801,
 'holds': 1347,
 'midford': 30039,
 'the_plymouth_borough': 43233,
 'decedent': 23050,
 'the_university_of_westminster': 39437,
 'austroalpine': 17189,
 'popularized': 6063,
 'the_galapagos_islands': 31310,
 'monotonous': 25775,
 'pregame': 21482,
 'fly': 3188,
 'styria': 33446,
 'pdo': 26297,
 'chan': 11338,
 'intruz': 33287,
 'toilet': 9109,
 'sophene': 48122,
 'emperor_kanmu': 44083,
 'october_15': 42697,
 "the_huguenot_element_in_charleston's_provincialisms": 36656,
 'the_somali_post': 24031,
 'ziyad': 35202,
 'accusations': 6594,
 'crazy_little_thing': 26845,
 'conflicted': 20543,
 'the_ministry_of_defence_act': 19571,
 'city_council': 11319,
 'adherence': 7329,
 'waihopai': 45885,
 'new_york_harbor': 44488,
 'diring': 36982,
 'pick': 5197,
 'cpm': 48545,
 'captivity': 11827,
 'us_district_court': 49836,
 'karadzic': 36097,
 'guangwu': 19581,
 'mycobacteria': 13593,
 'problemsome': 34544,
 'earthworm': 14012,
 'the_university_of_london_network': 29897,
 'cbnrm': 38529,
 'gathering': 3254,
 'thomas_barnett': 35518,
 'reticulum': 14730,
 'bookstores': 25270,
 'the_world_wealth_report': 27284,
 'the_sackler_center': 50627,
 'renaixenca': 52046,
 'mixture': 4757,
 'kbits': 9413,
 'ṣalībī': 17251,
 'deity': 5712,
 'the_academy_of_art': 36033,
 'regis_mckenna': 39887,
 'august_2009': 16761,
 'the_boer_war': 14634,
 'appearance-based': 42984,
 'rump-austria': 35044,
 'princes': 6350,
 'redeem': 26119,
 '750-line': 28727,
 'os': 5503,
 'chimpanzees': 10674,
 'lional_bailey_budden': 33622,
 'science_of_logic': 19326,
 'victoria_park': 34683,
 'concerto': 23465,
 'mid-to-late': 37231,
 'vary': 3226,
 'st_helena': 22193,
 'first_12_month': 51911,
 'sorcerer': 33811,
 'injunction': 25534,
 'existed': 1594,
 "d'etat": 8281,
 'distinctly': 9013,
 'things-as-interacted-by-us': 46771,
 'zhuge_liang': 18063,
 'agenda': 5582,
 'poisoning': 24207,
 'pala': 15707,
 'akritic': 27963,
 'serialized': 10832,
 'chumik': 38554,
 'keivan_rus': 52244,
 'rich': 3036,
 'starters': 27815,
 'duke_yansheng_kong_duanyou': 17049,
 'twilight_princess_hd': 44118,
 'patti_austin': 25420,
 'realart_pictures': 50133,
 'enhanced': 8397,
 'endeavor': 11828,
 'segmented': 43586,
 "the_end_of_von_neumann's_life": 31114,
 'woolco': 32885,
 'louis-phillipe': 30116,
 'long-term': 5269,
 'differencing': 19931,
 'informing': 11706,
 'cultivator': 48013,
 'nunneries': 21168,
 'firewire': 10463,
 'posited': 11855,
 'numbered': 6275,
 'the_new_haven_arena': 52407,
 'the_far_east': 47976,
 'missouri': 7517,
 'frescoes': 18572,
 '1603': 24283,
 'vrs': 13199,
 'ten_years_ago': 48536,
 'the_foreign_language_student_residence': 21488,
 'scuttle': 44843,
 '16-game': 23888,
 'the_orthodox_greek_church': 40625,
 'sadomasochism': 33350,
 'calyx': 32077,
 '2000-01': 20466,
 'hebron': 28951,
 'getcybersafe': 33575,
 'the_afl': 44948,
 'the_winter_bash': 52477,
 'abnormalities': 26596,
 'diego': 8724,
 'coevolved': 32822,
 '1160': 26165,
 'het_scheur': 27863,
 'quantitatively': 24398,
 '1673': 16703,
 'agnostic': 25260,
 'avl': 14465,
 'stabilisation': 17161,
 'oeiras': 20346,
 'concealed': 23362,
 'aquarium': 12331,
 'the_world_wildlife_fund': 33566,
 'second-driest': 50007,
 'eal': 52122,
 'defender': 21601,
 'dirctor': 33504,
 'pay': 765,
 'abusers': 35635,
 'beta-lactam': 30792,
 't-ray': 51252,
 'the_milken_institute': 31939,
 '1363': 34059,
 '2n1301': 48471,
 'san_francesco': 27479,
 'disappointment': 24168,
 'homeland': 10517,
 'actinide': 35739,
 'chrysaetos': 38682,
 'ifat': 24013,
 'princeton_university': 29644,
 'haa': 12187,
 'bake': 36049,
 'flota': 19309,
 'towers': 3862,
 'innkeeper': 18114,
 'vigorous': 40283,
 'romansh': 21800,
 'tilak': 25940,
 'in_square_miles': 17726,
 'the_orthodox_church_of': 11798,
 'modems': 31018,
 'henri_iv': 38317,
 'overlooks': 27936,
 'madhav_deshpande': 44809,
 'line-up': 18548,
 'foster': 7030,
 'arius': 23040,
 'lgbt': 4832,
 'altaians': 30841,
 'bare-board': 39778,
 'such': 779,
 'the_6th_century': 7088,
 'heroic': 11242,
 'forward': 2981,
 'circumnavigate': 32408,
 'the_council_of_trent': 10167,
 'flatworm': 24831,
 'the_irish_parliamentary_party': 38617,
 'mediator': 19177,
 'japanses': 18257,
 'grooves': 12349,
 'strongly': 2748,
 'doctrina_christiana': 18128,
 'solid-fuel': 36894,
 'us_census_bureau': 52209,
 'uhd': 47515,
 'thiazide': 25524,
 'employer': 4811,
 'unbroken': 20301,
 'the_prince_hall_lodge': 20182,
 'spaniard': 49845,
 'janis_ian': 40523,
 'maltese': 36801,
 'thunders': 13195,
 'encyclopedias': 7845,
 'lucerne': 31903,
 'garuda': 17578,
 'fight': 1096,
 'the_impact_team': 25044,
 'kokang': 35097,
 'technical_universities': 33434,
 'the_eastern_pyrenees': 37490,
 'stand-alone': 33445,
 'the_presidential_proclamation': 47480,
 'biomorphism': 40594,
 'devonport_dockyard': 45198,
 'mee': 21079,
 'mexico': 1608,
 'megabus': 16978,
 'dartmoor': 41794,
 'disseminate': 24025,
 'the_past_three_years': 44792,
 'learnt': 21013,
 'shipped': 5062,
 'the_treaty_of_brest-litovsk': 48691,
 'frazer': 41613,
 'located': 63,
 'richmond_international': 20938,
 'exerts': 49450,
 'decongestant': 24936,
 'outages': 40438,
 'roger_ii': 31287,
 'rms_queen': 19216,
 'enormous': 15431,
 'the_karabakh_war': 21911,
 'mark_childress': 36311,
 'the_winter_music_conference': 26010,
 'listening': 3491,
 'repressed': 24914,
 'resolving': 22910,
 'reveal': 3163,
 'theeastindiacompanychangedhowitdidbusinessbymovingfromfocusingonroyalpatronagetowhattypeofbusiness': 27798,
 'orbiting': 14684,
 'advocating': 50176,
 'beliefs': 1714,
 'the_menninger_clinic': 32895,
 'funfair': 21927,
 'recent': 1244,
 'reformist': 13228,
 'portray': 6001,
 'hemudu': 17327,
 'pharmaceuticals': 11544,
 'the_great_redan': 31338,
 'neil_ramirez': 29648,
 'proverbs': 19654,
 'pieta': 40819,
 'indicated': 4558,
 "'pax_mongolica": 46000,
 'refraining': 41088,
 'acta': 19514,
 'michael_dorf': 45833,
 'faithful': 20146,
 'gloucester': 18162,
 'eastern_prussia': 31026,
 'the_royal_society_of_london': 33650,
 'western_xia': 22823,
 'the_middle_of_summer': 51491,
 'duccio': 28713,
 'mendota': 25547,
 'jared_allen': 37062,
 'unfccc': 42173,
 'esztergom': 45449,
 'rhb': 33816,
 'qi': 41699,
 '84-gun_two-deckers': 42442,
 'physiology': 15280,
 'max_clifford': 29678,
 'charles_aznavour': 31771,
 'confucians': 52353,
 'uthman': 20348,
 'westminster_hall': 27112,
 'liaoning': 10921,
 'as_little_as_possible': 36926,
 'attitudes': 8590,
 'johann_mouse': 37809,
 'grahm': 47896,
 'bafta': 13615,
 'gas-lasers': 35010,
 'lps': 10032,
 'anglican': 3899,
 'foley': 43035,
 'mycolic': 42210,
 'john_harrison': 17805,
 'henry_guisan': 24670,
 'spanish_priest': 32598,
 'interlaced': 11271,
 'creditworthiness': 32800,
 'hume': 4314,
 'favorite_architecture': 29207,
 'non-jewish': 21056,
 'transposons': 19941,
 'beagle': 36994,
 'vacate': 18221,
 'yardage': 14049,
 'overrun': 42877,
 'the_test_drive': 22176,
 'long_island': 15744,
 'the_end_of_the_18th_century': 23412,
 'the_pillow_book': 34584,
 '24th': 9997,
 'alexis_de_tocqueville': 19037,
 'tacoma-seattle': 35439,
 'embarked': 12259,
 'profesional': 24380,
 'formalistic': 48369,
 'aug_2000-may_2001': 19716,
 'the_life_of_pablo': 30594,
 'grand_lodges_of': 30499,
 'the_appeals_chamber': 48715,
 'the_chola_empire': 18980,
 'nong_shu': 50797,
 'congregating': 49320,
 'pregnancy': 5461,
 '1975': 3499,
 'curia_regis': 52151,
 'edc': 17831,
 '1999-2000': 33054,
 'kithara': 47292,
 'warmia': 35108,
 'vacillating': 27318,
 'condensed': 18850,
 'the_great_famine': 6310,
 "''deja_vu": 35516,
 'touches': 13210,
 'nicholas_vavilov': 39929,
 'statutes': 3700,
 'bugatti_veyron_grand_sport_vitesse': 50334,
 'admire': 10826,
 'schoolmates': 31466,
 'features': 1158,
 'kenia': 27007,
 'hms_queen_elizabeth': 27522,
 'gross': 2796,
 'make-up': 9722,
 'moral': 4493,
 'diet': 2460,
 'citizens': 848,
 'uziece': 48200,
 'the_european_convention_on_human_rights': 18231,
 'chaffinch': 33565,
 'handful': 47546,
 'avalonia': 31360,
 'yâhud': 35169,
 '101': 23340,
 'icc': 22398,
 'expropriated': 21682,
 'ascend': 11804,
 'burdens': 11322,
 'rigiorous': 44852,
 'monarchical': 29008,
 'vannevar_bush': 41069,
 'eight': 2469,
 'lodging': 21603,
 'disturbance': 24692,
 'peloponnesian': 33950,
 'clairvaux': 18229,
 'sugar': 4393,
 'team-up': 44025,
 'necrotic': 33812,
 'chunks': 34272,
 '1150': 13632,
 'the_early_16th_century': 52328,
 'matrilineal': 35957,
 'penis-like': 29720,
 'ethnographic': 24253,
 'jane_anderson': 46550,
 'silkworms': 38331,
 'solves': 31814,
 'bats': 8316,
 'iclude': 30028,
 'margarette_reglerin': 42220,
 'qcd': 30923,
 'tefu': 25880,
 'rubbed': 27770,
 'wends': 51278,
 'strategic_defence_review': 29200,
 'nfc': 8898,
 'early_french': 29388,
 'passy': 38069,
 'levies': 17059,
 'concours': 24132,
 'l2c': 51653,
 'injected': 21951,
 'shemanic': 25261,
 'the_literary_capital': 39619,
 'makeshift': 39798,
 'nelson': 18291,
 'tabula': 42292,
 'eduard_shevardnadze': 50875,
 'mummified': 13427,
 'the_107_yuan': 47424,
 'water-balloons': 32352,
 'the_great_fire_of_london': 25757,
 'correspondant': 17388,
 'vampirovibrio': 37625,
 'beginnings': 8022,
 'funded': 2928,
 'insects': 804,
 'fines': 8485,
 'executor': 17145,
 'metamorphic': 20970,
 'proximaty': 45770,
 'hmg-coa': 37807,
 'the_english_football': 34120,
 'insecticide': 17817,
 'paris_urban_area': 42951,
 '1889': 9415,
 'religous': 19197,
 'liquidize': 22471,
 'joseph_schelling': 29219,
 'mount_everest': 22241,
 'typegenre': 38502,
 'congo': 1618,
 'zeppelin': 8043,
 'gaius_marius': 17408,
 'the_western_hemisphere': 19349,
 'vernaculars': 13122,
 'the_consumer_price_index': 31678,
 'stimulus': 6308,
 'geologic': 5042,
 'exective': 48817,
 'william_enston_home': 35861,
 'seating': 7729,
 'bypassed': 20394,
 'creek_oil': 28895,
 'the_great_appalachian_valley': 24787,
 'condoms': 22476,
 'launced': 42788,
 'the_warsaw_uprising': 20535,
 'st_louis': 9880,
 'viceroy': 13006,
 'sura_19_of': 31837,
 'redigitize': 33704,
 'dunlins': 30567,
 'philbin': 45634,
 'herals': 45314,
 'aphis': 23042,
 'the_pashupatinath_temple': 16128,
 'chelator': 46142,
 'alaskan_jews': 24458,
 'company_sabmiller': 44930,
 'over_months': 51992,
 "sant'ambrogio": 49715,
 'fronting': 35747,
 'biracial': 10757,
 'wooster_square': 16920,
 'colleges': 2344,
 'the_age_of': 5403,
 'x2a_are': 29175,
 '6-10_volume': 50635,
 'committees': 4884,
 'non-bird': 49807,
 'native_hawaiian': 34997,
 'tucker': 11479,
 'dpon-chen': 18873,
 'slant': 26565,
 'blogger': 22179,
 'january_24_2000': 34016,
 'the_doomsday_project': 18945,
 'ps3_playstation': 46598,
 'subcommittee': 35468,
 'variance': 7704,
 'the_mariana_islands': 27560,
 'open_skies_policy': 27267,
 'librarian': 42312,
 'herbivores': 12967,
 'july_1992': 45749,
 'chaotic': 18183,
 'the_african_great_lakes': 25680,
 'nazi_germans': 49785,
 'vapors': 40586,
 'ncaa_national_championships': 17580,
 'decorum': 39842,
 'walker': 22694,
 'emac': 22161,
 'misuse': 20812,
 'james_gleck': 44899,
 'utilized': 3215,
 'deveops': 29989,
 'et': 4915,
 'central_or_south_america': 35862,
 'bsod': 16847,
 'interest-only': 33701,
 'elliot': 31264,
 'million_square_feet': 24680,
 'eldest': 10552,
 'george_stinney': 38309,
 'patronized': 43315,
 '1957': 3685,
 'the_last_five_missions': 33855,
 'nicknamed': 6346,
 'act_2012': 32434,
 'phase_ii': 24562,
 'more_than_a_year': 44890,
 'fixture': 10085,
 'mr': 2993,
 'the_2nd_millennium': 41141,
 'woodward': 22182,
 'warrior-like': 52343,
 'sensitization': 45107,
 'non-open_directory': 25137,
 'mongol-tibetan': 18603,
 'new_delhi_municipal_council': 42318,
 'elevators': 1962,
 'hyperactive': 26648,
 'goulet': 46818,
 'benin_burkina_faso': 43663,
 '1170s': 35997,
 'mission_bay': 27529,
 'exportation': 26050,
 'candadites': 24888,
 'lava': 28639,
 'the_nelke_theatre': 48807,
 'the_green_power_partnership': 24857,
 'the_coral_sea': 18280,
 'others': 1337,
 'indignation': 16858,
 'redused': 52494,
 'familly-festival': 47783,
 'castes': 51285,
 'inner-groove': 47941,
 'james_o_fraser': 38961,
 '3_july_2012': 46712,
 'the_serbo-croat_language': 52562,
 '677': 30227,
 'abillity': 26037,
 'centeno': 30000,
 'icrisat': 21289,
 'university_college': 22518,
 'saturday': 6771,
 'emmerge': 30502,
 'lombard': 44676,
 'joe_royle': 14625,
 'crimean_scythians': 24721,
 'clark': 8030,
 'greek_orthodox_church': 46067,
 'deleterious': 21945,
 'short': 1325,
 'depart': 4587,
 'punala': 34108,
 'the_fertile_crescent': 13486,
 'the_racial_contract': 51042,
 'mount_olympus': 20450,
 'eastern_europe': 6083,
 'tentilla': 23091,
 'ner': 47979,
 'dermis': 17272,
 '1688': 24976,
 'the_centennial_exposition': 44439,
 'bud_light': 37153,
 'centenari': 32170,
 'newt_gingrich': 23261,
 'plenty': 27716,
 'popped': 23466,
 'fredicton': 35124,
 'accusation': 9144,
 'axially': 37771,
 'recipe': 8509,
 'origen': 46435,
 'denounced': 9592,
 'boulangism': 36042,
 'fold3': 15715,
 'forming': 4286,
 'akiko_komoto': 31612,
 'date_records': 14329,
 'burgoyne': 20294,
 'wether': 18896,
 'pere_vincent_jandel': 43046,
 'organs': 4330,
 'promoter': 10492,
 'cytokinins': 21614,
 'franz_liszt': 21165,
 '21st_street': 14906,
 'competitiveness': 45476,
 'sandisk': 14140,
 're-released': 16671,
 'manually': 13623,
 'outraged': 44473,
 'hayy_ibn_yaqdha': 22094,
 'roman_classical': 48508,
 '1_or': 32245,
 'between_2': 28160,
 'parentalia': 22008,
 'clump': 36485,
 'history_of_the_inductive_sciences': 50003,
 'route_66_begin': 40670,
 'the_nizam_club': 48750,
 'mark_woods': 11690,
 'stratigraphers': 23183,
 'dubstep': 49773,
 'anthonio_colve': 43307,
 '1993-2013': 34795,
 'phoenicians': 10670,
 'pope_pius_iv': 28163,
 'government-sponsored': 31895,
 'commensal': 18541,
 'the_day_of_the_dead_scene': 52394,
 'berd': 28225,
 'commonalities': 45303,
 'problems': 681,
 'stagnate': 15714,
 '37nn': 43549,
 'november_1783': 41136,
 'all-in-one': 33632,
 'the_gospel_of_john': 42315,
 'option': 3500,
 'the_hub_bid': 45881,
 'young': 1631,
 'dislike': 5372,
 'pathans': 24676,
 'dreaming': 39896,
 'the_opera_tour': 32031,
 'vertaling': 26080,
 'disproportionately': 16521,
 'dich': 49123,
 'economically': 5742,
 'the_parliamentary_labour_party': 36772,
 'inductees': 42887,
 'turkana_boy': 36705,
 'symphony': 6522,
 'the_problem_of_the_middle_east': 42084,
 'the_rocky_and_bullwinkle_show': 24448,
 'imperial_china': 17008,
 'the_democratic_republic_of': 24630,
 'bank_of_america': 16519,
 'bomber-aimer': 30219,
 'will_rogers': 23590,
 'trust': 4977,
 'backstreet_boys': 22334,
 'parchmentised': 47336,
 'mainain': 37060,
 'enrichment': 19924,
 'imitated': 27748,
 'qma': 38996,
 '82_million': 42375,
 'open_client': 46769,
 'widened': 12194,
 'mere': 41931,
 'videogames': 25402,
 'the_longmenshan_fault': 36489,
 'louis_ii': 26293,
 'alberti': 28011,
 'bulging': 34110,
 'jābir_al-ḥarrānī_al-battānī': 21952,
 'haploid': 21171,
 'conformities': 27128,
 'aclu': 13299,
 'sibling': 12667,
 'part': 69,
 'the_national_conference': 50794,
 'wonthaggi': 21935,
 'the_federal_open_market_committee': 49547,
 'mitchell_tower': 49809,
 '1767': 11344,
 'the_2009': 44457,
 'frenchtown': 35066,
 'articulated': 24225,
 'al-faqih': 30708,
 'shields': 15723,
 'the_ministry_of_foreign_affairs': 15750,
 'many_miles': 8746,
 'albanians': 42731,
 'eukaryotes': 5451,
 'old_dutch': 13002,
 'lined': 14319,
 'eastern_massachusetts': 51365,
 'previously-typed': 40533,
 'until_season_ten': 26215,
 'urgan': 27352,
 '280': 40340,
 'the_killers': 26582,
 'san_giovanni_evangelista': 20371,
 'materially': 32928,
 'anarchists': 12141,
 'the_thar_desert': 18818,
 'specialty': 6575,
 'western_electric': 40477,
 'aquila': 32128,
 'created': 114,
 '59-mile': 29470,
 'the_nanjing_massacre_memorial_hall': 51188,
 'inventiveness': 33437,
 'breakdown': 7875,
 'conecept': 42414,
 'chronic': 4848,
 'depict': 5445,
 "5'3": 19890,
 'five_days': 49675,
 'populous': 3122,
 'laurisilva': 22396,
 'assesed': 40172,
 'the_trident_regional_medical_center': 29467,
 'versions': 1841,
 'the_25th': 48983,
 'fare': 8249,
 'the_greek_language': 47400,
 'afro-caribbean': 19270,
 'co-discovery': 17009,
 'northumbrian': 19265,
 'praising': 43142,
 'chiffon': 26963,
 'north_american_lutherans': 51378,
 'takes': 1638,
 'nm': 12420,
 'south_florida': 22919,
 'the_tower_district': 14304,
 'bruno_wenn': 29743,
 'deprivation': 23476,
 'northwest': 4050,
 'dubai': 19653,
 'stomp': 19662,
 'practice': 766,
 'locations': 2096,
 'jaws': 7139,
 '1766': 27389,
 'stretched': 10798,
 'rotor': 5410,
 'possibly': 2927,
 'non-mainland': 43195,
 'mm': 21253,
 'ben_franklin': 13358,
 'rooms': 2688,
 'hosokibara': 28514,
 'byu_jerusalem_center': 24575,
 'microprobes': 26440,
 'wise': 10359,
 'cromwell_gardens': 20159,
 'hours': 2126,
 'articulations': 36014,
 'two_weeks': 15172,
 'sealing': 40660,
 'kaunitz': 21265,
 'homosexuals': 16445,
 'brigadier': 25991,
 'famiyl': 33600,
 'martyr': 25064,
 'bulletin': 31743,
 'alternate-day': 28807,
 'hosted': 1459,
 'the_hildebrand_rarity': 44766,
 'a_17th_century': 50750,
 'bigger': 5554,
 '1567': 42277,
 'itchen': 45154,
 'the_galápagos_islands': 34817,
 "l'écho": 45138,
 'susrutasamhita': 42409,
 'their_closing_hours': 36702,
 "'new": 32816,
 'apg': 13567,
 'reclamation': 16888,
 'balta': 21209,
 'zhu': 17544,
 'agra': 48853,
 "world's_most": 42031,
 'to': 7,
 'january_30th': 28530,
 'intended': 1292,
 'jokes': 41113,
 'scotty': 40244,
 'antiochus_the_great': 42947,
 'where': 28,
 'omitted': 16626,
 'trimmer': 31619,
 'miramar': 36057,
 'acctepted': 35686,
 'governor-equipped': 36649,
 'over_500_pounds': 40842,
 'dipole': 5483,
 'romances': 43228,
 'premium': 14507,
 'morrison': 8319,
 'iec': 9919,
 'heavier-than-air': 30746,
 'miamisouth_florida': 31355,
 'intervening': 14953,
 'uncle': 8059,
 'warp': 42542,
 'institutional': 10902,
 'claim': 507,
 'the_hongguang_emperor': 18591,
 'responsa': 47009,
 'cousing': 34834,
 'follow': 1286,
 'empiricists': 46301,
 'serenades': 35973,
 'outgassing': 22203,
 'peter_minuit': 46933,
 'the_super_slim': 33551,
 'nominating': 16336,
 'lumber': 7473,
 'deteriorated': 16522,
 'lactating': 23908,
 'surjective': 23068,
 '90_to': 38953,
 'relationships': 2274,
 'the_maurienne_valley': 18434,
 'maritime': 6849,
 'jonassohn': 11816,
 'portarlington': 30946,
 'super-boom': 37167,
 'mennonites': 9775,
 'vhf': 37722,
 'the_archaemenid_empire': 8187,
 '2012_holiday_season': 26047,
 'body': 202,
 'self-reported': 20302,
 'rayleigh__jeans': 21383,
 'assorted': 39455,
 'rev': 24222,
 'slangs': 44448,
 'the_swazi_house_of_assembly': 13401,
 'pigmented': 37727,
 'embarcadero': 37800,
 'danny_trevathan': 40680,
 'the_kievan_rus': 21179,
 'standalone': 22237,
 'nov_11_1911': 43027,
 'a171': 34399,
 'safeguards': 49954,
 'niche': 13746,
 'theropods': 20678,
 'proctodaeum': 31148,
 'prosperous': 18918,
 'ipcc_fourth_assessment_report': 24952,
 'cantabile': 29011,
 ...}

In [7]:
tokenized_indexed_sentences[0]


Out[7]:
[1, 3, 2206, 9, 388, 498, 93, 108, 5, 0]

In [8]:
tokenized_indexed_sentences = [np.array(sentence) + 1 for sentence in tokenized_indexed_sentences if len(sentence) > 0]

In [9]:
tokenized_indexed_sentences[0]


Out[9]:
array([   2,    4, 2207,   10,  389,  499,   94,  109,    6,    1])

In [10]:
new_weights = np.zeros((1, word2vec_weights.shape[1]))

In [11]:
new_weights = np.append(new_weights, word2vec_weights, axis=0)

generating training data


In [12]:
# window_size = 5
vocab_size = len(word2index)
print(vocab_size)


52730

In [13]:
maxlen = max([len(sentence) for sentence in tokenized_indexed_sentences])

In [14]:
tokenized_indexed_sentences = sequence.pad_sequences(tokenized_indexed_sentences)

In [19]:
seq_in = np.zeros_like(tokenized_indexed_sentences)
seq_out = np.zeros((tokenized_indexed_sentences.shape[0], tokenized_indexed_sentences.shape[1], 300))

# Generating Dataset
for index, sentence in enumerate(tokenized_indexed_sentences):
    y = np.append(sentence[1:], np.array(sentence[len(sentence)-1]))
    seq_in[index] += sentence
    seq_out[index] += [new_weights[i] for i in y]

n_samples = len(seq_in)
print ("Number of samples : ", n_samples)


Number of samples :  97974

Defining model


In [35]:
# Changes to the model to be done here
model = Sequential()
model.add(Embedding(input_dim=new_weights.shape[0], output_dim=new_weights.shape[1], weights=[new_weights], mask_zero=True))
model.add(LSTM(1024,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(300, return_sequences=True, activation=activation))
# model.load_weights("../weights/lstm-2-1024-300-batchsize-512-epochs-25-Sequence/weights.24.hdf5")
model.compile(loss=loss_function, optimizer='adam',metrics=['accuracy'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_2 (Embedding)      (None, None, 300)         15819300  
_________________________________________________________________
lstm_3 (LSTM)                (None, None, 1024)        5427200   
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 1024)        0         
_________________________________________________________________
lstm_4 (LSTM)                (None, None, 300)         1590000   
=================================================================
Total params: 22,836,500
Trainable params: 22,836,500
Non-trainable params: 0
_________________________________________________________________

Creating Weights Directory


In [36]:
model_weights_path = "../weights/" + model_name
if not os.path.exists(model_weights_path):
    os.makedirs(model_weights_path)
checkpoint_path = model_weights_path + '/weights.{epoch:02d}.hdf5'
checkpoint = ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_best_only=False, mode='max')

Train Model


In [37]:
model_fit_summary = model.fit(seq_in, seq_out, epochs=epochs, verbose=1, batch_size=batch_size, callbacks=[checkpoint])


Epoch 1/1
Epoch 00000: saving model to ../weights/lstm-2-1024-300-batchsize-512-epochs-25-Sequence/weights.00.hdf5
2/2 [==============================] - 1s - loss: 0.0700 - acc: 0.0000e+00

Predictions


In [38]:
start = 0
sentence_test = "In which regions in particular did"
indexed_sentences = embeddings.get_indexed_query(sentence_test)
print("indexed_sentences ",indexed_sentences)
sent = np.array(indexed_sentences)
#pattern = list(seq_in[start])
pattern = list(sent)
print("\"",' '.join(index2word[index] for index in pattern))
for i in range(10):
    prediction = model.predict(np.array([pattern]))
    pred_word = word2vec_model.similar_by_vector(prediction[0][prediction.shape[1] - 1])[0][0]
    sys.stdout.write(pred_word+" ")
    pattern.append(word2index[pred_word])
    pattern = pattern[:len(pattern)]


indexed_sentences  [1, 5, 17, 875, 5, 1707, 9]
" squadend of when indigenous of ban is
indoe indoe indoe indoe indoe indoe indoe indoe indoe indoe 

Model Summary


In [33]:
model_results = model_fit_summary.history
model_results.update(model_fit_summary.params)
model_results["word_embedding_dimension"] = word_embedding_dimension
model_results["word_embedding_window_size"] = word_embedding_window_size
model_results["window_size"] = window_size
model_results["batch_size"] = batch_size
model_results["epochs"] = epochs
model_results["model_name"] = model_name
model_results["accuracy_threshold"] = accuracy_threshold
model_results["activation"] = activation 
model_results["custom_accuracy"] = custom_accuracy
model_results["loss_function"] = loss_function
model_results["layers"] = []
model_results["dropouts"] = []
for layer in model.layers:
    if hasattr(layer, "units"):
        layer_summary = {}
        layer_summary["units"] = layer.get_config()["units"]
        layer_summary["name"] = layer.name
        model_results["layers"].append(layer_summary)
    if hasattr(layer, "rate"):
        dropout_summary = {}
        dropout_summary["rate"] = layer.get_config()["rate"]
        model_results["dropouts"].append(dropout_summary)
text_file_path = "../weights/{0}/model_results.json".format(model_name)
with open(text_file_path, "w") as f:
        json.dump(model_results, f)