Chapter 3.5 - Classifying newswires The Reuters dataset


In [1]:
# Loading the Reuters dataset
from keras.datasets import reuters


Using TensorFlow backend.

In [2]:
num_words = 10000

In [3]:
(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words = num_words)

In [4]:
# Training data
len(train_data)


Out[4]:
8982

In [5]:
# Test data
len(test_data)


Out[5]:
2246

In [6]:
# The data is a list of integers
train_data[0]


Out[6]:
[1,
 2,
 2,
 8,
 43,
 10,
 447,
 5,
 25,
 207,
 270,
 5,
 3095,
 111,
 16,
 369,
 186,
 90,
 67,
 7,
 89,
 5,
 19,
 102,
 6,
 19,
 124,
 15,
 90,
 67,
 84,
 22,
 482,
 26,
 7,
 48,
 4,
 49,
 8,
 864,
 39,
 209,
 154,
 6,
 151,
 6,
 83,
 11,
 15,
 22,
 155,
 11,
 15,
 7,
 48,
 9,
 4579,
 1005,
 504,
 6,
 258,
 6,
 272,
 11,
 15,
 22,
 134,
 44,
 11,
 15,
 16,
 8,
 197,
 1245,
 90,
 67,
 52,
 29,
 209,
 30,
 32,
 132,
 6,
 109,
 15,
 17,
 12]

Decoding newswires back to text


In [7]:
# Getting index of words
word_index = reuters.get_word_index()

In [8]:
word_index


Out[8]:
{'mdbl': 10996,
 'fawc': 16260,
 'degussa': 12089,
 'woods': 8803,
 'hanging': 13796,
 'localized': 20672,
 'sation': 20673,
 'chanthaburi': 20675,
 'refunding': 10997,
 'hermann': 8804,
 'passsengers': 20676,
 'stipulate': 20677,
 'heublein': 8352,
 'screaming': 20713,
 'tcby': 16261,
 'four': 185,
 'grains': 1642,
 'broiler': 20680,
 'wooden': 12090,
 'wednesday': 1220,
 'highveld': 13797,
 'duffour': 7593,
 '0053': 20681,
 'elections': 3914,
 '270': 2563,
 '271': 3551,
 '272': 5113,
 '273': 3552,
 '274': 3400,
 'rudman': 7975,
 '276': 3401,
 '277': 3478,
 '278': 3632,
 '279': 4309,
 'dormancy': 9381,
 'errors': 7247,
 'deferred': 3086,
 'sptnd': 20683,
 'cooking': 8805,
 'stratabit': 20684,
 'designing': 16262,
 'metalurgicos': 20685,
 'databank': 13798,
 '300er': 20686,
 'shocks': 20687,
 'nawg': 7972,
 'tnta': 20688,
 'perforations': 20689,
 'affiliates': 2891,
 '27p': 20690,
 'ching': 16263,
 'china': 595,
 'wagyu': 16264,
 'affiliated': 3189,
 'chino': 16265,
 'chinh': 16266,
 'slickline': 20692,
 'doldrums': 13799,
 'kids': 12092,
 'climbed': 3028,
 'controversy': 6693,
 'kidd': 20693,
 'spotty': 12093,
 'rebel': 12639,
 'millimetres': 9382,
 'golden': 4007,
 'projection': 5689,
 'stern': 12094,
 "hudson's": 7903,
 'dna': 10066,
 'dnc': 20695,
 'hodler': 20696,
 'lme': 2394,
 'insolvancy': 20697,
 'music': 13800,
 'therefore': 1984,
 'dns': 10998,
 'distortions': 6959,
 'thassos': 13801,
 'populations': 20698,
 'meteorologist': 8806,
 'loss': 43,
 'exco': 9383,
 'adventist': 20813,
 'murchison': 16267,
 'locked': 10999,
 'kampala': 13802,
 'arndt': 20699,
 'nakasone': 1267,
 'steinweg': 20700,
 "india's": 3633,
 'wang': 3029,
 'wane': 10067,
 'unjust': 13803,
 'titanium': 13804,
 'want': 850,
 'pinto': 20701,
 "institutes'": 16268,
 'absolute': 7973,
 'travel': 4677,
 'cutback': 6422,
 'nazmi': 16269,
 'modest': 1858,
 'shopwell': 16270,
 'sedi': 20702,
 'adoped': 20703,
 'tulis': 16271,
 '18th': 20704,
 "wmc's": 20705,
 'menlo': 20706,
 'reiners': 11000,
 'farmlands': 12095,
 'nonsensical': 20707,
 'elisra': 20708,
 'welcomed': 2461,
 'peup': 20709,
 "holiday's": 16272,
 'activating': 20711,
 'avondale': 16273,
 'interational': 16274,
 'welcomes': 20712,
 'fip': 16275,
 'tailings': 11001,
 'fit': 4205,
 'lifeline': 16276,
 'bringing': 1916,
 'fix': 4819,
 '624': 6164,
 'naturalite': 12096,
 'wales': 6165,
 'fin': 8807,
 'fio': 11129,
 'ceremenony': 20714,
 'sovr': 20715,
 "yeo's": 20716,
 'effects': 1788,
 'sixteen': 13805,
 'undeveloped': 8808,
 'glutted': 13806,
 'barton': 20717,
 'froday': 20718,
 'arrow': 10089,
 'stabilises': 11002,
 'allan': 6960,
 '374p': 20719,
 '393': 3891,
 '392': 4008,
 '391': 4206,
 '390': 3079,
 '397': 4550,
 '396': 6166,
 '395': 6423,
 '394': 4207,
 '399': 6961,
 '398': 4208,
 'stabilised': 7595,
 'smelters': 5114,
 'oprah': 20720,
 'orginially': 20721,
 "tvx's": 20722,
 'ponomarev': 16278,
 'enviroment': 20723,
 "reeves'": 20724,
 'mason': 8363,
 'encourage': 1670,
 'adapt': 7596,
 'abbott': 12776,
 'stamping': 13808,
 'colquiri': 20726,
 'ambrit': 11003,
 'strata': 8353,
 'corrects': 4821,
 'sandra': 11922,
 'estimate': 859,
 'universally': 20727,
 'chlorine': 20728,
 'competes': 16279,
 'leiner': 10068,
 'ministries': 8809,
 'disturbed': 8810,
 'competed': 13809,
 'juergen': 8811,
 'kfw': 13810,
 'turben': 11004,
 'reintroduced': 9384,
 'maladies': 20729,
 'chevron': 4101,
 'lazere': 16280,
 'antilles': 8812,
 'dti': 11907,
 'specially': 9070,
 'bilzerian': 4678,
 'bakelite': 13811,
 'renovated': 20730,
 'service': 568,
 'payless': 16281,
 'spiegler': 20731,
 'needed': 831,
 'wigglesworth': 16282,
 'master': 6962,
 'antonson': 13812,
 'genesis': 20732,
 'vismara': 13813,
 'organically': 20734,
 "accords'": 20735,
 'task': 5940,
 'positively': 7974,
 'feasibility': 3479,
 'ahmed': 6963,
 "suralco's": 13814,
 'awacs': 20736,
 'idly': 16283,
 'regulator': 20737,
 'pseudorabies': 12097,
 'staubli': 16284,
 'nzi': 8813,
 'feeling': 5115,
 '275': 3127,
 '6819': 20738,
 'gorman': 16285,
 'sustaining': 8354,
 'spectrum': 9385,
 'consenting': 20739,
 'recapitalized': 12098,
 'sailed': 11562,
 'dozen': 7597,
 'affairs': 1985,
 'courier': 2253,
 'kremlin': 8355,
 'shipments': 895,
 "aquino's": 16286,
 'committing': 10070,
 'sugarcane': 5293,
 'diminishing': 9386,
 'vexing': 16287,
 'simplify': 11005,
 'mouth': 6167,
 'steinhardt': 7248,
 'conceded': 8814,
 'bradford': 9387,
 'singer': 7976,
 '5602': 20740,
 "1987's": 13816,
 'tech': 4950,
 'teck': 6424,
 'majv': 20741,
 'saying': 666,
 'dickey': 16477,
 'sweetner': 20742,
 'teresa': 21149,
 'ulcer': 20743,
 'cheaply': 13817,
 'thai': 2361,
 'orleans': 6964,
 'excavator': 16290,
 'rico': 6168,
 'lube': 12099,
 'rick': 13818,
 'rich': 4679,
 'kerna': 13819,
 'rice': 950,
 'rica': 4209,
 'plate': 5503,
 'platt': 16291,
 'altogether': 8356,
 'jaguar': 8815,
 'dynair': 20744,
 'patch': 8816,
 'ldp': 2892,
 'boarded': 13820,
 'precluding': 16292,
 'clarified': 11006,
 'sensitivity': 16293,
 'alternative': 1511,
 'clarifies': 11007,
 'lots': 5116,
 'irs': 7598,
 'irv': 20745,
 'iri': 13821,
 'ira': 13822,
 'timber': 5690,
 'ire': 20746,
 'discipline': 5219,
 'extend': 1937,
 'nature': 3634,
 "amb's": 16295,
 'dunhill': 16296,
 'extent': 2142,
 'restrcitions': 20747,
 'heating': 2396,
 "mannesmann's": 11008,
 'outsanding': 20748,
 'multimillions': 20749,
 'sarcinelli': 13824,
 'southeastern': 6694,
 'eradicate': 10071,
 'libyan': 9388,
 'foreclosing': 20750,
 'maclaine': 12101,
 'fra': 20751,
 'union': 353,
 'frn': 11009,
 'much': 386,
 'fry': 12102,
 'mothball': 20752,
 'chlorazepate': 10072,
 'dxns': 12103,
 'toyko': 19981,
 'spit': 20753,
 '007050': 16297,
 'freehold': 16298,
 'davy': 13825,
 'dave': 11010,
 'spie': 12177,
 'aguayo': 10117,
 'wildcat': 12104,
 'fecs': 10069,
 'kennan': 20754,
 'intal': 16299,
 'contingencies': 9389,
 'professionally': 16551,
 'microbiological': 16300,
 'misconstrued': 20756,
 'k': 409,
 'securitiesd': 20757,
 'deferring': 16301,
 'kohl': 5941,
 'conditioned': 3030,
 'fnhb': 20758,
 "october's": 16302,
 'memorial': 13954,
 'democracies': 6965,
 'conformed': 27520,
 'split': 464,
 "bond's": 12105,
 'thinly': 11112,
 'dunkirk': 16515,
 'cavanaugh': 16303,
 "securities'": 13827,
 'marches': 21345,
 'issam': 16304,
 'workforce': 2020,
 'meinert': 12106,
 'boiler': 13828,
 "bp's": 5294,
 'torpedoed': 16305,
 'indidate': 20762,
 'downwardly': 13829,
 'viviez': 20763,
 'vladiminovich': 20764,
 'academic': 16306,
 'architecural': 20765,
 'corporate': 1117,
 'appropriately': 16307,
 'teicc': 20766,
 "hanover's": 20767,
 'aristech': 8817,
 'portrayed': 20768,
 'raffineries': 21383,
 'hai': 20770,
 'hal': 7599,
 'ham': 13830,
 'han': 10073,
 'e15b': 20771,
 'had': 61,
 'hay': 20772,
 'botchwey': 13831,
 'haq': 10074,
 'has': 37,
 'hat': 13832,
 'hav': 20773,
 'fortin': 20774,
 'municipal': 8818,
 'osman': 20775,
 'fsical': 20776,
 'elders': 3480,
 'survival': 12107,
 'unequivocally': 16308,
 'objective': 2519,
 'indicative': 6695,
 'shadow': 10075,
 'riskiness': 21411,
 'positiive': 20778,
 "american's": 10076,
 'alick': 16309,
 'harima': 16310,
 'alice': 12108,
 'altschul': 20779,
 'festivities': 16311,
 'medecines': 20780,
 'beneficial': 2942,
 'yoweri': 12109,
 'crowd': 13833,
 'crowe': 9390,
 'crown': 3553,
 'topping': 13679,
 'captive': 8819,
 'billboard': 12110,
 'fiduciary': 6169,
 'bottom': 3402,
 'plucked': 20782,
 'locksmithing': 20783,
 'ecopetrol': 9391,
 'pipestone': 24018,
 "growers'": 5505,
 'borrows': 20785,
 'eduard': 16312,
 'venpres': 13834,
 'bamboo': 16313,
 'foolish': 13835,
 'uruguyan': 20786,
 'officeholders': 20787,
 'economiques': 20788,
 'aden': 16314,
 'maxwell': 4822,
 'marshall': 4680,
 'honeymoon': 16315,
 'administer': 16316,
 'shoots': 20790,
 'rubbertech': 16317,
 'johsen': 16318,
 'reciprocity': 10077,
 'fabric': 13836,
 'suffice': 20791,
 'spokemsan': 20792,
 "sonora's": 20793,
 '5865': 16319,
 "systems'": 16320,
 'perfumes': 20794,
 'halycon': 20795,
 'nonvoting': 20796,
 'safeguard': 7250,
 'sawdust': 21538,
 "else's": 20797,
 'arrays': 13837,
 'aza': 20798,
 'smasher': 20799,
 'complications': 12111,
 'pesos': 1813,
 'relabelling': 20800,
 'passenger': 3722,
 "avon's": 12112,
 'megahertz': 20801,
 'mirror': 10683,
 'minas': 8357,
 'bourdain': 16322,
 'crownx': 20802,
 'eventual': 6425,
 'crowns': 1207,
 'role': 1369,
 'obliges': 20803,
 'rolf': 16323,
 'vegetative': 13838,
 'rolm': 20804,
 'roll': 4419,
 'intend': 2463,
 'palms': 16324,
 'denys': 19255,
 'transported': 13839,
 'moresby': 20805,
 'devon': 16325,
 'intent': 1351,
 "camco's": 20806,
 'variable': 5942,
 'transporter': 20807,
 'danske': 16326,
 'friedhelm': 13840,
 'hawker': 8358,
 "sand's": 17774,
 'preseving': 20808,
 '80386': 12113,
 'bnls': 16328,
 'ordination': 19984,
 'overturned': 11011,
 'erred': 16329,
 'cincinnati': 6696,
 'corps': 16710,
 'whoever': 20809,
 'osp': 16330,
 'osr': 13841,
 'ost': 12114,
 'chair': 16331,
 '690': 5647,
 'grapples': 20810,
 'megawatts': 13842,
 'photocopiers': 20811,
 'sconninx': 20812,
 'circumstances': 2274,
 'oversight': 13843,
 "paradyne's": 20814,
 '691': 6363,
 'paychecks': 20815,
 "stadelmann's": 13844,
 'choice': 3241,
 'vastagh': 11012,
 'embark': 8820,
 'gloomy': 9392,
 'stays': 9393,
 'exact': 4009,
 'minute': 5117,
 'kittiwake': 11892,
 'picul': 20816,
 'skewed': 20817,
 'cooke': 11013,
 'defaults': 10078,
 'reimpose': 11014,
 'hindered': 9394,
 'lengthened': 20818,
 'chopping': 16333,
 'mckiernan': 13845,
 'collaspe': 20819,
 'corazon': 7251,
 'antwerp': 7600,
 'abdullah': 13846,
 'goldston': 13847,
 '300': 442,
 'cassa': 20821,
 'casse': 20822,
 '695': 4081,
 'ground': 2979,
 'boost': 839,
 'azusa': 16334,
 'drafted': 9395,
 '303': 4823,
 'climbs': 13848,
 'honour': 7601,
 'vanderbilt': 20823,
 '305': 3968,
 'address': 3031,
 'dwindling': 8821,
 'benson': 7252,
 'enroll': 12115,
 'revenues': 501,
 'impacted': 12116,
 'queue': 20826,
 'accomplished': 10079,
 'throughput': 7602,
 'influx': 9396,
 'stockbuilding': 10080,
 'aproximates': 20827,
 'petroleo': 13849,
 'sistemas': 16335,
 'feretti': 14053,
 'opposes': 5943,
 'working': 882,
 'perished': 20829,
 'oldham': 13850,
 '27000': 20830,
 'optimize': 19245,
 'vigour': 20832,
 'opposed': 1580,
 'liberalizing': 16336,
 'wvz': 20833,
 'dampness': 20834,
 'approving': 13851,
 'sierra': 13496,
 'entrepot': 20835,
 'currency': 224,
 'originally': 1499,
 'tindemans': 20837,
 'valorem': 16337,
 'following': 477,
 'fossen': 20838,
 'locke': 11016,
 'employess': 20839,
 'rotberg': 12117,
 'parachute': 16338,
 'locks': 11017,
 'incremental': 12255,
 'woolowrth': 16339,
 'listens': 20841,
 'litre': 7253,
 'edouard': 3554,
 'ounce': 1377,
 'nicanor': 20843,
 'sucocitrico': 20844,
 'minicomputers': 16340,
 "silva's": 16341,
 'restitutions': 11018,
 'custer': 16342,
 '3rd': 2590,
 'fueled': 10081,
 'trydahl': 20845,
 'aice': 11019,
 'harmon': 12118,
 'conscious': 10082,
 'herbicidesand': 20846,
 'subdivisions': 20847,
 "veslefrikk's": 20848,
 'swollen': 11020,
 'pulled': 7978,
 'tilney': 20849,
 'years': 203,
 'structuring': 20850,
 'episodes': 20851,
 'sportscene': 16343,
 "northair's": 16344,
 'jig': 20852,
 'jin': 20853,
 'jim': 3403,
 'troubles': 8359,
 'workforces': 13852,
 'suspension': 2362,
 'troubled': 3892,
 'fondiaria': 16345,
 'modestly': 6697,
 'recipients': 12119,
 'civilian': 7979,
 'indigenous': 13853,
 'overpowering': 20854,
 'drilling': 1051,
 'sorted': 16346,
 'lichtenstein': 16347,
 'bedevil': 20855,
 'dispite': 20856,
 'battleships': 16843,
 'instability': 4824,
 'quarter': 95,
 'salado': 20857,
 'honduras': 5692,
 "chevron's": 13855,
 "lazere's": 12273,
 'receipt': 2660,
 'sponsor': 8360,
 'entering': 4825,
 "kcbt's": 16349,
 'nowicki': 19987,
 'salads': 13856,
 'augar': 16351,
 '797': 7980,
 '796': 7254,
 '795': 8361,
 '794': 5295,
 '793': 5118,
 '792': 6170,
 '791': 5296,
 '790': 4826,
 "nikko's": 20858,
 'unsaleable': 20859,
 '799': 5720,
 '798': 5693,
 'seriously': 2143,
 'trauma': 16352,
 'tvbh': 20860,
 'macedon': 20861,
 'disintegrated': 21906,
 'adddition': 21909,
 'incentives': 2244,
 'complicated': 5944,
 'reevaluating': 20864,
 'thatching': 21921,
 'brasil': 7981,
 '79p': 20865,
 'wrong': 4951,
 'initiate': 8822,
 'aboard': 16353,
 'saving': 7255,
 'spoken': 8823,
 'parkinson': 16364,
 'one': 65,
 'ont': 20867,
 'concert': 7256,
 "boston's": 16354,
 'stifled': 13859,
 'types': 4622,
 'lingering': 20868,
 'surges': 16356,
 'hurdman': 20869,
 'herds': 16357,
 'absorbs': 14114,
 'surged': 4681,
 'dalkon': 14211,
 'crossroads': 13860,
 'shakeup': 20870,
 'disasterous': 20871,
 'illness': 11021,
 'turned': 3242,
 'locations': 3801,
 'tyranite': 12120,
 'minesweepers': 13861,
 'turner': 7257,
 'borough': 20872,
 'underlines': 12358,
 "bancorporation's": 20873,
 'fashionable': 20874,
 "ae's": 20875,
 'dilutions': 16358,
 'goodman': 9472,
 'unlawfully': 10510,
 'mayer': 16359,
 'printer': 16360,
 'offload': 20877,
 'opposite': 13862,
 'buffer': 738,
 'printed': 9398,
 'pequiven': 16361,
 'panoche': 13863,
 'knowingly': 20878,
 'ecusta': 16362,
 'thsl': 20879,
 'phil': 8825,
 'jitters': 13864,
 'touche': 16363,
 'jittery': 20881,
 'friction': 3291,
 'fecal': 16365,
 'resurgance': 22068,
 'heeding': 20882,
 'soviets': 2363,
 'imagined': 16366,
 'transact': 16367,
 'califoirnia': 20883,
 "chrysler's": 9399,
 'respecitvely': 16368,
 'presse': 16369,
 'euromarket': 10084,
 'guarded': 12121,
 'satisfacotry': 16371,
 'authroization': 20884,
 'simplistic': 20885,
 'monde': 20886,
 'awaiting': 4102,
 'recombinant': 13865,
 'refinancement': 20887,
 'comserv': 20888,
 'kitakyushu': 20889,
 'pima': 16372,
 'basle': 11022,
 '6250': 20891,
 'choudhury': 16373,
 'vision': 8826,
 'interruptible': 20892,
 'weatherford': 13866,
 '832': 7982,
 '833': 5694,
 '830': 4420,
 '831': 5119,
 '836': 5297,
 '837': 4553,
 '834': 6172,
 '835': 4952,
 'alarming': 22144,
 '838': 5695,
 '839': 6173,
 '524p': 20893,
 'sponsorship': 20894,
 'vendex': 12122,
 "amsouth's": 20895,
 'kilometer': 20896,
 'enjoys': 10086,
 'illiberal': 20897,
 'punta': 6174,
 'punte': 20898,
 'girozentrale': 10087,
 'missstatements': 20899,
 'marietta': 10088,
 'awards': 6175,
 'concentrated': 3635,
 '83p': 20900,
 'developpement': 13867,
 'rhodes': 13868,
 'matheson': 5696,
 '1720': 20901,
 'paring': 20902,
 's': 35,
 'concentrates': 4953,
 "can's": 16374,
 'polysaturated': 22183,
 'parini': 20903,
 'baden': 13869,
 'bader': 20904,
 'buoyancy': 12123,
 'erdem': 20905,
 'properites': 16375,
 'comparitive': 20906,
 'practises': 12124,
 'collides': 20907,
 'west': 189,
 'wess': 20908,
 'collided': 13870,
 'practised': 20909,
 "amalgamated's": 20910,
 'motives': 20911,
 'wants': 1378,
 'formed': 1273,
 'readings': 20912,
 'geothermal': 12125,
 'tightened': 7315,
 "d'or": 11023,
 'former': 1109,
 'venezulean': 20913,
 'curd': 19935,
 'squeezes': 12126,
 'newspaper': 1019,
 'situation': 817,
 'ivey': 13871,
 'engaged': 3636,
 'dubious': 13872,
 'cayacq': 17061,
 'cobol': 20916,
 'limping': 20917,
 'technology': 883,
 'koerner': 20919,
 'debilitating': 16376,
 'verified': 7983,
 'otto': 4010,
 '7770': 20920,
 'emulsions': 16377,
 "onic's": 16378,
 'slate': 9075,
 'wires': 20921,
 'edged': 5506,
 'assigns': 20922,
 'singapore': 1341,
 'deflate': 20923,
 "strategy's": 20924,
 'walesa': 16379,
 'advertisement': 4554,
 'luyten': 20925,
 'shrortly': 20926,
 'corpoartion': 20927,
 'preferance': 22290,
 'tracking': 16380,
 'sunnyvale': 13874,
 'colorants': 20928,
 'persistently': 16381,
 "officers'": 16382,
 "his's": 20929,
 'being': 367,
 'divestitures': 7259,
 'steamer': 20930,
 'rover': 20931,
 'grounded': 8362,
 "businessmen's": 16383,
 'cyanidation': 16384,
 'overthrow': 20932,
 'partnerhip': 20933,
 'sumt': 16385,
 'sums': 8827,
 'oelmuehle': 16386,
 'unveil': 16387,
 'gestures': 13875,
 'penta': 20934,
 'traffic': 2544,
 'preference': 2428,
 'sumi': 20935,
 'world': 166,
 'postal': 9400,
 'bced': 16388,
 'dornbush': 12128,
 'confine': 14215,
 '2555': 20936,
 "zambia's": 5945,
 'superiority': 20937,
 'militate': 20938,
 'satisfactory': 2395,
 'superintendent': 20939,
 'tvx': 5946,
 'tvt': 16389,
 'magma': 6698,
 'diving': 20940,
 'tvb': 15548,
 'seaman': 13876,
 'matsunaga': 11025,
 '919': 4827,
 '918': 5298,
 'refundable': 17070,
 '914': 5947,
 '917': 7260,
 '916': 6699,
 '911': 5507,
 '910': 4828,
 'restoring': 10213,
 '912': 4555,
 'squabble': 20942,
 'retains': 7261,
 "partner's": 20943,
 'leadership': 5300,
 'graaf': 11026,
 'spacelab': 20944,
 'thailand': 1800,
 'graan': 9402,
 'exasperating': 20945,
 'hartmarx': 12129,
 'frights': 16390,
 'niall': 20946,
 'johnston': 11027,
 '91p': 16391,
 'sensitively': 16392,
 'porsche': 6016,
 'prepares': 15494,
 'lively': 12130,
 'stoppages': 10686,
 "associated's": 16394,
 'pivot': 12131,
 'series': 1037,
 'sese': 24050,
 'bubble': 7604,
 'trusses': 16395,
 'interestate': 20949,
 'continents': 20950,
 'societal': 20951,
 'with': 28,
 'pull': 6176,
 'rush': 6700,
 'monopoly': 6222,
 'operationally': 20953,
 'dirty': 20954,
 'abuses': 10090,
 'prudhoe': 7262,
 'pulp': 5949,
 'rust': 16396,
 'hellman': 20955,
 'amdec': 20956,
 'australasian': 16397,
 'watches': 13878,
 'hypertension': 20957,
 "hemdale's": 20958,
 'formulation': 16398,
 'watched': 7605,
 'jargon': 20959,
 'cream': 13879,
 'ideally': 9404,
 'ryavec': 11028,
 'microoganisms': 20960,
 'indemnify': 13880,
 'wincenty': 20961,
 'waving': 20962,
 "multifood's": 20963,
 'midges': 20964,
 'natalie': 11029,
 'crosbie': 13881,
 'posible': 20965,
 'omnibus': 13882,
 'assetsof': 20966,
 'tricks': 13883,
 'rs': 16399,
 'kilogram': 20967,
 'pruning': 25363,
 'dyer': 13884,
 'dyes': 20968,
 'legislatures': 20969,
 'scm': 16400,
 'sci': 9405,
 'riedel': 20970,
 'ceramic': 16401,
 'unitholders': 6701,
 'scb': 13885,
 'dn11': 20971,
 'conditionality': 20972,
 "stock's": 13807,
 'masland': 20973,
 'causes': 7606,
 'riots': 10091,
 'norf': 20974,
 'nord': 9406,
 'midwest': 3893,
 'tamils': 13886,
 'ofthe': 16402,
 "colombia's": 3421,
 '24th': 11030,
 'sant': 20975,
 'moines': 10092,
 'electrotechnical': 22577,
 'proceeded': 24534,
 'sanz': 20976,
 'insufficiently': 13887,
 'sang': 20977,
 'sand': 5950,
 'bracho': 16404,
 'small': 805,
 'workloads': 20978,
 'sank': 6702,
 'kemper': 20979,
 'abbreviated': 16405,
 'quicker': 13888,
 '199': 3802,
 '198': 3243,
 '195': 2661,
 '194': 3080,
 '197': 4310,
 '196': 3894,
 '191': 2850,
 '190': 2199,
 '193': 3481,
 '192': 3350,
 'past': 582,
 'fractionation': 20980,
 'displays': 20981,
 'pass': 3081,
 'investment': 202,
 'quals': 27062,
 'quicken': 16406,
 "centronic's": 20983,
 'menswear': 20984,
 'clock': 16407,
 'teape': 20985,
 'teapa': 20986,
 'prevailed': 10093,
 'hebei': 9407,
 ...}

In [9]:
# Reversing the index from value, key to key, value
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [10]:
reverse_word_index


Out[10]:
{10996: 'mdbl',
 16260: 'fawc',
 12089: 'degussa',
 8803: 'woods',
 13796: 'hanging',
 20672: 'localized',
 20673: 'sation',
 20675: 'chanthaburi',
 10997: 'refunding',
 8804: 'hermann',
 20676: 'passsengers',
 20677: 'stipulate',
 8352: 'heublein',
 20713: 'screaming',
 16261: 'tcby',
 185: 'four',
 1642: 'grains',
 20680: 'broiler',
 12090: 'wooden',
 1220: 'wednesday',
 13797: 'highveld',
 7593: 'duffour',
 20681: '0053',
 3914: 'elections',
 2563: '270',
 3551: '271',
 5113: '272',
 3552: '273',
 3400: '274',
 7975: 'rudman',
 3401: '276',
 3478: '277',
 3632: '278',
 4309: '279',
 9381: 'dormancy',
 7247: 'errors',
 3086: 'deferred',
 20683: 'sptnd',
 8805: 'cooking',
 20684: 'stratabit',
 16262: 'designing',
 20685: 'metalurgicos',
 13798: 'databank',
 20686: '300er',
 20687: 'shocks',
 7972: 'nawg',
 20688: 'tnta',
 20689: 'perforations',
 2891: 'affiliates',
 20690: '27p',
 16263: 'ching',
 595: 'china',
 16264: 'wagyu',
 3189: 'affiliated',
 16265: 'chino',
 16266: 'chinh',
 20692: 'slickline',
 13799: 'doldrums',
 12092: 'kids',
 3028: 'climbed',
 6693: 'controversy',
 20693: 'kidd',
 12093: 'spotty',
 12639: 'rebel',
 9382: 'millimetres',
 4007: 'golden',
 5689: 'projection',
 12094: 'stern',
 7903: "hudson's",
 10066: 'dna',
 20695: 'dnc',
 20696: 'hodler',
 2394: 'lme',
 20697: 'insolvancy',
 13800: 'music',
 1984: 'therefore',
 10998: 'dns',
 6959: 'distortions',
 13801: 'thassos',
 20698: 'populations',
 8806: 'meteorologist',
 43: 'loss',
 9383: 'exco',
 20813: 'adventist',
 16267: 'murchison',
 10999: 'locked',
 13802: 'kampala',
 20699: 'arndt',
 1267: 'nakasone',
 20700: 'steinweg',
 3633: "india's",
 3029: 'wang',
 10067: 'wane',
 13803: 'unjust',
 13804: 'titanium',
 850: 'want',
 20701: 'pinto',
 16268: "institutes'",
 7973: 'absolute',
 4677: 'travel',
 6422: 'cutback',
 16269: 'nazmi',
 1858: 'modest',
 16270: 'shopwell',
 20702: 'sedi',
 20703: 'adoped',
 16271: 'tulis',
 20704: '18th',
 20705: "wmc's",
 20706: 'menlo',
 11000: 'reiners',
 12095: 'farmlands',
 20707: 'nonsensical',
 20708: 'elisra',
 2461: 'welcomed',
 20709: 'peup',
 16272: "holiday's",
 20711: 'activating',
 16273: 'avondale',
 16274: 'interational',
 20712: 'welcomes',
 16275: 'fip',
 11001: 'tailings',
 4205: 'fit',
 16276: 'lifeline',
 1916: 'bringing',
 4819: 'fix',
 6164: '624',
 12096: 'naturalite',
 6165: 'wales',
 8807: 'fin',
 11129: 'fio',
 20714: 'ceremenony',
 20715: 'sovr',
 20716: "yeo's",
 1788: 'effects',
 13805: 'sixteen',
 8808: 'undeveloped',
 13806: 'glutted',
 20717: 'barton',
 20718: 'froday',
 10089: 'arrow',
 11002: 'stabilises',
 6960: 'allan',
 20719: '374p',
 3891: '393',
 4008: '392',
 4206: '391',
 3079: '390',
 4550: '397',
 6166: '396',
 6423: '395',
 4207: '394',
 6961: '399',
 4208: '398',
 7595: 'stabilised',
 5114: 'smelters',
 20720: 'oprah',
 20721: 'orginially',
 20722: "tvx's",
 16278: 'ponomarev',
 20723: 'enviroment',
 20724: "reeves'",
 8363: 'mason',
 1670: 'encourage',
 7596: 'adapt',
 12776: 'abbott',
 13808: 'stamping',
 20726: 'colquiri',
 11003: 'ambrit',
 8353: 'strata',
 4821: 'corrects',
 11922: 'sandra',
 859: 'estimate',
 20727: 'universally',
 20728: 'chlorine',
 16279: 'competes',
 10068: 'leiner',
 8809: 'ministries',
 8810: 'disturbed',
 13809: 'competed',
 8811: 'juergen',
 13810: 'kfw',
 11004: 'turben',
 9384: 'reintroduced',
 20729: 'maladies',
 4101: 'chevron',
 16280: 'lazere',
 8812: 'antilles',
 11907: 'dti',
 9070: 'specially',
 4678: 'bilzerian',
 13811: 'bakelite',
 20730: 'renovated',
 568: 'service',
 16281: 'payless',
 20731: 'spiegler',
 831: 'needed',
 16282: 'wigglesworth',
 6962: 'master',
 13812: 'antonson',
 20732: 'genesis',
 13813: 'vismara',
 20734: 'organically',
 20735: "accords'",
 5940: 'task',
 7974: 'positively',
 3479: 'feasibility',
 6963: 'ahmed',
 13814: "suralco's",
 20736: 'awacs',
 16283: 'idly',
 20737: 'regulator',
 12097: 'pseudorabies',
 16284: 'staubli',
 8813: 'nzi',
 5115: 'feeling',
 3127: '275',
 20738: '6819',
 16285: 'gorman',
 8354: 'sustaining',
 9385: 'spectrum',
 20739: 'consenting',
 12098: 'recapitalized',
 11562: 'sailed',
 7597: 'dozen',
 1985: 'affairs',
 2253: 'courier',
 8355: 'kremlin',
 895: 'shipments',
 16286: "aquino's",
 10070: 'committing',
 5293: 'sugarcane',
 9386: 'diminishing',
 16287: 'vexing',
 11005: 'simplify',
 6167: 'mouth',
 7248: 'steinhardt',
 8814: 'conceded',
 9387: 'bradford',
 7976: 'singer',
 20740: '5602',
 13816: "1987's",
 4950: 'tech',
 6424: 'teck',
 20741: 'majv',
 666: 'saying',
 16477: 'dickey',
 20742: 'sweetner',
 21149: 'teresa',
 20743: 'ulcer',
 13817: 'cheaply',
 2361: 'thai',
 6964: 'orleans',
 16290: 'excavator',
 6168: 'rico',
 12099: 'lube',
 13818: 'rick',
 4679: 'rich',
 13819: 'kerna',
 950: 'rice',
 4209: 'rica',
 5503: 'plate',
 16291: 'platt',
 8356: 'altogether',
 8815: 'jaguar',
 20744: 'dynair',
 8816: 'patch',
 2892: 'ldp',
 13820: 'boarded',
 16292: 'precluding',
 11006: 'clarified',
 16293: 'sensitivity',
 1511: 'alternative',
 11007: 'clarifies',
 5116: 'lots',
 7598: 'irs',
 20745: 'irv',
 13821: 'iri',
 13822: 'ira',
 5690: 'timber',
 20746: 'ire',
 5219: 'discipline',
 1937: 'extend',
 3634: 'nature',
 16295: "amb's",
 16296: 'dunhill',
 2142: 'extent',
 20747: 'restrcitions',
 2396: 'heating',
 11008: "mannesmann's",
 20748: 'outsanding',
 20749: 'multimillions',
 13824: 'sarcinelli',
 6694: 'southeastern',
 10071: 'eradicate',
 9388: 'libyan',
 20750: 'foreclosing',
 12101: 'maclaine',
 20751: 'fra',
 353: 'union',
 11009: 'frn',
 386: 'much',
 12102: 'fry',
 20752: 'mothball',
 10072: 'chlorazepate',
 12103: 'dxns',
 19981: 'toyko',
 20753: 'spit',
 16297: '007050',
 16298: 'freehold',
 13825: 'davy',
 11010: 'dave',
 12177: 'spie',
 10117: 'aguayo',
 12104: 'wildcat',
 10069: 'fecs',
 20754: 'kennan',
 16299: 'intal',
 9389: 'contingencies',
 16551: 'professionally',
 16300: 'microbiological',
 20756: 'misconstrued',
 409: 'k',
 20757: 'securitiesd',
 16301: 'deferring',
 5941: 'kohl',
 3030: 'conditioned',
 20758: 'fnhb',
 16302: "october's",
 13954: 'memorial',
 6965: 'democracies',
 27520: 'conformed',
 464: 'split',
 12105: "bond's",
 11112: 'thinly',
 16515: 'dunkirk',
 16303: 'cavanaugh',
 13827: "securities'",
 21345: 'marches',
 16304: 'issam',
 2020: 'workforce',
 12106: 'meinert',
 13828: 'boiler',
 5294: "bp's",
 16305: 'torpedoed',
 20762: 'indidate',
 13829: 'downwardly',
 20763: 'viviez',
 20764: 'vladiminovich',
 16306: 'academic',
 20765: 'architecural',
 1117: 'corporate',
 16307: 'appropriately',
 20766: 'teicc',
 20767: "hanover's",
 8817: 'aristech',
 20768: 'portrayed',
 21383: 'raffineries',
 20770: 'hai',
 7599: 'hal',
 13830: 'ham',
 10073: 'han',
 20771: 'e15b',
 61: 'had',
 20772: 'hay',
 13831: 'botchwey',
 10074: 'haq',
 37: 'has',
 13832: 'hat',
 20773: 'hav',
 20774: 'fortin',
 8818: 'municipal',
 20775: 'osman',
 20776: 'fsical',
 3480: 'elders',
 12107: 'survival',
 16308: 'unequivocally',
 2519: 'objective',
 6695: 'indicative',
 10075: 'shadow',
 21411: 'riskiness',
 20778: 'positiive',
 10076: "american's",
 16309: 'alick',
 16310: 'harima',
 12108: 'alice',
 20779: 'altschul',
 16311: 'festivities',
 20780: 'medecines',
 2942: 'beneficial',
 12109: 'yoweri',
 13833: 'crowd',
 9390: 'crowe',
 3553: 'crown',
 13679: 'topping',
 8819: 'captive',
 12110: 'billboard',
 6169: 'fiduciary',
 3402: 'bottom',
 20782: 'plucked',
 20783: 'locksmithing',
 9391: 'ecopetrol',
 24018: 'pipestone',
 5505: "growers'",
 20785: 'borrows',
 16312: 'eduard',
 13834: 'venpres',
 16313: 'bamboo',
 13835: 'foolish',
 20786: 'uruguyan',
 20787: 'officeholders',
 20788: 'economiques',
 16314: 'aden',
 4822: 'maxwell',
 4680: 'marshall',
 16315: 'honeymoon',
 16316: 'administer',
 20790: 'shoots',
 16317: 'rubbertech',
 16318: 'johsen',
 10077: 'reciprocity',
 13836: 'fabric',
 20791: 'suffice',
 20792: 'spokemsan',
 20793: "sonora's",
 16319: '5865',
 16320: "systems'",
 20794: 'perfumes',
 20795: 'halycon',
 20796: 'nonvoting',
 7250: 'safeguard',
 21538: 'sawdust',
 20797: "else's",
 13837: 'arrays',
 20798: 'aza',
 20799: 'smasher',
 12111: 'complications',
 1813: 'pesos',
 20800: 'relabelling',
 3722: 'passenger',
 12112: "avon's",
 20801: 'megahertz',
 10683: 'mirror',
 8357: 'minas',
 16322: 'bourdain',
 20802: 'crownx',
 6425: 'eventual',
 1207: 'crowns',
 1369: 'role',
 20803: 'obliges',
 16323: 'rolf',
 13838: 'vegetative',
 20804: 'rolm',
 4419: 'roll',
 2463: 'intend',
 16324: 'palms',
 19255: 'denys',
 13839: 'transported',
 20805: 'moresby',
 16325: 'devon',
 1351: 'intent',
 20806: "camco's",
 5942: 'variable',
 20807: 'transporter',
 16326: 'danske',
 13840: 'friedhelm',
 8358: 'hawker',
 17774: "sand's",
 20808: 'preseving',
 12113: '80386',
 16328: 'bnls',
 19984: 'ordination',
 11011: 'overturned',
 16329: 'erred',
 6696: 'cincinnati',
 16710: 'corps',
 20809: 'whoever',
 16330: 'osp',
 13841: 'osr',
 12114: 'ost',
 16331: 'chair',
 5647: '690',
 20810: 'grapples',
 13842: 'megawatts',
 20811: 'photocopiers',
 20812: 'sconninx',
 2274: 'circumstances',
 13843: 'oversight',
 20814: "paradyne's",
 6363: '691',
 20815: 'paychecks',
 13844: "stadelmann's",
 3241: 'choice',
 11012: 'vastagh',
 8820: 'embark',
 9392: 'gloomy',
 9393: 'stays',
 4009: 'exact',
 5117: 'minute',
 11892: 'kittiwake',
 20816: 'picul',
 20817: 'skewed',
 11013: 'cooke',
 10078: 'defaults',
 11014: 'reimpose',
 9394: 'hindered',
 20818: 'lengthened',
 16333: 'chopping',
 13845: 'mckiernan',
 20819: 'collaspe',
 7251: 'corazon',
 7600: 'antwerp',
 13846: 'abdullah',
 13847: 'goldston',
 442: '300',
 20821: 'cassa',
 20822: 'casse',
 4081: '695',
 2979: 'ground',
 839: 'boost',
 16334: 'azusa',
 9395: 'drafted',
 4823: '303',
 13848: 'climbs',
 7601: 'honour',
 20823: 'vanderbilt',
 3968: '305',
 3031: 'address',
 8821: 'dwindling',
 7252: 'benson',
 12115: 'enroll',
 501: 'revenues',
 12116: 'impacted',
 20826: 'queue',
 10079: 'accomplished',
 7602: 'throughput',
 9396: 'influx',
 10080: 'stockbuilding',
 20827: 'aproximates',
 13849: 'petroleo',
 16335: 'sistemas',
 14053: 'feretti',
 5943: 'opposes',
 882: 'working',
 20829: 'perished',
 13850: 'oldham',
 20830: '27000',
 19245: 'optimize',
 20832: 'vigour',
 1580: 'opposed',
 16336: 'liberalizing',
 20833: 'wvz',
 20834: 'dampness',
 13851: 'approving',
 13496: 'sierra',
 20835: 'entrepot',
 224: 'currency',
 1499: 'originally',
 20837: 'tindemans',
 16337: 'valorem',
 477: 'following',
 20838: 'fossen',
 11016: 'locke',
 20839: 'employess',
 12117: 'rotberg',
 16338: 'parachute',
 11017: 'locks',
 12255: 'incremental',
 16339: 'woolowrth',
 20841: 'listens',
 7253: 'litre',
 3554: 'edouard',
 1377: 'ounce',
 20843: 'nicanor',
 20844: 'sucocitrico',
 16340: 'minicomputers',
 16341: "silva's",
 11018: 'restitutions',
 16342: 'custer',
 2590: '3rd',
 10081: 'fueled',
 20845: 'trydahl',
 11019: 'aice',
 12118: 'harmon',
 10082: 'conscious',
 20846: 'herbicidesand',
 20847: 'subdivisions',
 20848: "veslefrikk's",
 11020: 'swollen',
 7978: 'pulled',
 20849: 'tilney',
 203: 'years',
 20850: 'structuring',
 20851: 'episodes',
 16343: 'sportscene',
 16344: "northair's",
 20852: 'jig',
 20853: 'jin',
 3403: 'jim',
 8359: 'troubles',
 13852: 'workforces',
 2362: 'suspension',
 3892: 'troubled',
 16345: 'fondiaria',
 6697: 'modestly',
 12119: 'recipients',
 7979: 'civilian',
 13853: 'indigenous',
 20854: 'overpowering',
 1051: 'drilling',
 16346: 'sorted',
 16347: 'lichtenstein',
 20855: 'bedevil',
 20856: 'dispite',
 16843: 'battleships',
 4824: 'instability',
 95: 'quarter',
 20857: 'salado',
 5692: 'honduras',
 13855: "chevron's",
 12273: "lazere's",
 2660: 'receipt',
 8360: 'sponsor',
 4825: 'entering',
 16349: "kcbt's",
 19987: 'nowicki',
 13856: 'salads',
 16351: 'augar',
 7980: '797',
 7254: '796',
 8361: '795',
 5295: '794',
 5118: '793',
 6170: '792',
 5296: '791',
 4826: '790',
 20858: "nikko's",
 20859: 'unsaleable',
 5720: '799',
 5693: '798',
 2143: 'seriously',
 16352: 'trauma',
 20860: 'tvbh',
 20861: 'macedon',
 21906: 'disintegrated',
 21909: 'adddition',
 2244: 'incentives',
 5944: 'complicated',
 20864: 'reevaluating',
 21921: 'thatching',
 7981: 'brasil',
 20865: '79p',
 4951: 'wrong',
 8822: 'initiate',
 16353: 'aboard',
 7255: 'saving',
 8823: 'spoken',
 16364: 'parkinson',
 65: 'one',
 20867: 'ont',
 7256: 'concert',
 16354: "boston's",
 13859: 'stifled',
 4622: 'types',
 20868: 'lingering',
 16356: 'surges',
 20869: 'hurdman',
 16357: 'herds',
 14114: 'absorbs',
 4681: 'surged',
 14211: 'dalkon',
 13860: 'crossroads',
 20870: 'shakeup',
 20871: 'disasterous',
 11021: 'illness',
 3242: 'turned',
 3801: 'locations',
 12120: 'tyranite',
 13861: 'minesweepers',
 7257: 'turner',
 20872: 'borough',
 12358: 'underlines',
 20873: "bancorporation's",
 20874: 'fashionable',
 20875: "ae's",
 16358: 'dilutions',
 9472: 'goodman',
 10510: 'unlawfully',
 16359: 'mayer',
 16360: 'printer',
 20877: 'offload',
 13862: 'opposite',
 738: 'buffer',
 9398: 'printed',
 16361: 'pequiven',
 13863: 'panoche',
 20878: 'knowingly',
 16362: 'ecusta',
 20879: 'thsl',
 8825: 'phil',
 13864: 'jitters',
 16363: 'touche',
 20881: 'jittery',
 3291: 'friction',
 16365: 'fecal',
 22068: 'resurgance',
 20882: 'heeding',
 2363: 'soviets',
 16366: 'imagined',
 16367: 'transact',
 20883: 'califoirnia',
 9399: "chrysler's",
 16368: 'respecitvely',
 16369: 'presse',
 10084: 'euromarket',
 12121: 'guarded',
 16371: 'satisfacotry',
 20884: 'authroization',
 20885: 'simplistic',
 20886: 'monde',
 4102: 'awaiting',
 13865: 'recombinant',
 20887: 'refinancement',
 20888: 'comserv',
 20889: 'kitakyushu',
 16372: 'pima',
 11022: 'basle',
 20891: '6250',
 16373: 'choudhury',
 8826: 'vision',
 20892: 'interruptible',
 13866: 'weatherford',
 7982: '832',
 5694: '833',
 4420: '830',
 5119: '831',
 5297: '836',
 4553: '837',
 6172: '834',
 4952: '835',
 22144: 'alarming',
 5695: '838',
 6173: '839',
 20893: '524p',
 20894: 'sponsorship',
 12122: 'vendex',
 20895: "amsouth's",
 20896: 'kilometer',
 10086: 'enjoys',
 20897: 'illiberal',
 6174: 'punta',
 20898: 'punte',
 10087: 'girozentrale',
 20899: 'missstatements',
 10088: 'marietta',
 6175: 'awards',
 3635: 'concentrated',
 20900: '83p',
 13867: 'developpement',
 13868: 'rhodes',
 5696: 'matheson',
 20901: '1720',
 20902: 'paring',
 35: 's',
 4953: 'concentrates',
 16374: "can's",
 22183: 'polysaturated',
 20903: 'parini',
 13869: 'baden',
 20904: 'bader',
 12123: 'buoyancy',
 20905: 'erdem',
 16375: 'properites',
 20906: 'comparitive',
 12124: 'practises',
 20907: 'collides',
 189: 'west',
 20908: 'wess',
 13870: 'collided',
 20909: 'practised',
 20910: "amalgamated's",
 20911: 'motives',
 1378: 'wants',
 1273: 'formed',
 20912: 'readings',
 12125: 'geothermal',
 7315: 'tightened',
 11023: "d'or",
 1109: 'former',
 20913: 'venezulean',
 19935: 'curd',
 12126: 'squeezes',
 1019: 'newspaper',
 817: 'situation',
 13871: 'ivey',
 3636: 'engaged',
 13872: 'dubious',
 17061: 'cayacq',
 20916: 'cobol',
 20917: 'limping',
 883: 'technology',
 20919: 'koerner',
 16376: 'debilitating',
 7983: 'verified',
 4010: 'otto',
 20920: '7770',
 16377: 'emulsions',
 16378: "onic's",
 9075: 'slate',
 20921: 'wires',
 5506: 'edged',
 20922: 'assigns',
 1341: 'singapore',
 20923: 'deflate',
 20924: "strategy's",
 16379: 'walesa',
 4554: 'advertisement',
 20925: 'luyten',
 20926: 'shrortly',
 20927: 'corpoartion',
 22290: 'preferance',
 16380: 'tracking',
 13874: 'sunnyvale',
 20928: 'colorants',
 16381: 'persistently',
 16382: "officers'",
 20929: "his's",
 367: 'being',
 7259: 'divestitures',
 20930: 'steamer',
 20931: 'rover',
 8362: 'grounded',
 16383: "businessmen's",
 16384: 'cyanidation',
 20932: 'overthrow',
 20933: 'partnerhip',
 16385: 'sumt',
 8827: 'sums',
 16386: 'oelmuehle',
 16387: 'unveil',
 13875: 'gestures',
 20934: 'penta',
 2544: 'traffic',
 2428: 'preference',
 20935: 'sumi',
 166: 'world',
 9400: 'postal',
 16388: 'bced',
 12128: 'dornbush',
 14215: 'confine',
 20936: '2555',
 5945: "zambia's",
 20937: 'superiority',
 20938: 'militate',
 2395: 'satisfactory',
 20939: 'superintendent',
 5946: 'tvx',
 16389: 'tvt',
 6698: 'magma',
 20940: 'diving',
 15548: 'tvb',
 13876: 'seaman',
 11025: 'matsunaga',
 4827: '919',
 5298: '918',
 17070: 'refundable',
 5947: '914',
 7260: '917',
 6699: '916',
 5507: '911',
 4828: '910',
 10213: 'restoring',
 4555: '912',
 20942: 'squabble',
 7261: 'retains',
 20943: "partner's",
 5300: 'leadership',
 11026: 'graaf',
 20944: 'spacelab',
 1800: 'thailand',
 9402: 'graan',
 20945: 'exasperating',
 12129: 'hartmarx',
 16390: 'frights',
 20946: 'niall',
 11027: 'johnston',
 16391: '91p',
 16392: 'sensitively',
 6016: 'porsche',
 15494: 'prepares',
 12130: 'lively',
 10686: 'stoppages',
 16394: "associated's",
 12131: 'pivot',
 1037: 'series',
 24050: 'sese',
 7604: 'bubble',
 16395: 'trusses',
 20949: 'interestate',
 20950: 'continents',
 20951: 'societal',
 28: 'with',
 6176: 'pull',
 6700: 'rush',
 6222: 'monopoly',
 20953: 'operationally',
 20954: 'dirty',
 10090: 'abuses',
 7262: 'prudhoe',
 5949: 'pulp',
 16396: 'rust',
 20955: 'hellman',
 20956: 'amdec',
 16397: 'australasian',
 13878: 'watches',
 20957: 'hypertension',
 20958: "hemdale's",
 16398: 'formulation',
 7605: 'watched',
 20959: 'jargon',
 13879: 'cream',
 9404: 'ideally',
 11028: 'ryavec',
 20960: 'microoganisms',
 13880: 'indemnify',
 20961: 'wincenty',
 20962: 'waving',
 20963: "multifood's",
 20964: 'midges',
 11029: 'natalie',
 13881: 'crosbie',
 20965: 'posible',
 13882: 'omnibus',
 20966: 'assetsof',
 13883: 'tricks',
 16399: 'rs',
 20967: 'kilogram',
 25363: 'pruning',
 13884: 'dyer',
 20968: 'dyes',
 20969: 'legislatures',
 16400: 'scm',
 9405: 'sci',
 20970: 'riedel',
 16401: 'ceramic',
 6701: 'unitholders',
 13885: 'scb',
 20971: 'dn11',
 20972: 'conditionality',
 13807: "stock's",
 20973: 'masland',
 7606: 'causes',
 10091: 'riots',
 20974: 'norf',
 9406: 'nord',
 3893: 'midwest',
 13886: 'tamils',
 16402: 'ofthe',
 3421: "colombia's",
 11030: '24th',
 20975: 'sant',
 10092: 'moines',
 22577: 'electrotechnical',
 24534: 'proceeded',
 20976: 'sanz',
 13887: 'insufficiently',
 20977: 'sang',
 5950: 'sand',
 16404: 'bracho',
 805: 'small',
 20978: 'workloads',
 6702: 'sank',
 20979: 'kemper',
 16405: 'abbreviated',
 13888: 'quicker',
 3802: '199',
 3243: '198',
 2661: '195',
 3080: '194',
 4310: '197',
 3894: '196',
 2850: '191',
 2199: '190',
 3481: '193',
 3350: '192',
 582: 'past',
 20980: 'fractionation',
 20981: 'displays',
 3081: 'pass',
 202: 'investment',
 27062: 'quals',
 16406: 'quicken',
 20983: "centronic's",
 20984: 'menswear',
 16407: 'clock',
 20985: 'teape',
 20986: 'teapa',
 10093: 'prevailed',
 9407: 'hebei',
 ...}

In [11]:
# Decoding
decoded_newswire = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])

In [12]:
decoded_newswire


Out[12]:
'? ? ? said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3'

Encoding the data


In [13]:
import numpy as np

In [14]:
def vectorize_sequences(sequences, dimension = num_words):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

In [15]:
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

In [16]:
x_train.shape


Out[16]:
(8982, 10000)

One-hot encoding the labels


In [17]:
# You can use keras' to_categorical function or following code:
def to_one_hot(labels, dimension = 46):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        results[i, label] = 1.
    return results

In [18]:
one_hot_train_labels = to_one_hot(train_labels)
one_hot_test_labels = to_one_hot(test_labels)

In [19]:
one_hot_train_labels.shape


Out[19]:
(8982, 46)

Building the network


In [20]:
from keras.models import Sequential
from keras.layers import Dense

In [21]:
model = Sequential()

In [22]:
model.add(Dense(units = 64, 
                activation = 'relu', 
                input_shape = (10000,)))
model.add(Dense(units = 64, 
                activation = 'relu'))
# Multiclass clasification => Softmax
model.add(Dense(units = 46, 
                activation = 'softmax'))

In [23]:
# Compiling the model
model.compile(optimizer = 'rmsprop', 
              loss = 'categorical_crossentropy',
              metrics = ['accuracy'])

Creating a validation set


In [24]:
x_val = x_train[:1000]
partial_x_train = x_train[1000:]

y_val = one_hot_train_labels[:1000]
partial_y_train = one_hot_train_labels[1000:]

In [25]:
history = model.fit(x = partial_x_train,
                    y = partial_y_train,
                    epochs = 20,
                    batch_size = 512,
                    validation_data = (x_val, y_val))


Train on 7982 samples, validate on 1000 samples
Epoch 1/20
7982/7982 [==============================] - 2s - loss: 2.5306 - acc: 0.4962 - val_loss: 1.7180 - val_acc: 0.6120
Epoch 2/20
7982/7982 [==============================] - 0s - loss: 1.4430 - acc: 0.6878 - val_loss: 1.3435 - val_acc: 0.7060
Epoch 3/20
7982/7982 [==============================] - 0s - loss: 1.0929 - acc: 0.7661 - val_loss: 1.1704 - val_acc: 0.7430
Epoch 4/20
7982/7982 [==============================] - 0s - loss: 0.8682 - acc: 0.8166 - val_loss: 1.0788 - val_acc: 0.7600
Epoch 5/20
7982/7982 [==============================] - 1s - loss: 0.7020 - acc: 0.8483 - val_loss: 0.9844 - val_acc: 0.7830
Epoch 6/20
7982/7982 [==============================] - 0s - loss: 0.5666 - acc: 0.8796 - val_loss: 0.9401 - val_acc: 0.8030
Epoch 7/20
7982/7982 [==============================] - 0s - loss: 0.4592 - acc: 0.9039 - val_loss: 0.9090 - val_acc: 0.8010
Epoch 8/20
7982/7982 [==============================] - 0s - loss: 0.3704 - acc: 0.9226 - val_loss: 0.9359 - val_acc: 0.7890
Epoch 9/20
7982/7982 [==============================] - 0s - loss: 0.3036 - acc: 0.9308 - val_loss: 0.8912 - val_acc: 0.8070
Epoch 10/20
7982/7982 [==============================] - 0s - loss: 0.2539 - acc: 0.9412 - val_loss: 0.9059 - val_acc: 0.8110
Epoch 11/20
7982/7982 [==============================] - 0s - loss: 0.2185 - acc: 0.9471 - val_loss: 0.9152 - val_acc: 0.8120
Epoch 12/20
7982/7982 [==============================] - 0s - loss: 0.1872 - acc: 0.9511 - val_loss: 0.9045 - val_acc: 0.8150
Epoch 13/20
7982/7982 [==============================] - 0s - loss: 0.1696 - acc: 0.9523 - val_loss: 0.9338 - val_acc: 0.8090
Epoch 14/20
7982/7982 [==============================] - 0s - loss: 0.1531 - acc: 0.9554 - val_loss: 0.9644 - val_acc: 0.8090
Epoch 15/20
7982/7982 [==============================] - 0s - loss: 0.1387 - acc: 0.9555 - val_loss: 0.9697 - val_acc: 0.8120
Epoch 16/20
7982/7982 [==============================] - 0s - loss: 0.1310 - acc: 0.9562 - val_loss: 1.0280 - val_acc: 0.8040
Epoch 17/20
7982/7982 [==============================] - 0s - loss: 0.1214 - acc: 0.9577 - val_loss: 1.0307 - val_acc: 0.7950
Epoch 18/20
7982/7982 [==============================] - 0s - loss: 0.1193 - acc: 0.9582 - val_loss: 1.0454 - val_acc: 0.8080
Epoch 19/20
7982/7982 [==============================] - 0s - loss: 0.1136 - acc: 0.9595 - val_loss: 1.1013 - val_acc: 0.7950
Epoch 20/20
7982/7982 [==============================] - 0s - loss: 0.1104 - acc: 0.9595 - val_loss: 1.0710 - val_acc: 0.8020

Plotting the training and validation loss


In [26]:
import matplotlib.pyplot as plt
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)

plt.plot(epochs, 
         loss, 
         'bo', 
         label = 'Training loss')
plt.plot(epochs, 
         val_loss, 
         'b', 
         label = 'Validation loss')

plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


Plotting the training and validation accuracy


In [27]:
# Clearing the figure
plt.clf()
acc = history.history['acc']
val_acc = history.history['val_acc']

plt.plot(epochs, 
         acc, 
         'bo', 
         label = 'Training acc')
plt.plot(epochs, 
         val_acc, 
         'b', 
         label = 'Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


The model overfits after the 9th epoch.

The network will be re-trained from scratch.


In [28]:
model = Sequential()
model.add(Dense(units = 64, 
                       activation = 'relu', 
                       input_shape = (10000,)))
model.add(Dense(units = 64, 
                       activation = 'relu'))
model.add(Dense(units = 46, 
                       activation = 'softmax'))

model.compile(optimizer = 'rmsprop',
              loss='categorical_crossentropy', 
              metrics=['accuracy'])
# Training process
model.fit(x = partial_x_train,
          y = partial_y_train,
          epochs = 9,
          batch_size = 512,
          validation_data = (x_val, y_val))
results = model.evaluate(x_test, one_hot_test_labels)


Train on 7982 samples, validate on 1000 samples
Epoch 1/9
7982/7982 [==============================] - 1s - loss: 2.5401 - acc: 0.5226 - val_loss: 1.6792 - val_acc: 0.6540
Epoch 2/9
7982/7982 [==============================] - 0s - loss: 1.3785 - acc: 0.7096 - val_loss: 1.2825 - val_acc: 0.7210
Epoch 3/9
7982/7982 [==============================] - 0s - loss: 1.0207 - acc: 0.7781 - val_loss: 1.1321 - val_acc: 0.7550
Epoch 4/9
7982/7982 [==============================] - 0s - loss: 0.8003 - acc: 0.8257 - val_loss: 1.0532 - val_acc: 0.7580
Epoch 5/9
7982/7982 [==============================] - 0s - loss: 0.6392 - acc: 0.8629 - val_loss: 0.9753 - val_acc: 0.7950
Epoch 6/9
7982/7982 [==============================] - 0s - loss: 0.5112 - acc: 0.8930 - val_loss: 0.9097 - val_acc: 0.8130
Epoch 7/9
7982/7982 [==============================] - 0s - loss: 0.4115 - acc: 0.9141 - val_loss: 0.8914 - val_acc: 0.8240
Epoch 8/9
7982/7982 [==============================] - 0s - loss: 0.3357 - acc: 0.9283 - val_loss: 0.8726 - val_acc: 0.8280
Epoch 9/9
7982/7982 [==============================] - 1s - loss: 0.2787 - acc: 0.9371 - val_loss: 0.9343 - val_acc: 0.8010
2112/2246 [===========================>..] - ETA: 0s

In [29]:
results


Out[29]:
[1.0224752447377967, 0.77738201251968353]

Comparison to a random classifier


In [30]:
import copy
test_labels_copy = copy.copy(test_labels)
np.random.shuffle(test_labels_copy)
hits_array = np.array(test_labels) == np.array(test_labels_copy)
print(float(np.sum(hits_array)) / len(test_labels) * 100, '%')


18.2546749777382 %

Predicting the label


In [31]:
predictions = model.predict(x_test)

In [32]:
predictions.shape


Out[32]:
(2246, 46)

In [33]:
# Picking max probability for the first example (with index 0)
np.argmax(predictions[0])


Out[33]:
3

In [34]:
# Probability of this class to be correct (according to the neural network) in %
np.max(predictions[0]) * 100


Out[34]:
97.245413064956665