In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import codecs
import json
import ast
import json

In [ ]:


In [7]:
tokenfile = pd.read_csv('../LearningTreebanks_gr.csv', encoding='utf-8', sep='\t') #Only testsentences. No demosents
tokenfile = tokenfile.reset_index()
tokenfile.Stimuliname = tokenfile.Stimuliname.str[:-4] #removing '.png'
tokenfile
token_dict=dict()
for row in tokenfile.iterrows():
    
    if row[1][1] in token_dict:
        token_dict[row[1][1]].append((row[1][3]))
    else:
        token_dict[row[1][1]]=[row[1][3]]
#dictionary with Ids as keys, words in sentence seperated by commas
token_dict


Out[7]:
{'HDL0003': ['Jewelry', 'Makers', 'Copy', 'Cosmetics', 'Sales', 'Ploys'],
 'HDL0005': ['Braumeisters',
  'Ltd.',
  'tests',
  'a',
  'beer',
  'brewed',
  'with',
  'oat',
  'bran',
  ',',
  'rather',
  'than',
  'rice',
  'or',
  'corn',
  '.'],
 'HDL0016': ['HUGO', 'FELLED', 'vast', 'timberlands', '.'],
 'HDL0017': ['HE',
  'RODE',
  'HIS',
  'HOBBY',
  ',',
  'but',
  'he',
  'could',
  "n't",
  'milk',
  'it',
  ',',
  'the',
  'Tax',
  'Court',
  'says',
  '.'],
 'HDL0020': ['LUTHER',
  'BURBANK',
  'CROSS',
  '-',
  'BRED',
  'PLANTS',
  'to',
  'produce',
  'the',
  'billion',
  '-',
  'dollar',
  'Idaho',
  'potato',
  '.'],
 'HDL0039': ['BROKERAGE',
  'HIRING',
  'languishes',
  'amid',
  'market',
  'turmoil',
  '.'],
 'HDL0042': ['THE',
  'IRS',
  'may',
  'taketh',
  'what',
  'the',
  'Labor',
  'Department',
  'giveth',
  '.'],
 'HDL0043': ['CORPORATE', 'DOWNSIZING', 'digs', 'deeper', '.'],
 'HDL0049': ['Hospital', 'Regulation', 'Sparks', 'Kentucky', 'Feud'],
 'HDL0051': ['Related', 'Roommates', 'Trim', 'Hospital', 'Bills'],
 'HDL0055': ['In', 'other', 'commodity', 'markets', 'yesterday', ':'],
 'HDL0067': ['WHITMAN',
  '&',
  'RANSOM',
  'recruits',
  'lawyers',
  'from',
  'disbanding',
  'firm',
  ':'],
 'HDL0068': ['SHORT', 'SKIRTS', 'not', 'welcome', 'in', 'Texas', 'court', ':'],
 'HDL0072': ['Rochester', 'Community', 'Savings', 'Bank', '--'],
 'HDL0073': ['South',
  'Australian',
  'Government',
  'Finance',
  'Authority',
  '-LRB-',
  'agency',
  '-RRB-',
  '--'],
 'HDL0075': ['EAST',
  'GERMANS',
  'RALLIED',
  'in',
  'three',
  'cities',
  'to',
  'demand',
  'democratic',
  'freedoms',
  '.'],
 'HDL0079': ['APARTHEID',
  'FOES',
  'STAGED',
  'a',
  'massive',
  'anti-government',
  'rally',
  'in',
  'South',
  'Africa',
  '.'],
 'HDL0080': ['CONGRESSIONAL',
  'LEADERS',
  'BACKED',
  'Bush',
  "'s",
  'criticism',
  'of',
  'Nicaragua',
  "'s",
  'Ortega',
  '.'],
 'HDL0081': ['Wives', 'May', 'Not', 'Benefit', 'When', 'Men', 'Do', 'Chores'],
 'HDL0083': ['Reagan', 'Era', 'Young', 'Hold', 'Liberal', 'Views'],
 'HDL0093': ['Closed', 'End', 'Bond', 'Funds'],
 'HDL0095': ['Specialized', 'Equity', 'and', 'Convertible', 'Funds'],
 'HDL0100': ['California', 'Thefts', 'Make', 'Travel', 'Agents', 'Jittery'],
 'HDL0101': ['Texans', 'Get', 'Reasonable', 'Car', 'Rental', 'Insurance'],
 'HDL0102': ['Flight', 'Attendants', 'Lag', 'Before', 'Jets', 'Even', 'Land'],
 'HDL0104': ['CRESTMONT',
  'FEDERAL',
  'SAVINGS',
  '&',
  'LOAN',
  'ASSOCIATION',
  '-LRB-',
  'Edison',
  ',',
  'N.J.',
  '-RRB-',
  '--'],
 'HDL0119': ['Odd', 'Cars', ',', 'Funny', 'Names'],
 'HDL0125': ['Office', 'Market', 'Weakens', 'In', 'Overbuilt', 'Northeast'],
 'HDL0127': ['Housing',
  'Developers',
  'Try',
  'Brand',
  '-',
  'Name',
  'Buildings'],
 'HDL0129': ['HEALTH',
  'CLUBS',
  'gear',
  'up',
  'for',
  'a',
  'graying',
  'clientele',
  '.'],
 'HDL0130': ['``',
  'HOT',
  "''",
  'TOPAZ',
  'sparks',
  'regulator',
  ',',
  'jeweler',
  'concern',
  'over',
  'import',
  'of',
  'irradiated',
  'stones',
  '.'],
 'HDL0131': ['CAPITAL',
  'TRAVELS',
  'to',
  'Europe',
  'as',
  '1992',
  'unification',
  'nears',
  '.'],
 'HDL0133': ['SUSPECT',
  '``',
  'SALES',
  "''",
  'ads',
  'are',
  'challenged',
  'by',
  'the',
  'Better',
  'Business',
  'Bureau',
  'of',
  'Metropolitan',
  'New',
  'York',
  '.'],
 'HDL0145': ['MURDER',
  'THREAT',
  'charged',
  'in',
  'Haas',
  'Securities',
  'Corp',
  '.',
  'stock',
  '-',
  'manipulation',
  'trial',
  '.'],
 'HDL0146': ['TRUSTEE',
  'WHO',
  'MONITORED',
  'settlement',
  'payments',
  'to',
  'Dalkon',
  'Shield',
  'claimants',
  'quits',
  '.'],
 'HDL0147': ['CHICAGO',
  'LAW',
  'FIRM',
  'recruits',
  'American',
  'Express',
  'Co.',
  'vice',
  'president',
  ':'],
 'HDL0154': ['From',
  'the',
  'Sept.',
  '30',
  '-',
  'Oct',
  '.',
  '4',
  'issue',
  'of',
  'The',
  'Economist',
  ':'],
 'HDL0159': ['PENSION', 'AND', 'PROFIT', '-', 'SHARING', 'RULES', ':'],
 'HDL0162': ['Yeast',
  'Adapted',
  'to',
  'Make',
  'Gene',
  '-',
  'Spliced',
  'Drugs'],
 'HDL0166': ['PAY',
  'FOR',
  'PERFORMANCE',
  'hangs',
  'mostly',
  'on',
  'boss',
  "'s",
  'subjective',
  'view',
  '.'],
 'HDL0167': ['JAPANESE',
  'COMPANIES',
  'fare',
  'best',
  'in',
  'U.S.',
  'when',
  'they',
  'give',
  'Americans',
  'more',
  'say',
  '.'],
 'HDL0178': ['DESPITE',
  'VICTORIES',
  'this',
  'year',
  ',',
  'small',
  'business',
  'fears',
  'losing',
  'parental',
  '-',
  'leave',
  'war',
  '.'],
 'HDL0179': ['IN',
  'LOS',
  'ANGELES',
  ',',
  'more',
  'small',
  'businesses',
  'ponder',
  'adopting',
  'a',
  'child',
  '-',
  'care',
  'policy',
  '.'],
 'HDL0180': ['NOVEMBER',
  'BALLOTS',
  'will',
  'contain',
  'few',
  'referendum',
  'or',
  'initiative',
  'issues',
  'that',
  'especially',
  'affect',
  'small',
  'business',
  '.'],
 'HDL0182': ['CALIFORNIA',
  ',',
  'A',
  'TREND',
  '-',
  'SETTER',
  'in',
  'franchising',
  'rules',
  ',',
  'stirs',
  'a',
  'controversy',
  '.'],
 'HDL0198': ['DALKON',
  'SHIELD',
  'CLAIMANTS',
  'hope',
  'to',
  'stop',
  'reorganization',
  '-',
  'plan',
  'appeal',
  '.'],
 'HDL0200': ['TED',
  "BUNDY'S",
  'LAWYERS',
  'switch',
  'to',
  'victims',
  "'",
  'side',
  'in',
  'death',
  '-',
  'sentence',
  'case',
  '.'],
 'HDL0201': ['THE', 'CASE', 'OF', 'THE', 'FAKE', 'DALIS', ':'],
 'HDL0202': ['BRISTOL',
  '-',
  'MYERS',
  'SQUIBB',
  'Co',
  '.',
  '-LRB-',
  'New',
  'York',
  '-RRB-',
  '--'],
 'HDL0206': ['Showing', 'Up', 'in', 'Court', 'Without', 'Being', 'There'],
 'MAI0032': ['call', 'them', 'at', '303-832-8160', '.'],
 'MAI0058': ['do',
  'you',
  'think',
  'they',
  'are',
  'cool',
  'b/c',
  'of',
  'the',
  'taco',
  'bell',
  'dog',
  '?'],
 'MAI0067': ['I',
  'was',
  'thinking',
  'Kenneally',
  "'s",
  'at',
  'around',
  '5',
  '.'],
 'MAI0130': ['Otherwise',
  ',',
  'I',
  'will',
  'be',
  'sending',
  'it',
  'to',
  'Peoples',
  'as',
  'our',
  'final',
  'revision',
  'by',
  'mid',
  'morning',
  '.'],
 'MAI0145': ['Attached',
  'please',
  'find',
  'the',
  'latest',
  'enovate',
  'risk',
  'policy',
  '.'],
 'MAI0153': ['She',
  'is',
  'very',
  'conscientious',
  'about',
  'what',
  'she',
  'signs',
  ',',
  'and',
  'who',
  'initials',
  'what',
  '.'],
 'MAI0166': ['I',
  'have',
  'a',
  'fax',
  'machine',
  'at',
  'home',
  ',',
  'though',
  ',',
  'if',
  'you',
  'prefer',
  '.'],
 'MAI0171': ['If',
  'not',
  ',',
  'is',
  'there',
  'someone',
  'else',
  'that',
  'will',
  '?'],
 'MAI0188': ['Thanks',
  'for',
  'your',
  'prompt',
  'attention',
  'to',
  'this',
  '.'],
 'MAI0252': ['the', 'time', ':', '10:00', 'AM', '-', '11:00', 'AM', 'CST'],
 'MAI0261': ['Is',
  'this',
  'related',
  'to',
  'the',
  'problems',
  'he',
  'is',
  'having',
  'getting',
  'around',
  '-LRB-',
  'circulatory',
  '?',
  '-RRB-',
  '?'],
 'MAI0270': ['I',
  'will',
  'manage',
  'client',
  'expectations',
  'accordingly',
  '.'],
 'MAI0303': ['the', 'place', ':', 'EB', '3143C'],
 'MAI0304': ['the',
  'subject',
  ':',
  'Turbine',
  '1',
  'and',
  'Turbine',
  '2',
  'Purchase',
  'Agreement'],
 'MAI0329': ['I',
  'can',
  'recommend',
  'some',
  'good',
  'restaurants',
  'since',
  'I',
  'took',
  'Ric',
  'there',
  'last',
  'year',
  'for',
  'his',
  'birthday',
  '.'],
 'MAI0330': ['We',
  'stayed',
  'at',
  'the',
  'Menger',
  'and',
  'had',
  'a',
  'great',
  'time',
  '.'],
 'MAI0333': ['This',
  'Friday',
  '-',
  'Michael',
  'goes',
  'for',
  'a',
  'visit',
  'at',
  'St.',
  'Francis',
  ',',
  'which',
  'may',
  'be',
  'his',
  'new',
  'school',
  '-LRB-',
  'so',
  'far',
  ',',
  'so',
  'good',
  '-RRB-',
  '.'],
 'MAI0340': ['It',
  'is',
  'the',
  'officer',
  "'s",
  'meeting',
  'for',
  'Enterprise',
  ',',
  'and',
  'spouses',
  'are',
  'invited',
  '.'],
 'MAI0344': ['I',
  "'m",
  'watching',
  'for',
  'some',
  'good',
  'vacation',
  'days',
  ',',
  'also',
  '...'],
 'MAI0350': ['I',
  'have',
  'a',
  'concern',
  'that',
  'the',
  'Enron',
  'optionality',
  'bug',
  'could',
  'bite',
  'us',
  'on',
  'the',
  'backside',
  'with',
  'that',
  'one',
  '.'],
 'MAI0404': ['Also',
  ',',
  'we',
  'have',
  'attached',
  'a',
  'pdf',
  'black',
  '-',
  'line',
  'of',
  'the',
  'Guarantee',
  'vs',
  'the',
  'form',
  'of',
  'guarantee',
  'in',
  'the',
  'Turbine',
  'Contract',
  '.'],
 'MAI0406': ['Do',
  'not',
  'hesitate',
  'to',
  'call',
  'us',
  'with',
  'any',
  'questions',
  '.'],
 'MAI0419': ['They',
  'are',
  'beautiful',
  'and',
  'will',
  'add',
  'a',
  'lot',
  'to',
  'our',
  'collection',
  '.'],
 'MAI0427': ['They',
  'are',
  'kind',
  'of',
  'in',
  'rank',
  'order',
  'but',
  'as',
  'I',
  'stated',
  'if',
  'I',
  'find',
  'the',
  'piece',
  'that',
  'I',
  'like',
  'we',
  'will',
  'purchase',
  'it',
  '.'],
 'MAI0437': ['Other',
  'impressionist',
  'or',
  'post',
  'impressionist',
  'lithos'],
 'MAI0453': ['Huskers', 'drool', 'over', 'Sooners', '.'],
 'MAI0491': ['I',
  'just',
  'got',
  'your',
  'email',
  'and',
  'I',
  'certainly',
  'concur',
  'with',
  'Jeff',
  'making',
  'the',
  'call',
  '.'],
 'MAI0492': ['He',
  'has',
  'maintained',
  'a',
  'good',
  'relationship',
  'with',
  'Mulva',
  '.'],
 'MAI0498': ['I',
  'do',
  "n't",
  'know',
  'if',
  'there',
  'is',
  'anything',
  'I',
  'can',
  'do',
  'but',
  'I',
  "'m",
  'always',
  'willing',
  'to',
  'help',
  '.'],
 'MAI0506': ['Thank', 'you', 'for', 'you', 'patience', '.'],
 'MAI0511': ['The',
  'Dow',
  'then',
  'sank',
  'to',
  '631',
  'in',
  'December',
  'of',
  "'70",
  '.'],
 'MAI0555': ['If',
  'you',
  'have',
  'any',
  'other',
  'questions',
  ',',
  'please',
  'let',
  'me',
  'know',
  '.'],
 'MAI0567': ['Adobe',
  'Acrobat',
  'Reader',
  '4.0',
  'may',
  'be',
  'downloaded',
  'for',
  'FREE',
  'from',
  'www.adobe.com',
  '.'],
 'MAI0598': ['What', "'s", 'going', 'on', 'dude', '?'],
 'MAI0606': ['Let', 's', 'get', 'together', 'soon', '.'],
 'MAI0612': ['How', 'is', 'it', 'going', '?'],
 'MAI0630': ['-LRB-',
  'See',
  'attached',
  'file',
  ':',
  'Constellation',
  'Power',
  '-LRB-',
  'GISB',
  'draft',
  '-RRB-',
  '.doc',
  '-RRB-',
  '-LRB-',
  'See',
  'attached',
  'file',
  ':',
  'Sam3102.doc',
  '-RRB-'],
 'MAI0634': ['I',
  'have',
  'sent',
  'your',
  'question',
  're',
  'on',
  'line',
  'trading',
  'to',
  'that',
  'area',
  '.'],
 'MAI0639': ['Jackie',
  'Taylor',
  '-',
  'she',
  'is',
  'located',
  'at',
  'Court',
  'House',
  'Concessionaire',
  'and',
  'under',
  'her',
  'name',
  'in',
  'the',
  'directory',
  '.'],
 'MAI0652': ['Attached', 'is', 'an', 'image', 'of', 'the', 'GISB', '.'],
 'MAI0698': ['Please',
  'forward',
  'a',
  'copy',
  'of',
  'the',
  'J.M.',
  'Huber',
  'Corporation',
  'Guaranty',
  'to',
  'my',
  'attention',
  '.'],
 'MAI0710': ['Would', 'love', 'for', 'you', 'to', 'join', 'us', '.'],
 'MAI0723': ['I', 'better', 'pass', 'on', 'the', 'Comets', 'game', '.'],
 'MAI0724': ['My',
  'weekends',
  'seem',
  'to',
  'be',
  'taken',
  'up',
  'with',
  'condo',
  'matters',
  ',',
  'house',
  'hunting',
  '.'],
 'MAI0732': ['The',
  'game',
  'is',
  'at',
  '12',
  ':',
  'Sat',
  '@',
  'Compaq',
  'Center',
  '.'],
 'MAI0742': ['I', 'better', 'pass', 'on', 'the', 'Comets', 'gam', '.'],
 'MAI0747': ['Let', 'me', 'know', 'if', 'you', 'are', 'interested', '.'],
 'MAI0758': ["Monday's",
  'starting',
  'next',
  'week',
  'at',
  '4',
  '???????????????'],
 'MAI0759': ['Wednesday', 'does', "n't", 'work', 'for', 'me', '.'],
 'MAI0771': ['He',
  'is',
  'concerned',
  'about',
  'the',
  'allocation',
  'amongst',
  'categories',
  '-',
  'in',
  'particular',
  ',',
  'Real',
  'Time',
  'Traders',
  '.'],
 'TWI0000': ['Having',
  'read',
  'many',
  'D5000',
  'previews',
  'I',
  "'m",
  'worried',
  '.'],
 'TWI0003': ['I',
  'have',
  'no',
  '``',
  'but',
  "''",
  'to',
  'add',
  'to',
  'qualify',
  'that',
  'sentence',
  'with',
  '.'],
 'TWI0004': ['Really',
  'lovely',
  'evening',
  'spent',
  'with',
  'people',
  'I',
  'used',
  'to',
  'work',
  'with',
  '.'],
 'TWI0005': ['Take',
  'note',
  'This',
  'Life',
  'and',
  'Red',
  'Dwarf',
  ',',
  'reunions',
  'do',
  "n't",
  'have',
  'to',
  'be',
  'shit',
  '.'],
 'TWI0007': ['I',
  'just',
  'think',
  'he',
  'looks',
  'like',
  'a',
  'big',
  'baby',
  ',',
  'and',
  'ppl',
  'USED',
  'to',
  'call',
  'him',
  'that',
  '.'],
 'TWI0010': ['The',
  'Canon',
  'is',
  'definitely',
  'the',
  'better',
  'of',
  'the',
  'two',
  '.'],
 'TWI0012': ['Been',
  'using',
  'Safari',
  '4',
  '(',
  'OSX',
  ')',
  'today',
  'and',
  'it',
  'has',
  "n't",
  'been',
  'going',
  'well',
  '.'],
 'TWI0015': ['Still', 'laughing', 'at', 'man', 'utd', '.'],
 'TWI0016': ['new',
  'Red',
  'Dwarf',
  'character',
  "'s",
  'accent',
  'is',
  'already',
  'annoying',
  'me'],
 'TWI0017': ['First', 'half', 'of', 'new', 'Red', 'Dwarf', ':', 'Poor', '.'],
 'TWI0018': ['Does',
  'anyone',
  'else',
  'think',
  'Lloyds',
  'TSB',
  'went',
  'under',
  'because',
  'of',
  'the',
  'horrible',
  'music',
  'on',
  'their',
  'TV',
  'adverts',
  '?'],
 'TWI0019': ['NYC',
  'promoting',
  'LGBT',
  'summer',
  'tourism',
  ',',
  'hopefully',
  'prompting',
  'Rush',
  'Limbaugh',
  'to',
  'keep',
  'his',
  'promise',
  'to',
  'leave',
  'the',
  'City'],
 'TWI0020': ['Breaking',
  'news',
  ':',
  'admin',
  'official',
  'says',
  'Chrysler',
  'will',
  'file',
  'for',
  'Chapter',
  '11',
  'bankruptcy',
  '.'],
 'TWI0023': ['ok',
  ',',
  'let',
  "'s",
  'be',
  'honest',
  ',',
  'is',
  'the',
  'iphone',
  'really',
  'that',
  'great',
  '?'],
 'TWI0025': ['Obama',
  'Condemns',
  'North',
  'Korea',
  'Launch',
  ',',
  'Calls',
  'for',
  'Nuclear',
  'Free',
  'World',
  '-',
  'voice',
  'of',
  'America',
  ':'],
 'TWI0027': ['Oh',
  'geez',
  ',',
  'another',
  'kiddie',
  'eddie',
  'murphy',
  'movie',
  '.'],
 'TWI0032': ['No', 'broadband', 'and', 'mobile', 'phones', 'banned'],
 'TWI0033': ['DM',
  ':',
  'You',
  'may',
  'forgive',
  'him',
  ',',
  'Rihanna',
  ',',
  'but',
  'battered',
  'women',
  'wo',
  "n't",
  '.'],
 'TWI0034': ['Gordon',
  'Brown',
  'refuses',
  'to',
  'hand',
  'back',
  '$',
  '3m',
  'pension'],
 'TWI0039': ['Bono', 'is', 'NOT', 'god', '!'],
 'TWI0040': ['Props', 'to', 'Jamba', 'Juice', '.'],
 'TWI0042': ['I',
  'do',
  "n't",
  'see',
  'how',
  'Leno',
  'at',
  '10pm',
  'would',
  'be',
  'the',
  'least',
  'bit',
  'profitable',
  'for',
  'NBC',
  '.'],
 'TWI0043': ['Cheap',
  'is',
  'all',
  'that',
  'it',
  "'s",
  'got',
  'going',
  'for',
  'it',
  '.'],
 'TWI0044': ['Sky',
  'News',
  'URL',
  'I',
  'just',
  'quoted',
  'was',
  '301characters',
  'long',
  '!',
  '!',
  '!'],
 'TWI0049': ['Fuck',
  'the',
  'mets',
  'and',
  'their',
  '$',
  '100',
  'tickets',
  '.'],
 'TWI0050': ['Rush', 'Limbaugh', 'is', 'not', 'panel', 'material', '.'],
 'TWI0051': ['Did',
  'you',
  'see',
  'his',
  'interview',
  'with',
  'Barbara',
  'Walters',
  '?'],
 'TWI0056': ['I',
  "'m",
  'gald',
  'that',
  'the',
  'Dallas',
  'Cowboys',
  'dumped',
  'that',
  'whinner',
  'Terrell',
  'Owens',
  '.'],
 'TWI0058': ['Username',
  'I',
  'have',
  'science',
  'on',
  'my',
  'side',
  'Urlname',
  'Urlname',
  'you',
  'have',
  'Rush',
  'Limbaugh',
  'and',
  'empty',
  'rhetoric',
  '.'],
 'TWI0059': ['Obama',
  'puts',
  'GM',
  ',',
  'Chrysler',
  'on',
  'short',
  'leash',
  'Urlname'],
 'TWI0061': ['ACMA',
  'doing',
  'it',
  "'s",
  'part',
  'to',
  'kill',
  'of',
  'tv',
  '.'],
 'TWI0062': ['Urlname', 'we', 'liked', 'Underbelly', 'Uncut', '.'],
 'TWI0063': ['Does',
  'anyone',
  'really',
  'think',
  'a',
  'big-screen',
  'Kindle',
  'can',
  'save',
  'newspapers',
  '?',
  ':'],
 'TWI0064': ['Here',
  'comes',
  'another',
  'possible',
  'savior',
  'for',
  'the',
  'de',
  '...'],
 'TWI0065': ['trying',
  'to',
  'figure',
  'out',
  'why',
  'rush',
  'limbaugh',
  "'s",
  'head',
  'keeps',
  'getting',
  'bigger',
  ',',
  'it',
  "'s",
  'usually',
  'just',
  'his',
  'nose',
  '.'],
 'TWI0067': ['North',
  'Korea',
  'will',
  'never',
  'be',
  'able',
  'to',
  'live',
  'this',
  'one',
  'down',
  '!'],
 'TWI0071': ['73',
  'Executives',
  'got',
  'millions',
  'at',
  'AIG',
  'and',
  '11',
  'of',
  'them',
  'do',
  "n't",
  'even',
  'work',
  'there',
  'any',
  'more',
  '!'],
 'TWI0073': ['BACK', 'TO', 'NAIL', 'BUSINESS', '...'],
 'TWI0075': ['Call',
  'me',
  'crazy',
  'but',
  'I',
  'love',
  'the',
  'smell',
  'of',
  'old',
  'books',
  'f',
  '...'],
 'TWI0076': ['RT',
  'Username',
  ':',
  '``',
  'Look',
  'for',
  'Oracle',
  'to',
  'do',
  'a',
  'little',
  'pruning',
  'before',
  'it',
  'blows',
  'any',
  'dough',
  'on',
  'Sun',
  "'s",
  'hardware',
  'business',
  ',',
  "''",
  'Urlname'],
 'TWI0079': ['Should',
  'be',
  'sellers',
  'this',
  'year',
  '-',
  'parts',
  'r',
  'worth',
  'far',
  'more',
  'than',
  'the',
  'whole',
  '.'],
 'TWI0080': ['Go', 'with', 'youth', '-', 'enough', 'already'],
 'TWI0081': ['Just',
  'told',
  'Lloyds',
  'to',
  'go',
  'do',
  'one',
  'their',
  'new',
  'lending',
  'is',
  '3.5',
  '%',
  'over',
  'base',
  'rate',
  'on',
  'well',
  'secured',
  'loans',
  '!'],
 'TWI0083': ['RP',
  "'s",
  'response',
  'to',
  'Obama',
  ',',
  'JUST',
  'up',
  ':',
  'Barack',
  'Obama',
  'is',
  '``',
  'preaching',
  'inflation',
  '...',
  'economic',
  'fascism',
  "''",
  'Urlname',
  '#tlot',
  '...'],
 'TWI0084': ['Obama',
  'notes',
  'earlier',
  'today',
  'to',
  'ABC',
  'that',
  'he',
  'is',
  '``',
  'unaware',
  'of',
  'tea',
  'parties',
  '.',
  "''"],
 'TWI0086': ['RT',
  'Username',
  'Sky',
  'News',
  'creates',
  'a',
  'Twitter',
  'corresp',
  '.'],
 'TWI0088': ['Should',
  'be',
  'rolling',
  'that',
  'into',
  'regular',
  'work',
  'rather',
  'th',
  '...'],
 'TWI0091': ['Username',
  'first',
  'line',
  'from',
  'ABC',
  'news',
  ':',
  'North',
  'Korea',
  'defiantly',
  'launched',
  '...'],
 'TWI0092': ['Adding',
  'the',
  'word',
  'defiantly',
  'perpetrates',
  'that',
  'is',
  'was',
  'wrong',
  '.'],
 'TWI0094': ['Lloyds',
  'bonuses',
  'should',
  'be',
  'taxed',
  '99',
  'p',
  'in',
  'the',
  'pound',
  '.'],
 'WBL0018': ['Does', 'anybody', 'use', 'it', 'for', 'anything', 'else', '?'],
 'WBL0049': ['As',
  'recently',
  'as',
  'last',
  'week',
  'the',
  'official',
  'line',
  'stated',
  'they',
  'had',
  'no',
  'knowledge',
  'he',
  'had',
  'entered',
  'the',
  'country',
  '.'],
 'WBL0058': ['-LRB-',
  'Ask',
  'Terry',
  'Nichols',
  'about',
  'the',
  'Philippines',
  '.',
  '-RRB-'],
 'WBL0061': ['And',
  'to',
  'those',
  'who',
  'do',
  "n't",
  'even',
  'know',
  'their',
  'crimes',
  ',',
  'not',
  'even',
  'that',
  '.'],
 'WBL0062': ['But', 'will', 'Posada', 'be', 'given', 'up', '?'],
 'WBL0065': ['The',
  'hope',
  'may',
  'be',
  'that',
  'Posada',
  'can',
  'soon',
  'be',
  'offered',
  'to',
  'a',
  'post-Chavez',
  'Allawi',
  '-',
  'like',
  'puppet',
  'in',
  'Caracas',
  '.'],
 'WBL0067': ['But',
  'Posada',
  "'s",
  'nearly',
  '80',
  'years',
  'old',
  ',',
  'and',
  'the',
  'Venezuelan',
  'people',
  'will',
  'ensure',
  'that',
  "'s",
  'a',
  'vain',
  'hope',
  '.'],
 'WBL0071': ['His',
  'case',
  'threatens',
  'the',
  'consensus',
  'fiction',
  'of',
  'the',
  '"',
  'War',
  'on',
  'Terror',
  '.',
  '"'],
 'WBL0083': ['Wilson',
  'was',
  'claiming',
  'that',
  'he',
  'had',
  'been',
  'working',
  'for',
  'the',
  'CIA',
  'when',
  'he',
  'sold',
  'the',
  'C',
  '-',
  '4',
  'to',
  'Quaddaffi',
  '.'],
 'WBL0119': ['It', 'pretty', 'much', 'covers', 'dating', 'stuff', '.'],
 'WBL0120': ['I', "'ll", 'be', 'sure', 'to', 'come', 'back', '.'],
 'WBL0129': ['You', 'CAN', 'Do', 'It', '.'],
 'WBL0130': ['And',
  'It',
  "'s",
  'Not',
  'Hard',
  'To',
  'Do',
  '....',
  'IF',
  'You',
  'Know',
  'HOW',
  '.'],
 'WBL0136': ['And',
  'you',
  'search',
  'in',
  'vain',
  'to',
  'find',
  'just',
  'one',
  'law',
  'abiding',
  'citizen'],
 'WBL0138': ['Afraid',
  'I',
  'do',
  "n't",
  'have',
  'time',
  'today',
  'to',
  'discuss',
  'these',
  ',',
  'but',
  'some',
  'stories',
  'need',
  'attention',
  ':'],
 'WBL0141': ['Some',
  '200,000',
  'guns',
  'the',
  'US',
  'sent',
  'to',
  'Iraqi',
  'security',
  'forces',
  'may',
  'have',
  'been',
  'smuggled',
  'to',
  'terrorists',
  ',',
  'it',
  'was',
  'feared',
  'yesterday',
  '.'],
 'WBL0143': ['But',
  'the',
  'four',
  'planeloads',
  'of',
  'arms',
  'have',
  'vanished',
  '.'],
 'WBL0145': ['But',
  'the',
  'work',
  'was',
  'contracted',
  'out',
  'via',
  'a',
  'complex',
  'web',
  'of',
  'private',
  'arms',
  'traders',
  '.'],
 'WBL0150': ['American',
  'defence',
  'chiefs',
  'hired',
  'a',
  'US',
  'firm',
  'to',
  'take',
  'the',
  'guns',
  ',',
  'from',
  'the',
  '90s',
  'Bosnian',
  'war',
  ',',
  'to',
  'Iraq',
  '.'],
 'WBL0182': ['The', 'will', 'is', 'another', 'matter', '.'],
 'WBL0206': ['No', 'need', 'to', 'worry', '.'],
 'WBL0212': ['Then',
  ',',
  'of',
  'course',
  ',',
  'there',
  'is',
  'the',
  'evidence',
  'the',
  'jury',
  'did',
  'not',
  'hear',
  'about',
  'in',
  'the',
  'Robinson',
  'case',
  '...'],
 'WBL0228': ['no',
  'one',
  'was',
  'charged',
  'for',
  'forced',
  'marriage',
  ',',
  'only',
  'the',
  'beating',
  '.'],
 'WBL0241': ['A',
  'country',
  'deserves',
  'the',
  'leaders',
  'it',
  'has',
  ',',
  'my',
  'friends',
  '...'],
 'WBL0242': ['And',
  'MEK',
  '--',
  'the',
  'Iranian',
  '-LRB-',
  'not',
  'Iraqi',
  '-RRB-',
  'terror',
  'group',
  'in',
  'question',
  '--',
  'is',
  'itself',
  'unquestionably',
  'a',
  'cult',
  ':'],
 'WBL0267': ['"',
  'Another',
  'generation',
  'of',
  'heavy',
  'metal',
  'has',
  'taken',
  'over',
  ',',
  'and',
  '--',
  'sorry',
  '--',
  'it',
  'ai',
  "n't",
  'just',
  'about',
  'strippers',
  'and',
  'dope',
  '.'],
 'WBL0268': ['Okay',
  ',',
  'it',
  "'s",
  'partly',
  'about',
  'strippers',
  'and',
  'dope',
  '.'],
 'WBL0279': ['You',
  'know',
  ',',
  'nature',
  'hates',
  'a',
  'void',
  '.',
  ':-RRB-'],
 'WBL0285': ['They',
  'were',
  'death',
  'metal',
  'brainwashed',
  'fans',
  ',',
  'literally',
  'fulfilling',
  'the',
  'death',
  'metal',
  'paradigm',
  ',',
  'er',
  '...',
  'morality',
  '.'],
 'WBL0288': ['I',
  'detect',
  'the',
  'hissing',
  'lisp',
  'of',
  'the',
  'lying',
  'serpent',
  'in',
  'this',
  'article',
  '.'],
 'WBL0290': ['I',
  'think',
  'the',
  'jury',
  "'s",
  'still',
  'out',
  'on',
  'exactly',
  'who',
  'did',
  'the',
  'brainwashing',
  'when',
  'in',
  'regard',
  'to',
  'the',
  'Columbine',
  'killers',
  '.'],
 'WBL0300': ['Imaginary',
  'evil',
  'is',
  'romantic',
  'and',
  'varied',
  ';',
  'real',
  'evil',
  'is',
  'gloomy',
  ',',
  'monotonous',
  ',',
  'barren',
  ',',
  'boring',
  '.'],
 'WBL0324': ['During',
  'this',
  'search',
  ',',
  'Michele',
  'Tollis',
  'became',
  'convinced',
  'that',
  'satanism',
  'had',
  'something',
  'to',
  'do',
  'with',
  'his',
  'son',
  "'s",
  'disappearance',
  '.'],
 'WBL0362': ['Why', 'do', "n't", 'you', 'state', 'it', '?'],
 'WBL0365': ['John', 'Balance', 'from', 'Coil', '.'],
 'WBL0382': ['I',
  'was',
  'amazed',
  'at',
  'the',
  'spiel',
  'they',
  'delivered',
  '.'],
 'WBL0406': ['I',
  "'m",
  'wary',
  'of',
  'jumping',
  'into',
  'this',
  'fray',
  'without',
  'backup',
  ',',
  'that',
  "'s",
  'for',
  'damn',
  'sure',
  '.',
  ':-RRB-'],
 'WBL0412': ['Similarly',
  ',',
  'is',
  'invoking',
  'ancient',
  'Egyptian',
  'gods',
  'and',
  'goddesses',
  'a',
  'reaffirmation',
  'of',
  'an',
  'oppressive',
  'and',
  'hierarchical',
  'system',
  '?'],
 'WBL0419': ['A',
  'bit',
  'of',
  'a',
  'knee',
  '-',
  'jerk',
  'reaction',
  'from',
  'me',
  'there',
  ',',
  'and',
  'I',
  'apologize',
  'for',
  'my',
  'over-generalizations',
  '.'],
 'WBL0444': ['I', "'m", 'just', 'speculating', 'now', '.'],
 'WBL0450': ['It',
  "'s",
  'on',
  'loan',
  ',',
  'by',
  'the',
  'way',
  ',',
  'from',
  'a',
  'guy',
  'named',
  'Joe',
  "O'Neill",
  'in',
  'Midland',
  ',',
  'Texas',
  '.'],
 'WBL0459': ['That',
  "'s",
  'a',
  'Senate',
  'term',
  '--',
  'particularly',
  'on',
  'good',
  'judges',
  '.'],
 'WBL0470': ['And',
  'I',
  'gave',
  'it',
  'all',
  'my',
  'heart',
  ',',
  'all',
  'my',
  'energy',
  ',',
  'based',
  'upon',
  'principles',
  'that',
  'did',
  'not',
  'change',
  'once',
  'I',
  'got',
  'into',
  'the',
  'Oval',
  'Office',
  '.'],
 'WBL0490': ['Russia',
  'also',
  'announced',
  'that',
  'it',
  'was',
  'seeking',
  'and',
  'building',
  'the',
  'best',
  'nukes',
  'the',
  'world',
  "'s",
  'ever',
  'seen',
  '.'],
 'WBL0491': ['President',
  'Vladimir',
  'Putin',
  'said',
  'Russia',
  'is',
  'will',
  'have',
  'new',
  'nuclear',
  'weapons',
  'that',
  'other',
  'countries',
  'do',
  'not',
  'and',
  'will',
  'not',
  'have',
  '.'],
 'WBL0498': ['It',
  'is',
  'rumored',
  'that',
  'North',
  'Korea',
  'has',
  'at',
  'least',
  'a',
  'couple',
  'nuclear',
  'weapons',
  '.'],
 'WBL0516': ['All',
  'this',
  'is',
  'highly',
  'unlikely',
  ',',
  'as',
  'with',
  'most',
  'al',
  '-',
  'Qaeda',
  'crackpot',
  'schemes',
  '.'],
 'WBL0532': ['Response',
  ':',
  'Iraq',
  'is',
  'actually',
  'hostile',
  'territory',
  'for',
  'al',
  '-',
  'Qaeda',
  ',',
  'and',
  'without',
  'Iraqi',
  'sympathizers',
  'it',
  'can',
  'not',
  'succeed',
  'there',
  '.'],
 'WBL0545': ['The',
  'clerics',
  'demanded',
  'talks',
  'with',
  'local',
  'US',
  'commanders',
  '.'],
 'WBL0554': ['The',
  'US',
  'troops',
  'fired',
  'into',
  'the',
  'hostile',
  'crowd',
  ',',
  'killing',
  '4',
  '.'],
 'WSJ0058': ['Many',
  'have',
  'raised',
  'cash',
  'levels',
  ',',
  'which',
  'act',
  'as',
  'a',
  'buffer',
  'against',
  'steep',
  'market',
  'declines',
  '.'],
 'WSJ0067': ['Also',
  ',',
  'persistent',
  'redemptions',
  'would',
  'force',
  'some',
  'fund',
  'managers',
  'to',
  'dump',
  'stocks',
  'to',
  'raise',
  'cash',
  '.'],
 'WSJ0068': ['But',
  'a',
  'strong',
  'level',
  'of',
  'investor',
  'withdrawals',
  'is',
  'much',
  'more',
  'unlikely',
  'this',
  'time',
  'around',
  ',',
  'fund',
  'managers',
  'said',
  '.'],
 'WSJ0101': ['She',
  'added',
  ',',
  '``',
  'If',
  'they',
  'all',
  'were',
  'bullish',
  ',',
  'I',
  "'d",
  'really',
  'be',
  'upset',
  '.',
  "''"],
 'WSJ0108': ['``', 'The', 'projects', 'are', 'big', '.'],
 'WSJ0136': ['``',
  'We',
  'usually',
  'operate',
  'in',
  'that',
  'conservative',
  'manner',
  '.',
  "''"],
 'WSJ0150': ['``',
  'The',
  'studies',
  '-LCB-',
  'on',
  'closing',
  'the',
  'unit',
  '-RCB-',
  'could',
  "n't",
  'be',
  'completed',
  'until',
  'now',
  ',',
  "''",
  'he',
  'said',
  '.'],
 'WSJ0166': ['Meanwhile',
  ',',
  'Brazil',
  'is',
  'expected',
  'to',
  'increase',
  'its',
  'allowance',
  'from',
  'the',
  '1.43',
  '%',
  'share',
  'it',
  'has',
  'had',
  'in',
  'recent',
  'years',
  '.'],
 'WSJ0167': ['The',
  'EC',
  'and',
  'Japan',
  '--',
  'the',
  'U.S.',
  "'s",
  'largest',
  'steel',
  'suppliers',
  '--',
  'have',
  "n't",
  'been',
  'filling',
  'their',
  'quotas',
  'to',
  'the',
  'full',
  'extent',
  '.'],
 'WSJ0171': ['But',
  'its',
  'quota',
  'has',
  'been',
  'as',
  'high',
  'as',
  '6.9',
  '%',
  'in',
  '1984',
  '.'],
 'WSJ0188': ['``',
  'They',
  'do',
  "n't",
  'want',
  'to',
  'get',
  'caught',
  'again',
  ',',
  "''",
  'says',
  'one',
  'industry',
  'watcher',
  '.'],
 'WSJ0193': ['The',
  'company',
  'introduced',
  'a',
  'superconcentrated',
  'Lemon',
  'Cheer',
  'in',
  'Japan',
  'after',
  'watching',
  'the',
  'success',
  'of',
  'Attack',
  '.'],
 'WSJ0194': ['When',
  'Attack',
  'hit',
  'the',
  'shelves',
  'in',
  '1987',
  ',',
  'P&G',
  "'s",
  'share',
  'of',
  'the',
  'Japanese',
  'market',
  'fell',
  'to',
  'about',
  '8',
  '%',
  'from',
  'more',
  'than',
  '20',
  '%',
  '.'],
 'WSJ0200': ['But',
  'the',
  'product',
  ',',
  'which',
  'was',
  "n't",
  'as',
  'concentrated',
  'as',
  'the',
  'new',
  'Cheer',
  ',',
  'bombed',
  'in',
  'a',
  'market',
  'test',
  'in',
  'Denver',
  'and',
  'was',
  'dropped',
  '.'],
 'WSJ0202': ['But',
  'P&G',
  'contends',
  'the',
  'new',
  'Cheer',
  'is',
  'a',
  'unique',
  'formula',
  'that',
  'also',
  'offers',
  'an',
  'ingredient',
  'that',
  'prevents',
  'colors',
  'from',
  'fading',
  '.'],
 'WSJ0215': ['They',
  'expected',
  'a',
  'new',
  'barrage',
  'of',
  'demands',
  'that',
  'Japan',
  'do',
  'something',
  'quickly',
  'to',
  'reduce',
  'its',
  'trade',
  'surplus',
  'with',
  'the',
  'U.S.',
  '.'],
 'WSJ0241': ['``',
  'But',
  'I',
  'am',
  'relieved',
  'to',
  'see',
  'that',
  'you',
  'are',
  'beautiful',
  'and',
  'gentle',
  'and',
  'intelligent',
  'and',
  'a',
  'person',
  'of',
  'integrity',
  '.',
  "''"],
 'WSJ0252': ['The',
  'units',
  'that',
  'filed',
  'the',
  'suit',
  'are',
  'Southeastern',
  'Newspapers',
  'Corp.',
  'and',
  'Florida',
  'Publishing',
  'Co',
  '.'],
 'WSJ0270': ['Beginning',
  'in',
  'mid-1987',
  ',',
  'prices',
  'began',
  'accelerating',
  'as',
  'a',
  'growing',
  'U.S.',
  'economy',
  'and',
  'the',
  'weak',
  'dollar',
  'spurred',
  'demand',
  '.'],
 'WSJ0277': ['In',
  'addition',
  ',',
  'crude',
  'oil',
  'prices',
  'were',
  'up',
  'some',
  'from',
  'a',
  'year',
  'earlier',
  ',',
  'further',
  'pressuring',
  'profitability',
  '.'],
 'WSJ0279': ['During',
  'the',
  '1988',
  'second',
  'half',
  ',',
  'many',
  'companies',
  'posted',
  'record',
  'gasoline',
  'and',
  'chemical',
  'profits',
  '.'],
 'WSJ0290': ['``',
  'We',
  "'ve",
  'been',
  'very',
  'disappointed',
  'in',
  'the',
  'performance',
  'of',
  'natural',
  'gas',
  'prices',
  ',',
  "''",
  'said',
  'Mr.',
  'Cox',
  ',',
  'Phillips',
  "'s",
  'president',
  '.'],
 'WSJ0292': ['Going',
  'into',
  'the',
  'fourth',
  'quarter',
  ',',
  'natural',
  'gas',
  'prices',
  'are',
  'anywhere',
  'from',
  '8',
  '%',
  'to',
  '17',
  '%',
  'lower',
  'than',
  'a',
  'year',
  'earlier',
  '.'],
 'WSJ0300': ['Olivetti',
  'reportedly',
  'began',
  'shipping',
  'these',
  'tools',
  'in',
  '1984',
  '.'],
 'WSJ0303': ['President',
  'Bush',
  'called',
  'his',
  'attention',
  'to',
  'the',
  'matter',
  'during',
  'the',
  'Italian',
  'leader',
  "'s",
  'visit',
  'here',
  'last',
  'week',
  '.'],
 'WSJ0316': ['Officials',
  'at',
  'Drexel',
  'said',
  'they',
  'had',
  "n't",
  'seen',
  'the',
  'suit',
  'and',
  'thus',
  'could',
  "n't",
  'comment',
  '.'],
 'WSJ0322': ['However',
  ',',
  'the',
  'agreement',
  'was',
  'canceled',
  'in',
  'June',
  '1984',
  '.'],
 'WSJ0333': ['And',
  'they',
  "'re",
  'likely',
  'to',
  'stay',
  'that',
  'way',
  'for',
  'months',
  'to',
  'come',
  ',',
  'analysts',
  'say',
  '.'],
 'WSJ0344': ['Until',
  'then',
  ',',
  'limited',
  'stocks',
  'are',
  'likely',
  'to',
  'keep',
  'prices',
  'near',
  'the',
  '$',
  '4',
  '-',
  'a',
  '-',
  'bushel',
  'level',
  ',',
  'analysts',
  'say',
  '.'],
 'WSJ0350': ['Ferruzzi',
  'has',
  'denied',
  'it',
  'was',
  'trying',
  'to',
  'manipulate',
  'the',
  'soybean',
  'futures',
  'market',
  '.'],
 'WSJ0419': ['Bozell',
  'joins',
  'Backer',
  'Spielvogel',
  'Bates',
  'and',
  'Ogilvy',
  'Group',
  'as',
  'U.S.',
  'agencies',
  'with',
  'interests',
  'in',
  'Korean',
  'agencies',
  '.'],
 'WSJ0422': ['The',
  'company',
  'asked',
  'for',
  'a',
  '15',
  '-',
  'day',
  'extension',
  'Sept.',
  '30',
  ',',
  'when',
  'the',
  'financial',
  'reports',
  'were',
  'due',
  '.'],
 'WSJ0444': ['Now',
  ',',
  'GM',
  'appears',
  'to',
  'be',
  'stepping',
  'up',
  'the',
  'pace',
  'of',
  'its',
  'factory',
  'consolidation',
  'to',
  'get',
  'in',
  'shape',
  'for',
  'the',
  '1990s',
  '.'],
 'WSJ0453': ['The',
  'shutdowns',
  'will',
  'idle',
  'about',
  '3,000',
  'Canadian',
  'assembly',
  'workers',
  'and',
  'about',
  '2,500',
  'workers',
  'in',
  'Ohio',
  '.'],
 'WSJ0458': ['That',
  'announcement',
  'left',
  'union',
  'officials',
  'in',
  'Van',
  'Nuys',
  'and',
  'Oklahoma',
  'City',
  'uncertain',
  'about',
  'their',
  'futures',
  '.'],
 'WSJ0470': ['Weatherford',
  'said',
  'market',
  'conditions',
  'led',
  'to',
  'the',
  'cancellation',
  'of',
  'the',
  'planned',
  'exchange',
  '.'],
 'WSJ0491': ['``',
  'Without',
  'official',
  'knowledge',
  'of',
  'sex',
  'or',
  'death',
  ',',
  'we',
  'flirted',
  'with',
  'both',
  ',',
  "''",
  'she',
  'writes',
  '.'],
 'WSJ0492': ['She',
  'analyzed',
  'families',
  'by',
  'their',
  'sleeping',
  'arrangements',
  '.'],
 'WSJ0511': ['But',
  'it',
  'does',
  "n't",
  'take',
  'too',
  'many',
  'lines',
  'to',
  'figure',
  'Harry',
  'out',
  '.'],
 'WSJ0532': ['``',
  'This',
  'further',
  'confuses',
  'retailers',
  ',',
  "''",
  'she',
  'says',
  '.'],
 'WSJ0544': ['Hardest',
  'hit',
  'are',
  'what',
  'he',
  'calls',
  '``',
  'secondary',
  "''",
  'sites',
  'that',
  'primarily',
  'serve',
  'neighborhood',
  'residents',
  '.'],
 'WSJ0555': ['By',
  'contrast',
  ',',
  'rentals',
  'in',
  'the',
  'best',
  'retail',
  'locations',
  'in',
  'Boston',
  ',',
  'San',
  'Francisco',
  'and',
  'Chicago',
  'rarely',
  'top',
  '$',
  '100',
  'a',
  'square',
  'foot',
  '.'],
 'WSJ0574': ['Justices',
  'Brennan',
  'and',
  'Stevens',
  'appear',
  'philosophical',
  'about',
  'it',
  ';',
  'Justices',
  'Marshall',
  'and',
  'Blackmun',
  'appear',
  'fighting',
  'mad',
  '.'],
 'WSJ0606': ['In',
  'these',
  'four',
  ',',
  'for',
  'instance',
  ',',
  'the',
  'RTC',
  'is',
  'stuck',
  'with',
  '$',
  '4.51',
  'billion',
  'in',
  'bad',
  'assets',
  '.'],
 'WSJ0616': ['NCNB',
  'will',
  'acquire',
  'University',
  'Federal',
  'Savings',
  'Association',
  ',',
  'Houston',
  ',',
  'which',
  'had',
  'assets',
  'of',
  '$',
  '2.8',
  'billion',
  '.'],
 'WSJ0630': ['Limited',
  'Inc.',
  ',',
  'offering',
  'of',
  'up',
  'to',
  '$',
  '300',
  'million',
  'of',
  'debt',
  'securities',
  'and',
  'warrants',
  '.'],
 'WSJ0634': ['Trans',
  'World',
  'Airlines',
  'Inc.',
  ',',
  'offering',
  'of',
  '$',
  '150',
  'million',
  'senior',
  'notes',
  ',',
  'via',
  'Drexel',
  'Burnham',
  '.'],
 'WSJ0643': ['Besides',
  ',',
  'Time',
  'executives',
  'think',
  'selling',
  'a',
  'news',
  'magazine',
  'with',
  'a',
  'clock',
  'radio',
  'is',
  'tacky',
  '.'],
 'WSJ0658': ['Both',
  'magazines',
  'are',
  'expected',
  'to',
  'announce',
  'their',
  'ad',
  'rates',
  'and',
  'circulation',
  'levels',
  'for',
  '1990',
  'within',
  'a',
  'month',
  '.'],
 'WSJ0663': ['Its',
  'niche',
  'as',
  'the',
  '``',
  'network',
  'of',
  'record',
  "''",
  'during',
  'major',
  'crises',
  'draws',
  'elite',
  'audiences',
  'around',
  'the',
  'world',
  '.']}

In [11]:
with open('../Pagerecordings/2015-12-10_22-11-52test') as datafile:
    flaskdata = json.load(datafile)#a list of dicts.

In [12]:
flask_dict=dict()
all_timestamps=[] # list of all timestamps, inclusing fixation screens
for letterscreen in flaskdata:  #Every other is the fixation cross screen. You need still timestamp from this
    #letterscreen is a dict with one entry
    for screencontent in letterscreen: #only one per letterscreen
        screencontent = ast.literal_eval(screencontent) #is string. Needs to be turned into python object
        timestamp = screencontent['timestamp']
        screenid = screencontent['screen_id']
        all_timestamps.append((screenid,timestamp))
        
        #print(timestamp)
        #print(screenid)
        #tokensent = [tokenfile[tokenfile.Stimuliname==screenid]] #looking up tokenized annotated sent
        if 'screen' not in screenid: #Looks at non fixation screen
            token=list()
            token_sentence=token_dict[screenid] #Finds the the sentence in the dictionary
            flask_dict[screenid]=['start time:%s'%timestamp,{'tokens':token}] #The final dictionary outputted
            
            
            #print(screenid,timestamp)
        #print(tokensent)
            boxes = screencontent['boxes'] #Data for sentence/screen
            #print(len(boxes))
            letter_boxes=[]    #Create list of boxes for easier indexing
            for box in boxes:
                letter_boxes.append(box)
            
            for i in range(0,len(token_sentence)):  # Looking at each word in the sentence 
                letter_boxes[0:len(token_sentence[0])]
                if isinstance(token_sentence[i],str): # If the word is a string 
                    #print(token_sentence[i])
                    tokens=list()
                    token_number='token%sword'%i   #Can add i+1 to start with 1 instead of 0 index
                    
                    word_in_sent=letter_boxes[0:len(token_sentence[i])]  # Get all the boxes for a word
                    #Index the coordinates of a word
                    top_coor=word_in_sent[0]['top']
                    bottom_coor=word_in_sent[0]['bottom']
                    left_coor=word_in_sent[0]['left']
                    right_coor=word_in_sent[-1]['right']
                    
                    token.append({token_number:[token_sentence[i],'top_coor:%s'%top_coor,'bottom_coor:%s'%bottom_coor,'left_coor:%s'%left_coor,'right_coor:%s'%right_coor]})
                else:
                    punct=[]
                
        #else:
            #all_timestamps.append(screencontent)
            
            

for i in range(0,len(all_timestamps)): #inserting the end times for each sentence, using the timestamp from the fixation screen just after it. 
    if all_timestamps[i][0] in flask_dict:
        
        flask_dict[all_timestamps[i][0]].insert(1,'end time:%s'%all_timestamps[i+1][1])
flask_dict['MAI0498'][2]['tokens'] #to access tokens
flask_dict['MAI0498'][0] # to access start time
flask_dict['MAI0498'][1] # to access end time
flask_dict.keys() # access


Out[12]:
dict_keys(['WBL0324', 'HDL0100', 'WSJ0643', 'MAI0498', 'HDL0039', 'TWI0018', 'WBL0450', 'TWI0088', 'TWI0042', 'WSJ0292', 'WBL0268', 'TWI0061', 'HDL0068', 'TWI0010'])

In [82]:
#{Sentid_1 {starttime: , endtime: , tokens: [{token1word: We, top_coor:, bottom_coor: , left_coor: , right_coor: ] }}
#Sentid_2 (etc..)
#}

In [ ]:


In [ ]: