In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import codecs
import json
import ast
import json
In [ ]:
In [7]:
tokenfile = pd.read_csv('../LearningTreebanks_gr.csv', encoding='utf-8', sep='\t') #Only testsentences. No demosents
tokenfile = tokenfile.reset_index()
tokenfile.Stimuliname = tokenfile.Stimuliname.str[:-4] #removing '.png'
tokenfile
token_dict=dict()
for row in tokenfile.iterrows():
if row[1][1] in token_dict:
token_dict[row[1][1]].append((row[1][3]))
else:
token_dict[row[1][1]]=[row[1][3]]
#dictionary with Ids as keys, words in sentence seperated by commas
token_dict
Out[7]:
{'HDL0003': ['Jewelry', 'Makers', 'Copy', 'Cosmetics', 'Sales', 'Ploys'],
'HDL0005': ['Braumeisters',
'Ltd.',
'tests',
'a',
'beer',
'brewed',
'with',
'oat',
'bran',
',',
'rather',
'than',
'rice',
'or',
'corn',
'.'],
'HDL0016': ['HUGO', 'FELLED', 'vast', 'timberlands', '.'],
'HDL0017': ['HE',
'RODE',
'HIS',
'HOBBY',
',',
'but',
'he',
'could',
"n't",
'milk',
'it',
',',
'the',
'Tax',
'Court',
'says',
'.'],
'HDL0020': ['LUTHER',
'BURBANK',
'CROSS',
'-',
'BRED',
'PLANTS',
'to',
'produce',
'the',
'billion',
'-',
'dollar',
'Idaho',
'potato',
'.'],
'HDL0039': ['BROKERAGE',
'HIRING',
'languishes',
'amid',
'market',
'turmoil',
'.'],
'HDL0042': ['THE',
'IRS',
'may',
'taketh',
'what',
'the',
'Labor',
'Department',
'giveth',
'.'],
'HDL0043': ['CORPORATE', 'DOWNSIZING', 'digs', 'deeper', '.'],
'HDL0049': ['Hospital', 'Regulation', 'Sparks', 'Kentucky', 'Feud'],
'HDL0051': ['Related', 'Roommates', 'Trim', 'Hospital', 'Bills'],
'HDL0055': ['In', 'other', 'commodity', 'markets', 'yesterday', ':'],
'HDL0067': ['WHITMAN',
'&',
'RANSOM',
'recruits',
'lawyers',
'from',
'disbanding',
'firm',
':'],
'HDL0068': ['SHORT', 'SKIRTS', 'not', 'welcome', 'in', 'Texas', 'court', ':'],
'HDL0072': ['Rochester', 'Community', 'Savings', 'Bank', '--'],
'HDL0073': ['South',
'Australian',
'Government',
'Finance',
'Authority',
'-LRB-',
'agency',
'-RRB-',
'--'],
'HDL0075': ['EAST',
'GERMANS',
'RALLIED',
'in',
'three',
'cities',
'to',
'demand',
'democratic',
'freedoms',
'.'],
'HDL0079': ['APARTHEID',
'FOES',
'STAGED',
'a',
'massive',
'anti-government',
'rally',
'in',
'South',
'Africa',
'.'],
'HDL0080': ['CONGRESSIONAL',
'LEADERS',
'BACKED',
'Bush',
"'s",
'criticism',
'of',
'Nicaragua',
"'s",
'Ortega',
'.'],
'HDL0081': ['Wives', 'May', 'Not', 'Benefit', 'When', 'Men', 'Do', 'Chores'],
'HDL0083': ['Reagan', 'Era', 'Young', 'Hold', 'Liberal', 'Views'],
'HDL0093': ['Closed', 'End', 'Bond', 'Funds'],
'HDL0095': ['Specialized', 'Equity', 'and', 'Convertible', 'Funds'],
'HDL0100': ['California', 'Thefts', 'Make', 'Travel', 'Agents', 'Jittery'],
'HDL0101': ['Texans', 'Get', 'Reasonable', 'Car', 'Rental', 'Insurance'],
'HDL0102': ['Flight', 'Attendants', 'Lag', 'Before', 'Jets', 'Even', 'Land'],
'HDL0104': ['CRESTMONT',
'FEDERAL',
'SAVINGS',
'&',
'LOAN',
'ASSOCIATION',
'-LRB-',
'Edison',
',',
'N.J.',
'-RRB-',
'--'],
'HDL0119': ['Odd', 'Cars', ',', 'Funny', 'Names'],
'HDL0125': ['Office', 'Market', 'Weakens', 'In', 'Overbuilt', 'Northeast'],
'HDL0127': ['Housing',
'Developers',
'Try',
'Brand',
'-',
'Name',
'Buildings'],
'HDL0129': ['HEALTH',
'CLUBS',
'gear',
'up',
'for',
'a',
'graying',
'clientele',
'.'],
'HDL0130': ['``',
'HOT',
"''",
'TOPAZ',
'sparks',
'regulator',
',',
'jeweler',
'concern',
'over',
'import',
'of',
'irradiated',
'stones',
'.'],
'HDL0131': ['CAPITAL',
'TRAVELS',
'to',
'Europe',
'as',
'1992',
'unification',
'nears',
'.'],
'HDL0133': ['SUSPECT',
'``',
'SALES',
"''",
'ads',
'are',
'challenged',
'by',
'the',
'Better',
'Business',
'Bureau',
'of',
'Metropolitan',
'New',
'York',
'.'],
'HDL0145': ['MURDER',
'THREAT',
'charged',
'in',
'Haas',
'Securities',
'Corp',
'.',
'stock',
'-',
'manipulation',
'trial',
'.'],
'HDL0146': ['TRUSTEE',
'WHO',
'MONITORED',
'settlement',
'payments',
'to',
'Dalkon',
'Shield',
'claimants',
'quits',
'.'],
'HDL0147': ['CHICAGO',
'LAW',
'FIRM',
'recruits',
'American',
'Express',
'Co.',
'vice',
'president',
':'],
'HDL0154': ['From',
'the',
'Sept.',
'30',
'-',
'Oct',
'.',
'4',
'issue',
'of',
'The',
'Economist',
':'],
'HDL0159': ['PENSION', 'AND', 'PROFIT', '-', 'SHARING', 'RULES', ':'],
'HDL0162': ['Yeast',
'Adapted',
'to',
'Make',
'Gene',
'-',
'Spliced',
'Drugs'],
'HDL0166': ['PAY',
'FOR',
'PERFORMANCE',
'hangs',
'mostly',
'on',
'boss',
"'s",
'subjective',
'view',
'.'],
'HDL0167': ['JAPANESE',
'COMPANIES',
'fare',
'best',
'in',
'U.S.',
'when',
'they',
'give',
'Americans',
'more',
'say',
'.'],
'HDL0178': ['DESPITE',
'VICTORIES',
'this',
'year',
',',
'small',
'business',
'fears',
'losing',
'parental',
'-',
'leave',
'war',
'.'],
'HDL0179': ['IN',
'LOS',
'ANGELES',
',',
'more',
'small',
'businesses',
'ponder',
'adopting',
'a',
'child',
'-',
'care',
'policy',
'.'],
'HDL0180': ['NOVEMBER',
'BALLOTS',
'will',
'contain',
'few',
'referendum',
'or',
'initiative',
'issues',
'that',
'especially',
'affect',
'small',
'business',
'.'],
'HDL0182': ['CALIFORNIA',
',',
'A',
'TREND',
'-',
'SETTER',
'in',
'franchising',
'rules',
',',
'stirs',
'a',
'controversy',
'.'],
'HDL0198': ['DALKON',
'SHIELD',
'CLAIMANTS',
'hope',
'to',
'stop',
'reorganization',
'-',
'plan',
'appeal',
'.'],
'HDL0200': ['TED',
"BUNDY'S",
'LAWYERS',
'switch',
'to',
'victims',
"'",
'side',
'in',
'death',
'-',
'sentence',
'case',
'.'],
'HDL0201': ['THE', 'CASE', 'OF', 'THE', 'FAKE', 'DALIS', ':'],
'HDL0202': ['BRISTOL',
'-',
'MYERS',
'SQUIBB',
'Co',
'.',
'-LRB-',
'New',
'York',
'-RRB-',
'--'],
'HDL0206': ['Showing', 'Up', 'in', 'Court', 'Without', 'Being', 'There'],
'MAI0032': ['call', 'them', 'at', '303-832-8160', '.'],
'MAI0058': ['do',
'you',
'think',
'they',
'are',
'cool',
'b/c',
'of',
'the',
'taco',
'bell',
'dog',
'?'],
'MAI0067': ['I',
'was',
'thinking',
'Kenneally',
"'s",
'at',
'around',
'5',
'.'],
'MAI0130': ['Otherwise',
',',
'I',
'will',
'be',
'sending',
'it',
'to',
'Peoples',
'as',
'our',
'final',
'revision',
'by',
'mid',
'morning',
'.'],
'MAI0145': ['Attached',
'please',
'find',
'the',
'latest',
'enovate',
'risk',
'policy',
'.'],
'MAI0153': ['She',
'is',
'very',
'conscientious',
'about',
'what',
'she',
'signs',
',',
'and',
'who',
'initials',
'what',
'.'],
'MAI0166': ['I',
'have',
'a',
'fax',
'machine',
'at',
'home',
',',
'though',
',',
'if',
'you',
'prefer',
'.'],
'MAI0171': ['If',
'not',
',',
'is',
'there',
'someone',
'else',
'that',
'will',
'?'],
'MAI0188': ['Thanks',
'for',
'your',
'prompt',
'attention',
'to',
'this',
'.'],
'MAI0252': ['the', 'time', ':', '10:00', 'AM', '-', '11:00', 'AM', 'CST'],
'MAI0261': ['Is',
'this',
'related',
'to',
'the',
'problems',
'he',
'is',
'having',
'getting',
'around',
'-LRB-',
'circulatory',
'?',
'-RRB-',
'?'],
'MAI0270': ['I',
'will',
'manage',
'client',
'expectations',
'accordingly',
'.'],
'MAI0303': ['the', 'place', ':', 'EB', '3143C'],
'MAI0304': ['the',
'subject',
':',
'Turbine',
'1',
'and',
'Turbine',
'2',
'Purchase',
'Agreement'],
'MAI0329': ['I',
'can',
'recommend',
'some',
'good',
'restaurants',
'since',
'I',
'took',
'Ric',
'there',
'last',
'year',
'for',
'his',
'birthday',
'.'],
'MAI0330': ['We',
'stayed',
'at',
'the',
'Menger',
'and',
'had',
'a',
'great',
'time',
'.'],
'MAI0333': ['This',
'Friday',
'-',
'Michael',
'goes',
'for',
'a',
'visit',
'at',
'St.',
'Francis',
',',
'which',
'may',
'be',
'his',
'new',
'school',
'-LRB-',
'so',
'far',
',',
'so',
'good',
'-RRB-',
'.'],
'MAI0340': ['It',
'is',
'the',
'officer',
"'s",
'meeting',
'for',
'Enterprise',
',',
'and',
'spouses',
'are',
'invited',
'.'],
'MAI0344': ['I',
"'m",
'watching',
'for',
'some',
'good',
'vacation',
'days',
',',
'also',
'...'],
'MAI0350': ['I',
'have',
'a',
'concern',
'that',
'the',
'Enron',
'optionality',
'bug',
'could',
'bite',
'us',
'on',
'the',
'backside',
'with',
'that',
'one',
'.'],
'MAI0404': ['Also',
',',
'we',
'have',
'attached',
'a',
'pdf',
'black',
'-',
'line',
'of',
'the',
'Guarantee',
'vs',
'the',
'form',
'of',
'guarantee',
'in',
'the',
'Turbine',
'Contract',
'.'],
'MAI0406': ['Do',
'not',
'hesitate',
'to',
'call',
'us',
'with',
'any',
'questions',
'.'],
'MAI0419': ['They',
'are',
'beautiful',
'and',
'will',
'add',
'a',
'lot',
'to',
'our',
'collection',
'.'],
'MAI0427': ['They',
'are',
'kind',
'of',
'in',
'rank',
'order',
'but',
'as',
'I',
'stated',
'if',
'I',
'find',
'the',
'piece',
'that',
'I',
'like',
'we',
'will',
'purchase',
'it',
'.'],
'MAI0437': ['Other',
'impressionist',
'or',
'post',
'impressionist',
'lithos'],
'MAI0453': ['Huskers', 'drool', 'over', 'Sooners', '.'],
'MAI0491': ['I',
'just',
'got',
'your',
'email',
'and',
'I',
'certainly',
'concur',
'with',
'Jeff',
'making',
'the',
'call',
'.'],
'MAI0492': ['He',
'has',
'maintained',
'a',
'good',
'relationship',
'with',
'Mulva',
'.'],
'MAI0498': ['I',
'do',
"n't",
'know',
'if',
'there',
'is',
'anything',
'I',
'can',
'do',
'but',
'I',
"'m",
'always',
'willing',
'to',
'help',
'.'],
'MAI0506': ['Thank', 'you', 'for', 'you', 'patience', '.'],
'MAI0511': ['The',
'Dow',
'then',
'sank',
'to',
'631',
'in',
'December',
'of',
"'70",
'.'],
'MAI0555': ['If',
'you',
'have',
'any',
'other',
'questions',
',',
'please',
'let',
'me',
'know',
'.'],
'MAI0567': ['Adobe',
'Acrobat',
'Reader',
'4.0',
'may',
'be',
'downloaded',
'for',
'FREE',
'from',
'www.adobe.com',
'.'],
'MAI0598': ['What', "'s", 'going', 'on', 'dude', '?'],
'MAI0606': ['Let', 's', 'get', 'together', 'soon', '.'],
'MAI0612': ['How', 'is', 'it', 'going', '?'],
'MAI0630': ['-LRB-',
'See',
'attached',
'file',
':',
'Constellation',
'Power',
'-LRB-',
'GISB',
'draft',
'-RRB-',
'.doc',
'-RRB-',
'-LRB-',
'See',
'attached',
'file',
':',
'Sam3102.doc',
'-RRB-'],
'MAI0634': ['I',
'have',
'sent',
'your',
'question',
're',
'on',
'line',
'trading',
'to',
'that',
'area',
'.'],
'MAI0639': ['Jackie',
'Taylor',
'-',
'she',
'is',
'located',
'at',
'Court',
'House',
'Concessionaire',
'and',
'under',
'her',
'name',
'in',
'the',
'directory',
'.'],
'MAI0652': ['Attached', 'is', 'an', 'image', 'of', 'the', 'GISB', '.'],
'MAI0698': ['Please',
'forward',
'a',
'copy',
'of',
'the',
'J.M.',
'Huber',
'Corporation',
'Guaranty',
'to',
'my',
'attention',
'.'],
'MAI0710': ['Would', 'love', 'for', 'you', 'to', 'join', 'us', '.'],
'MAI0723': ['I', 'better', 'pass', 'on', 'the', 'Comets', 'game', '.'],
'MAI0724': ['My',
'weekends',
'seem',
'to',
'be',
'taken',
'up',
'with',
'condo',
'matters',
',',
'house',
'hunting',
'.'],
'MAI0732': ['The',
'game',
'is',
'at',
'12',
':',
'Sat',
'@',
'Compaq',
'Center',
'.'],
'MAI0742': ['I', 'better', 'pass', 'on', 'the', 'Comets', 'gam', '.'],
'MAI0747': ['Let', 'me', 'know', 'if', 'you', 'are', 'interested', '.'],
'MAI0758': ["Monday's",
'starting',
'next',
'week',
'at',
'4',
'???????????????'],
'MAI0759': ['Wednesday', 'does', "n't", 'work', 'for', 'me', '.'],
'MAI0771': ['He',
'is',
'concerned',
'about',
'the',
'allocation',
'amongst',
'categories',
'-',
'in',
'particular',
',',
'Real',
'Time',
'Traders',
'.'],
'TWI0000': ['Having',
'read',
'many',
'D5000',
'previews',
'I',
"'m",
'worried',
'.'],
'TWI0003': ['I',
'have',
'no',
'``',
'but',
"''",
'to',
'add',
'to',
'qualify',
'that',
'sentence',
'with',
'.'],
'TWI0004': ['Really',
'lovely',
'evening',
'spent',
'with',
'people',
'I',
'used',
'to',
'work',
'with',
'.'],
'TWI0005': ['Take',
'note',
'This',
'Life',
'and',
'Red',
'Dwarf',
',',
'reunions',
'do',
"n't",
'have',
'to',
'be',
'shit',
'.'],
'TWI0007': ['I',
'just',
'think',
'he',
'looks',
'like',
'a',
'big',
'baby',
',',
'and',
'ppl',
'USED',
'to',
'call',
'him',
'that',
'.'],
'TWI0010': ['The',
'Canon',
'is',
'definitely',
'the',
'better',
'of',
'the',
'two',
'.'],
'TWI0012': ['Been',
'using',
'Safari',
'4',
'(',
'OSX',
')',
'today',
'and',
'it',
'has',
"n't",
'been',
'going',
'well',
'.'],
'TWI0015': ['Still', 'laughing', 'at', 'man', 'utd', '.'],
'TWI0016': ['new',
'Red',
'Dwarf',
'character',
"'s",
'accent',
'is',
'already',
'annoying',
'me'],
'TWI0017': ['First', 'half', 'of', 'new', 'Red', 'Dwarf', ':', 'Poor', '.'],
'TWI0018': ['Does',
'anyone',
'else',
'think',
'Lloyds',
'TSB',
'went',
'under',
'because',
'of',
'the',
'horrible',
'music',
'on',
'their',
'TV',
'adverts',
'?'],
'TWI0019': ['NYC',
'promoting',
'LGBT',
'summer',
'tourism',
',',
'hopefully',
'prompting',
'Rush',
'Limbaugh',
'to',
'keep',
'his',
'promise',
'to',
'leave',
'the',
'City'],
'TWI0020': ['Breaking',
'news',
':',
'admin',
'official',
'says',
'Chrysler',
'will',
'file',
'for',
'Chapter',
'11',
'bankruptcy',
'.'],
'TWI0023': ['ok',
',',
'let',
"'s",
'be',
'honest',
',',
'is',
'the',
'iphone',
'really',
'that',
'great',
'?'],
'TWI0025': ['Obama',
'Condemns',
'North',
'Korea',
'Launch',
',',
'Calls',
'for',
'Nuclear',
'Free',
'World',
'-',
'voice',
'of',
'America',
':'],
'TWI0027': ['Oh',
'geez',
',',
'another',
'kiddie',
'eddie',
'murphy',
'movie',
'.'],
'TWI0032': ['No', 'broadband', 'and', 'mobile', 'phones', 'banned'],
'TWI0033': ['DM',
':',
'You',
'may',
'forgive',
'him',
',',
'Rihanna',
',',
'but',
'battered',
'women',
'wo',
"n't",
'.'],
'TWI0034': ['Gordon',
'Brown',
'refuses',
'to',
'hand',
'back',
'$',
'3m',
'pension'],
'TWI0039': ['Bono', 'is', 'NOT', 'god', '!'],
'TWI0040': ['Props', 'to', 'Jamba', 'Juice', '.'],
'TWI0042': ['I',
'do',
"n't",
'see',
'how',
'Leno',
'at',
'10pm',
'would',
'be',
'the',
'least',
'bit',
'profitable',
'for',
'NBC',
'.'],
'TWI0043': ['Cheap',
'is',
'all',
'that',
'it',
"'s",
'got',
'going',
'for',
'it',
'.'],
'TWI0044': ['Sky',
'News',
'URL',
'I',
'just',
'quoted',
'was',
'301characters',
'long',
'!',
'!',
'!'],
'TWI0049': ['Fuck',
'the',
'mets',
'and',
'their',
'$',
'100',
'tickets',
'.'],
'TWI0050': ['Rush', 'Limbaugh', 'is', 'not', 'panel', 'material', '.'],
'TWI0051': ['Did',
'you',
'see',
'his',
'interview',
'with',
'Barbara',
'Walters',
'?'],
'TWI0056': ['I',
"'m",
'gald',
'that',
'the',
'Dallas',
'Cowboys',
'dumped',
'that',
'whinner',
'Terrell',
'Owens',
'.'],
'TWI0058': ['Username',
'I',
'have',
'science',
'on',
'my',
'side',
'Urlname',
'Urlname',
'you',
'have',
'Rush',
'Limbaugh',
'and',
'empty',
'rhetoric',
'.'],
'TWI0059': ['Obama',
'puts',
'GM',
',',
'Chrysler',
'on',
'short',
'leash',
'Urlname'],
'TWI0061': ['ACMA',
'doing',
'it',
"'s",
'part',
'to',
'kill',
'of',
'tv',
'.'],
'TWI0062': ['Urlname', 'we', 'liked', 'Underbelly', 'Uncut', '.'],
'TWI0063': ['Does',
'anyone',
'really',
'think',
'a',
'big-screen',
'Kindle',
'can',
'save',
'newspapers',
'?',
':'],
'TWI0064': ['Here',
'comes',
'another',
'possible',
'savior',
'for',
'the',
'de',
'...'],
'TWI0065': ['trying',
'to',
'figure',
'out',
'why',
'rush',
'limbaugh',
"'s",
'head',
'keeps',
'getting',
'bigger',
',',
'it',
"'s",
'usually',
'just',
'his',
'nose',
'.'],
'TWI0067': ['North',
'Korea',
'will',
'never',
'be',
'able',
'to',
'live',
'this',
'one',
'down',
'!'],
'TWI0071': ['73',
'Executives',
'got',
'millions',
'at',
'AIG',
'and',
'11',
'of',
'them',
'do',
"n't",
'even',
'work',
'there',
'any',
'more',
'!'],
'TWI0073': ['BACK', 'TO', 'NAIL', 'BUSINESS', '...'],
'TWI0075': ['Call',
'me',
'crazy',
'but',
'I',
'love',
'the',
'smell',
'of',
'old',
'books',
'f',
'...'],
'TWI0076': ['RT',
'Username',
':',
'``',
'Look',
'for',
'Oracle',
'to',
'do',
'a',
'little',
'pruning',
'before',
'it',
'blows',
'any',
'dough',
'on',
'Sun',
"'s",
'hardware',
'business',
',',
"''",
'Urlname'],
'TWI0079': ['Should',
'be',
'sellers',
'this',
'year',
'-',
'parts',
'r',
'worth',
'far',
'more',
'than',
'the',
'whole',
'.'],
'TWI0080': ['Go', 'with', 'youth', '-', 'enough', 'already'],
'TWI0081': ['Just',
'told',
'Lloyds',
'to',
'go',
'do',
'one',
'their',
'new',
'lending',
'is',
'3.5',
'%',
'over',
'base',
'rate',
'on',
'well',
'secured',
'loans',
'!'],
'TWI0083': ['RP',
"'s",
'response',
'to',
'Obama',
',',
'JUST',
'up',
':',
'Barack',
'Obama',
'is',
'``',
'preaching',
'inflation',
'...',
'economic',
'fascism',
"''",
'Urlname',
'#tlot',
'...'],
'TWI0084': ['Obama',
'notes',
'earlier',
'today',
'to',
'ABC',
'that',
'he',
'is',
'``',
'unaware',
'of',
'tea',
'parties',
'.',
"''"],
'TWI0086': ['RT',
'Username',
'Sky',
'News',
'creates',
'a',
'Twitter',
'corresp',
'.'],
'TWI0088': ['Should',
'be',
'rolling',
'that',
'into',
'regular',
'work',
'rather',
'th',
'...'],
'TWI0091': ['Username',
'first',
'line',
'from',
'ABC',
'news',
':',
'North',
'Korea',
'defiantly',
'launched',
'...'],
'TWI0092': ['Adding',
'the',
'word',
'defiantly',
'perpetrates',
'that',
'is',
'was',
'wrong',
'.'],
'TWI0094': ['Lloyds',
'bonuses',
'should',
'be',
'taxed',
'99',
'p',
'in',
'the',
'pound',
'.'],
'WBL0018': ['Does', 'anybody', 'use', 'it', 'for', 'anything', 'else', '?'],
'WBL0049': ['As',
'recently',
'as',
'last',
'week',
'the',
'official',
'line',
'stated',
'they',
'had',
'no',
'knowledge',
'he',
'had',
'entered',
'the',
'country',
'.'],
'WBL0058': ['-LRB-',
'Ask',
'Terry',
'Nichols',
'about',
'the',
'Philippines',
'.',
'-RRB-'],
'WBL0061': ['And',
'to',
'those',
'who',
'do',
"n't",
'even',
'know',
'their',
'crimes',
',',
'not',
'even',
'that',
'.'],
'WBL0062': ['But', 'will', 'Posada', 'be', 'given', 'up', '?'],
'WBL0065': ['The',
'hope',
'may',
'be',
'that',
'Posada',
'can',
'soon',
'be',
'offered',
'to',
'a',
'post-Chavez',
'Allawi',
'-',
'like',
'puppet',
'in',
'Caracas',
'.'],
'WBL0067': ['But',
'Posada',
"'s",
'nearly',
'80',
'years',
'old',
',',
'and',
'the',
'Venezuelan',
'people',
'will',
'ensure',
'that',
"'s",
'a',
'vain',
'hope',
'.'],
'WBL0071': ['His',
'case',
'threatens',
'the',
'consensus',
'fiction',
'of',
'the',
'"',
'War',
'on',
'Terror',
'.',
'"'],
'WBL0083': ['Wilson',
'was',
'claiming',
'that',
'he',
'had',
'been',
'working',
'for',
'the',
'CIA',
'when',
'he',
'sold',
'the',
'C',
'-',
'4',
'to',
'Quaddaffi',
'.'],
'WBL0119': ['It', 'pretty', 'much', 'covers', 'dating', 'stuff', '.'],
'WBL0120': ['I', "'ll", 'be', 'sure', 'to', 'come', 'back', '.'],
'WBL0129': ['You', 'CAN', 'Do', 'It', '.'],
'WBL0130': ['And',
'It',
"'s",
'Not',
'Hard',
'To',
'Do',
'....',
'IF',
'You',
'Know',
'HOW',
'.'],
'WBL0136': ['And',
'you',
'search',
'in',
'vain',
'to',
'find',
'just',
'one',
'law',
'abiding',
'citizen'],
'WBL0138': ['Afraid',
'I',
'do',
"n't",
'have',
'time',
'today',
'to',
'discuss',
'these',
',',
'but',
'some',
'stories',
'need',
'attention',
':'],
'WBL0141': ['Some',
'200,000',
'guns',
'the',
'US',
'sent',
'to',
'Iraqi',
'security',
'forces',
'may',
'have',
'been',
'smuggled',
'to',
'terrorists',
',',
'it',
'was',
'feared',
'yesterday',
'.'],
'WBL0143': ['But',
'the',
'four',
'planeloads',
'of',
'arms',
'have',
'vanished',
'.'],
'WBL0145': ['But',
'the',
'work',
'was',
'contracted',
'out',
'via',
'a',
'complex',
'web',
'of',
'private',
'arms',
'traders',
'.'],
'WBL0150': ['American',
'defence',
'chiefs',
'hired',
'a',
'US',
'firm',
'to',
'take',
'the',
'guns',
',',
'from',
'the',
'90s',
'Bosnian',
'war',
',',
'to',
'Iraq',
'.'],
'WBL0182': ['The', 'will', 'is', 'another', 'matter', '.'],
'WBL0206': ['No', 'need', 'to', 'worry', '.'],
'WBL0212': ['Then',
',',
'of',
'course',
',',
'there',
'is',
'the',
'evidence',
'the',
'jury',
'did',
'not',
'hear',
'about',
'in',
'the',
'Robinson',
'case',
'...'],
'WBL0228': ['no',
'one',
'was',
'charged',
'for',
'forced',
'marriage',
',',
'only',
'the',
'beating',
'.'],
'WBL0241': ['A',
'country',
'deserves',
'the',
'leaders',
'it',
'has',
',',
'my',
'friends',
'...'],
'WBL0242': ['And',
'MEK',
'--',
'the',
'Iranian',
'-LRB-',
'not',
'Iraqi',
'-RRB-',
'terror',
'group',
'in',
'question',
'--',
'is',
'itself',
'unquestionably',
'a',
'cult',
':'],
'WBL0267': ['"',
'Another',
'generation',
'of',
'heavy',
'metal',
'has',
'taken',
'over',
',',
'and',
'--',
'sorry',
'--',
'it',
'ai',
"n't",
'just',
'about',
'strippers',
'and',
'dope',
'.'],
'WBL0268': ['Okay',
',',
'it',
"'s",
'partly',
'about',
'strippers',
'and',
'dope',
'.'],
'WBL0279': ['You',
'know',
',',
'nature',
'hates',
'a',
'void',
'.',
':-RRB-'],
'WBL0285': ['They',
'were',
'death',
'metal',
'brainwashed',
'fans',
',',
'literally',
'fulfilling',
'the',
'death',
'metal',
'paradigm',
',',
'er',
'...',
'morality',
'.'],
'WBL0288': ['I',
'detect',
'the',
'hissing',
'lisp',
'of',
'the',
'lying',
'serpent',
'in',
'this',
'article',
'.'],
'WBL0290': ['I',
'think',
'the',
'jury',
"'s",
'still',
'out',
'on',
'exactly',
'who',
'did',
'the',
'brainwashing',
'when',
'in',
'regard',
'to',
'the',
'Columbine',
'killers',
'.'],
'WBL0300': ['Imaginary',
'evil',
'is',
'romantic',
'and',
'varied',
';',
'real',
'evil',
'is',
'gloomy',
',',
'monotonous',
',',
'barren',
',',
'boring',
'.'],
'WBL0324': ['During',
'this',
'search',
',',
'Michele',
'Tollis',
'became',
'convinced',
'that',
'satanism',
'had',
'something',
'to',
'do',
'with',
'his',
'son',
"'s",
'disappearance',
'.'],
'WBL0362': ['Why', 'do', "n't", 'you', 'state', 'it', '?'],
'WBL0365': ['John', 'Balance', 'from', 'Coil', '.'],
'WBL0382': ['I',
'was',
'amazed',
'at',
'the',
'spiel',
'they',
'delivered',
'.'],
'WBL0406': ['I',
"'m",
'wary',
'of',
'jumping',
'into',
'this',
'fray',
'without',
'backup',
',',
'that',
"'s",
'for',
'damn',
'sure',
'.',
':-RRB-'],
'WBL0412': ['Similarly',
',',
'is',
'invoking',
'ancient',
'Egyptian',
'gods',
'and',
'goddesses',
'a',
'reaffirmation',
'of',
'an',
'oppressive',
'and',
'hierarchical',
'system',
'?'],
'WBL0419': ['A',
'bit',
'of',
'a',
'knee',
'-',
'jerk',
'reaction',
'from',
'me',
'there',
',',
'and',
'I',
'apologize',
'for',
'my',
'over-generalizations',
'.'],
'WBL0444': ['I', "'m", 'just', 'speculating', 'now', '.'],
'WBL0450': ['It',
"'s",
'on',
'loan',
',',
'by',
'the',
'way',
',',
'from',
'a',
'guy',
'named',
'Joe',
"O'Neill",
'in',
'Midland',
',',
'Texas',
'.'],
'WBL0459': ['That',
"'s",
'a',
'Senate',
'term',
'--',
'particularly',
'on',
'good',
'judges',
'.'],
'WBL0470': ['And',
'I',
'gave',
'it',
'all',
'my',
'heart',
',',
'all',
'my',
'energy',
',',
'based',
'upon',
'principles',
'that',
'did',
'not',
'change',
'once',
'I',
'got',
'into',
'the',
'Oval',
'Office',
'.'],
'WBL0490': ['Russia',
'also',
'announced',
'that',
'it',
'was',
'seeking',
'and',
'building',
'the',
'best',
'nukes',
'the',
'world',
"'s",
'ever',
'seen',
'.'],
'WBL0491': ['President',
'Vladimir',
'Putin',
'said',
'Russia',
'is',
'will',
'have',
'new',
'nuclear',
'weapons',
'that',
'other',
'countries',
'do',
'not',
'and',
'will',
'not',
'have',
'.'],
'WBL0498': ['It',
'is',
'rumored',
'that',
'North',
'Korea',
'has',
'at',
'least',
'a',
'couple',
'nuclear',
'weapons',
'.'],
'WBL0516': ['All',
'this',
'is',
'highly',
'unlikely',
',',
'as',
'with',
'most',
'al',
'-',
'Qaeda',
'crackpot',
'schemes',
'.'],
'WBL0532': ['Response',
':',
'Iraq',
'is',
'actually',
'hostile',
'territory',
'for',
'al',
'-',
'Qaeda',
',',
'and',
'without',
'Iraqi',
'sympathizers',
'it',
'can',
'not',
'succeed',
'there',
'.'],
'WBL0545': ['The',
'clerics',
'demanded',
'talks',
'with',
'local',
'US',
'commanders',
'.'],
'WBL0554': ['The',
'US',
'troops',
'fired',
'into',
'the',
'hostile',
'crowd',
',',
'killing',
'4',
'.'],
'WSJ0058': ['Many',
'have',
'raised',
'cash',
'levels',
',',
'which',
'act',
'as',
'a',
'buffer',
'against',
'steep',
'market',
'declines',
'.'],
'WSJ0067': ['Also',
',',
'persistent',
'redemptions',
'would',
'force',
'some',
'fund',
'managers',
'to',
'dump',
'stocks',
'to',
'raise',
'cash',
'.'],
'WSJ0068': ['But',
'a',
'strong',
'level',
'of',
'investor',
'withdrawals',
'is',
'much',
'more',
'unlikely',
'this',
'time',
'around',
',',
'fund',
'managers',
'said',
'.'],
'WSJ0101': ['She',
'added',
',',
'``',
'If',
'they',
'all',
'were',
'bullish',
',',
'I',
"'d",
'really',
'be',
'upset',
'.',
"''"],
'WSJ0108': ['``', 'The', 'projects', 'are', 'big', '.'],
'WSJ0136': ['``',
'We',
'usually',
'operate',
'in',
'that',
'conservative',
'manner',
'.',
"''"],
'WSJ0150': ['``',
'The',
'studies',
'-LCB-',
'on',
'closing',
'the',
'unit',
'-RCB-',
'could',
"n't",
'be',
'completed',
'until',
'now',
',',
"''",
'he',
'said',
'.'],
'WSJ0166': ['Meanwhile',
',',
'Brazil',
'is',
'expected',
'to',
'increase',
'its',
'allowance',
'from',
'the',
'1.43',
'%',
'share',
'it',
'has',
'had',
'in',
'recent',
'years',
'.'],
'WSJ0167': ['The',
'EC',
'and',
'Japan',
'--',
'the',
'U.S.',
"'s",
'largest',
'steel',
'suppliers',
'--',
'have',
"n't",
'been',
'filling',
'their',
'quotas',
'to',
'the',
'full',
'extent',
'.'],
'WSJ0171': ['But',
'its',
'quota',
'has',
'been',
'as',
'high',
'as',
'6.9',
'%',
'in',
'1984',
'.'],
'WSJ0188': ['``',
'They',
'do',
"n't",
'want',
'to',
'get',
'caught',
'again',
',',
"''",
'says',
'one',
'industry',
'watcher',
'.'],
'WSJ0193': ['The',
'company',
'introduced',
'a',
'superconcentrated',
'Lemon',
'Cheer',
'in',
'Japan',
'after',
'watching',
'the',
'success',
'of',
'Attack',
'.'],
'WSJ0194': ['When',
'Attack',
'hit',
'the',
'shelves',
'in',
'1987',
',',
'P&G',
"'s",
'share',
'of',
'the',
'Japanese',
'market',
'fell',
'to',
'about',
'8',
'%',
'from',
'more',
'than',
'20',
'%',
'.'],
'WSJ0200': ['But',
'the',
'product',
',',
'which',
'was',
"n't",
'as',
'concentrated',
'as',
'the',
'new',
'Cheer',
',',
'bombed',
'in',
'a',
'market',
'test',
'in',
'Denver',
'and',
'was',
'dropped',
'.'],
'WSJ0202': ['But',
'P&G',
'contends',
'the',
'new',
'Cheer',
'is',
'a',
'unique',
'formula',
'that',
'also',
'offers',
'an',
'ingredient',
'that',
'prevents',
'colors',
'from',
'fading',
'.'],
'WSJ0215': ['They',
'expected',
'a',
'new',
'barrage',
'of',
'demands',
'that',
'Japan',
'do',
'something',
'quickly',
'to',
'reduce',
'its',
'trade',
'surplus',
'with',
'the',
'U.S.',
'.'],
'WSJ0241': ['``',
'But',
'I',
'am',
'relieved',
'to',
'see',
'that',
'you',
'are',
'beautiful',
'and',
'gentle',
'and',
'intelligent',
'and',
'a',
'person',
'of',
'integrity',
'.',
"''"],
'WSJ0252': ['The',
'units',
'that',
'filed',
'the',
'suit',
'are',
'Southeastern',
'Newspapers',
'Corp.',
'and',
'Florida',
'Publishing',
'Co',
'.'],
'WSJ0270': ['Beginning',
'in',
'mid-1987',
',',
'prices',
'began',
'accelerating',
'as',
'a',
'growing',
'U.S.',
'economy',
'and',
'the',
'weak',
'dollar',
'spurred',
'demand',
'.'],
'WSJ0277': ['In',
'addition',
',',
'crude',
'oil',
'prices',
'were',
'up',
'some',
'from',
'a',
'year',
'earlier',
',',
'further',
'pressuring',
'profitability',
'.'],
'WSJ0279': ['During',
'the',
'1988',
'second',
'half',
',',
'many',
'companies',
'posted',
'record',
'gasoline',
'and',
'chemical',
'profits',
'.'],
'WSJ0290': ['``',
'We',
"'ve",
'been',
'very',
'disappointed',
'in',
'the',
'performance',
'of',
'natural',
'gas',
'prices',
',',
"''",
'said',
'Mr.',
'Cox',
',',
'Phillips',
"'s",
'president',
'.'],
'WSJ0292': ['Going',
'into',
'the',
'fourth',
'quarter',
',',
'natural',
'gas',
'prices',
'are',
'anywhere',
'from',
'8',
'%',
'to',
'17',
'%',
'lower',
'than',
'a',
'year',
'earlier',
'.'],
'WSJ0300': ['Olivetti',
'reportedly',
'began',
'shipping',
'these',
'tools',
'in',
'1984',
'.'],
'WSJ0303': ['President',
'Bush',
'called',
'his',
'attention',
'to',
'the',
'matter',
'during',
'the',
'Italian',
'leader',
"'s",
'visit',
'here',
'last',
'week',
'.'],
'WSJ0316': ['Officials',
'at',
'Drexel',
'said',
'they',
'had',
"n't",
'seen',
'the',
'suit',
'and',
'thus',
'could',
"n't",
'comment',
'.'],
'WSJ0322': ['However',
',',
'the',
'agreement',
'was',
'canceled',
'in',
'June',
'1984',
'.'],
'WSJ0333': ['And',
'they',
"'re",
'likely',
'to',
'stay',
'that',
'way',
'for',
'months',
'to',
'come',
',',
'analysts',
'say',
'.'],
'WSJ0344': ['Until',
'then',
',',
'limited',
'stocks',
'are',
'likely',
'to',
'keep',
'prices',
'near',
'the',
'$',
'4',
'-',
'a',
'-',
'bushel',
'level',
',',
'analysts',
'say',
'.'],
'WSJ0350': ['Ferruzzi',
'has',
'denied',
'it',
'was',
'trying',
'to',
'manipulate',
'the',
'soybean',
'futures',
'market',
'.'],
'WSJ0419': ['Bozell',
'joins',
'Backer',
'Spielvogel',
'Bates',
'and',
'Ogilvy',
'Group',
'as',
'U.S.',
'agencies',
'with',
'interests',
'in',
'Korean',
'agencies',
'.'],
'WSJ0422': ['The',
'company',
'asked',
'for',
'a',
'15',
'-',
'day',
'extension',
'Sept.',
'30',
',',
'when',
'the',
'financial',
'reports',
'were',
'due',
'.'],
'WSJ0444': ['Now',
',',
'GM',
'appears',
'to',
'be',
'stepping',
'up',
'the',
'pace',
'of',
'its',
'factory',
'consolidation',
'to',
'get',
'in',
'shape',
'for',
'the',
'1990s',
'.'],
'WSJ0453': ['The',
'shutdowns',
'will',
'idle',
'about',
'3,000',
'Canadian',
'assembly',
'workers',
'and',
'about',
'2,500',
'workers',
'in',
'Ohio',
'.'],
'WSJ0458': ['That',
'announcement',
'left',
'union',
'officials',
'in',
'Van',
'Nuys',
'and',
'Oklahoma',
'City',
'uncertain',
'about',
'their',
'futures',
'.'],
'WSJ0470': ['Weatherford',
'said',
'market',
'conditions',
'led',
'to',
'the',
'cancellation',
'of',
'the',
'planned',
'exchange',
'.'],
'WSJ0491': ['``',
'Without',
'official',
'knowledge',
'of',
'sex',
'or',
'death',
',',
'we',
'flirted',
'with',
'both',
',',
"''",
'she',
'writes',
'.'],
'WSJ0492': ['She',
'analyzed',
'families',
'by',
'their',
'sleeping',
'arrangements',
'.'],
'WSJ0511': ['But',
'it',
'does',
"n't",
'take',
'too',
'many',
'lines',
'to',
'figure',
'Harry',
'out',
'.'],
'WSJ0532': ['``',
'This',
'further',
'confuses',
'retailers',
',',
"''",
'she',
'says',
'.'],
'WSJ0544': ['Hardest',
'hit',
'are',
'what',
'he',
'calls',
'``',
'secondary',
"''",
'sites',
'that',
'primarily',
'serve',
'neighborhood',
'residents',
'.'],
'WSJ0555': ['By',
'contrast',
',',
'rentals',
'in',
'the',
'best',
'retail',
'locations',
'in',
'Boston',
',',
'San',
'Francisco',
'and',
'Chicago',
'rarely',
'top',
'$',
'100',
'a',
'square',
'foot',
'.'],
'WSJ0574': ['Justices',
'Brennan',
'and',
'Stevens',
'appear',
'philosophical',
'about',
'it',
';',
'Justices',
'Marshall',
'and',
'Blackmun',
'appear',
'fighting',
'mad',
'.'],
'WSJ0606': ['In',
'these',
'four',
',',
'for',
'instance',
',',
'the',
'RTC',
'is',
'stuck',
'with',
'$',
'4.51',
'billion',
'in',
'bad',
'assets',
'.'],
'WSJ0616': ['NCNB',
'will',
'acquire',
'University',
'Federal',
'Savings',
'Association',
',',
'Houston',
',',
'which',
'had',
'assets',
'of',
'$',
'2.8',
'billion',
'.'],
'WSJ0630': ['Limited',
'Inc.',
',',
'offering',
'of',
'up',
'to',
'$',
'300',
'million',
'of',
'debt',
'securities',
'and',
'warrants',
'.'],
'WSJ0634': ['Trans',
'World',
'Airlines',
'Inc.',
',',
'offering',
'of',
'$',
'150',
'million',
'senior',
'notes',
',',
'via',
'Drexel',
'Burnham',
'.'],
'WSJ0643': ['Besides',
',',
'Time',
'executives',
'think',
'selling',
'a',
'news',
'magazine',
'with',
'a',
'clock',
'radio',
'is',
'tacky',
'.'],
'WSJ0658': ['Both',
'magazines',
'are',
'expected',
'to',
'announce',
'their',
'ad',
'rates',
'and',
'circulation',
'levels',
'for',
'1990',
'within',
'a',
'month',
'.'],
'WSJ0663': ['Its',
'niche',
'as',
'the',
'``',
'network',
'of',
'record',
"''",
'during',
'major',
'crises',
'draws',
'elite',
'audiences',
'around',
'the',
'world',
'.']}
In [11]:
with open('../Pagerecordings/2015-12-10_22-11-52test') as datafile:
flaskdata = json.load(datafile)#a list of dicts.
In [12]:
flask_dict=dict()
all_timestamps=[] # list of all timestamps, inclusing fixation screens
for letterscreen in flaskdata: #Every other is the fixation cross screen. You need still timestamp from this
#letterscreen is a dict with one entry
for screencontent in letterscreen: #only one per letterscreen
screencontent = ast.literal_eval(screencontent) #is string. Needs to be turned into python object
timestamp = screencontent['timestamp']
screenid = screencontent['screen_id']
all_timestamps.append((screenid,timestamp))
#print(timestamp)
#print(screenid)
#tokensent = [tokenfile[tokenfile.Stimuliname==screenid]] #looking up tokenized annotated sent
if 'screen' not in screenid: #Looks at non fixation screen
token=list()
token_sentence=token_dict[screenid] #Finds the the sentence in the dictionary
flask_dict[screenid]=['start time:%s'%timestamp,{'tokens':token}] #The final dictionary outputted
#print(screenid,timestamp)
#print(tokensent)
boxes = screencontent['boxes'] #Data for sentence/screen
#print(len(boxes))
letter_boxes=[] #Create list of boxes for easier indexing
for box in boxes:
letter_boxes.append(box)
for i in range(0,len(token_sentence)): # Looking at each word in the sentence
letter_boxes[0:len(token_sentence[0])]
if isinstance(token_sentence[i],str): # If the word is a string
#print(token_sentence[i])
tokens=list()
token_number='token%sword'%i #Can add i+1 to start with 1 instead of 0 index
word_in_sent=letter_boxes[0:len(token_sentence[i])] # Get all the boxes for a word
#Index the coordinates of a word
top_coor=word_in_sent[0]['top']
bottom_coor=word_in_sent[0]['bottom']
left_coor=word_in_sent[0]['left']
right_coor=word_in_sent[-1]['right']
token.append({token_number:[token_sentence[i],'top_coor:%s'%top_coor,'bottom_coor:%s'%bottom_coor,'left_coor:%s'%left_coor,'right_coor:%s'%right_coor]})
else:
punct=[]
#else:
#all_timestamps.append(screencontent)
for i in range(0,len(all_timestamps)): #inserting the end times for each sentence, using the timestamp from the fixation screen just after it.
if all_timestamps[i][0] in flask_dict:
flask_dict[all_timestamps[i][0]].insert(1,'end time:%s'%all_timestamps[i+1][1])
flask_dict['MAI0498'][2]['tokens'] #to access tokens
flask_dict['MAI0498'][0] # to access start time
flask_dict['MAI0498'][1] # to access end time
flask_dict.keys() # access
Out[12]:
dict_keys(['WBL0324', 'HDL0100', 'WSJ0643', 'MAI0498', 'HDL0039', 'TWI0018', 'WBL0450', 'TWI0088', 'TWI0042', 'WSJ0292', 'WBL0268', 'TWI0061', 'HDL0068', 'TWI0010'])
In [82]:
#{Sentid_1 {starttime: , endtime: , tokens: [{token1word: We, top_coor:, bottom_coor: , left_coor: , right_coor: ] }}
#Sentid_2 (etc..)
#}
In [ ]:
In [ ]:
Content source: SigridK/browser-reading
Similar notebooks: