parse_data


Exploration of he original squad dataset


In [12]:
import numpy as np
import json

In [13]:
class DotDict(dict):
    def __getattr__(self, name):
        return self[name]

args = DotDict()
args.data = '../../data/train-v1.1.json'
args.outfile = '../../data/train_parsed.json'
args.outfile_valid = '../../data/valid_parsed.json'
args.train_ratio = 0.9

In [14]:
with open(args.data, 'r') as f:
    data = json.load(f)

In [16]:
type(data)
data.keys()


Out[16]:
[u'version', u'data']

In [19]:
data['version']


Out[19]:
u'1.1'

In [20]:
data = data['data']

In [21]:
type(data)


Out[21]:
list

In [26]:
len(data)


Out[26]:
442

442 papers(topics) in total in squad dataset


In [34]:
print([data[i]['title'] for i in range(100)])


[u'University_of_Notre_Dame', u'Beyonc\xe9', u'Montana', u'Genocide', u'Antibiotics', u'Fr\xe9d\xe9ric_Chopin', u'Sino-Tibetan_relations_during_the_Ming_dynasty', u'IPod', u'The_Legend_of_Zelda:_Twilight_Princess', u'Spectre_(2015_film)', u'2008_Sichuan_earthquake', u'New_York_City', u'To_Kill_a_Mockingbird', u'Solar_energy', u'Tajikistan', u'Anthropology', u'Portugal', u'Kanye_West', u'Buddhism', u'American_Idol', u'Dog', u'2008_Summer_Olympics_torch_relay', u'Alfred_North_Whitehead', u'Financial_crisis_of_2007%E2%80%9308', u'Saint_Barth%C3%A9lemy', u'Genome', u'Comprehensive_school', u'Republic_of_the_Congo', u'Prime_minister', u'Institute_of_technology', u'Wayback_Machine', u'Dutch_Republic', u'Symbiosis', u'Canadian_Armed_Forces', u'Cardinal_(Catholicism)', u'Iranian_languages', u'Lighting', u'Separation_of_powers_under_the_United_States_Constitution', u'Architecture', u'Human_Development_Index', u'Southern_Europe', u'BBC_Television', u'Arnold_Schwarzenegger', u'Plymouth', u'Heresy', u'Warsaw_Pact', u'Materialism', u'Space_Race', u'Pub', u'Christian', u'Sony_Music_Entertainment', u'Oklahoma_City', u'Hunter-gatherer', u'United_Nations_Population_Fund', u'Russian_Soviet_Federative_Socialist_Republic', u'Universal_Studios', u'Alexander_Graham_Bell', u'Internet_service_provider', u'Comics', u'Saint_Helena', u'Aspirated_consonant', u'Hydrogen', u'Web_browser', u'Boston', u'BeiDou_Navigation_Satellite_System', u'Canon_law', u'Communications_in_Somalia', u'Catalan_language', u'Estonian_language', u'Paper', u'Arena_Football_League', u'Adult_contemporary_music', u'Matter', u'Westminster_Abbey', u'Nanjing', u'Bern', u'Daylight_saving_time', u'Royal_Institute_of_British_Architects', u'National_Archives_and_Records_Administration', u'Tristan_da_Cunha', u'University_of_Kansas', u'Political_corruption', u'Dialect', u'Classical_music', u'Slavs', u'Southampton', u'Treaty', u'Josip_Broz_Tito', u'Marshall_Islands', u'Szlachta', u'Virgil', u'Alps', u'Gene', u'Guinea-Bissau', u'List_of_numbered_streets_in_Manhattan', u'Brain', u'Near_East', u'Zhejiang', u'Ministry_of_Defence_(United_Kingdom)', u'High-definition_television']

In [36]:
data[0].keys()


Out[36]:
[u'paragraphs', u'title']

In [40]:
len(data[0]['paragraphs'])


Out[40]:
55

In [41]:
data[0]['paragraphs'][0].keys()


Out[41]:
[u'qas', u'context']

In [42]:
data[0]['paragraphs'][0]['context']


Out[42]:
u'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [46]:
data[303]['paragraphs'][0]['qas']


Out[46]:
[{u'answers': [{u'answer_start': 36, u'text': u'Asgaya gigageyi'}],
  u'id': u'5727cbc3ff5b5019007d9564',
  u'question': u'What is the Cherokee name for Oklahoma?'},
 {u'answers': [{u'answer_start': 106, u'text': u'Uukuhu\xfawa'}],
  u'id': u'5727cbc3ff5b5019007d9565',
  u'question': u'What is the Pawnee name for Oklahoma?'},
 {u'answers': [{u'answer_start': 125, u'text': u'Gahnawiyo\u02c0geh'}],
  u'id': u'5727cbc3ff5b5019007d9566',
  u'question': u'What is the Cayuga name for Oklahoma?'},
 {u'answers': [{u'answer_start': 239, u'text': u'28th'}],
  u'id': u'5727cbc3ff5b5019007d9567',
  u'question': u'Where does Oklahoma rank by population?'},
 {u'answers': [{u'answer_start': 417, u'text': u'The Sooner State'}],
  u'id': u'5727cbc3ff5b5019007d9568',
  u'question': u"What is Oklahoma's nickname?"}]

In [44]:
data[10]['paragraphs'][0]['qas']


Out[44]:
[{u'answers': [{u'answer_start': 4, u'text': u'2008'}],
  u'id': u'56cdca7862d2951400fa6826',
  u'question': u'In what year did the earthquake in Sichuan occur?'},
 {u'answers': [{u'answer_start': 31,
    u'text': u'the Great Sichuan earthquake'}],
  u'id': u'56cdca7862d2951400fa6827',
  u'question': u'What was the earthquake named?'},
 {u'answers': [{u'answer_start': 206, u'text': u'69,197'}],
  u'id': u'56cdca7862d2951400fa6828',
  u'question': u'How many people were killed as a result?'},
 {u'answers': [{u'answer_start': 4, u'text': u'2008'}],
  u'id': u'56d4f9902ccc5a1400d833c0',
  u'question': u'What year did the Sichuan earthquake take place?'},
 {u'answers': [{u'answer_start': 73, u'text': u'8.0 Ms and 7.9 Mw'}],
  u'id': u'56d4f9902ccc5a1400d833c1',
  u'question': u'What did the quake measure?'},
 {u'answers': [{u'answer_start': 171, u'text': u'May 12'}],
  u'id': u'56d4f9902ccc5a1400d833c2',
  u'question': u'What day did the earthquake occur?'},
 {u'answers': [{u'answer_start': 108,
    u'text': u'02:28:01 PM China Standard Time'}],
  u'id': u'56d4f9902ccc5a1400d833c3',
  u'question': u'What time of the day did the quake happen?'},
 {u'answers': [{u'answer_start': 206, u'text': u'69,197'}],
  u'id': u'56d4f9902ccc5a1400d833c4',
  u'question': u'How many people died?'}]

In [45]:
data[100]['paragraphs'][3]['qas']


Out[45]:
[{u'answers': [{u'answer_start': 83, u'text': u'construction material'}],
  u'id': u'56f9e9738f12f3190062ffe7',
  u'question': u'What has been the primary purpose of wood for millennia other than fuel?'},
 {u'answers': [{u'answer_start': 116, u'text': u'houses'}],
  u'id': u'56f9e9738f12f3190062ffe8',
  u'question': u'What is often made of wood that provides shelter to people?'},
 {u'answers': [{u'answer_start': 213, u'text': u'tree-ring widths'}],
  u'id': u'56f9e9738f12f3190062ffe9',
  u'question': u'For clues about the climate of a place, we can look at variation in isotopic abundances or what other thing?'},
 {u'answers': [{u'answer_start': 131, u'text': u'weapons'}],
  u'id': u'56f9e9738f12f3190062ffea',
  u'question': u'What could be made out of wood that could be used to fight off attackers?'},
 {u'answers': [{u'answer_start': 140, u'text': u'furniture'}],
  u'id': u'56f9e9738f12f3190062ffeb',
  u'question': u'What category of items often constructed from wood does a chair belong to?'}]

For each paper there are several paragraphs


In [ ]: