In [1]:
import json
aug_data_path = "/Users/minjoons/data/squad/train-v1.0-aug.json"
aug_data = json.load(open(aug_data_path, 'r'))
In [2]:
def compare_answers():
for article in aug_data['data']:
for para in article['paragraphs']:
deps = para['deps']
nodess = []
for dep in deps:
nodes, edges = dep
if dep is not None:
nodess.append(nodes)
else:
nodess.append([])
wordss = [[node[0] for node in nodes] for nodes in nodess]
for qa in para['qas']:
for answer in qa['answers']:
text = answer['text']
word_start = answer['answer_word_start']
word_stop = answer['answer_word_stop']
answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]
yield answer_words, text
ca = compare_answers()
print(next(ca))
print(next(ca))
print(next(ca))
print(next(ca))
In [11]:
def nodep_counter():
x_count = 0
q_count = 0
for article in aug_data['data']:
for para in article['paragraphs']:
deps = para['deps']
nodess = []
for sent, dep in zip(para['sents'], deps):
if dep is None:
print("x:", sent)
x_count += 1
for qa in para['qas']:
if qa['dep'] is None:
print("q:", qa['question'])
q_count += 1
print(x_count, q_count)
nodep_counter()
In [4]:
def bad_node_counter():
count = 0
for article in aug_data['data']:
for para in article['paragraphs']:
sents = para['sents']
deps = para['deps']
nodess = []
for dep in deps:
if dep is not None:
nodes, edges = dep
for node in nodes:
if len(node) != 5:
count += 1
print(count)
bad_node_counter()
In [5]:
def noanswer_counter():
count = 0
for article in aug_data['data']:
for para in article['paragraphs']:
deps = para['deps']
nodess = []
for dep in deps:
if dep is not None:
nodes, edges = dep
nodess.append(nodes)
else:
nodess.append([])
wordss = [[node[0] for node in nodes] for nodes in nodess]
for qa in para['qas']:
for answer in qa['answers']:
text = answer['text']
word_start = answer['answer_word_start']
word_stop = answer['answer_word_stop']
if word_start is None:
count += 1
print(count)
noanswer_counter()
In [14]:
def mult_sent_answer_counter():
count = 0
for article in aug_data['data']:
for para in article['paragraphs']:
for qa in para['qas']:
for answer in qa['answers']:
text = answer['text']
word_start = answer['answer_word_start']
word_stop = answer['answer_word_stop']
if word_start is not None and word_start[0] != word_stop[0]:
count += 1
print(count)
mult_sent_answer_counter()
In [ ]:
In [ ]:
In [ ]: