In [1]:
import json
aug_data_path = "/Users/minjoons/data/squad/dev-v1.0-aug.json"
aug_data = json.load(open(aug_data_path, 'r'))
In [17]:
def compare_answers():
for article in aug_data['data']:
for para in article['paragraphs']:
deps = para['deps']
nodess = []
for dep in deps:
nodes, edges = dep
if dep is not None:
nodess.append(nodes)
else:
nodess.append([])
wordss = [[node[0] for node in nodes] for nodes in nodess]
for qa in para['qas']:
for answer in qa['answers']:
text = answer['text']
word_start = answer['answer_word_start']
word_stop = answer['answer_word_stop']
answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]
yield answer_words, text
ca = compare_answers()
print(next(ca))
print(next(ca))
print(next(ca))
print(next(ca))
In [18]:
def counter():
count = 0
for article in aug_data['data']:
for para in article['paragraphs']:
deps = para['deps']
nodess = []
for dep in deps:
if dep is None:
count += 1
print(count)
counter()
In [19]:
def bad_node_counter():
count = 0
for article in aug_data['data']:
for para in article['paragraphs']:
sents = para['sents']
deps = para['deps']
nodess = []
for dep in deps:
if dep is not None:
nodes, edges = dep
for node in nodes:
if len(node) != 5:
count += 1
print(count)
bad_node_counter()
In [20]:
def noanswer_counter():
count = 0
for article in aug_data['data']:
for para in article['paragraphs']:
deps = para['deps']
nodess = []
for dep in deps:
if dep is not None:
nodes, edges = dep
nodess.append(nodes)
else:
nodess.append([])
wordss = [[node[0] for node in nodes] for nodes in nodess]
for qa in para['qas']:
for answer in qa['answers']:
text = answer['text']
word_start = answer['answer_word_start']
word_stop = answer['answer_word_stop']
if word_start is None:
count += 1
print(count)
noanswer_counter()
In [22]:
print(sum(len(para['qas']) for a in aug_data['data'] for para in a['paragraphs']))
In [5]:
import nltk
def _set_span(t, i):
if isinstance(t[0], str):
t.span = (i, i+len(t))
else:
first = True
for c in t:
cur_span = _set_span(c, i)
i = cur_span[1]
if first:
min_ = cur_span[0]
first = False
max_ = cur_span[1]
t.span = (min_, max_)
return t.span
def set_span(t):
assert isinstance(t, nltk.tree.Tree)
try:
return _set_span(t, 0)
except:
print(t)
exit()
def same_span_counter():
count = 0
for article in aug_data['data']:
for para in article['paragraphs']:
consts = para['consts']
for const in consts:
tree = nltk.tree.Tree.fromstring(const)
set_span(tree)
if len(list(tree.subtrees())) > len(set(t.span for t in tree.subtrees())):
count += 1
print(count)
same_span_counter()
In [ ]: