In [1]:
import json

aug_data_path = "/Users/minjoons/data/squad/train-v1.0-aug.json"
aug_data = json.load(open(aug_data_path, 'r'))

In [2]:
def compare_answers():
    for article in aug_data['data']:
        for para in article['paragraphs']:
            deps = para['deps']
            nodess = []
            for dep in deps:
                nodes, edges = dep
                if dep is not None:
                    nodess.append(nodes)
                else:
                    nodess.append([])
            wordss = [[node[0] for node in nodes] for nodes in nodess]
            for qa in para['qas']:
                for answer in qa['answers']:
                    text = answer['text']
                    word_start = answer['answer_word_start']
                    word_stop = answer['answer_word_stop']
                    answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]
                    yield answer_words, text

ca = compare_answers()
print(next(ca))
print(next(ca))
print(next(ca))
print(next(ca))


(['Saint', 'Bernadette', 'Soubirous'], 'Saint Bernadette Soubirous')
(['a', 'copper', 'statue', 'of', 'Christ'], 'a copper statue of Christ')
(['the', 'Main', 'Building'], 'the Main Building')
(['a', 'Marian', 'place', 'of', 'prayer', 'and', 'reflection'], 'a Marian place of prayer and reflection')

In [11]:
def nodep_counter():
    x_count = 0
    q_count = 0
    for article in aug_data['data']:
        for para in article['paragraphs']:
            deps = para['deps']
            nodess = []
            for sent, dep in zip(para['sents'], deps):
                if dep is None:
                    print("x:", sent)
                    x_count += 1
            for qa in para['qas']:
                if qa['dep'] is None:
                    print("q:", qa['question'])
                    q_count += 1
    print(x_count, q_count)
nodep_counter()


x: .
x: .
x: .
x: .
x: .
x: .
x: .
x: .
q: k
q: j
q: n
q: b
q: v
x: .
x: :208
x: .
x: .
x: .
x: .
x: .
x: .
x: .
x: .
x: .
x: .
x: .
q: dd
q: dd
q: dd
q: dd
q: d
x: .
x: .
x: .
x: .
x: .
x: .
x: .
x: .
x: :411
x: .
x: .
x: .
x: .
x: .
x: .
x: :40
x: .
x: *
x: :14
x: .
x: .
x: .
x: :131
x: .
x: .
x: .
x: .
x: .
x: .
x: .
x: .
x: .
53 10

In [4]:
def bad_node_counter():
    count = 0
    for article in aug_data['data']:
        for para in article['paragraphs']:
            sents = para['sents']
            deps = para['deps']
            nodess = []
            for dep in deps:
                if dep is not None:
                    nodes, edges = dep
                    for node in nodes:
                        if len(node) != 5:
                            count += 1
    print(count)
bad_node_counter()


0

In [5]:
def noanswer_counter():
    count = 0
    for article in aug_data['data']:
        for para in article['paragraphs']:
            deps = para['deps']
            nodess = []
            for dep in deps:
                if dep is not None:
                    nodes, edges = dep
                    nodess.append(nodes)
                else:
                    nodess.append([])
            wordss = [[node[0] for node in nodes] for nodes in nodess]
            for qa in para['qas']:
                for answer in qa['answers']:
                    text = answer['text']
                    word_start = answer['answer_word_start']
                    word_stop = answer['answer_word_stop']
                    if word_start is None:
                        count += 1
    print(count)
noanswer_counter()


36

In [14]:
def mult_sent_answer_counter():
    count = 0
    for article in aug_data['data']:
        for para in article['paragraphs']:
            for qa in para['qas']:
                for answer in qa['answers']:
                    text = answer['text']
                    word_start = answer['answer_word_start']
                    word_stop = answer['answer_word_stop']
                    if word_start is not None and word_start[0] != word_stop[0]:
                        count += 1
    print(count)
mult_sent_answer_counter()


106

In [ ]:


In [ ]:


In [ ]: