In [1]:
import json

aug_data_path = "/Users/minjoons/data/squad/dev-v1.0-aug.json"
aug_data = json.load(open(aug_data_path, 'r'))

In [17]:
def compare_answers():
    for article in aug_data['data']:
        for para in article['paragraphs']:
            deps = para['deps']
            nodess = []
            for dep in deps:
                nodes, edges = dep
                if dep is not None:
                    nodess.append(nodes)
                else:
                    nodess.append([])
            wordss = [[node[0] for node in nodes] for nodes in nodess]
            for qa in para['qas']:
                for answer in qa['answers']:
                    text = answer['text']
                    word_start = answer['answer_word_start']
                    word_stop = answer['answer_word_stop']
                    answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]
                    yield answer_words, text

ca = compare_answers()
print(next(ca))
print(next(ca))
print(next(ca))
print(next(ca))


(['Denver', 'Broncos'], 'Denver Broncos')
(['Denver', 'Broncos'], 'Denver Broncos')
(['Denver', 'Broncos'], 'Denver Broncos ')
(['Carolina', 'Panthers'], 'Carolina Panthers')

In [18]:
def counter():
    count = 0
    for article in aug_data['data']:
        for para in article['paragraphs']:
            deps = para['deps']
            nodess = []
            for dep in deps:
                if dep is None:
                    count += 1
    print(count)
counter()


8

In [19]:
def bad_node_counter():
    count = 0
    for article in aug_data['data']:
        for para in article['paragraphs']:
            sents = para['sents']
            deps = para['deps']
            nodess = []
            for dep in deps:
                if dep is not None:
                    nodes, edges = dep
                    for node in nodes:
                        if len(node) != 5:
                            count += 1
    print(count)
bad_node_counter()


0

In [20]:
def noanswer_counter():
    count = 0
    for article in aug_data['data']:
        for para in article['paragraphs']:
            deps = para['deps']
            nodess = []
            for dep in deps:
                if dep is not None:
                    nodes, edges = dep
                    nodess.append(nodes)
                else:
                    nodess.append([])
            wordss = [[node[0] for node in nodes] for nodes in nodess]
            for qa in para['qas']:
                for answer in qa['answers']:
                    text = answer['text']
                    word_start = answer['answer_word_start']
                    word_stop = answer['answer_word_stop']
                    if word_start is None:
                        count += 1
    print(count)
noanswer_counter()


7

In [22]:
print(sum(len(para['qas']) for a in aug_data['data'] for para in a['paragraphs']))


10600

In [5]:
import nltk

def _set_span(t, i):
    if isinstance(t[0], str):
        t.span = (i, i+len(t))
    else:
        first = True
        for c in t:
            cur_span = _set_span(c, i)
            i = cur_span[1]
            if first:
                min_ = cur_span[0]
                first = False
        max_ = cur_span[1]
        t.span = (min_, max_)
    return t.span


def set_span(t):
    assert isinstance(t, nltk.tree.Tree)
    try:
        return _set_span(t, 0)
    except:
        print(t)
        exit()

def same_span_counter():
    count = 0
    for article in aug_data['data']:
        for para in article['paragraphs']:
            consts = para['consts']
            for const in consts:
                tree = nltk.tree.Tree.fromstring(const)
                set_span(tree)
                if len(list(tree.subtrees())) > len(set(t.span for t in tree.subtrees())):
                    count += 1
    print(count)
same_span_counter()


10348

In [ ]: