In [1]:
import os
import json
import requests
from load_from_wiki import load_data

In [2]:
def processing_squad_data(dataset):
    raw_data = []
    for data in dataset['data']:
        paragraphs = data['paragraphs']       
        for paragraph in paragraphs:
            para_ques_dict = {}
            para_ques_dict['Passages'] = paragraph['context'].lower()
            ques_list = []
            for questions in paragraph['qas']:
                ques_list.append(questions['question'])
            para_ques_dict['Question'] = list(set(ques_list)) 
            raw_data.append(para_ques_dict)
    return raw_data

In [3]:
def combine_squad_dev_train():    
    with open('../data/dev-v1.1.json') as data_file:
        dataset = json.load(data_file)
        dev_set = processing_squad_data(dataset)
    with open('../data/train-v1.1.json') as data_file:
        dataset = json.load(data_file)
        train_set = processing_squad_data(dataset)
    dev_set.extend(train_set)
    with open('../data/squad_data.json', 'w') as outfile:
        json.dump(dev_set , outfile)

In [1]:
def combine_squad_data():
    """Merges two files squad_data and wiki_data and generates an merged file squad_wiki_data.json  """
    with open('../data/dev-v1.1.json') as data_file:
        dataset = json.load(data_file)
    with open('../data/train-v1.1.json') as data_file:
        dataset1 = json.load(data_file)
    dataset = processing_squad_data(dataset)
    dataset1 = processing_squad_data(dataset1)
    final_dict = {}
    final_para = []
    final_question = []
    for data in dataset:
        final_para.append(data['Passages'])
        final_question.extend(data['Question'])
    for data in dataset1:
        final_para.append(data['Passages'])
        final_question.extend(data['Question'])
    final_dict['Paragraph'] = ''.join(final_para)
    final_dict['Question'] = final_question
    final_data = []
    final_data.append(final_dict)
    with open('../data/combined_squad_data.json','w') as outfile:
        json.dump(final_data , outfile)

In [4]:
def merge_file():
    """Merges two files squad_data and wiki_data and generates an merged file squad_wiki_data.json  """
    with open('../data/squad_data.json') as data_file:
        dataset1 = json.load(data_file)
    with open('../data/wiki_data.json') as data_file:
        dataset2 = json.load(data_file)
    final_dict = {}
    final_para = []
    final_question = []
    for data in dataset1:
        final_para.append(data['Passages'])
        final_question.extend(data['Question'])
    for data in dataset2:
        final_para.append(data['Passage'])
        final_question.extend(data['Question'])
    final_dict['Paragraph'] = ''.join(final_para)
    final_dict['Question'] = final_question
    final_data = []
    final_data.append(final_dict)
    with open('../data/squad_wiki_data.json','w') as outfile:
        json.dump(final_data , outfile)

In [5]:
def load_squad_wiki_data():
    if not os.path.isfile("../data/squad_wiki_data.json"):
        # Check if the train-v1.1.json exists
        if not os.path.isfile("../data/train-v1.1.json"):
            print("Loading Squad Training Data")
            response = requests.get("https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json")
            with open("../data/train-v1.1.json", "wb") as outfile:
                for data in response.iter_content():
                    outfile.write(data)

        # Check if the dev-v1.1.json exists
        if not os.path.isfile("../data/dev-v1.1.json"):
            print("Loading Squad Dev Data")
            response = requests.get("https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json")
            with open("../data/dev-v1.1.json", "wb") as outfile:
                for data in response.iter_content():
                    outfile.write(data)

        # Check if the squad_data exists if not generate squad_data.json
        if not os.path.isfile("../data/squad_data.json"):
            print("Combining Squad Data")
            combine_squad_dev_train()

        # Check if the wiki_data exists else call the respective script to load it
        if not os.path.isfile("../data/wiki_data.json"):
            print("Loading Wiki Data")
            load_data()

        merge_file()

In [2]:
def get_squad_wiki_data():
    print("Loading Squad Data")
    load_squad_wiki_data()
    with open("../data/squad_wiki_data.json", "r") as dataset:
        squad_wiki_data = json.load(dataset)
    return squad_wiki_data

In [3]:
def get_squad_data():
    print("Combining Squad Data")
    combine_squad_data()
    with open("../data/combined_squad_data.json", "r") as dataset:
        squad_data = json.load(dataset)
    return squad_data

In [7]:
# data = get_squad_wiki_data()

In [8]:
# type(data[0]["Question"])

In [ ]:


In [ ]: