In [ ]:
import numpy as np
import json
import time
import argparse
import _pickle as pickle
from os import path
from tqdm import tqdm
import sys
from utils import CoreNLP_path
#from stanford_corenlp_pywrapper import CoreNLP
from gensim.models import KeyedVectors
from tokenizer import CoreNLPTokenizer
import multiprocessing
from multiprocessing import Pool
from multiprocessing.util import Finalize
from functools import partial
Using TensorFlow backend.
In [2]:
def word2vec(word2vec_path):
model = KeyedVectors.load_word2vec_format(word2vec_path)
def get_word_vector(word):
return model[word]
except KeyError:
return np.zeros(model.vector_size)
return get_word_vector
In [3]:
print('Reading SQuAD data... ', end='')
with open('../../data/train_parsed.json') as fd:
samples = json.load(fd)
Reading SQuAD data... Done!
In [4]:
print('Initiating CoreNLP service connection... ', end='')
tokenizer = CoreNLPTokenizer(classpath='/home/anatoly/stanford-corenlp-full-2017-06-09/*', annatators='pos, ner, lemma')
Initiating CoreNLP service connection... Done!
In [5]:
cpus = multiprocessing.cpu_count()
except NotImplementedError:
cpus = 2 # arbitrary default
In [6]:
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
In [12]:
class Tokenizer(object):
def __init__(self, cpus):
self.cpus = cpus
def worker(self, arr):
t = CoreNLPTokenizer(classpath='/home/anatoly/stanford-corenlp-full-2017-06-09/*')
return [t.tokenize(sample) for sample in arr]
def tokenize(self, arr):
chunked = chunks(arr, round(len(arr) / self.cpus))
p = Pool(self.cpus)
nested_list =, chunked)
return [val for sublist in nested_list for val in sublist]
t = Tokenizer(4)
t.tokenize([sample['context'] for sample in samples[0:10]])
[[0, 15],
[15, 16],
[17, 20],
[21, 27],
[28, 31],
[32, 33],
[34, 42],
[43, 52],
[52, 53],
[54, 58],
[59, 62],
[63, 67],
[68, 76],
[76, 78],
[79, 83],
[84, 88],
[89, 91],
[92, 93],
[94, 100],
[101, 107],
[108, 110],
[111, 114],
[115, 121],
[122, 126],
[126, 127],
[128, 139],
[140, 142],
[143, 148],
[149, 151],
[152, 155],
[156, 160],
[161, 169],
[170, 173],
[174, 180],
[181, 183],
[183, 184],
[185, 187],
[188, 189],
[190, 196],
[197, 203],
[204, 206],
[207, 213],
[214, 218],
[219, 223],
[224, 232],
[233, 237],
[238, 241],
[242, 248],
[249, 250],
[250, 256],
[257, 259],
[260, 262],
[263, 268],
[268, 269],
[269, 270],
[271, 275],
[276, 278],
[279, 282],
[283, 287],
[288, 296],
[297, 299],
[300, 303],
[304, 312],
[313, 315],
[316, 319],
[320, 326],
[327, 332],
[332, 333],
[334, 345],
[346, 352],
[353, 356],
[357, 365],
[366, 368],
[369, 372],
[373, 379],
[379, 380],
[381, 382],
[383, 389],
[390, 395],
[396, 398],
[399, 405],
[406, 409],
[410, 420],
[420, 421],
[422, 424],
[425, 427],
[428, 429],
[430, 437],
[438, 440],
[441, 444],
[445, 451],
[452, 454],
[455, 462],
[462, 463],
[464, 470],
[471, 476],
[477, 480],
[481, 487],
[488, 492],
[493, 502],
[503, 511],
[512, 514],
[515, 520],
[521, 531],
[532, 541],
[542, 544],
[545, 549],
[549, 550],
[551, 553],
[554, 557],
[558, 561],
[562, 564],
[565, 568],
[569, 573],
[574, 579],
[580, 581],
[581, 584],
[585, 587],
[588, 589],
[590, 596],
[597, 601],
[602, 606],
[607, 615],
[616, 623],
[624, 625],
[626, 633],
[634, 637],
[638, 641],
[642, 646],
[647, 651],
[651, 652],
[652, 653],
[654, 656],
[657, 658],
[659, 665],
[665, 666],
[667, 673],
[674, 679],
[680, 686],
[687, 689],
[690, 694],
[694, 695]]),
[[0, 15],
[15, 16],
[17, 20],
[21, 27],
[28, 31],
[32, 33],
[34, 42],
[43, 52],
[52, 53],
[54, 58],
[59, 62],
[63, 67],
[68, 76],
[76, 78],
[79, 83],
[84, 88],
[89, 91],
[92, 93],
[94, 100],
[101, 107],
[108, 110],
[111, 114],
[115, 121],
[122, 126],
[126, 127],
[128, 139],
[140, 142],
[143, 148],
[149, 151],
[152, 155],
[156, 160],
[161, 169],
[170, 173],
[174, 180],
[181, 183],
[183, 184],
[185, 187],
[188, 189],
[190, 196],
[197, 203],
[204, 206],
[207, 213],
[214, 218],
[219, 223],
[224, 232],
[233, 237],
[238, 241],
[242, 248],
[249, 250],
[250, 256],
[257, 259],
[260, 262],
[263, 268],
[268, 269],
[269, 270],
[271, 275],
[276, 278],
[279, 282],
[283, 287],
[288, 296],
[297, 299],
[300, 303],
[304, 312],
[313, 315],
[316, 319],
[320, 326],
[327, 332],
[332, 333],
[334, 345],
[346, 352],
[353, 356],
[357, 365],
[366, 368],
[369, 372],
[373, 379],
[379, 380],
[381, 382],
[383, 389],
[390, 395],
[396, 398],
[399, 405],
[406, 409],
[410, 420],
[420, 421],
[422, 424],
[425, 427],
[428, 429],
[430, 437],
[438, 440],
[441, 444],
[445, 451],
[452, 454],
[455, 462],
[462, 463],
[464, 470],
[471, 476],
[477, 480],
[481, 487],
[488, 492],
[493, 502],
[503, 511],
[512, 514],
[515, 520],
[521, 531],
[532, 541],
[542, 544],
[545, 549],
[549, 550],
[551, 553],
[554, 557],
[558, 561],
[562, 564],
[565, 568],
[569, 573],
[574, 579],
[580, 581],
[581, 584],
[585, 587],
[588, 589],
[590, 596],
[597, 601],
[602, 606],
[607, 615],
[616, 623],
[624, 625],
[626, 633],
[634, 637],
[638, 641],
[642, 646],
[647, 651],
[651, 652],
[652, 653],
[654, 656],
[657, 658],
[659, 665],
[665, 666],
[667, 673],
[674, 679],
[680, 686],
[687, 689],
[690, 694],
[694, 695]]),
[[0, 15],
[15, 16],
[17, 20],
[21, 27],
[28, 31],
[32, 33],
[34, 42],
[43, 52],
[52, 53],
[54, 58],
[59, 62],
[63, 67],
[68, 76],
[76, 78],
[79, 83],
[84, 88],
[89, 91],
[92, 93],
[94, 100],
[101, 107],
[108, 110],
[111, 114],
[115, 121],
[122, 126],
[126, 127],
[128, 139],
[140, 142],
[143, 148],
[149, 151],
[152, 155],
[156, 160],
[161, 169],
[170, 173],
[174, 180],
[181, 183],
[183, 184],
[185, 187],
[188, 189],
[190, 196],
[197, 203],
[204, 206],
[207, 213],
[214, 218],
[219, 223],
[224, 232],
[233, 237],
[238, 241],
[242, 248],
[249, 250],
[250, 256],
[257, 259],
[260, 262],
[263, 268],
[268, 269],
[269, 270],
[271, 275],
[276, 278],
[279, 282],
[283, 287],
[288, 296],
[297, 299],
[300, 303],
[304, 312],
[313, 315],
[316, 319],
[320, 326],
[327, 332],
[332, 333],
[334, 345],
[346, 352],
[353, 356],
[357, 365],
[366, 368],
[369, 372],
[373, 379],
[379, 380],
[381, 382],
[383, 389],
[390, 395],
[396, 398],
[399, 405],
[406, 409],
[410, 420],
[420, 421],
[422, 424],
[425, 427],
[428, 429],
[430, 437],
[438, 440],
[441, 444],
[445, 451],
[452, 454],
[455, 462],
[462, 463],
[464, 470],
[471, 476],
[477, 480],
[481, 487],
[488, 492],
[493, 502],
[503, 511],
[512, 514],
[515, 520],
[521, 531],
[532, 541],
[542, 544],
[545, 549],
[549, 550],
[551, 553],
[554, 557],
[558, 561],
[562, 564],
[565, 568],
[569, 573],
[574, 579],
[580, 581],
[581, 584],
[585, 587],
[588, 589],
[590, 596],
[597, 601],
[602, 606],
[607, 615],
[616, 623],
[624, 625],
[626, 633],
[634, 637],
[638, 641],
[642, 646],
[647, 651],
[651, 652],
[652, 653],
[654, 656],
[657, 658],
[659, 665],
[665, 666],
[667, 673],
[674, 679],
[680, 686],
[687, 689],
[690, 694],
[694, 695]]),
[[0, 15],
[15, 16],
[17, 20],
[21, 27],
[28, 31],
[32, 33],
[34, 42],
[43, 52],
[52, 53],
[54, 58],
[59, 62],
[63, 67],
[68, 76],
[76, 78],
[79, 83],
[84, 88],
[89, 91],
[92, 93],
[94, 100],
[101, 107],
[108, 110],
[111, 114],
[115, 121],
[122, 126],
[126, 127],
[128, 139],
[140, 142],
[143, 148],
[149, 151],
[152, 155],
[156, 160],
[161, 169],
[170, 173],
[174, 180],
[181, 183],
[183, 184],
[185, 187],
[188, 189],
[190, 196],
[197, 203],
[204, 206],
[207, 213],
[214, 218],
[219, 223],
[224, 232],
[233, 237],
[238, 241],
[242, 248],
[249, 250],
[250, 256],
[257, 259],
[260, 262],
[263, 268],
[268, 269],
[269, 270],
[271, 275],
[276, 278],
[279, 282],
[283, 287],
[288, 296],
[297, 299],
[300, 303],
[304, 312],
[313, 315],
[316, 319],
[320, 326],
[327, 332],
[332, 333],
[334, 345],
[346, 352],
[353, 356],
[357, 365],
[366, 368],
[369, 372],
[373, 379],
[379, 380],
[381, 382],
[383, 389],
[390, 395],
[396, 398],
[399, 405],
[406, 409],
[410, 420],
[420, 421],
[422, 424],
[425, 427],
[428, 429],
[430, 437],
[438, 440],
[441, 444],
[445, 451],
[452, 454],
[455, 462],
[462, 463],
[464, 470],
[471, 476],
[477, 480],
[481, 487],
[488, 492],
[493, 502],
[503, 511],
[512, 514],
[515, 520],
[521, 531],
[532, 541],
[542, 544],
[545, 549],
[549, 550],
[551, 553],
[554, 557],
[558, 561],
[562, 564],
[565, 568],
[569, 573],
[574, 579],
[580, 581],
[581, 584],
[585, 587],
[588, 589],
[590, 596],
[597, 601],
[602, 606],
[607, 615],
[616, 623],
[624, 625],
[626, 633],
[634, 637],
[638, 641],
[642, 646],
[647, 651],
[651, 652],
[652, 653],
[654, 656],
[657, 658],
[659, 665],
[665, 666],
[667, 673],
[674, 679],
[680, 686],
[687, 689],
[690, 694],
[694, 695]]),
[[0, 15],
[15, 16],
[17, 20],
[21, 27],
[28, 31],
[32, 33],
[34, 42],
[43, 52],
[52, 53],
[54, 58],
[59, 62],
[63, 67],
[68, 76],
[76, 78],
[79, 83],
[84, 88],
[89, 91],
[92, 93],
[94, 100],
[101, 107],
[108, 110],
[111, 114],
[115, 121],
[122, 126],
[126, 127],
[128, 139],
[140, 142],
[143, 148],
[149, 151],
[152, 155],
[156, 160],
[161, 169],
[170, 173],
[174, 180],
[181, 183],
[183, 184],
[185, 187],
[188, 189],
[190, 196],
[197, 203],
[204, 206],
[207, 213],
[214, 218],
[219, 223],
[224, 232],
[233, 237],
[238, 241],
[242, 248],
[249, 250],
[250, 256],
[257, 259],
[260, 262],
[263, 268],
[268, 269],
[269, 270],
[271, 275],
[276, 278],
[279, 282],
[283, 287],
[288, 296],
[297, 299],
[300, 303],
[304, 312],
[313, 315],
[316, 319],
[320, 326],
[327, 332],
[332, 333],
[334, 345],
[346, 352],
[353, 356],
[357, 365],
[366, 368],
[369, 372],
[373, 379],
[379, 380],
[381, 382],
[383, 389],
[390, 395],
[396, 398],
[399, 405],
[406, 409],
[410, 420],
[420, 421],
[422, 424],
[425, 427],
[428, 429],
[430, 437],
[438, 440],
[441, 444],
[445, 451],
[452, 454],
[455, 462],
[462, 463],
[464, 470],
[471, 476],
[477, 480],
[481, 487],
[488, 492],
[493, 502],
[503, 511],
[512, 514],
[515, 520],
[521, 531],
[532, 541],
[542, 544],
[545, 549],
[549, 550],
[551, 553],
[554, 557],
[558, 561],
[562, 564],
[565, 568],
[569, 573],
[574, 579],
[580, 581],
[581, 584],
[585, 587],
[588, 589],
[590, 596],
[597, 601],
[602, 606],
[607, 615],
[616, 623],
[624, 625],
[626, 633],
[634, 637],
[638, 641],
[642, 646],
[647, 651],
[651, 652],
[652, 653],
[654, 656],
[657, 658],
[659, 665],
[665, 666],
[667, 673],
[674, 679],
[680, 686],
[687, 689],
[690, 694],
[694, 695]]),
[[0, 2],
[3, 5],
[6, 10],
[11, 16],
[17, 29],
[29, 30],
[31, 36],
[37, 41],
[41, 43],
[44, 52],
[53, 56],
[57, 58],
[59, 65],
[66, 68],
[69, 73],
[74, 79],
[80, 87],
[87, 88],
[89, 92],
[93, 97],
[98, 109],
[110, 117],
[118, 125],
[126, 131],
[132, 142],
[142, 143],
[144, 148],
[149, 150],
[151, 156],
[157, 160],
[161, 171],
[172, 179],
[179, 180],
[181, 184],
[185, 192],
[193, 202],
[203, 206],
[207, 215],
[215, 216],
[217, 222],
[223, 225],
[226, 227],
[228, 236],
[237, 244],
[245, 247],
[248, 257],
[258, 262],
[262, 263],
[264, 267],
[268, 278],
[279, 287],
[288, 290],
[291, 297],
[298, 303],
[304, 311],
[312, 315],
[316, 322],
[323, 325],
[326, 328],
[329, 332],
[333, 339],
[340, 350],
[351, 361],
[362, 373],
[374, 376],
[377, 380],
[381, 387],
[388, 394],
[394, 395],
[396, 399],
[400, 405],
[406, 414],
[414, 415],
[416, 419],
[420, 427],
[427, 428],
[429, 431],
[432, 440],
[441, 446],
[447, 448],
[449, 453],
[454, 457],
[458, 465],
[466, 468],
[469, 476],
[477, 487],
[488, 491],
[492, 499],
[499, 500],
[501, 504],
[505, 509],
[510, 518],
[519, 521],
[522, 531],
[532, 540],
[540, 541],
[542, 545],
[546, 556],
[557, 561],
[562, 569],
[570, 581],
[582, 591],
[591, 592],
[593, 597],
[598, 601],
[602, 610],
[611, 620],
[621, 626],
[627, 630],
[631, 637],
[638, 647],
[648, 658],
[659, 662],
[663, 668],
[669, 673],
[673, 674],
[675, 678],
[679, 686],
[687, 689],
[690, 698],
[699, 703],
[704, 708],
[709, 714],
[715, 719],
[720, 723],
[724, 729],
[730, 734],
[734, 736],
[737, 744],
[744, 745],
[746, 752],
[753, 763],
[764, 767],
[768, 771],
[772, 776],
[776, 777],
[778, 781],
[782, 790],
[791, 793],
[794, 796],
[797, 808],
[809, 820],
[821, 824],
[825, 829],
[830, 833],
[834, 838],
[839, 840],
[841, 848],
[849, 856],
[857, 859],
[860, 863],
[864, 873],
[874, 883],
[884, 888],
[889, 892],
[893, 903],
[903, 904],
[905, 907],
[908, 912],
[912, 913],
[914, 918],
[919, 923],
[924, 932],
[933, 941],
[942, 946],
[947, 950],
[951, 959],
[960, 965],
[966, 968],
[969, 973],
[974, 975],
[976, 988],
[989, 993],
[993, 994],
[995, 996],
[997, 1004],
[1005, 1014],
[1014, 1015],
[1016, 1022],
[1023, 1028],
[1029, 1032],
[1033, 1042],
[1042, 1043],
[1044, 1052],
[1052, 1053],
[1054, 1056],
[1057, 1061],
[1061, 1062],
[1063, 1067],
[1068, 1073],
[1074, 1082],
[1083, 1091],
[1092, 1096],
[1097, 1100],
[1101, 1106],
[1107, 1113],
[1114, 1115],
[1116, 1123],
[1124, 1128],
[1128, 1129],
[1130, 1133],
[1134, 1146],
[1147, 1152],
[1153, 1158],
[1159, 1164],
[1165, 1169],
[1170, 1174],
[1175, 1185],
[1185, 1186],
[1187, 1194],
[1195, 1200],
[1201, 1203],
[1204, 1213],
[1214, 1216],
[1217, 1222],
[1223, 1225],
[1226, 1229],
[1230, 1238],
[1238, 1239],
[1240, 1247],
[1247, 1248],
[1249, 1252],
[1253, 1258],
[1259, 1262],
[1263, 1274],
[1275, 1277],
[1278, 1281],
[1282, 1290],
[1290, 1291],
[1292, 1299],
[1299, 1300],
[1301, 1303],
[1304, 1310],
[1311, 1315],
[1316, 1318],
[1319, 1332],
[1333, 1340],
[1341, 1344],
[1345, 1354],
[1355, 1362],
[1363, 1371],
[1371, 1372],
[1373, 1379],
[1380, 1388],
[1388, 1389],
[1390, 1394],
[1395, 1398],
[1399, 1404],
[1404, 1405]]),
[[0, 2],
[3, 5],
[6, 10],
[11, 16],
[17, 29],
[29, 30],
[31, 36],
[37, 41],
[41, 43],
[44, 52],
[53, 56],
[57, 58],
[59, 65],
[66, 68],
[69, 73],
[74, 79],
[80, 87],
[87, 88],
[89, 92],
[93, 97],
[98, 109],
[110, 117],
[118, 125],
[126, 131],
[132, 142],
[142, 143],
[144, 148],
[149, 150],
[151, 156],
[157, 160],
[161, 171],
[172, 179],
[179, 180],
[181, 184],
[185, 192],
[193, 202],
[203, 206],
[207, 215],
[215, 216],
[217, 222],
[223, 225],
[226, 227],
[228, 236],
[237, 244],
[245, 247],
[248, 257],
[258, 262],
[262, 263],
[264, 267],
[268, 278],
[279, 287],
[288, 290],
[291, 297],
[298, 303],
[304, 311],
[312, 315],
[316, 322],
[323, 325],
[326, 328],
[329, 332],
[333, 339],
[340, 350],
[351, 361],
[362, 373],
[374, 376],
[377, 380],
[381, 387],
[388, 394],
[394, 395],
[396, 399],
[400, 405],
[406, 414],
[414, 415],
[416, 419],
[420, 427],
[427, 428],
[429, 431],
[432, 440],
[441, 446],
[447, 448],
[449, 453],
[454, 457],
[458, 465],
[466, 468],
[469, 476],
[477, 487],
[488, 491],
[492, 499],
[499, 500],
[501, 504],
[505, 509],
[510, 518],
[519, 521],
[522, 531],
[532, 540],
[540, 541],
[542, 545],
[546, 556],
[557, 561],
[562, 569],
[570, 581],
[582, 591],
[591, 592],
[593, 597],
[598, 601],
[602, 610],
[611, 620],
[621, 626],
[627, 630],
[631, 637],
[638, 647],
[648, 658],
[659, 662],
[663, 668],
[669, 673],
[673, 674],
[675, 678],
[679, 686],
[687, 689],
[690, 698],
[699, 703],
[704, 708],
[709, 714],
[715, 719],
[720, 723],
[724, 729],
[730, 734],
[734, 736],
[737, 744],
[744, 745],
[746, 752],
[753, 763],
[764, 767],
[768, 771],
[772, 776],
[776, 777],
[778, 781],
[782, 790],
[791, 793],
[794, 796],
[797, 808],
[809, 820],
[821, 824],
[825, 829],
[830, 833],
[834, 838],
[839, 840],
[841, 848],
[849, 856],
[857, 859],
[860, 863],
[864, 873],
[874, 883],
[884, 888],
[889, 892],
[893, 903],
[903, 904],
[905, 907],
[908, 912],
[912, 913],
[914, 918],
[919, 923],
[924, 932],
[933, 941],
[942, 946],
[947, 950],
[951, 959],
[960, 965],
[966, 968],
[969, 973],
[974, 975],
[976, 988],
[989, 993],
[993, 994],
[995, 996],
[997, 1004],
[1005, 1014],
[1014, 1015],
[1016, 1022],
[1023, 1028],
[1029, 1032],
[1033, 1042],
[1042, 1043],
[1044, 1052],
[1052, 1053],
[1054, 1056],
[1057, 1061],
[1061, 1062],
[1063, 1067],
[1068, 1073],
[1074, 1082],
[1083, 1091],
[1092, 1096],
[1097, 1100],
[1101, 1106],
[1107, 1113],
[1114, 1115],
[1116, 1123],
[1124, 1128],
[1128, 1129],
[1130, 1133],
[1134, 1146],
[1147, 1152],
[1153, 1158],
[1159, 1164],
[1165, 1169],
[1170, 1174],
[1175, 1185],
[1185, 1186],
[1187, 1194],
[1195, 1200],
[1201, 1203],
[1204, 1213],
[1214, 1216],
[1217, 1222],
[1223, 1225],
[1226, 1229],
[1230, 1238],
[1238, 1239],
[1240, 1247],
[1247, 1248],
[1249, 1252],
[1253, 1258],
[1259, 1262],
[1263, 1274],
[1275, 1277],
[1278, 1281],
[1282, 1290],
[1290, 1291],
[1292, 1299],
[1299, 1300],
[1301, 1303],
[1304, 1310],
[1311, 1315],
[1316, 1318],
[1319, 1332],
[1333, 1340],
[1341, 1344],
[1345, 1354],
[1355, 1362],
[1363, 1371],
[1371, 1372],
[1373, 1379],
[1380, 1388],
[1388, 1389],
[1390, 1394],
[1395, 1398],
[1399, 1404],
[1404, 1405]]),
[[0, 2],
[3, 5],
[6, 10],
[11, 16],
[17, 29],
[29, 30],
[31, 36],
[37, 41],
[41, 43],
[44, 52],
[53, 56],
[57, 58],
[59, 65],
[66, 68],
[69, 73],
[74, 79],
[80, 87],
[87, 88],
[89, 92],
[93, 97],
[98, 109],
[110, 117],
[118, 125],
[126, 131],
[132, 142],
[142, 143],
[144, 148],
[149, 150],
[151, 156],
[157, 160],
[161, 171],
[172, 179],
[179, 180],
[181, 184],
[185, 192],
[193, 202],
[203, 206],
[207, 215],
[215, 216],
[217, 222],
[223, 225],
[226, 227],
[228, 236],
[237, 244],
[245, 247],
[248, 257],
[258, 262],
[262, 263],
[264, 267],
[268, 278],
[279, 287],
[288, 290],
[291, 297],
[298, 303],
[304, 311],
[312, 315],
[316, 322],
[323, 325],
[326, 328],
[329, 332],
[333, 339],
[340, 350],
[351, 361],
[362, 373],
[374, 376],
[377, 380],
[381, 387],
[388, 394],
[394, 395],
[396, 399],
[400, 405],
[406, 414],
[414, 415],
[416, 419],
[420, 427],
[427, 428],
[429, 431],
[432, 440],
[441, 446],
[447, 448],
[449, 453],
[454, 457],
[458, 465],
[466, 468],
[469, 476],
[477, 487],
[488, 491],
[492, 499],
[499, 500],
[501, 504],
[505, 509],
[510, 518],
[519, 521],
[522, 531],
[532, 540],
[540, 541],
[542, 545],
[546, 556],
[557, 561],
[562, 569],
[570, 581],
[582, 591],
[591, 592],
[593, 597],
[598, 601],
[602, 610],
[611, 620],
[621, 626],
[627, 630],
[631, 637],
[638, 647],
[648, 658],
[659, 662],
[663, 668],
[669, 673],
[673, 674],
[675, 678],
[679, 686],
[687, 689],
[690, 698],
[699, 703],
[704, 708],
[709, 714],
[715, 719],
[720, 723],
[724, 729],
[730, 734],
[734, 736],
[737, 744],
[744, 745],
[746, 752],
[753, 763],
[764, 767],
[768, 771],
[772, 776],
[776, 777],
[778, 781],
[782, 790],
[791, 793],
[794, 796],
[797, 808],
[809, 820],
[821, 824],
[825, 829],
[830, 833],
[834, 838],
[839, 840],
[841, 848],
[849, 856],
[857, 859],
[860, 863],
[864, 873],
[874, 883],
[884, 888],
[889, 892],
[893, 903],
[903, 904],
[905, 907],
[908, 912],
[912, 913],
[914, 918],
[919, 923],
[924, 932],
[933, 941],
[942, 946],
[947, 950],
[951, 959],
[960, 965],
[966, 968],
[969, 973],
[974, 975],
[976, 988],
[989, 993],
[993, 994],
[995, 996],
[997, 1004],
[1005, 1014],
[1014, 1015],
[1016, 1022],
[1023, 1028],
[1029, 1032],
[1033, 1042],
[1042, 1043],
[1044, 1052],
[1052, 1053],
[1054, 1056],
[1057, 1061],
[1061, 1062],
[1063, 1067],
[1068, 1073],
[1074, 1082],
[1083, 1091],
[1092, 1096],
[1097, 1100],
[1101, 1106],
[1107, 1113],
[1114, 1115],
[1116, 1123],
[1124, 1128],
[1128, 1129],
[1130, 1133],
[1134, 1146],
[1147, 1152],
[1153, 1158],
[1159, 1164],
[1165, 1169],
[1170, 1174],
[1175, 1185],
[1185, 1186],
[1187, 1194],
[1195, 1200],
[1201, 1203],
[1204, 1213],
[1214, 1216],
[1217, 1222],
[1223, 1225],
[1226, 1229],
[1230, 1238],
[1238, 1239],
[1240, 1247],
[1247, 1248],
[1249, 1252],
[1253, 1258],
[1259, 1262],
[1263, 1274],
[1275, 1277],
[1278, 1281],
[1282, 1290],
[1290, 1291],
[1292, 1299],
[1299, 1300],
[1301, 1303],
[1304, 1310],
[1311, 1315],
[1316, 1318],
[1319, 1332],
[1333, 1340],
[1341, 1344],
[1345, 1354],
[1355, 1362],
[1363, 1371],
[1371, 1372],
[1373, 1379],
[1380, 1388],
[1388, 1389],
[1390, 1394],
[1395, 1398],
[1399, 1404],
[1404, 1405]]),
[[0, 2],
[3, 5],
[6, 10],
[11, 16],
[17, 29],
[29, 30],
[31, 36],
[37, 41],
[41, 43],
[44, 52],
[53, 56],
[57, 58],
[59, 65],
[66, 68],
[69, 73],
[74, 79],
[80, 87],
[87, 88],
[89, 92],
[93, 97],
[98, 109],
[110, 117],
[118, 125],
[126, 131],
[132, 142],
[142, 143],
[144, 148],
[149, 150],
[151, 156],
[157, 160],
[161, 171],
[172, 179],
[179, 180],
[181, 184],
[185, 192],
[193, 202],
[203, 206],
[207, 215],
[215, 216],
[217, 222],
[223, 225],
[226, 227],
[228, 236],
[237, 244],
[245, 247],
[248, 257],
[258, 262],
[262, 263],
[264, 267],
[268, 278],
[279, 287],
[288, 290],
[291, 297],
[298, 303],
[304, 311],
[312, 315],
[316, 322],
[323, 325],
[326, 328],
[329, 332],
[333, 339],
[340, 350],
[351, 361],
[362, 373],
[374, 376],
[377, 380],
[381, 387],
[388, 394],
[394, 395],
[396, 399],
[400, 405],
[406, 414],
[414, 415],
[416, 419],
[420, 427],
[427, 428],
[429, 431],
[432, 440],
[441, 446],
[447, 448],
[449, 453],
[454, 457],
[458, 465],
[466, 468],
[469, 476],
[477, 487],
[488, 491],
[492, 499],
[499, 500],
[501, 504],
[505, 509],
[510, 518],
[519, 521],
[522, 531],
[532, 540],
[540, 541],
[542, 545],
[546, 556],
[557, 561],
[562, 569],
[570, 581],
[582, 591],
[591, 592],
[593, 597],
[598, 601],
[602, 610],
[611, 620],
[621, 626],
[627, 630],
[631, 637],
[638, 647],
[648, 658],
[659, 662],
[663, 668],
[669, 673],
[673, 674],
[675, 678],
[679, 686],
[687, 689],
[690, 698],
[699, 703],
[704, 708],
[709, 714],
[715, 719],
[720, 723],
[724, 729],
[730, 734],
[734, 736],
[737, 744],
[744, 745],
[746, 752],
[753, 763],
[764, 767],
[768, 771],
[772, 776],
[776, 777],
[778, 781],
[782, 790],
[791, 793],
[794, 796],
[797, 808],
[809, 820],
[821, 824],
[825, 829],
[830, 833],
[834, 838],
[839, 840],
[841, 848],
[849, 856],
[857, 859],
[860, 863],
[864, 873],
[874, 883],
[884, 888],
[889, 892],
[893, 903],
[903, 904],
[905, 907],
[908, 912],
[912, 913],
[914, 918],
[919, 923],
[924, 932],
[933, 941],
[942, 946],
[947, 950],
[951, 959],
[960, 965],
[966, 968],
[969, 973],
[974, 975],
[976, 988],
[989, 993],
[993, 994],
[995, 996],
[997, 1004],
[1005, 1014],
[1014, 1015],
[1016, 1022],
[1023, 1028],
[1029, 1032],
[1033, 1042],
[1042, 1043],
[1044, 1052],
[1052, 1053],
[1054, 1056],
[1057, 1061],
[1061, 1062],
[1063, 1067],
[1068, 1073],
[1074, 1082],
[1083, 1091],
[1092, 1096],
[1097, 1100],
[1101, 1106],
[1107, 1113],
[1114, 1115],
[1116, 1123],
[1124, 1128],
[1128, 1129],
[1130, 1133],
[1134, 1146],
[1147, 1152],
[1153, 1158],
[1159, 1164],
[1165, 1169],
[1170, 1174],
[1175, 1185],
[1185, 1186],
[1187, 1194],
[1195, 1200],
[1201, 1203],
[1204, 1213],
[1214, 1216],
[1217, 1222],
[1223, 1225],
[1226, 1229],
[1230, 1238],
[1238, 1239],
[1240, 1247],
[1247, 1248],
[1249, 1252],
[1253, 1258],
[1259, 1262],
[1263, 1274],
[1275, 1277],
[1278, 1281],
[1282, 1290],
[1290, 1291],
[1292, 1299],
[1299, 1300],
[1301, 1303],
[1304, 1310],
[1311, 1315],
[1316, 1318],
[1319, 1332],
[1333, 1340],
[1341, 1344],
[1345, 1354],
[1355, 1362],
[1363, 1371],
[1371, 1372],
[1373, 1379],
[1380, 1388],
[1388, 1389],
[1390, 1394],
[1395, 1398],
[1399, 1404],
[1404, 1405]]),
[[0, 2],
[3, 5],
[6, 10],
[11, 16],
[17, 29],
[29, 30],
[31, 36],
[37, 41],
[41, 43],
[44, 52],
[53, 56],
[57, 58],
[59, 65],
[66, 68],
[69, 73],
[74, 79],
[80, 87],
[87, 88],
[89, 92],
[93, 97],
[98, 109],
[110, 117],
[118, 125],
[126, 131],
[132, 142],
[142, 143],
[144, 148],
[149, 150],
[151, 156],
[157, 160],
[161, 171],
[172, 179],
[179, 180],
[181, 184],
[185, 192],
[193, 202],
[203, 206],
[207, 215],
[215, 216],
[217, 222],
[223, 225],
[226, 227],
[228, 236],
[237, 244],
[245, 247],
[248, 257],
[258, 262],
[262, 263],
[264, 267],
[268, 278],
[279, 287],
[288, 290],
[291, 297],
[298, 303],
[304, 311],
[312, 315],
[316, 322],
[323, 325],
[326, 328],
[329, 332],
[333, 339],
[340, 350],
[351, 361],
[362, 373],
[374, 376],
[377, 380],
[381, 387],
[388, 394],
[394, 395],
[396, 399],
[400, 405],
[406, 414],
[414, 415],
[416, 419],
[420, 427],
[427, 428],
[429, 431],
[432, 440],
[441, 446],
[447, 448],
[449, 453],
[454, 457],
[458, 465],
[466, 468],
[469, 476],
[477, 487],
[488, 491],
[492, 499],
[499, 500],
[501, 504],
[505, 509],
[510, 518],
[519, 521],
[522, 531],
[532, 540],
[540, 541],
[542, 545],
[546, 556],
[557, 561],
[562, 569],
[570, 581],
[582, 591],
[591, 592],
[593, 597],
[598, 601],
[602, 610],
[611, 620],
[621, 626],
[627, 630],
[631, 637],
[638, 647],
[648, 658],
[659, 662],
[663, 668],
[669, 673],
[673, 674],
[675, 678],
[679, 686],
[687, 689],
[690, 698],
[699, 703],
[704, 708],
[709, 714],
[715, 719],
[720, 723],
[724, 729],
[730, 734],
[734, 736],
[737, 744],
[744, 745],
[746, 752],
[753, 763],
[764, 767],
[768, 771],
[772, 776],
[776, 777],
[778, 781],
[782, 790],
[791, 793],
[794, 796],
[797, 808],
[809, 820],
[821, 824],
[825, 829],
[830, 833],
[834, 838],
[839, 840],
[841, 848],
[849, 856],
[857, 859],
[860, 863],
[864, 873],
[874, 883],
[884, 888],
[889, 892],
[893, 903],
[903, 904],
[905, 907],
[908, 912],
[912, 913],
[914, 918],
[919, 923],
[924, 932],
[933, 941],
[942, 946],
[947, 950],
[951, 959],
[960, 965],
[966, 968],
[969, 973],
[974, 975],
[976, 988],
[989, 993],
[993, 994],
[995, 996],
[997, 1004],
[1005, 1014],
[1014, 1015],
[1016, 1022],
[1023, 1028],
[1029, 1032],
[1033, 1042],
[1042, 1043],
[1044, 1052],
[1052, 1053],
[1054, 1056],
[1057, 1061],
[1061, 1062],
[1063, 1067],
[1068, 1073],
[1074, 1082],
[1083, 1091],
[1092, 1096],
[1097, 1100],
[1101, 1106],
[1107, 1113],
[1114, 1115],
[1116, 1123],
[1124, 1128],
[1128, 1129],
[1130, 1133],
[1134, 1146],
[1147, 1152],
[1153, 1158],
[1159, 1164],
[1165, 1169],
[1170, 1174],
[1175, 1185],
[1185, 1186],
[1187, 1194],
[1195, 1200],
[1201, 1203],
[1204, 1213],
[1214, 1216],
[1217, 1222],
[1223, 1225],
[1226, 1229],
[1230, 1238],
[1238, 1239],
[1240, 1247],
[1247, 1248],
[1249, 1252],
[1253, 1258],
[1259, 1262],
[1263, 1274],
[1275, 1277],
[1278, 1281],
[1282, 1290],
[1290, 1291],
[1292, 1299],
[1299, 1300],
[1301, 1303],
[1304, 1310],
[1311, 1315],
[1316, 1318],
[1319, 1332],
[1333, 1340],
[1341, 1344],
[1345, 1354],
[1355, 1362],
[1363, 1371],
[1371, 1372],
[1373, 1379],
[1380, 1388],
[1388, 1389],
[1390, 1394],
[1395, 1398],
[1399, 1404],
[1404, 1405]])]
In [ ]:
print('Tokenizing dataset with CoreNLP using pool of workers')
x = [tokenizer.tokenize(sample['context']) for sample in tqdm(samples[0:10])]
In [ ]:
In [ ]:
Content source: AnatoliiPotapov/squad
Similar notebooks: