In [3]:
import sys
sys.path.append('../../')

In [4]:
sys.path


Out[4]:
['',
 '/usr/lib/python35.zip',
 '/usr/lib/python3.5',
 '/usr/lib/python3.5/plat-x86_64-linux-gnu',
 '/usr/lib/python3.5/lib-dynload',
 '/home/anatoly/.local/lib/python3.5/site-packages',
 '/usr/local/lib/python3.5/dist-packages',
 '/home/anatoly/ParlAI',
 '/usr/local/lib/python3.5/dist-packages/numpy-1.13.1-py3.5-linux-x86_64.egg',
 '/usr/lib/python3/dist-packages',
 '/home/anatoly/.local/lib/python3.5/site-packages/IPython/extensions',
 '/home/anatoly/.ipython',
 '../../']

In [5]:
from preprocessing import tokenizer

In [6]:
# from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import _pickle as pickle
import argparse
import json
from collections import Counter
import multiprocessing
from multiprocessing import Pool

import numpy as np
from gensim.models import KeyedVectors
from tqdm import tqdm


Using TensorFlow backend.

In [7]:
def word2vec(word2vec_path):
    model = KeyedVectors.load_word2vec_format(word2vec_path)

    def get_word_vector(word):
        try:
            return model[word]
        except KeyError:
            return np.zeros(model.vector_size)

    return get_word_vector

In [19]:
class FeatureDict(object):

    def __init__(self):
        try:
            self.load()
        except:
            self.feature_dict = {}

    def add_data(self, data):
        for example in data:
            for token in example['question_tokens']+example['context_tokens']:
                if not (token[3] == None): self.add_feature('pos='+token[3])
                #if not (token[4] == None): self.add_feature(token[4])  # To many lemma features
                if not (token[5] == None): self.add_feature('ner='+token[5])

    def add_feature(self, feature):
        if not self.feature_dict.get(feature):
            self.feature_dict[feature] = len(self.feature_dict)

    def _to_id(self, feature):
        return self.feature_dict[feature]

    def save(self):
        with open('../../data/feature_dict.pkl', 'wb') as fd:
            pickle.dump(self.feature_dict, fd)

    def load(self):
        with open('../../data/feature_dict.pkl', 'rb') as f:
            self.feature_dict = pickle.load(f, encoding='iso-8859-1')

    def renumerate(self):
        keys = list(self.feature_dict.keys())
        self.feature_dict = {}
        for key in keys: self.feature_dict[key] = len(self.feature_dict)

In [79]:
class Vectorizer(object):

    def __init__(self, w2v_path, extra = True, use='pos, ner, wiq, tf, is_question', use_qc = (True, False)):
        self.word_vector = word2vec(w2v_path)
        self.dict = FeatureDict()
        self.use = use
        self.extra = extra
        self.use_qc = use_qc

        keys = list(self.dict.feature_dict.keys())

        if not 'pos' in use:
            for key in keys:
                if 'pos' in key:
                    self.dict.feature_dict.pop(key, None)

        if not 'ner' in use:
            for key in keys:
                if 'ner' in key:
                    self.dict.feature_dict.pop(key, None)

        self.dict.renumerate()

        if 'tf' in use:
            self.dict.add_feature('tf')
            self.dict.add_feature('tf_rev')

        if 'wiq' in use:
            self.dict.add_feature('in_question')
            self.dict.add_feature('in_question_uncased')
            self.dict.add_feature('in_question_lemma')


    def extra_features(self, sample):

        context_features = np.zeros((len(sample['context_tokens']), len(self.dict.feature_dict)))
        question_features = np.zeros((len(sample['question_tokens']), len(self.dict.feature_dict)))

        def wiq(features, question=False):

            if not question:
                q_words_cased = {w for w in sample['question']}
                q_words_uncased = {w.lower() for w in sample['question']}
                q_lemma = {w[4] for w in sample['question_tokens']} if 'lemma' in self.use else None

                for i in range(len(sample['context_tokens'])):
                    if sample['context_tokens'][i][0] in q_words_cased:
                        features[i][self.dict.feature_dict['in_question']] = 1.0
                    if sample['context_tokens'][i][0].lower() in q_words_uncased:
                        features[i][self.dict.feature_dict['in_question_uncased']] = 1.0
                    if q_lemma and sample['context_tokens'][i] in q_lemma:
                        features[i][self.dict.feature_dict['in_question_lemma']] = 1.0

        def pos(features, question=False):
            tokens = 'context_tokens'
            if question:
                tokens = 'question_tokens'
            for i, w in enumerate(sample[tokens]):
                f = 'pos=%s' % w[3]
                if f in self.dict.feature_dict:
                    features[i][self.dict.feature_dict[f]] = 1.0

        def ner(features, question=False):
            tokens = 'context_tokens'
            if question:
                tokens = 'question_tokens'
            for i, w in enumerate(sample[tokens]):
                f = 'pos=%s' % w[5]
                if f in self.dict.feature_dict:
                    features[i][self.dict.feature_dict[f]] = 1.0

        def tf(features, question=False):
            tokens = 'context_tokens'
            if question:
                tokens = 'question_tokens'
            counter = Counter([w[0].lower() for w in sample[tokens]])
            l = len(sample[tokens])
            for i, w in enumerate(sample[tokens]):
                features[i][self.dict.feature_dict['tf']] = counter[w[0].lower()] * 1.0 / l
                features[i][self.dict.feature_dict['tf_rev']] = l / (counter[w[0].lower()] + 1.0)

        if self.use_qc[0]:
            if 'pos' in self.use:
                pos(context_features)
            if 'ner' in self.use:
                ner(context_features)
            if 'tf' in self.use:
                ner(context_features)
            if 'wiq' in self.use:
                wiq(context_features)
        else:
            context_features = None

        if self.use_qc[1]:
            if 'pos' in self.use:
                pos(question_features, True)
            if 'ner' in self.use:
                ner(question_features, True)
            if 'tf' in self.use:
                ner(question_features, True)
            if 'wiq' in self.use:
                wiq(question_features, True)
        else:
            question_features = None


        return [context_features, question_features]

    def to_vector(self, sample, need_answer = True):

        context_vecs = [self.word_vector(token[0]) for token in sample['context_tokens']]
        context_vecs = np.vstack(context_vecs).astype(np.float32)

        question_vecs = [self.word_vector(token[0]) for token in sample['question_tokens']]
        question_vecs = np.vstack(question_vecs).astype(np.float32)


        if self.extra:
            context_extra, question_exta = self.extra_features(sample)
            if self.use_qc[0]:
                context_vecs = np.hstack((context_vecs, context_extra))
            if self.use_qc[1]:
                question_vecs = np.hstack((question_vecs, question_exta))

        if need_answer:

            context_char_offsets = [token[2] for token in sample['context_tokens']]

            try:
                answer_start, answer_end = sample['answer_start'], sample['answer_end']

                answer_start = [answer_start >= s and answer_start < e
                                for s, e in context_char_offsets].index(True)
                answer_end = [answer_end >= s and answer_end < e
                              for s, e in context_char_offsets].index(True)
            except ValueError:
                return None

            return [[context_vecs, question_vecs], [answer_start, answer_end]]

        else:
            return [context_vecs, question_vecs]

In [80]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

class Preprocessor(object):

    def __init__(self, w2v_path, use, use_qc, cpus=4, need_answers=True):
        self.cpus = cpus
        self.use = use
        self.w2v_path = w2v_path
        self.use_qc = use_qc

    def worker(self, arr):
        vectorizer = Vectorizer(w2v_path=self.w2v_path, extra=False, use=self.use, use_qc=self.use_qc)
        return [vectorizer.to_vector(sample) for sample in arr]

    def preprocess(self, samples):
        if len(samples) < 10000:
            samples = [sample for sample in self.worker(samples) if sample is not None]
        else:
            chunked = chunks(samples, round(len(samples) / self.cpus))
            p = Pool(self.cpus)
            nested_list = p.map(self.worker, chunked)
            samples = [val for sublist in nested_list for val in sublist if val is not None]

        # Transpose
        data = [[[], []],
                [[], []]]

        for sample in samples:
            data[0][0].append(sample[0][0])
            data[0][1].append(sample[0][1])
            data[1][0].append(sample[1][0])
            data[1][1].append(sample[1][1])

        return data

In [81]:
%%bash
ls


additional_features.ipynb
multithreading.ipynb

In [82]:
class ConsoleArgs(object):
    def __init__(self):
        self.word2vec_path = '../../data/word2vec_from_glove_300.vec'
        self.outfile = '../../data/check.pkl'
        self.data = '../../data/check_tokens.json'
        self.use = 'pos, ner, wiq, tf'

args = ConsoleArgs()

In [83]:
if not args.outfile.endswith('.pkl'):
        args.outfile += '.pkl'

    print('Reading SQuAD data... ', end='')
    with open(args.data) as fd:
        samples = json.load(fd)
    print('Done!')

    print('Making feature dict... ', end='')
    feature_dict = FeatureDict()
    feature_dict.add_data(samples)
    feature_dict.save()
    print('Done!')

    try:
        cpus = multiprocessing.cpu_count()
    except NotImplementedError:
        cpus = 2  # arbitrary default


Reading SQuAD data... Done!
Making feature dict... Done!

In [84]:
len(samples)


Out[84]:
5196

In [85]:
samples[0]


Out[85]:
{'answer': 'in the late 1990s',
 'answer_end': 285,
 'answer_start': 269,
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'context_tokens': [['When', 'When ', [0, 4], 'WRB', 'when', 'O'],
  ['did', 'did ', [5, 8], 'VBD', 'do', 'O'],
  ['Beyonce', 'Beyonce ', [9, 16], 'NNP', 'Beyonce', 'PERSON'],
  ['start', 'start ', [17, 22], 'VB', 'start', 'O'],
  ['becoming', 'becoming ', [23, 31], 'VBG', 'become', 'O'],
  ['popular', 'popular', [32, 39], 'JJ', 'popular', 'O'],
  ['?', '?', [39, 40], '.', '?', 'O']],
 'id': '56be85543aeaaa14008c9063',
 'question': 'When did Beyonce start becoming popular?',
 'question_tokens': [['When', 'When ', [0, 4], 'WRB', 'when', 'O'],
  ['did', 'did ', [5, 8], 'VBD', 'do', 'O'],
  ['Beyonce', 'Beyonce ', [9, 16], 'NNP', 'Beyonce', 'PERSON'],
  ['start', 'start ', [17, 22], 'VB', 'start', 'O'],
  ['becoming', 'becoming ', [23, 31], 'VBG', 'become', 'O'],
  ['popular', 'popular', [32, 39], 'JJ', 'popular', 'O'],
  ['?', '?', [39, 40], '.', '?', 'O']],
 'topic': 'Beyoncé'}

In [86]:
print('Processing SQuAD data... ', end='')
prepro = Preprocessor(w2v_path=args.word2vec_path, cpus=cpus, use=args.use, use_qc=(True, True))
data = prepro.preprocess(samples[0:5])
print('Done!')


Processing SQuAD data... Done!

In [76]:
data


Out[76]:
[[[array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
            0.        ,  0.        ],
          [ 0.12859   ,  0.25046   , -0.55467999, ..., -0.32962999,
            0.25841999,  0.30136001],
          [ 0.        ,  0.        ,  0.        , ...,  0.        ,
            0.        ,  0.        ],
          ..., 
          [ 0.23217   ,  0.065479  ,  0.66214001, ..., -0.34689999,
           -0.31128001,  0.011083  ],
          [ 0.10346   , -0.12694   ,  0.62326002, ..., -0.21291   ,
           -0.58504999, -0.21844   ],
          [-0.0833    , -0.20896   , -0.043623  , ..., -0.17745   ,
            0.055793  ,  0.80125999]], dtype=float32),
   array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
            0.        ,  0.        ],
          [-0.083992  ,  0.039025  , -0.065901  , ..., -0.35866001,
           -0.043888  , -0.27428001],
          [ 0.12859   ,  0.25046   , -0.55467999, ..., -0.32962999,
            0.25841999,  0.30136001],
          ..., 
          [-0.62721997,  0.11828   ,  0.70923001, ..., -0.23074999,
           -0.15401   , -0.68997997],
          [-0.26144999,  0.33976001, -0.095768  , ..., -0.28389001,
           -0.17033   ,  0.1158    ],
          [-0.0833    , -0.20896   , -0.043623  , ..., -0.17745   ,
            0.055793  ,  0.80125999]], dtype=float32),
   array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
            0.        ,  0.        ],
          [ 0.12859   ,  0.25046   , -0.55467999, ..., -0.32962999,
            0.25841999,  0.30136001],
          [ 0.        ,  0.        ,  0.        , ...,  0.        ,
            0.        ,  0.        ],
          ..., 
          [ 0.36574   , -0.11894   , -0.74641001, ..., -0.37042001,
           -0.04782   ,  0.14322001],
          [ 0.05803   , -0.027674  , -0.23252   , ...,  0.16777   ,
           -0.81129003,  0.093747  ],
          [-0.0833    , -0.20896   , -0.043623  , ..., -0.17745   ,
            0.055793  ,  0.80125999]], dtype=float32),
   array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
            0.        ,  0.        ],
          [-0.20017   ,  0.14302   ,  0.052055  , ...,  0.034939  ,
           -0.12599   ,  0.21863   ],
          [-0.28657001, -0.25597   , -0.17669   , ..., -0.46542999,
           -0.58546001,  0.17404   ],
          ..., 
          [-1.12409997,  0.14786001, -0.16429   , ..., -0.30471   ,
            0.22741   , -0.22732   ],
          [-0.26144999,  0.33976001, -0.095768  , ..., -0.28389001,
           -0.17033   ,  0.1158    ],
          [-0.0833    , -0.20896   , -0.043623  , ..., -0.17745   ,
            0.055793  ,  0.80125999]], dtype=float32),
   array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
            0.        ,  0.        ],
          [-0.22232001,  0.23856001, -0.048047  , ..., -0.32701001,
           -0.053744  , -0.41824001],
          [ 0.20830999, -0.055975  ,  0.32861999, ..., -0.16778   ,
            0.061694  , -0.086137  ],
          ..., 
          [-0.0085603 ,  0.094092  ,  0.33443999, ..., -0.41503999,
           -0.55855   ,  0.12241   ],
          [ 0.48642001,  0.23928   , -0.082238  , ..., -0.12247   ,
           -0.39736   ,  0.047344  ],
          [-0.0833    , -0.20896   , -0.043623  , ..., -0.17745   ,
            0.055793  ,  0.80125999]], dtype=float32)],
  [array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
            0.        ,  0.        ],
          [ 0.12859   ,  0.25046   , -0.55467999, ..., -0.32962999,
            0.25841999,  0.30136001],
          [ 0.        ,  0.        ,  0.        , ...,  0.        ,
            0.        ,  0.        ],
          ..., 
          [ 0.23217   ,  0.065479  ,  0.66214001, ..., -0.34689999,
           -0.31128001,  0.011083  ],
          [ 0.10346   , -0.12694   ,  0.62326002, ..., -0.21291   ,
           -0.58504999, -0.21844   ],
          [-0.0833    , -0.20896   , -0.043623  , ..., -0.17745   ,
            0.055793  ,  0.80125999]], dtype=float32),
   array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
            0.        ,  0.        ],
          [-0.083992  ,  0.039025  , -0.065901  , ..., -0.35866001,
           -0.043888  , -0.27428001],
          [ 0.12859   ,  0.25046   , -0.55467999, ..., -0.32962999,
            0.25841999,  0.30136001],
          ..., 
          [-0.62721997,  0.11828   ,  0.70923001, ..., -0.23074999,
           -0.15401   , -0.68997997],
          [-0.26144999,  0.33976001, -0.095768  , ..., -0.28389001,
           -0.17033   ,  0.1158    ],
          [-0.0833    , -0.20896   , -0.043623  , ..., -0.17745   ,
            0.055793  ,  0.80125999]], dtype=float32),
   array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
            0.        ,  0.        ],
          [ 0.12859   ,  0.25046   , -0.55467999, ..., -0.32962999,
            0.25841999,  0.30136001],
          [ 0.        ,  0.        ,  0.        , ...,  0.        ,
            0.        ,  0.        ],
          ..., 
          [ 0.36574   , -0.11894   , -0.74641001, ..., -0.37042001,
           -0.04782   ,  0.14322001],
          [ 0.05803   , -0.027674  , -0.23252   , ...,  0.16777   ,
           -0.81129003,  0.093747  ],
          [-0.0833    , -0.20896   , -0.043623  , ..., -0.17745   ,
            0.055793  ,  0.80125999]], dtype=float32),
   array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
            0.        ,  0.        ],
          [-0.20017   ,  0.14302   ,  0.052055  , ...,  0.034939  ,
           -0.12599   ,  0.21863   ],
          [-0.28657001, -0.25597   , -0.17669   , ..., -0.46542999,
           -0.58546001,  0.17404   ],
          ..., 
          [-1.12409997,  0.14786001, -0.16429   , ..., -0.30471   ,
            0.22741   , -0.22732   ],
          [-0.26144999,  0.33976001, -0.095768  , ..., -0.28389001,
           -0.17033   ,  0.1158    ],
          [-0.0833    , -0.20896   , -0.043623  , ..., -0.17745   ,
            0.055793  ,  0.80125999]], dtype=float32),
   array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
            0.        ,  0.        ],
          [-0.22232001,  0.23856001, -0.048047  , ..., -0.32701001,
           -0.053744  , -0.41824001],
          [ 0.20830999, -0.055975  ,  0.32861999, ..., -0.16778   ,
            0.061694  , -0.086137  ],
          ..., 
          [-0.0085603 ,  0.094092  ,  0.33443999, ..., -0.41503999,
           -0.55855   ,  0.12241   ],
          [ 0.48642001,  0.23928   , -0.082238  , ..., -0.12247   ,
           -0.39736   ,  0.047344  ],
          [-0.0833    , -0.20896   , -0.043623  , ..., -0.17745   ,
            0.055793  ,  0.80125999]], dtype=float32)]],
 [[269, 207, 526, 166, 276], [285, 225, 529, 179, 285]]]

In [78]:
data[0][0][0].shape


Out[78]:
(7, 300)

In [58]:
nested_list[2][105][0][1].shape


Out[58]:
(12, 300)

In [ ]: