In [1]:
from utils import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
import string
import re
import pickle
from tqdm import tqdm
# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]
BASE_DIR = '.'
GLOVE_DIR = BASE_DIR + '/data/glove.6B/'
RAW_DATA_FN = BASE_DIR + '/data/train.csv'
MAX_SEQUENCE_LENGTH = 40 # 1000
VALIDATION_SPLIT = 0.2
# Output
OUTPUT_DIR = BASE_DIR + '/data/output'
WORD_INDEX_FN = os.path.join(OUTPUT_DIR, 'word_index.pckl')
EMB_INDEX_FN = os.path.join(OUTPUT_DIR, 'emb_index.pckl')
H5DATA_FN = os.path.join(OUTPUT_DIR, 'data.300.h5')
# Model Output
MODEL_NAME = '/yyy'
OUTPUT_DIR_MODEL = OUTPUT_DIR + MODEL_NAME
WEIGHTS_FN = OUTPUT_DIR_MODEL + '/weights.h5'
MODEL_FN = OUTPUT_DIR_MODEL + '/model.json'
MODEL_INTERRUPTED_FN = OUTPUT_DIR_MODEL + '/model_interrupted.h5'
CSV_FN = OUTPUT_DIR_MODEL + '/log.csv'
PLOT_MODEL_FN = OUTPUT_DIR_MODEL + '/model.png'
PLOT_BASIS_MODEL_FN = OUTPUT_DIR_MODEL + '/basis_model.png'
MONITOR_DISTANCE_FN = OUTPUT_DIR_MODEL + '/monitor_distance.csv'
In [3]:
def restore_sentence(array, ind2word):
s = []
for i in array:
if i != 0:
s.append(ind2word[i])
return " ".join(s)
In [7]:
df = pd.read_csv(RAW_DATA_FN, nrows=1000)
#df = pd.read_csv(RAW_DATA_FN, encoding="ISO-8859-1")
In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(df['question1']) + list(df['question2'])
tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)
# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
Out[8]:
In [18]:
word2tfidf;
In [19]:
# Set up spaCy
#from spacy.en import English
#parser = English()
import spacy
nlp = spacy.load('en')
In [20]:
vecs1 = []
for qu in tqdm(list(df['question1'])):
doc = nlp(qu)
mean_vec = np.zeros([len(doc), 300])
for word in doc:
# word2vec
vec = word.vector
# fetch df score
try:
idf = word2tfidf[str(word)]
except:
#print word
idf = 0
# compute final vec
mean_vec += vec * idf
mean_vec = mean_vec.mean(axis=0)
vecs1.append(mean_vec)
df['q1_feats'] = list(vecs1)
In [4]:
import argparse
parser = argparse.ArgumentParser(description="programpurpose")
parser.add_argument("-s", "--sample", help="run on sample", action="store_true")
args = parser.parse_args(["-s"])
limit = 10
df = pd.read_csv(RAW_DATA_FN, encoding="ISO-8859-1")
y_train = df['is_duplicate'].values
q1, q2 = [], [] # list of text samples
qid = df['id'].values
q1 = df['question1'].values
q2 = df['question2'].fillna("").values # non duplicates can be nan
if args.sample:
q1 = q1[:limit]
q2 = q2[:limit]
df = df[:limit]
texts = np.concatenate([q1, q2], axis=0)
texts.shape
df.head()
Out[4]:
Out[4]:
Out[4]:
In [5]:
# Set up spaCy
#from spacy.en import English
#parser = English()
import spacy
nlp = spacy.load('en')
In [6]:
w2f = {lex.orth_: lex.rank for lex in nlp.vocab}
f2w = {lex.rank: lex.orth_ for lex in nlp.vocab}
In [115]:
nlp.vocab[0].orth_, nlp.vocab[0].rank
Out[115]:
In [116]:
[f2w[i] for i in range(1,19)]
Out[116]:
In [7]:
def get_embeddings(vocab):
max_rank = max(lex.rank for lex in vocab if lex.has_vector)
# rank+1 because index 0 is NULL
vectors = np.zeros((max_rank+1, vocab.vectors_length), dtype='float32')
for lex in vocab:
if lex.has_vector:
vectors[lex.rank] = lex.vector
return len(vectors), vectors
#num_words, embedding_matrix = get_embMatrix(word_index, embeddings_index)
num_words, embedding_matrix = get_embeddings(nlp.vocab)
embedding_matrix.shape
Out[7]:
In [56]:
%%time
# vectorize the questions
vecs1 = [doc.vector for doc in nlp.pipe(df['question1'], n_threads=50)]
vecs1 = np.array(vecs1)
df['q1_feats'] = list(vecs1)
vecs2 = [doc.vector for doc in nlp.pipe(df['question2'], n_threads=50)]
vecs2 = np.array(vecs2)
df['q2_feats'] = list(vecs2)
# save features
pd.to_pickle(df, 'data/1_df.pkl')
In [65]:
df = pd.read_pickle('data/1_df.pkl')
df.head()
df.ix[1, 'question2']
Out[65]:
Out[65]:
In [158]:
def get_featuresSpacy(docs, max_length):
docs = list(docs)
word2ind = dict()
Xs = np.zeros((len(docs), max_length), dtype='int32')
for i, doc in enumerate(docs):
j = 0
for token in doc:
token = nlp.vocab[token.lower_]
word2ind[token.orth_] = token.rank
if token.has_vector and not token.is_punct and not token.is_space:
Xs[i, j] = token.rank # frequency rank, 0: no vector existent?
j += 1
if j >= max_length:
break
log.info('Found %s unique tokens.' % len(word2ind))
with open(WORD_INDEX_FN, 'wb') as f:
pickle.dump(word2ind, f)
log.info("Saved", data=WORD_INDEX_FN, vocab=len(word2ind))
return Xs, word2ind
In [159]:
data, word2ind = get_featuresSpacy(list(nlp.pipe(texts, n_threads=40, batch_size=60000)), MAX_SEQUENCE_LENGTH)
data.shape
Out[159]:
In [146]:
word2ind
Out[146]:
In [105]:
data[0]
Out[105]:
In [69]:
# tokenize the questions
q1_docs = list(nlp.pipe(df['question1'], batch_size=5000, n_threads=8))
#dev_docs = list(nlp.pipe(dev_texts, batch_size=5000, n_threads=3))
with open('q1_docs.pckl', 'wb') as f:
pickle.dump(q1_docs, f)
In [70]:
# tokenize the questions
q2_docs = list(nlp.pipe(df['question2'], batch_size=5000, n_threads=8))
#dev_docs = list(nlp.pipe(dev_texts, batch_size=5000, n_threads=3))
with open('q2_docs.pckl', 'wb') as f:
pickle.dump(q2_docs, f)
In [71]:
len(q1_docs), len(q2_docs)
assert len(q1_docs) == len(q2_docs)
Out[71]:
In [74]:
q1_feats = get_features(q1_docs, 50)
q1_feats.shape
q1_feats[0]
q1_docs[0]
Out[74]:
Out[74]:
Out[74]:
In [76]:
# frequency == rank != index in vocab
w = q1_docs[0][0]
w, w.rank
nlp.vocab['What'].orth
Out[76]:
Out[76]:
In [77]:
# how to index and rank relate
nlp.vocab[727].orth_
nlp.vocab[727].rank
Out[77]:
Out[77]:
In [80]:
w2f['What'], f2w[142]
Out[80]:
In [93]:
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
In [147]:
def get_featuresKeras(texts, max_length):
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word2ind = tokenizer.word_index
#ind2word = {v: k for k, v in word2ind.items()}
log.info('Found %s unique tokens.' % len(word2ind))
with open(WORD_INDEX_FN, 'wb') as f:
pickle.dump(word2ind, f)
log.info("Saved", data=WORD_INDEX_FN, vocab=len(word2ind))
#Xs = pad_sequences(sequences, maxlen=max_length)
Xs = pad_sequences(sequences, maxlen=None)
return Xs, word2ind
In [148]:
#data = get_featuresSpacy(list(nlp.pipe(texts, n_threads=40, batch_size=60000)), MAX_SEQUENCE_LENGTH)
data, word2indK = get_featuresKeras(texts, MAX_SEQUENCE_LENGTH)
data.shape
Out[148]:
In [149]:
word2indK
Out[149]:
In [160]:
# find the difference between Keras and SpaCy
set(word2ind.keys()) - set(word2indK.keys())
set(word2indK.keys()) - set(word2ind.keys())
Out[160]:
Out[160]:
In [157]:
[s for s in texts if s.find('oxidee') != 0]
Out[157]:
In [110]:
data[0]
Out[110]:
In [81]:
def index_emb(args):
"""
1. build index, mapping words in the embeddings set to their embedding vector
"""
log.info('Indexing word vectors.')
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
log.info('Found word vectors.', n=len(embeddings_index))
with open(EMB_INDEX_FN, 'wb') as f:
pickle.dump(embeddings_index, f)
log.info("Saved", fn=EMB_INDEX_FN)
return embeddings_index
In [91]:
# embedding_matrix[0] == 0
# no offset of 1 needed as seen in some examples
def get_embMatrix(word_index, embeddings_index):
'''
compute an index, mapping words to known embeddings, by parsing the data dump of pre-trained embeddings
'''
log.info('Preparing embedding matrix.')
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
if i >= MAX_NB_WORDS:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be/stay all-zeros.
embedding_matrix[i] = embedding_vector
return num_words, embedding_matrix
In [83]:
embeddings_index = index_emb(None)
In [87]:
with open(WORD_INDEX_FN, 'rb') as f:
word2ind = pickle.load(f)
with open(EMB_INDEX_FN, 'rb') as f:
embeddings_index = pickle.load(f)
len(word2ind), len(embeddings_index)
Out[87]:
In [85]:
i = 0
for k,v in word2ind.items():
print(k, v)
i += 1
if i == 10:
break
word2ind["paperback'"]
Out[85]:
In [42]:
embeddings_index['hallo']
Out[42]:
In [90]:
num_wordsK, embedding_matrixK = get_embMatrix(word2ind, embeddings_index)
embedding_matrixK.shape
Out[90]: