Special thanks to Irina Golzmann for help with technical part.
You will require nltk v3.2 to solve this assignment
It is really important that the version is 3.2, otherwize russian tokenizer might not work
Install/update
sudo pip install --upgrade nltk==3.2
sudo pip install --upgrade pip
If for some reason you can't or won't switch to nltk v3.2, just make sure that russian words are tokenized properly with RegeExpTokenizer.
In [ ]:
low_RAM_mode = True
very_low_RAM = False #If you have <3GB RAM, set BOTH to true
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
Ex-kaggle-competition on prohibited content detection
There goes the description - https://www.kaggle.com/c/avito-prohibited-content
High-RAM mode,
Different kinds of features:
Only 1 binary target whether or not such advertisement contains prohibited materials
In [ ]:
if not low_RAM_mode:
# a lot of ram
df = pd.read_csv("avito_train.tsv",sep='\t')
else:
#aroung 4GB ram
df = pd.read_csv("avito_train_1kk.tsv",sep='\t')
In [ ]:
print df.shape, df.is_blocked.mean()
df[:5]
In [ ]:
print "Blocked ratio",df.is_blocked.mean()
print "Count:",len(df)
In [ ]:
#downsample
< downsample data so that both classes have approximately equal ratios>
df = <downsampled dataset>
print "Blocked ratio:",df.is_blocked.mean()
print "Count:",len(df)
In [ ]:
assert df.is_blocked.mean() < 0.51
assert df.is_blocked.mean() > 0.49
assert len(df) <= 560000
print "All tests passed"
In [ ]:
#In case your RAM-o-meter is in the red
if very_low_ram:
data = data[::2]
In [ ]:
from nltk.tokenize import RegexpTokenizer
from collections import Counter,defaultdict
tokenizer = RegexpTokenizer(r"\w+")
#Dictionary of tokens
token_counts = Counter()
#All texts
all_texts = np.hstack([df.description.values,df.title.values])
#Compute token frequencies
for s in all_texts:
if type(s) is not str:
continue
s = s.decode('utf8').lower()
tokens = tokenizer.tokenize(s)
for token in tokens:
token_counts[token] +=1
In [ ]:
#Word frequency distribution, just for kicks
_=plt.hist(token_counts.values(),range=[0,50],bins=50)
In [ ]:
#Select only the tokens that had at least 10 occurences in the corpora.
#Use token_counts.
min_count = 10
tokens = <tokens from token_counts keys that had at least min_count occurences throughout the dataset>
In [ ]:
token_to_id = {t:i+1 for i,t in enumerate(tokens)}
null_token = "NULL"
token_to_id[null_token] = 0
In [ ]:
print "# Tokens:",len(token_to_id)
if len(token_to_id) < 30000:
print "Alarm! It seems like there are too few tokens. Make sure you updated NLTK and applied correct thresholds -- unless you now what you're doing, ofc"
if len(token_to_id) > 1000000:
print "Alarm! Too many tokens. You might have messed up when pruning rare ones -- unless you know what you're doin' ofc"
In [ ]:
def vectorize(strings, token_to_id, max_len=150):
token_matrix = []
for s in strings:
if type(s) is not str:
token_matrix.append([0]*max_len)
continue
s = s.decode('utf8').lower()
tokens = tokenizer.tokenize(s)
token_ids = map(lambda token: token_to_id.get(token,0), tokens)[:max_len]
token_ids += [0]*(max_len - len(token_ids))
token_matrix.append(token_ids)
return np.array(token_matrix)
In [ ]:
desc_tokens = vectorize(df.description.values,token_to_id,max_len = 150)
title_tokens = vectorize(df.title.values,token_to_id,max_len = 15)
In [ ]:
print "Размер матрицы:",title_tokens.shape
for title, tokens in zip(df.title.values[:3],title_tokens[:3]):
print title,'->', tokens[:10],'...'
As you can see, our preprocessing is somewhat crude. Let us see if that is enough for our network
In [ ]:
#All numeric features
df_numerical_features = df[["phones_cnt","emails_cnt","urls_cnt","price"]]
In [ ]:
#One-hot-encoded category and subcategory
from sklearn.feature_extraction import DictVectorizer
categories = []
data_cat_subcat = df[["category","subcategory"]].values
categories = [A list of dictionaries {"category":category_name, "subcategory":subcategory_name} for each data sample]
vectorizer = DictVectorizer(sparse=False)
cat_one_hot = vectorizer.fit_transform(categories)
cat_one_hot = pd.DataFrame(cat_one_hot,columns=vectorizer.feature_names_)
In [ ]:
df_non_text = pd.merge(
df_numerical_features,cat_one_hot,on = np.arange(len(cat_one_hot))
)
del df_non_text["key_0"]
In [ ]:
#Target variable - whether or not sample contains prohibited material
target = df.is_blocked.values.astype('int32')
#Preprocessed titles
title_tokens = title_tokens.astype('int32')
#Preprocessed tokens
desc_tokens = desc_tokens.astype('int32')
#Non-sequences
df_non_text = df_non_text.astype('float32')
In [ ]:
#Split into training and test set.
#Difficulty selector:
#Easy: split randomly
#Medium: select test set items that have item_ids strictly above that of training set
#Hard: do whatever you want, but score yourself using kaggle private leaderboard
title_tr,title_ts,desc_tr,desc_ts,nontext_tr,nontext_ts,target_tr,target_ts = <define_these_variables>
In [ ]:
save_prepared_data = True #save
read_prepared_data = False #load
#but not both at once
assert not (save_prepared_data and read_prepared_data)
if save_prepared_data:
print "Saving preprocessed data (may take up to 3 minutes)"
import pickle
with open("preprocessed_data.pcl",'w') as fout:
pickle.dump(data_tuple,fout)
with open("token_to_id.pcl",'w') as fout:
pickle.dump(token_to_id,fout)
print "готово"
elif read_prepared_data:
print "Reading saved data..."
import pickle
with open("preprocessed_data.pcl",'r') as fin:
data_tuple = pickle.load(fin)
title_tr,title_ts,desc_tr,desc_ts,nontext_tr,nontext_ts,target_tr,target_ts = data_tuple
with open("token_to_id.pcl",'r') as fin:
token_to_id = pickle.load(fin)
#Re-importing libraries to allow staring noteboook from here
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
print "done"
Since we have several data sources, our neural network may differ from what you used to work with.
These three inputs must be blended somehow - concatenated or added.
In [1]:
#libraries
import lasagne
from theano import tensor as T
import theano
In [ ]:
#3 inputs and a refere output
title_token_ids = T.matrix("title_token_ids",dtype='int32')
desc_token_ids = T.matrix("desc_token_ids",dtype='int32')
categories = T.matrix("categories",dtype='float32')
target_y = T.ivector("is_blocked")
In [ ]:
title_inp = lasagne.layers.InputLayer((None,title_tr.shape[1]),input_var=title_token_ids)
descr_inp = lasagne.layers.InputLayer((None,desc_tr.shape[1]),input_var=desc_token_ids)
cat_inp = lasagne.layers.InputLayer((None,nontext_tr.shape[1]), input_var=categories)
In [ ]:
# Descriptions
#word-wise embedding. We recommend to start from some 64 and improving after you are certain it works.
descr_nn = lasagne.layers.EmbeddingLayer(descr_inp,
input_size=len(token_to_id)+1,
output_size=?)
#reshape from [batch, time, unit] to [batch,unit,time] to allow 1d convolution over time
descr_nn = lasagne.layers.DimshuffleLayer(descr_nn, [0,2,1])
descr_nn = 1D convolution over embedding, maybe several ones in a stack
#pool over time
descr_nn = lasagne.layers.GlobalPoolLayer(descr_nn,T.max)
#Possible improvements here are adding several parallel convs with different filter sizes or stacking them the usual way
#1dconv -> 1d max pool ->1dconv and finally global pool
# Titles
title_nn = <Process titles somehow (title_inp)>
# Non-sequences
cat_nn = <Process non-sequences(cat_inp)>
In [ ]:
nn = <merge three layers into one (e.g. lasagne.layers.concat) >
nn = lasagne.layers.DenseLayer(nn,your_lucky_number)
nn = lasagne.layers.DropoutLayer(nn,p=maybe_use_me)
nn = lasagne.layers.DenseLayer(nn,1,nonlinearity=lasagne.nonlinearities.linear)
binary = True
In [ ]:
#All trainable params
weights = lasagne.layers.get_all_params(nn,trainable=True)
In [ ]:
#Simple NN prediction
prediction = lasagne.layers.get_output(nn)[:,0]
#Hinge loss
loss = lasagne.objectives.binary_hinge_loss(prediction,target_y,delta = what_do_you_think).mean()
In [ ]:
#Weight optimization step
updates = <your favorite optimizer>
In [ ]:
#deterministic version
det_prediction = lasagne.layers.get_output(nn,deterministic=True)[:,0]
#equivalent loss function
det_loss = <an excercise in copy-pasting and editing>
In [ ]:
train_fun = theano.function([desc_token_ids,title_token_ids,categories,target_y],[loss,prediction],updates = updates)
eval_fun = theano.function([desc_token_ids,title_token_ids,categories,target_y],[det_loss,det_prediction])
In [ ]:
#average precision at K
from oracle import APatK, score
In [ ]:
# Out good old minibatch iterator now supports arbitrary amount of arrays (X,y,z)
def iterate_minibatches(*arrays,**kwargs):
batchsize=kwargs.get("batchsize",100)
shuffle = kwargs.get("shuffle",True)
if shuffle:
indices = np.arange(len(arrays[0]))
np.random.shuffle(indices)
for start_idx in range(0, len(arrays[0]) - batchsize + 1, batchsize):
if shuffle:
excerpt = indices[start_idx:start_idx + batchsize]
else:
excerpt = slice(start_idx, start_idx + batchsize)
yield [arr[excerpt] for arr in arrays]
n_epochs = 10**10
and manual interrupting is still an optionTips:
With small minibatches_per_epoch, network quality may jump around 0.5 for several epochs
AUC is the most stable of all three metrics
Average Precision at top 2.5% (APatK) - is the least stable. If batch_size*minibatches_per_epoch < 10k, it behaves as a uniform random variable.
Plotting metrics over training time may be a good way to analyze which architectures work better.
Once you are sure your network aint gonna crash, it's worth letting it train for a few hours of an average laptop's time to see it's true potential
In [ ]:
from sklearn.metrics import roc_auc_score, accuracy_score
n_epochs = 100
batch_size = 100
minibatches_per_epoch = 100
for i in range(n_epochs):
#training
epoch_y_true = []
epoch_y_pred = []
b_c = b_loss = 0
for j, (b_desc,b_title,b_cat, b_y) in enumerate(
iterate_minibatches(desc_tr,title_tr,nontext_tr,target_tr,batchsize=batch_size,shuffle=True)):
if j > minibatches_per_epoch:break
loss,pred_probas = train_fun(b_desc,b_title,b_cat,b_y)
b_loss += loss
b_c +=1
epoch_y_true.append(b_y)
epoch_y_pred.append(pred_probas)
epoch_y_true = np.concatenate(epoch_y_true)
epoch_y_pred = np.concatenate(epoch_y_pred)
print "Train:"
print '\tloss:',b_loss/b_c
print '\tacc:',accuracy_score(epoch_y_true,epoch_y_pred>0.)
print '\tauc:',roc_auc_score(epoch_y_true,epoch_y_pred)
print '\tap@k:',APatK(epoch_y_true,epoch_y_pred,K = int(len(epoch_y_pred)*0.025)+1)
#evaluation
epoch_y_true = []
epoch_y_pred = []
b_c = b_loss = 0
for j, (b_desc,b_title,b_cat, b_y) in enumerate(
iterate_minibatches(desc_ts,title_ts,nontext_ts,target_ts,batchsize=batch_size,shuffle=True)):
if j > minibatches_per_epoch: break
loss,pred_probas = eval_fun(b_desc,b_title,b_cat,b_y)
b_loss += loss
b_c +=1
epoch_y_true.append(b_y)
epoch_y_pred.append(pred_probas)
epoch_y_true = np.concatenate(epoch_y_true)
epoch_y_pred = np.concatenate(epoch_y_pred)
print "Val:"
print '\tloss:',b_loss/b_c
print '\tacc:',accuracy_score(epoch_y_true,epoch_y_pred>0.)
print '\tauc:',roc_auc_score(epoch_y_true,epoch_y_pred)
print '\tap@k:',APatK(epoch_y_true,epoch_y_pred,K = int(len(epoch_y_pred)*0.025)+1)
In [2]:
print "If you are seeing this, it's time to backup your notebook. No, really, 'tis too easy to mess up everything without noticing. "
In [ ]:
#evaluation
epoch_y_true = []
epoch_y_pred = []
b_c = b_loss = 0
for j, (b_desc,b_title,b_cat, b_y) in enumerate(
iterate_minibatches(desc_ts,title_ts,nontext_ts,target_ts,batchsize=batch_size,shuffle=True)):
loss,pred_probas = eval_fun(b_desc,b_title,b_cat,b_y)
b_loss += loss
b_c +=1
epoch_y_true.append(b_y)
epoch_y_pred.append(pred_probas)
epoch_y_true = np.concatenate(epoch_y_true)
epoch_y_pred = np.concatenate(epoch_y_pred)
final_accuracy = accuracy_score(epoch_y_true,epoch_y_pred>0)
final_auc = roc_auc_score(epoch_y_true,epoch_y_pred)
final_apatk = APatK(epoch_y_true,epoch_y_pred,K = int(len(epoch_y_pred)*0.025)+1)
print "Scores:"
print '\tloss:',b_loss/b_c
print '\tacc:',final_accuracy
print '\tauc:',final_auc
print '\tap@k:',final_apatk
score(final_accuracy,final_auc,final_apatk)
Remember the training, Luke
etc etc etc
If you have background in texts, there may be a way to improve tokenizer, add some lemmatization, etc etc.
{How did you shape the monster?}
In [ ]: