In [1]:
#1 tf-idf
#2 td-idf + nltk.pos_tag
#3 add n-grams
#4 use microsoft nnet
#5 use keras embedding with cnn
#Previous score
# Log_loss - 10.3227065134
# Acc_loss - 0.70113037671
In [1]:
def submit(y_pred, test, filename):
sub = pd.DataFrame()
sub = pd.DataFrame()
sub['test_id'] = test['test_id']
sub['is_duplicate'] = y_test
sub.to_csv(filename, index=False)
def save_sparse_csr(filename,array):
np.savez(filename,data = array.data ,indices=array.indices,
indptr =array.indptr, shape=array.shape )
def load_sparse_csr(filename):
loader = np.load(filename)
return csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
shape = loader['shape'])
In [15]:
def correct_dataset(dataset):
dataset.loc[(dataset['question1'] == dataset['question2']), 'is_duplicate'] = 1
return dataset
def process_dataset(dataset, correct_dataset=False):
dataset['question1'].fillna(' ', inplace=True)
dataset['question2'].fillna(' ', inplace=True)
#delete punctuation
dataset['question1'] = dataset['question1'].str.replace('[^\w\s]','')
dataset['question2'] = dataset['question2'].str.replace('[^\w\s]','')
#lower questions
dataset['question1'] = dataset['question1'].str.lower()
dataset['question2'] = dataset['question2'].str.lower()
#union questions
dataset['union'] = pd.Series(dataset['question1']).str.cat(dataset['question2'], sep=' ')
if correct_dataset:
return correct_dataset(dataset)
else:
return dataset
def split_and_rem_stop_words(line):
cachedStopWords = stopwords.words("english")
return [word for word in line.split() if word not in cachedStopWords]
In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import nltk
import sklearn as sk
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from nltk.corpus import stopwords
In [4]:
train = pd.read_csv('../datasets/train.csv')
test = pd.read_csv('../datasets/test.csv')
train = process_dataset(train)
test = process_dataset(test)
In [18]:
train.head()
Out[18]:
In [7]:
TF_IDF = TfidfVectorizer(min_df=3, ngram_range=(1, 2), max_df=0.3, stop_words='english')
#TF_IDF = TF_IDF.fit(train['union'].append(test['union']).tolist())
TF_IDF = TF_IDF.fit(test['union'].tolist())
X = TF_IDF.transform(train['union'].tolist())
X_test = TF_IDF.transform(test['union'].tolist())
y = train['is_duplicate'].tolist()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=42)
In [8]:
X_train.shape
Out[8]:
In [9]:
X_test.shape
Out[9]:
In [10]:
rf = RandomForestClassifier(random_state=17, max_depth=10, n_estimators=100, n_jobs=-1, class_weight='balanced')
In [11]:
rf.fit(X=X_train, y=y_train)
Out[11]:
In [12]:
y_pred = rf.predict(X_val)
l_loss = log_loss(y_pred=y_pred, y_true=y_val)
acc_loss = accuracy_score(y_pred=y_pred, y_true=y_val)
print ('Log_loss - {}'.format(l_loss))
print ('Acc_loss - {}'.format(acc_loss))
In [14]:
rf = RandomForestClassifier(random_state=17, max_depth=10, n_estimators=100, n_jobs=-1, class_weight='balanced')
rf.fit(X=X, y=y)
y_test = rf.predict(X_test)
In [15]:
submit(y_test, test, '../submissions/1_2_gramm.csv')
In [19]:
save_sparse_csr(array=X, filename='../np_saved/x_train_1_2_gramm')
save_sparse_csr(array=X_test, filename='../np_saved/x_test_1_2_gramm')
In [ ]: