In [1]:
#1 tf-idf 
#2 td-idf + nltk.pos_tag
#3 add n-grams
#4 use microsoft nnet
#5 use keras embedding with cnn

#Previous score
# Log_loss - 10.3227065134
# Acc_loss - 0.70113037671

In [1]:
def submit(y_pred, test, filename):
    sub = pd.DataFrame()
    sub = pd.DataFrame()
    sub['test_id'] = test['test_id']
    sub['is_duplicate'] = y_test
    sub.to_csv(filename, index=False)

def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

In [15]:
def correct_dataset(dataset):
    dataset.loc[(dataset['question1'] == dataset['question2']), 'is_duplicate'] = 1
    return dataset

def process_dataset(dataset, correct_dataset=False):
    dataset['question1'].fillna(' ', inplace=True)
    dataset['question2'].fillna(' ', inplace=True)
    
    #delete punctuation
    dataset['question1'] = dataset['question1'].str.replace('[^\w\s]','')
    dataset['question2'] = dataset['question2'].str.replace('[^\w\s]','')

    #lower questions
    dataset['question1'] = dataset['question1'].str.lower()
    dataset['question2'] = dataset['question2'].str.lower()

    #union questions
    dataset['union'] = pd.Series(dataset['question1']).str.cat(dataset['question2'], sep=' ')

    if correct_dataset:
        return correct_dataset(dataset)
    else:
        return dataset

def split_and_rem_stop_words(line):
    cachedStopWords = stopwords.words("english")
    return [word for word in line.split() if word not in cachedStopWords]

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import nltk
import sklearn as sk
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from nltk.corpus import stopwords

In [4]:
train = pd.read_csv('../datasets/train.csv')
test = pd.read_csv('../datasets/test.csv')

train = process_dataset(train)
test = process_dataset(test)

In [18]:
train.head()


Out[18]:
id qid1 qid2 question1 question2 is_duplicate union union_splitted
0 0 1 2 what is the step by step guide to invest in sh... what is the step by step guide to invest in sh... 0 what is the step by step guide to invest in sh... [step, step, guide, invest, share, market, ind...
1 1 3 4 what is the story of kohinoor kohinoor diamond what would happen if the indian government sto... 0 what is the story of kohinoor kohinoor diamond... [story, kohinoor, kohinoor, diamond, would, ha...
2 2 5 6 how can i increase the speed of my internet co... how can internet speed be increased by hacking... 0 how can i increase the speed of my internet co... [increase, speed, internet, connection, using,...
3 3 7 8 why am i mentally very lonely how can i solve it find the remainder when math2324math is divide... 0 why am i mentally very lonely how can i solve ... [mentally, lonely, solve, find, remainder, mat...
4 4 9 10 which one dissolve in water quikly sugar salt ... which fish would survive in salt water 0 which one dissolve in water quikly sugar salt ... [one, dissolve, water, quikly, sugar, salt, me...

In [7]:
TF_IDF = TfidfVectorizer(min_df=3, ngram_range=(1, 2), max_df=0.3, stop_words='english')
#TF_IDF = TF_IDF.fit(train['union'].append(test['union']).tolist())

TF_IDF = TF_IDF.fit(test['union'].tolist())

X = TF_IDF.transform(train['union'].tolist())
X_test = TF_IDF.transform(test['union'].tolist())

y = train['is_duplicate'].tolist()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=42)

In [8]:
X_train.shape


Out[8]:
(363861, 1677490)

In [9]:
X_test.shape


Out[9]:
(2345796, 1677490)

In [10]:
rf = RandomForestClassifier(random_state=17, max_depth=10, n_estimators=100, n_jobs=-1, class_weight='balanced')

In [11]:
rf.fit(X=X_train, y=y_train)


Out[11]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=10, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=17, verbose=0, warm_start=False)

In [12]:
y_pred = rf.predict(X_val)
l_loss   = log_loss(y_pred=y_pred, y_true=y_val)
acc_loss = accuracy_score(y_pred=y_pred, y_true=y_val)

print ('Log_loss - {}'.format(l_loss))
print ('Acc_loss - {}'.format(acc_loss))


Log_loss - 10.3414610386
Acc_loss - 0.700586212867

In [14]:
rf = RandomForestClassifier(random_state=17, max_depth=10, n_estimators=100, n_jobs=-1, class_weight='balanced')
rf.fit(X=X, y=y)
y_test = rf.predict(X_test)

In [15]:
submit(y_test, test, '../submissions/1_2_gramm.csv')

In [19]:
save_sparse_csr(array=X, filename='../np_saved/x_train_1_2_gramm')
save_sparse_csr(array=X_test, filename='../np_saved/x_test_1_2_gramm')

In [ ]: