notebook.community

Edit and run



In [1]:

    
#1 tf-idf 
#2 td-idf + nltk.pos_tag
#3 add n-grams
#4 use microsoft nnet
#5 use keras embedding with cnn

#Previous score
# Log_loss - 10.3227065134
# Acc_loss - 0.70113037671



In [1]:

    
def submit(y_pred, test, filename):
    sub = pd.DataFrame()
    sub = pd.DataFrame()
    sub['test_id'] = test['test_id']
    sub['is_duplicate'] = y_test
    sub.to_csv(filename, index=False)

def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])



In [15]:

    
def correct_dataset(dataset):
    dataset.loc[(dataset['question1'] == dataset['question2']), 'is_duplicate'] = 1
    return dataset

def process_dataset(dataset, correct_dataset=False):
    dataset['question1'].fillna(' ', inplace=True)
    dataset['question2'].fillna(' ', inplace=True)
    
    #delete punctuation
    dataset['question1'] = dataset['question1'].str.replace('[^\w\s]','')
    dataset['question2'] = dataset['question2'].str.replace('[^\w\s]','')

    #lower questions
    dataset['question1'] = dataset['question1'].str.lower()
    dataset['question2'] = dataset['question2'].str.lower()

    #union questions
    dataset['union'] = pd.Series(dataset['question1']).str.cat(dataset['question2'], sep=' ')

    if correct_dataset:
        return correct_dataset(dataset)
    else:
        return dataset

def split_and_rem_stop_words(line):
    cachedStopWords = stopwords.words("english")
    return [word for word in line.split() if word not in cachedStopWords]



In [16]:

    
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import nltk
import sklearn as sk
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from nltk.corpus import stopwords



In [4]:

    
train = pd.read_csv('../datasets/train.csv')
test = pd.read_csv('../datasets/test.csv')

train = process_dataset(train)
test = process_dataset(test)



In [18]:

    
train.head()









    Out[18]:






  
    
      
      id
      qid1
      qid2
      question1
      question2
      is_duplicate
      union
      union_splitted
    
  
  
    
      0
      0
      1
      2
      what is the step by step guide to invest in sh...
      what is the step by step guide to invest in sh...
      0
      what is the step by step guide to invest in sh...
      [step, step, guide, invest, share, market, ind...
    
    
      1
      1
      3
      4
      what is the story of kohinoor kohinoor diamond
      what would happen if the indian government sto...
      0
      what is the story of kohinoor kohinoor diamond...
      [story, kohinoor, kohinoor, diamond, would, ha...
    
    
      2
      2
      5
      6
      how can i increase the speed of my internet co...
      how can internet speed be increased by hacking...
      0
      how can i increase the speed of my internet co...
      [increase, speed, internet, connection, using,...
    
    
      3
      3
      7
      8
      why am i mentally very lonely how can i solve it
      find the remainder when math2324math is divide...
      0
      why am i mentally very lonely how can i solve ...
      [mentally, lonely, solve, find, remainder, mat...
    
    
      4
      4
      9
      10
      which one dissolve in water quikly sugar salt ...
      which fish would survive in salt water
      0
      which one dissolve in water quikly sugar salt ...
      [one, dissolve, water, quikly, sugar, salt, me...



In [7]:

    
TF_IDF = TfidfVectorizer(min_df=3, ngram_range=(1, 2), max_df=0.3, stop_words='english')
#TF_IDF = TF_IDF.fit(train['union'].append(test['union']).tolist())

TF_IDF = TF_IDF.fit(test['union'].tolist())

X = TF_IDF.transform(train['union'].tolist())
X_test = TF_IDF.transform(test['union'].tolist())

y = train['is_duplicate'].tolist()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=42)



In [8]:

    
X_train.shape









    Out[8]:





(363861, 1677490)



In [9]:

    
X_test.shape









    Out[9]:





(2345796, 1677490)



In [10]:

    
rf = RandomForestClassifier(random_state=17, max_depth=10, n_estimators=100, n_jobs=-1, class_weight='balanced')



In [11]:

    
rf.fit(X=X_train, y=y_train)









    Out[11]:





RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=10, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=17, verbose=0, warm_start=False)



In [12]:

    
y_pred = rf.predict(X_val)
l_loss   = log_loss(y_pred=y_pred, y_true=y_val)
acc_loss = accuracy_score(y_pred=y_pred, y_true=y_val)

print ('Log_loss - {}'.format(l_loss))
print ('Acc_loss - {}'.format(acc_loss))









    



Log_loss - 10.3414610386
Acc_loss - 0.700586212867



In [14]:

    
rf = RandomForestClassifier(random_state=17, max_depth=10, n_estimators=100, n_jobs=-1, class_weight='balanced')
rf.fit(X=X, y=y)
y_test = rf.predict(X_test)



In [15]:

    
submit(y_test, test, '../submissions/1_2_gramm.csv')



In [19]:

    
save_sparse_csr(array=X, filename='../np_saved/x_train_1_2_gramm')
save_sparse_csr(array=X_test, filename='../np_saved/x_test_1_2_gramm')



In [ ]:

	id	qid1	qid2	question1	question2	union	union_splitted
0	0	1	2	what is the step by step guide to invest in sh...	what is the step by step guide to invest in sh...	what is the step by step guide to invest in sh...	[step, step, guide, invest, share, market, ind...
1	1	3	4	what is the story of kohinoor kohinoor diamond	what would happen if the indian government sto...	what is the story of kohinoor kohinoor diamond...	[story, kohinoor, kohinoor, diamond, would, ha...
2	2	5	6	how can i increase the speed of my internet co...	how can internet speed be increased by hacking...	how can i increase the speed of my internet co...	[increase, speed, internet, connection, using,...
3	3	7	8	why am i mentally very lonely how can i solve it	find the remainder when math2324math is divide...	why am i mentally very lonely how can i solve ...	[mentally, lonely, solve, find, remainder, mat...
4	4	9	10	which one dissolve in water quikly sugar salt ...	which fish would survive in salt water	which one dissolve in water quikly sugar salt ...	[one, dissolve, water, quikly, sugar, salt, me...