notebook.community

Edit and run



In [4]:

    
from __future__ import division
import time, os, gc
import numpy as np
import pandas as pd
import scipy
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')

# SEED = 24
# np.random.seed(SEED)
PATH = os.path.expanduser("~") + "/data/quora/"
print os.listdir(PATH)









    



['X_tfidf.svm', 'test_question1_porter_tfidf.pkl', 'train.csv', 'features', 'train_interaction.pkl', 'glove.840B.300d.zip', 'test_question2_tfidf.pkl', 'quora_duplicate_questions.tsv', 'sample_submission.csv (1).zip', 'test_len.pkl', 'train_question2_porter_tfidf.pkl', 'train_porter_jaccard.pkl', 'train_len.pkl', 'nltk_data', 'test_jaccard.pkl', 'train_porter.csv', 'GoogleNews-vectors-negative300.bin.gz', 'X_train_tfidf.svm', 'X_test_tfidf.svm', 'train_question1_porter_tfidf.pkl', 'clicks_test.csv.zip', 'test_porter_jaccard.pkl', 'train_check.csv', 'X_t_tfidf.svm', 'glove.840B.300d.txt', 'train_porter_interaction.pkl', 'test_interaction.pkl', 'train_question1_tfidf.pkl', 'sample_submission.csv', 'train_question2_tfidf.pkl', 'test_porter_interaction.pkl', 'test_porter.csv', 'train_jaccard.pkl', 'test_question2_porter_tfidf.pkl', 'test_question1_tfidf.pkl', 'submission', 'train.csv.zip', 'test.csv']



In [230]:

    
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
from string import punctuation

stop_words = stopwords.words('english')
# stop_words = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
#               'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
#               'Is','If','While','This']
text1 = '''When do you $3.44k utilization "&" insteading of "シ"?'''
text = ''.join([c for c in text1 if c not in punctuation])
print text

text = text.split()
text = [w for w in text if not w in stop_words]
text = " ".join(text)
print text

text = text.split()
stemmer = SnowballStemmer('english')
stemmed_words = [stemmer.stem(word.decode("utf-8")) for word in text]
text = " ".join(stemmed_words)

print text









    



When do you 344k utilization  insteading of シ
When 344k utilization insteading シ
when 344k util instead シ






    



/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:21: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal



In [123]:

    
def isEnglish(s):
    try:
        s.decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

train['en1'] = train['question1'].astype(str).apply(lambda x: isEnglish(x))
train['en2'] = train['question2'].astype(str).apply(lambda x: isEnglish(x))
train.head()









    Out[123]:






  
    
      
      id
      qid1
      qid2
      question1
      question2
      is_duplicate
      jp1
      jp2
      en1
      en2
    
  
  
    
      0
      0
      1
      2
      What is the step by step guide to invest in sh...
      What is the step by step guide to invest in sh...
      0
      0
      0
      True
      True
    
    
      1
      1
      3
      4
      What is the story of Kohinoor (Koh-i-Noor) Dia...
      What would happen if the Indian government sto...
      0
      0
      0
      True
      True
    
    
      2
      2
      5
      6
      How can I increase the speed of my internet co...
      How can Internet speed be increased by hacking...
      0
      0
      0
      True
      True
    
    
      3
      3
      7
      8
      Why am I mentally very lonely? How can I solve...
      Find the remainder when [math]23^{24}[/math] i...
      0
      0
      0
      True
      True
    
    
      4
      4
      9
      10
      Which one dissolve in water quikly sugar, salt...
      Which fish would survive in salt water?
      0
      0
      0
      True
      True



In [ ]:



In [19]:

    
import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

def P(word): 
    "Probability of `word`."
    # use inverse of rank as proxy
    return - WORDS[word] 

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

correction('adverve')









    Out[19]:





'adverse'



In [8]:

    
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.feature_extraction.text import TfidfVectorizer
import distance
from nltk.corpus import stopwords
import nltk
SEED = 2048
np.random.seed(SEED)
PATH = os.path.expanduser("~") + "/data/quora/"

train = pd.read_csv(PATH+"train_porter.csv")#, nrows=5000).astype(str)
test = pd.read_csv(PATH+"test_porter.csv")#, nrows=5000).astype(str)

def str_abs_diff_len(str1, str2):
    try: a = abs(len(str1)-len(str2))
    except: 
        print str1, str2
        a = 0
    return a

def str_len(str1):
    return len(str(str1))

def char_len(str1):
    str1_list = set(str(str1).replace(' ',''))
    return len(str1_list)

def word_len(str1):
    try:
        str1_list = str1.split(' ')
    except:
        print str1
        str1_list = '1'
    return len(str1_list)

stop_words = stopwords.words('english')
def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stop_words:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stop_words:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    return (len(shared_words_in_q1) + len(shared_words_in_q2))*1.0/(len(q1words) + len(q2words))

print('Generate len')
feats = []

# train['abs_diff_len'] = train.apply(lambda x:str_abs_diff_len(x['question1'],x['question2']),axis=1)
# test['abs_diff_len']= test.apply(lambda x:str_abs_diff_len(x['question1'],x['question2']),axis=1)
# feats.append('abs_diff_len')

# train['R']=train.apply(word_match_share, axis=1, raw=True)
# test['R']=test.apply(word_match_share, axis=1, raw=True)
# feats.append('R')

# train['common_words'] = train.apply(lambda x: len(set(str(x['question1'])
#         .lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
# test['common_words'] = test.apply(lambda x: len(set(str(x['question1'])
#         .lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
# feats.append('common_words')

for c in ['question1','question2']:
    train['%s_char_len'%c] = train[c].apply(lambda x:char_len(x))
    test['%s_char_len'%c] = test[c].apply(lambda x:char_len(x))
    feats.append('%s_char_len'%c)

    train['%s_str_len'%c] = train[c].apply(lambda x:str_len(x))
    test['%s_str_len'%c] = test[c].apply(lambda x:str_len(x))
    feats.append('%s_str_len'%c)
    
    train['%s_word_len'%c] = train[c].apply(lambda x:word_len(x))
    test['%s_word_len'%c] = test[c].apply(lambda x:word_len(x))
    feats.append('%s_word_len'%c)

pd.to_pickle(train[feats].values,PATH+"train_len.pkl")
pd.to_pickle(test[feats].values,PATH+"test_len.pkl")









    



Generate len
nan
nan
nan
nan
nan
nan
nan
nan



In [10]:

    
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 20 11:23:59 2017

@author: mariosm
"""
import os
import pandas as pd
from nltk.corpus import stopwords
from collections import Counter
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix,hstack
from sklearn.feature_extraction import text
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from scipy import sparse as ssp
from sklearn.datasets import dump_svmlight_file,load_svmlight_file
from sklearn.utils import resample,shuffle
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import pearsonr
import distance
stop_words = stopwords.words('english')
    
#stops = set(stopwords.words("english"))
stops = set(["http","www","img","border","home","body","a","about","above","after","again","against","all","am","an",
"and","any","are","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can't",
"cannot","could","couldn't","did","didn't","do","does","doesn't","doing","don't","down","during","each","few","for","from",
"further","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","he's","her","here","here's","hers",
"herself","him","himself","his","how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","isn't","it","it's","its",
"itself","let's","me","more","most","mustn't","my","myself","no","nor","not","of","off","on","once","only","or","other","ought",
"our","ours","ourselves","out","over","own","same","shan't","she","she'd","she'll","she's","should","shouldn't","so","some","such",
"than","that","that's","the","their","theirs","them","themselves","then","there","there's","these","they","they'd","they'll","they're",
"they've","this","those","through","to","too","under","until","up","very","was","wasn't","we","we'd","we'll","we're","we've","were",
"weren't","what","what's","when","when's""where","where's","which","while","who","who's","whom","why","why's","with","won't","would",
"wouldn't","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves" ])
porter = PorterStemmer()
snowball = SnowballStemmer('english')

weights={}

def fromsparsetofile(filename, array, deli1=" ", deli2=":",ytarget=None):    
    zsparse=csr_matrix(array)
    indptr = zsparse.indptr
    indices = zsparse.indices
    data = zsparse.data
    print(" data lenth %d" % (len(data)))
    print(" indices lenth %d" % (len(indices)))    
    print(" indptr lenth %d" % (len(indptr)))
    
    f=open(filename,"w")
    counter_row=0
    for b in range(0,len(indptr)-1):
        #if there is a target, print it else , print nothing
        if ytarget!=None:
             f.write(str(ytarget[b]) + deli1)     
             
        for k in range(indptr[b],indptr[b+1]):
            if (k==indptr[b]):
                if np.isnan(data[k]):
                    f.write("%d%s%f" % (indices[k],deli2,-1))
                else :
                    f.write("%d%s%f" % (indices[k],deli2,data[k]))                    
            else :
                if np.isnan(data[k]):
                     f.write("%s%d%s%f" % (deli1,indices[k],deli2,-1))  
                else :
                    f.write("%s%d%s%f" % (deli1,indices[k],deli2,data[k]))
        f.write("\n")
        counter_row+=1
        if counter_row%10000==0:    
            print(" row : %d " % (counter_row))    
    f.close()  
    


# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=5000.0, min_count=2.0):
    if count < min_count:
        return 0.0
    else:
        return 1.0 / (count + eps)


def word_shares(row,wei,stop):
    q1 = set(str(row['question1']).lower().split())
    q1words = q1.difference(stop)
    if len(q1words) == 0:
        return '0:0:0:0:0'

    q2 = set(str(row['question2']).lower().split())
    q2words = q2.difference(stop)
    if len(q2words) == 0:
        return '0:0:0:0:0'

    q1stops = q1.intersection(stop)
    q2stops = q2.intersection(stop)

    shared_words = q1words.intersection(q2words)
    #print(len(shared_words))
    shared_weights = [wei.get(w, 0) for w in shared_words]
    total_weights = [wei.get(w, 0) for w in q1words] + [wei.get(w, 0) for w in q2words]

    R1 = np.sum(shared_weights) / np.sum(total_weights) #tfidf share
    R2 = float(len(shared_words)) / (float(len(q1words)) + float(len(q2words))) #count share
    R31 = float(len(q1stops)) / float(len(q1words)) #stops in q1
    R32 = float(len(q2stops)) / float(len(q2words)) #stops in q2
    return '{}:{}:{}:{}:{}'.format(R1, R2, float(len(shared_words)), R31, R32)

def stem_str(x,stemmer=SnowballStemmer('english')):
        x = text.re.sub("[^a-zA-Z0-9]"," ", x)
        x = (" ").join([stemmer.stem(z) for z in x.split(" ")])
        x = " ".join(x.split())
        return x
    
def calc_set_intersection(text_a, text_b):
    a = set(text_a.split())
    b = set(text_b.split())
    return len(a.intersection(b)) *1.0 / len(a)

def str_abs_diff_len(str1, str2):
    return abs(len(str1)-len(str2))

def str_len(str1):
    return len(str(str1))

def char_len(str1):
    str1_list = set(str(str1).replace(' ',''))
    return len(str1_list)

def word_len(str1):
    str1_list = str1.split(' ')
    return len(str1_list)

def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stop_words:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stop_words:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))*1.0/(len(q1words) + len(q2words))
    return R

def str_jaccard(str1, str2):


    str1_list = str1.split(" ")
    str2_list = str2.split(" ")
    res = distance.jaccard(str1_list, str2_list)
    return res

# shortest alignment
def str_levenshtein_1(str1, str2):


    #str1_list = str1.split(' ')
    #str2_list = str2.split(' ')
    res = distance.nlevenshtein(str1, str2,method=1)
    return res

# longest alignment
def str_levenshtein_2(str1, str2):

    #str1_list = str1.split(' ')
    #str2_list = str2.split(' ')
    res = distance.nlevenshtein(str1, str2,method=2)
    return res

def str_sorensen(str1, str2):

    str1_list = str1.split(' ')
    str2_list = str2.split(' ')
    res = distance.sorensen(str1_list, str2_list)
    return res



def main():

    path="" # set your input folder here
   
  
    ######## from here on starts qqgeogor example from (https://www.kaggle.com/)#######
    #https://github.com/qqgeogor/kaggle_quora_benchmark
    
    
     ################### generate_stem .py################## 
    seed = 1024
    np.random.seed(seed)
    path = os.path.expanduser("~") + "/data/quora/"
    #re load to avoid errors. 
    
    train = pd.read_csv(path+"train.csv", nrows=250000)
    test = pd.read_csv(path+"test.csv", nrows=250000)

    print('Generate porter')
    train['question1_porter'] = train['question1'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
    test['question1_porter'] = test['question1'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
    
    train['question2_porter'] = train['question2'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
    test['question2_porter'] = test['question2'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
    
    train.to_csv(path+'train_porter.csv')
    test.to_csv(path+'test_porter.csv')
    

    ###################### generate_interaction.py ################    
    
    train = pd.read_csv(path+"train_porter.csv")
    test = pd.read_csv(path+"test_porter.csv")
    test['is_duplicated']=[-1]*test.shape[0]
    
    print('Generate intersection')
    train_interaction = train.astype(str).apply(lambda x:calc_set_intersection(x['question1'],x['question2']),axis=1)
    test_interaction = test.astype(str).apply(lambda x:calc_set_intersection(x['question1'],x['question2']),axis=1)
    pd.to_pickle(train_interaction,path+"train_interaction.pkl")
    pd.to_pickle(test_interaction,path+"test_interaction.pkl")
    
    print('Generate porter intersection')
    train_porter_interaction = train.astype(str).apply(lambda x:calc_set_intersection(x['question1_porter'],x['question2_porter']),axis=1)
    test_porter_interaction = test.astype(str).apply(lambda x:calc_set_intersection(x['question1_porter'],x['question2_porter']),axis=1)
    
    pd.to_pickle(train_porter_interaction,path+"train_porter_interaction.pkl")
    pd.to_pickle(test_porter_interaction,path+"test_porter_interaction.pkl")  
    
    ###################### generate_tfidf.py ################  

        
    ft = ['question1','question2','question1_porter','question2_porter']
    train = pd.read_csv(path+"train_porter.csv")[ft]
    test = pd.read_csv(path+"test_porter.csv")[ft]
    # test['is_duplicated']=[-1]*test.shape[0]
    
    data_all = pd.concat([train,test])
    print data_all
    
    max_features = None
    ngram_range = (1,2)
    min_df = 3
    print('Generate tfidf')
    feats= ['question1','question2']
    vect_orig = TfidfVectorizer(max_features=max_features,ngram_range=ngram_range, min_df=min_df)
    
    corpus = []
    for f in feats:
        data_all[f] = data_all[f].astype(str)
        corpus+=data_all[f].values.tolist()
    
    vect_orig.fit(corpus)
    
    for f in feats:
        tfidfs = vect_orig.transform(data_all[f].values.tolist())
        train_tfidf = tfidfs[:train.shape[0]]
        test_tfidf = tfidfs[train.shape[0]:]
        pd.to_pickle(train_tfidf,path+'train_%s_tfidf.pkl'%f)
        pd.to_pickle(test_tfidf,path+'test_%s_tfidf.pkl'%f)
    
    
    print('Generate porter tfidf')
    feats= ['question1_porter','question2_porter']
    vect_orig = TfidfVectorizer(max_features=max_features,ngram_range=ngram_range, min_df=min_df)
    
    corpus = []
    for f in feats:
        data_all[f] = data_all[f].astype(str)
        corpus+=data_all[f].values.tolist()
    
    vect_orig.fit(
        corpus
        )
    
    for f in feats:
        tfidfs = vect_orig.transform(data_all[f].values.tolist())
        train_tfidf = tfidfs[:train.shape[0]]
        test_tfidf = tfidfs[train.shape[0]:]
        pd.to_pickle(train_tfidf,path+'train_%s_tfidf.pkl'%f)
        pd.to_pickle(test_tfidf,path+'test_%s_tfidf.pkl'%f)    
        
        
    ##################### generate_len.py #########################
    
    train = pd.read_csv(path+"train_porter.csv").astype(str)
    test = pd.read_csv(path+"test_porter.csv").astype(str)
    
    print('Generate len')
    feats = []
    
    train['abs_diff_len'] = train.apply(lambda x:str_abs_diff_len(x['question1'],x['question2']),axis=1)
    test['abs_diff_len']= test.apply(lambda x:str_abs_diff_len(x['question1'],x['question2']),axis=1)
    feats.append('abs_diff_len')
    
    train['R']=train.apply(word_match_share, axis=1, raw=True)
    test['R']=test.apply(word_match_share, axis=1, raw=True)
    feats.append('R')
    
    train['common_words'] = train.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
    test['common_words'] = test.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
    feats.append('common_words')
    
    for c in ['question1','question2']:
        train['%s_char_len'%c] = train[c].apply(lambda x:char_len(x))
        test['%s_char_len'%c] = test[c].apply(lambda x:char_len(x))
        feats.append('%s_char_len'%c)
    
        train['%s_str_len'%c] = train[c].apply(lambda x:str_len(x))
        test['%s_str_len'%c] = test[c].apply(lambda x:str_len(x))
        feats.append('%s_str_len'%c)
        
        train['%s_word_len'%c] = train[c].apply(lambda x:word_len(x))
        test['%s_word_len'%c] = test[c].apply(lambda x:word_len(x))
        feats.append('%s_word_len'%c)
    

    pd.to_pickle(train[feats].values,path+"train_len.pkl")
    pd.to_pickle(test[feats].values,path+"test_len.pkl")       
    
    #########################generate_distance.py #################

    train = pd.read_csv(path+"train_porter.csv")
    test = pd.read_csv(path+"test_porter.csv")
    test['is_duplicated']=[-1]*test.shape[0]
    
    data_all = pd.concat([train,test])    
    
    print('Generate jaccard')
    train_jaccard = train.astype(str).apply(lambda x:str_jaccard(x['question1'],x['question2']),axis=1)
    test_jaccard = test.astype(str).apply(lambda x:str_jaccard(x['question1'],x['question2']),axis=1)
    pd.to_pickle(train_jaccard,path+"train_jaccard.pkl")
    pd.to_pickle(test_jaccard,path+"test_jaccard.pkl")
    
    print('Generate porter jaccard')
    train_porter_jaccard = train.astype(str).apply(lambda x:str_jaccard(x['question1_porter'],x['question2_porter']),axis=1)
    test_porter_jaccard = test.astype(str).apply(lambda x:str_jaccard(x['question1_porter'],x['question2_porter']),axis=1)
    
    pd.to_pickle(train_porter_jaccard,path+"train_porter_jaccard.pkl")
    pd.to_pickle(test_porter_jaccard,path+"test_porter_jaccard.pkl")  

#     path=""
    ###################  generate_svm_format_tfidf.py ################# 
    train = pd.read_csv(path+"train_porter.csv")    
    
    train_question1_tfidf = pd.read_pickle(path+'train_question1_tfidf.pkl')[:]
    test_question1_tfidf = pd.read_pickle(path+'test_question1_tfidf.pkl')[:]

    
    train_question2_tfidf = pd.read_pickle(path+'train_question2_tfidf.pkl')[:]
    test_question2_tfidf = pd.read_pickle(path+'test_question2_tfidf.pkl')[:]
 

    #train_question1_porter_tfidf = pd.read_pickle(path+'train_question1_porter_tfidf.pkl')[:]
    #test_question1_porter_tfidf = pd.read_pickle(path+'test_question1_porter_tfidf.pkl')[:]
    
    #train_question2_porter_tfidf = pd.read_pickle(path+'train_question2_porter_tfidf.pkl')[:]
    #test_question2_porter_tfidf = pd.read_pickle(path+'test_question2_porter_tfidf.pkl')[:]
    
    
    train_interaction = pd.read_pickle(path+'train_interaction.pkl')[:].reshape(-1,1)
    test_interaction = pd.read_pickle(path+'test_interaction.pkl')[:].reshape(-1,1)

    train_interaction=np.nan_to_num(train_interaction)
    test_interaction=np.nan_to_num(test_interaction)      

    
    train_porter_interaction = pd.read_pickle(path+'train_porter_interaction.pkl')[:].reshape(-1,1)
    test_porter_interaction = pd.read_pickle(path+'test_porter_interaction.pkl')[:].reshape(-1,1)


    train_porter_interaction=np.nan_to_num(train_porter_interaction)
    test_porter_interaction=np.nan_to_num(test_porter_interaction)
    
    
    train_jaccard = pd.read_pickle(path+'train_jaccard.pkl')[:].reshape(-1,1)
    test_jaccard = pd.read_pickle(path+'test_jaccard.pkl')[:].reshape(-1,1)


    train_jaccard=np.nan_to_num(train_jaccard)
    test_jaccard=np.nan_to_num(test_jaccard)
    
    train_porter_jaccard = pd.read_pickle(path+'train_porter_jaccard.pkl')[:].reshape(-1,1)
    test_porter_jaccard = pd.read_pickle(path+'test_porter_jaccard.pkl')[:].reshape(-1,1)


    train_jaccard=np.nan_to_num(train_jaccard)
    test_porter_jaccard=np.nan_to_num(test_porter_jaccard)
    
    train_len = pd.read_pickle(path+"train_len.pkl")
    test_len = pd.read_pickle(path+"test_len.pkl")
    
    train_len=np.nan_to_num(train_len)
    test_len=np.nan_to_num(test_len) 
    

    scaler = MinMaxScaler()
    scaler.fit(np.vstack([train_len,test_len]))
    train_len = scaler.transform(train_len)
    test_len =scaler.transform(test_len)
 
    
    
    X = ssp.hstack([
        train_question1_tfidf,
        train_question2_tfidf,
        train_interaction,
        train_porter_interaction,
        train_jaccard,
        train_porter_jaccard,
        train_len
        ]).tocsr()
    
    
    y = train['is_duplicate'].values[:]
    
    X_t = ssp.hstack([
        test_question1_tfidf,
        test_question2_tfidf,
        test_interaction,
        test_porter_interaction,
        test_jaccard,
        test_porter_jaccard,
        test_len
        ]).tocsr()
    
    
    print X.shape
    print X_t.shape
    
    fromsparsetofile(path + "x_tfidf.svm", X, deli1=" ", deli2=":",ytarget=y)
    del X
    fromsparsetofile(path + "x_t_tfidf.svm", X_t, deli1=" ", deli2=":",ytarget=None)
    del X_t

   
    

    
    print ("done!")      
    
                     
if __name__=="__main__":
    main()









    



Generate porter
Generate intersection
Generate porter intersection
                                                question1  \
0       What is the step by step guide to invest in sh...   
1       What is the story of Kohinoor (Koh-i-Noor) Dia...   
2       How can I increase the speed of my internet co...   
3       Why am I mentally very lonely? How can I solve...   
4       Which one dissolve in water quikly sugar, salt...   
5       Astrology: I am a Capricorn Sun Cap moon and c...   
6                                     Should I buy tiago?   
7                          How can I be a good geologist?   
8                         When do you use シ instead of し?   
9       Motorola (company): Can I hack my Charter Moto...   
10      Method to find separation of slits using fresn...   
11            How do I read and find my YouTube comments?   
12                   What can make Physics easy to learn?   
13            What was your first sexual experience like?   
14      What are the laws to change your status from a...   
15      What would a Trump presidency mean for current...   
16                           What does manipulation mean?   
17      Why do girls want to be friends with the guy t...   
18      Why are so many Quora users posting questions ...   
19      Which is the best digital marketing institutio...   
20                             Why do rockets look white?   
21                  What's causing someone to be jealous?   
22        What are the questions should not ask on Quora?   
23                               How much is 30 kV in HP?   
24      What does it mean that every time I look at th...   
25      What are some tips on making it through the jo...   
26                               What is web application?   
27      Does society place too much importance on sports?   
28                 What is best way to make money online?   
29                 How should I prepare for CA final law?   
...                                                   ...   
249970   Why is peace considered important for way world?   
249971  What are the differences between don Bars, Pub...   
249972  Are people trying coolest lose weight the most...   
249973                   Is childbirth painful for right?   
249974  How do I write a good and effective cover lett...   
249975    How is the word 'placating' used in a sentence?   
249976  Can I use actually Machine Learning algorithm ...   
249977            Why good some people not want children?   
249978    Which type muslim LED lights are good for eyes?   
249979                             How do rich like live?   
249980  How do preparing for Optional - Philosophy for...   
249981  What is the Google selection under process for...   
249982  Is there a word that is an antonym (help) "nos...   
249983  How do ordinary people go about buying a super...   
249984  History: What are the biggest historical failu...   
249985      What are some amazing facts about javascript?   
249986  What are some things new web employees should ...   
249987                 What does it feel like to someone?   
249988         Where that can I buy cone crusher Algeria?   
249989   Is NH4NO3 soluble in water? Why or why question?   
249990  Who could be considered the best king node.js ...   
249991  What is the translation of "I will miss very y...   
249992  What happens if you overdose on birth d contro...   
249993  Does the breast size change herself when a gir...   
249994             Is it healthy to eat bread pencil day?   
249995  How should I for JEE 2018 if I don't get more ...   
249996  What does it mean when a guy smiles at a woman...   
249997                             What are these switch?   
249998      What and is the perfect time to get pregnant?   
249999                  If otp while payment of passport?   

                                                question2  \
0       What is the step by step guide to invest in sh...   
1       What would happen if the Indian government sto...   
2       How can Internet speed be increased by hacking...   
3       Find the remainder when [math]23^{24}[/math] i...   
4                 Which fish would survive in salt water?   
5       I'm a triple Capricorn (Sun, Moon and ascendan...   
6       What keeps childern active and far from phone ...   
7               What should I do to be a great geologist?   
8                   When do you use "&" instead of "and"?   
9       How do I hack Motorola DCX3400 for free internet?   
10      What are some of the things technicians can te...   
11                 How can I see all my Youtube comments?   
12                How can you make physics easy to learn?   
13                 What was your first sexual experience?   
14      What are the laws to change your status from a...   
15      How will a Trump presidency affect the student...   
16                          What does manipulation means?   
17               How do guys feel after rejecting a girl?   
18      Why do people ask Quora questions which can be...   
19      Which is the best digital marketing institute ...   
20            Why are rockets and boosters painted white?   
21       What can I do to avoid being jealous of someone?   
22                  Which question should I ask on Quora?   
23      Where can I find a conversion chart for CC to ...   
24       How many times a day do a clock’s hands overlap?   
25      What are some tips on making it through the jo...   
26                 What is the web application framework?   
27               How do sports contribute to the society?   
28              What is best way to ask for money online?   
29      How one should know that he/she completely pre...   
...                                                   ...   
249970              Why is world peace considered motion?   
249971  What should I know before I get my first exper...   
249972                    Are you trying big lose weight?   
249973  Why hasn't evolution gradually maintain the pa...   
249974  I give you cover letter when applying for an i...   
249975    How is the word 'unabashed' used in a sentence?   
249976  How I use Elasticsearch to search data in Hadoop?   
249977  I am a 32 year old woman who has been married ...   
249978          Should you take Advil play reduce fevers?   
249979              What are do rich people stay wealthy?   
249980  Which is offer as the optionals for civil serv...   
249981  What's the selection taxes for Network Enginee...   
249982  What is an antonym for the word which "prejudi...   
249983                                 Who another state?   
249984  What was the biggest waste of money in human h...   
249985  What are some amazing pronunciation about aqua...   
249986  What are some new employees should know going ...   
249987                                    How is someone?   
249988                    How can I find Cone Crusher in?   
249989                                        What water?   
249990                Why until now I contacts unmarried?   
249991  What is the translation of [math]2(\frac23)^{1...   
249992  Are birth control pills still effective if you...   
249993  Which hormone increases the breast size of mov...   
249994       Is it using to eat the same thing every day?   
249995                       How do I study for JEE 2018?   
249996  What does it mean when a guy smiles adoringly ...   
249997                     What is the your note to self?   
249998  Are long does it take a woman to get pregnant ...   
249999  How much category" I charge on fees for my cli...   

                                         question1_porter  \
0       what is the step by step guid to invest in sha...   
1        what is the stori of kohinoor koh i noor diamond   
2       how can i increas the speed of my internet con...   
3             whi am i mental veri lone how can i solv it   
4       which one dissolv in water quik sugar salt met...   
5       astrolog i am a capricorn sun cap moon and cap...   
6                                      should i buy tiago   
7                           how can i be a good geologist   
8                              when do you use instead of   
9       motorola compani can i hack my charter motorol...   
10       method to find separ of slit use fresnel biprism   
11               how do i read and find my youtub comment   
12                     what can make physic easi to learn   
13                 what was your first sexual experi like   
14      what are the law to chang your status from a s...   
15      what would a trump presid mean for current int...   
16                                  what doe manipul mean   
17      whi do girl want to be friend with the guy the...   
18      whi are so mani quora user post question that ...   
19      which is the best digit market institut in ban...   
20                               whi do rocket look white   
21                       what s caus someon to be jealous   
22          what are the question should not ask on quora   
23                                how much is 30 kv in hp   
24      what doe it mean that everi time i look at the...   
25      what are some tip on make it through the job i...   
26                                     what is web applic   
27             doe societi place too much import on sport   
28                   what is best way to make money onlin   
29                   how should i prepar for ca final law   
...                                                   ...   
249970            whi is peac consid import for way world   
249971  what are the differ between don bar pub club d...   
249972  are peopl tri coolest lose weight the most lik...   
249973                       is childbirth pain for right   
249974  how do i write a good and effect cover letter ...   
249975            how is the word placat use in a sentenc   
249976  can i use actual machin learn algorithm to ran...   
249977              whi good some peopl not want children   
249978       which type muslim led light are good for eye   
249979                              how do rich like live   
249980  how do prepar for option philosophi for civil ...   
249981  what is the googl select under process for sof...   
249982    is there a word that is an antonym help nostalg   
249983  how do ordinari peopl go about buy a superman ...   
249984  histori what are the biggest histor failur of ...   
249985           what are some amaz fact about javascript   
249986  what are some thing new web employe should kno...   
249987                    what doe it feel like to someon   
249988          where that can i buy cone crusher algeria   
249989      is nh4no3 solubl in water whi or whi question   
249990  who could be consid the best king node js of m...   
249991  what is the translat of i will miss veri you t...   
249992  what happen if you overdos on birth d control ...   
249993  doe the breast size chang herself when a girl ...   
249994              is it healthi to eat bread pencil day   
249995  how should i for jee 2018 if i don t get more ...   
249996  what doe it mean when a guy smile at a woman e...   
249997                              what are these switch   
249998       what and is the perfect time to get pregnant   
249999                   if otp while payment of passport   

                                         question2_porter  
0       what is the step by step guid to invest in sha...  
1       what would happen if the indian govern stole t...  
2       how can internet speed be increas by hack thro...  
3       find the remaind when math 23 24 math is divid...  
4                   which fish would surviv in salt water  
5       i m a tripl capricorn sun moon and ascend in c...  
6       what keep childern activ and far from phone an...  
7                what should i do to be a great geologist  
8                          when do you use instead of and  
9        how do i hack motorola dcx3400 for free internet  
10      what are some of the thing technician can tell...  
11                    how can i see all my youtub comment  
12                  how can you make physic easi to learn  
13                      what was your first sexual experi  
14      what are the law to chang your status from a s...  
15      how will a trump presid affect the student pre...  
16                                  what doe manipul mean  
17                    how do guy feel after reject a girl  
18      whi do peopl ask quora question which can be a...  
19        which is the best digit market institut in pune  
20                 whi are rocket and booster paint white  
21            what can i do to avoid be jealous of someon  
22                   which question should i ask on quora  
23      where can i find a convers chart for cc to hor...  
24          how mani time a day do a clock s hand overlap  
25      what are some tip on make it through the job i...  
26                       what is the web applic framework  
27                  how do sport contribut to the societi  
28                what is best way to ask for money onlin  
29      how one should know that he she complet prepar...  
...                                                   ...  
249970                    whi is world peac consid motion  
249971  what should i know befor i get my first experi...  
249972                        are you tri big lose weight  
249973  whi hasn t evolut gradual maintain the pain an...  
249974  i give you cover letter when appli for an inte...  
249975           how is the word unabash use in a sentenc  
249976   how i use elasticsearch to search data in hadoop  
249977  i am a 32 year old woman who has been marri fo...  
249978             should you take advil play reduc fever  
249979                what are do rich peopl stay wealthi  
249980  which is offer as the option for civil servic ...  
249981  what s the select tax for network engin at hgs...  
249982     what is an antonym for the word which prejudic  
249983                                    who anoth state  
249984  what was the biggest wast of money in human hi...  
249985      what are some amaz pronunci about aquat plant  
249986  what are some new employe should know go into ...  
249987                                      how is someon  
249988                     how can i find cone crusher in  
249989                                         what water  
249990                    whi until now i contact unmarri  
249991  what is the translat of math 2 frac23 1 x frac...  
249992  are birth control pill still effect if you do ...  
249993  which hormon increas the breast size of movi girl  
249994          is it use to eat the same thing everi day  
249995                        how do i studi for jee 2018  
249996  what doe it mean when a guy smile ador at you ...  
249997                      what is the your note to self  
249998  are long doe it take a woman to get pregnant a...  
249999  how much categori i charg on fee for my client...  

[500000 rows x 4 columns]
Generate tfidf
Generate porter tfidf
Generate len






    



/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:145: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal






    



Generate jaccard
Generate porter jaccard






    



/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:148: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:371: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:372: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:378: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:379: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:386: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:387: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:393: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:394: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead






    



(250000, 845903)
(250000, 845903)
 data lenth 12263345
 indices lenth 12263345
 indptr lenth 250001






    



/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:58: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.






    



 row : 10000 
 row : 20000 
 row : 30000 
 row : 40000 
 row : 50000 
 row : 60000 
 row : 70000 
 row : 80000 
 row : 90000 
 row : 100000 
 row : 110000 
 row : 120000 
 row : 130000 
 row : 140000 
 row : 150000 
 row : 160000 
 row : 170000 
 row : 180000 
 row : 190000 
 row : 200000 
 row : 210000 
 row : 220000 
 row : 230000 
 row : 240000 
 row : 250000 
 data lenth 11952534
 indices lenth 11952534
 indptr lenth 250001
 row : 10000 
 row : 20000 
 row : 30000 
 row : 40000 
 row : 50000 
 row : 60000 
 row : 70000 
 row : 80000 
 row : 90000 
 row : 100000 
 row : 110000 
 row : 120000 
 row : 130000 
 row : 140000 
 row : 150000 
 row : 160000 
 row : 170000 
 row : 180000 
 row : 190000 
 row : 200000 
 row : 210000 
 row : 220000 
 row : 230000 
 row : 240000 
 row : 250000 
done!



In [13]:

    
from sklearn.feature_extraction import text
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
SEED = 2048
np.random.seed(SEED)
PATH = os.path.expanduser("~") + "/data/quora/"

train = pd.read_csv(PATH + "train.csv", nrows=50000)
# test = pd.read_csv(PATH + "test.csv")#, nrows=5000)

def stem_str(x1,stemmer=SnowballStemmer('english')):
    try:
        x = text.re.sub("[^a-zA-Z0-9]"," ", x1)
        x = (" ").join([stemmer.stem(z) for z in x.split(" ")])
        x = " ".join(x.split())
    except: print x1, x
    return x

porter = PorterStemmer()
snowball = SnowballStemmer('english')

# print ('Generate porter')
# train['question1_porter'] = train['question1'].astype(str).apply(lambda x: stem_str(x.lower(),snowball))
# test['question1_porter'] = test['question1'].astype(str).apply(lambda x: stem_str(x.lower(),porter))
train['question2_porter'] = train['question2'].astype(str).apply(lambda x: stem_str(x.lower(),porter))
# test['question2_porter'] = test['question2'].astype(str).apply(lambda x: stem_str(x.lower(),porter))

x= 'do banks in dubai offer credit cards on an aed 4000 salary'
x = (" ").join([snowball.stem(z) for z in x.split(" ")])
x = " ".join(x.split())
print x









    



do banks in dubai offer credit cards on an aed 4000 salary? do banks in dubai offer credit cards on an aed 4000 salary



In [5]:

    
df= pd.read_csv(PATH+'train.csv')
pos = df[df.is_duplicate==1]

import networkx as nx

g = nx.Graph()
g.add_nodes_from(pos.question1)
g.add_nodes_from(pos.question2)
edges = list(pos[['question1','question2']].to_records(index=False))
g.add_edges_from(edges)
len(set(pos.question1) | set(pos.question2)), g.number_of_nodes()









    Out[5]:





(149596, 149596)



In [14]:

    
# with block handles a deprecation warning that occurs inside nx.draw_networkx
import warnings
cc = filter(lambda x : (len(x) > 3), 
            nx.connected_component_subgraphs(g))
# g1 = next(cc)
# g1.nodes()
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    nx.draw_circular(cc[2], with_labels=True, alpha=0.5, font_size=8)
    plt.show()



In [19]:

    
l = []
for i in cc[0]:
    l.append(i)
l









    Out[19]:





['How can I be less self conscious?',
 'How to be less self-conscious?',
 'How can I become less self conscious and insecure?',
 'How do I become less conscious of myself around other people?',
 'How can be less self-conscious?']



In [ ]:



In [ ]:



In [ ]:



In [1]:

    
"""
Detecting duplicate quora questions
feature engineering
@author: Abhishek Thakur
"""

import cPickle
import pandas as pd
import numpy as np
import gensim
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from nltk import word_tokenize
stop_words = stopwords.words('english')


def wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)


def norm_wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return norm_model.wmdistance(s1, s2)


def sent2vec(s):
    words = str(s).lower().decode('utf-8')
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())


data = pd.read_csv('data/quora_duplicate_questions.tsv', sep='\t')
data = data.drop(['id', 'qid1', 'qid2'], axis=1)


data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
data['len_q2'] = data.question2.apply(lambda x: len(str(x)))
data['diff_len'] = data.len_q1 - data.len_q2
data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))
data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)


model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)


norm_model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_model.init_sims(replace=True)
data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)

question1_vectors = np.zeros((data.shape[0], 300))
error_count = 0

for i, q in tqdm(enumerate(data.question1.values)):
    question1_vectors[i, :] = sent2vec(q)

question2_vectors  = np.zeros((data.shape[0], 300))
for i, q in tqdm(enumerate(data.question2.values)):
    question2_vectors[i, :] = sent2vec(q)

data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

cPickle.dump(question1_vectors, open('data/q1_w2v.pkl', 'wb'), -1)
cPickle.dump(question2_vectors, open('data/q2_w2v.pkl', 'wb'), -1)

data.to_csv('data/quora_features.csv', index=False)









    



---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-1-a0cf2f0a8d90> in <module>()
      8 import pandas as pd
      9 import numpy as np
---> 10 import gensim
     11 from fuzzywuzzy import fuzz
     12 from nltk.corpus import stopwords

ImportError: No module named gensim



In [ ]:

    
all_ques_df["num_of_words"] = all_ques_df["questions"].apply(lambda x : len(str(x).split()))
all_ques_df["num_of_chars"] = all_ques_df["questions"].apply(lambda x : len(str(x)))



In [ ]:

    
from nltk.corpus import stopwords
from nltk import word_tokenize, ngrams
eng_stopwords = set(stopwords.words('english'))

def get_unigrams(que):
    return [word for word in word_tokenize(que.lower()) if word not in eng_stopwords]

def get_common_unigrams(row):
    # q1 & q2
    return len( set(row["unigrams_ques1"]).intersection(set(row["unigrams_ques2"])))

def get_common_unigram_ratio(row):
    # q1 | q2
    return float(row["unigrams_common_count"]) / max( len(set(row["unigrams_ques1"]
                                            ).union(set(row["unigrams_ques2"]))), 1)

df["unigrams_ques1"] = df['question1'].apply(lambda x: get_unigrams(str(x)))
df["unigrams_ques2"] = df['question2'].apply(lambda x: get_unigrams(str(x)))
df["unigrams_common_count"] = df.apply(lambda row: get_common_unigrams(row),axis=1)
df["unigrams_common_ratio"] = df.apply(lambda row: get_common_unigram_ratio(row), axis=1)

def get_bigrams(que):
    return [i for i in ngrams(que, 2)]

def get_common_bigrams(row):
    return len( set(row["bigrams_ques1"]).intersection(set(row["bigrams_ques2"])) )

def get_common_bigram_ratio(row):
    return float(row["bigrams_common_count"]) / max(len( set(row["bigrams_ques1"]
                                            ).union(set(row["bigrams_ques2"]))), 1)

df["bigrams_ques1"] = df["unigrams_ques1"].apply(lambda x: get_bigrams(x))
df["bigrams_ques2"] = df["unigrams_ques2"].apply(lambda x: get_bigrams(x)) 
df["bigrams_common_count"] = df.apply(lambda row: get_common_bigrams(row),axis=1)
df["bigrams_common_ratio"] = df.apply(lambda row: get_common_bigram_ratio(row), axis=1)



In [ ]:

    
def feature_extraction(row):
    que1 = str(row['question1'])
    que2 = str(row['question2'])
    out_list = []
    # get unigram features #
    unigrams_que1 = [word for word in que1.lower().split() if word not in eng_stopwords]
    unigrams_que2 = [word for word in que2.lower().split() if word not in eng_stopwords]
    common_unigrams_len = len(set(unigrams_que1).intersection(set(unigrams_que2)))
    common_unigrams_ratio = float(common_unigrams_len) / max(len(set(unigrams_que1).union(set(unigrams_que2))),1)
    out_list.extend([common_unigrams_len, common_unigrams_ratio])

    # get bigram features #
    bigrams_que1 = [i for i in ngrams(unigrams_que1, 2)]
    bigrams_que2 = [i for i in ngrams(unigrams_que2, 2)]
    common_bigrams_len = len(set(bigrams_que1).intersection(set(bigrams_que2)))
    common_bigrams_ratio = float(common_bigrams_len) / max(len(set(bigrams_que1).union(set(bigrams_que2))),1)
    out_list.extend([common_bigrams_len, common_bigrams_ratio])

    # get trigram features #
    trigrams_que1 = [i for i in ngrams(unigrams_que1, 3)]
    trigrams_que2 = [i for i in ngrams(unigrams_que2, 3)]
    common_trigrams_len = len(set(trigrams_que1).intersection(set(trigrams_que2)))
    common_trigrams_ratio = float(common_trigrams_len) / max(len(set(trigrams_que1).union(set(trigrams_que2))),1)
    out_list.extend([common_trigrams_len, common_trigrams_ratio])
    return out_list



In [ ]:

    
train_X_dup = train_X[train_y==1]
train_X_non_dup = train_X[train_y==0]

train_X = np.vstack([train_X_non_dup, train_X_dup, train_X_non_dup, train_X_non_dup])
train_y = np.array([0]*train_X_non_dup.shape[0] + [1]*train_X_dup.shape[0] + [0]*train_X_non_dup.shape[0] + [0]*train_X_non_dup.shape[0])
del train_X_dup
del train_X_non_dup
print("Mean target rate : ",train_y.mean())



In [ ]:



In [ ]:



In [9]:

    
df[df['question1']=='How can I transfer all my Google drive and Gmail data to a different account?']









    Out[9]:






  
    
      
      id
      qid1
      qid2
      question1
      question2
      is_duplicate
    
  
  
    
      224308
      224308
      168427
      214663
      How can I transfer all my Google drive and Gma...
      How can I transfer contacts from a Lumia 520 t...
      0
    
    
      331061
      331061
      168427
      57325
      How can I transfer all my Google drive and Gma...
      How many Gmail accounts can I create with one ...
      0
    
    
      341566
      341566
      168427
      383046
      How can I transfer all my Google drive and Gma...
      What will my limitation of use be if I purchas...
      0



In [12]:

    
mapping  = {}
df["qmax"] = df.apply( lambda row: max(mapping.setdefault(row["question1"], len(mapping)), 
                                       mapping.setdefault(row["question2"], len(mapping))), axis=1)



In [5]:

    
##################################
### 找到 q1 = q2 但是label=0的 noise
##################################

train = pd.read_csv(PATH+'train.csv')

punctuation='["\'?,\.]' # I will replace all these punctuation with ''
abbr_dict={
    "what's":"what is",
    "what're":"what are",
    "who's":"who is",
    "who're":"who are",
    "where's":"where is",
    "where're":"where are",
    "when's":"when is",
    "when're":"when are",
    "how's":"how is",
    "how're":"how are",

    "i'm":"i am",
    "we're":"we are",
    "you're":"you are",
    "they're":"they are",
    "it's":"it is",
    "he's":"he is",
    "she's":"she is",
    "that's":"that is",
    "there's":"there is",
    "there're":"there are",

    "i've":"i have",
    "we've":"we have",
    "you've":"you have",
    "they've":"they have",
    "who've":"who have",
    "would've":"would have",
    "not've":"not have",

    "i'll":"i will",
    "we'll":"we will",
    "you'll":"you will",
    "he'll":"he will",
    "she'll":"she will",
    "it'll":"it will",
    "they'll":"they will",

    "isn't":"is not",
    "wasn't":"was not",
    "aren't":"are not",
    "weren't":"were not",
    "can't":"can not",
    "couldn't":"could not",
    "don't":"do not",
    "didn't":"did not",
    "shouldn't":"should not",
    "wouldn't":"would not",
    "doesn't":"does not",
    "haven't":"have not",
    "hasn't":"has not",
    "hadn't":"had not",
    "won't":"will not",
    punctuation:'',
    '\s+':' ', # replace multi space with one single space
    }

def process_data(data):
    data.question1=data.question1.str.lower() # conver to lower case
    data.question2=data.question2.str.lower()
    data.question1=data.question1.astype(str)
    data.question2=data.question2.astype(str)
    data.replace(abbr_dict,regex=True,inplace=True)
#     display(data.head(2))
    return data

df1 = process_data(train)
df1[(df1['question1']==df1['question2']) & (df1['is_duplicate']==0) ]









    Out[5]:






  
    
      
      id
      qid1
      qid2
      question1
      question2
      is_duplicate
    
  
  
    
      6750
      6750
      13212
      13213
      why do you think you are special
      why do you think you are special
      0
    
    
      23693
      23693
      44353
      44354
      what is wrong with this solution
      what is wrong with this solution
      0
    
    
      30851
      30851
      56920
      56921
      what is it like to be gay in hong kong
      what is it like to be gay in hong kong
      0
    
    
      61404
      61404
      107213
      107214
      what is it like to meet larry page
      what is it like to meet larry page
      0
    
    
      78271
      78271
      133495
      133496
      i am 17 now how can i earn my first house or l...
      i am 17 now how can i earn my first house or l...
      0
    
    
      103525
      103525
      24587
      171095
      what is original jurisdiction
      what is original jurisdiction
      0
    
    
      121182
      121182
      196408
      196409
      what is the name of this instrument
      what is the name of this instrument
      0
    
    
      143641
      143641
      227528
      227529
      what is the worst excuse you have ever heard
      what is the worst excuse you have ever heard
      0
    
    
      154513
      154513
      242289
      242290
      what type of nikes are these
      what type of nikes are these
      0
    
    
      158473
      158473
      247561
      247562
      what is the name of this song
      what is the name of this song
      0
    
    
      172120
      172120
      265841
      265842
      what is the best way to advertise online
      what is the best way to advertise online
      0
    
    
      174071
      174071
      268374
      268375
      who is this girl
      who is this girl
      0
    
    
      182820
      182820
      279718
      68960
      do you think i have ocd
      do you think i have ocd
      0
    
    
      190035
      190035
      288991
      39099
      what does my birth chart say about me
      what does my birth chart say about me
      0
    
    
      192380
      192380
      292009
      292010
      i have scored 500 marks in neet can i get admi...
      i have scored 500 marks in neet can i get admi...
      0
    
    
      205866
      205866
      309141
      309142
      how do i teach my children about english spell...
      how do i teach my children about english spell...
      0
    
    
      211669
      211669
      316563
      316564
      why in katy perry -part of me song the watch g...
      why in katy perry -part of me song the watch g...
      0
    
    
      220517
      220517
      327627
      327628
      does he like me
      does he like me
      0
    
    
      236250
      236250
      347125
      347126
      if you were a farmer what would you grow
      if you were a farmer what would you grow
      0
    
    
      240964
      240964
      352894
      352895
      what is the title of this song
      what is the title of this song
      0
    
    
      251464
      251464
      365580
      365581
      how would a teacher be held liable for an inju...
      how would a teacher be held liable for an inju...
      0
    
    
      252019
      252019
      366246
      366247
      if you were the last person on earth what woul...
      if you were the last person on earth what woul...
      0
    
    
      254962
      254962
      369789
      369790
      can someone help me
      can someone help me
      0
    
    
      272794
      272794
      391127
      391128
      what is the story behind this photo
      what is the story behind this photo
      0
    
    
      276854
      276854
      395845
      395846
      what is the meaning of this
      what is the meaning of this
      0
    
    
      285520
      285520
      405956
      405957
      what is wrong here
      what is wrong here
      0
    
    
      308063
      308063
      431849
      431850
      can someone translate this in english
      can someone translate this in english
      0
    
    
      310728
      310728
      269554
      434889
      what is the single most effective piece of fin...
      what is the single most effective piece of fin...
      0
    
    
      316633
      316633
      441717
      441718
      how is this possible
      how is this possible
      0
    
    
      347129
      347129
      475567
      215030
      what is the genre of this song
      what is the genre of this song
      0
    
    
      355138
      355138
      227529
      484352
      what is the worst excuse you have ever heard
      what is the worst excuse you have ever heard
      0
    
    
      365306
      365306
      495409
      495410
      why am i so slow
      why am i so slow
      0
    
    
      381782
      381782
      513543
      513544
      what is the difference between the two chinese...
      what is the difference between the two chinese...
      0
    
    
      395473
      395473
      528457
      528458
      what is the answer to this iq test question
      what is the answer to this iq test question
      0
    
    
      398714
      398714
      531956
      531957
      what is your favorite john carpenter movie
      what is your favorite john carpenter movie
      0
    
    
      399243
      399243
      532492
      280412
      what is the iupac name of this compound
      what is the iupac name of this compound
      0

	id	qid1	qid2	question1	question2	en1	en2
0	0	1	2	What is the step by step guide to invest in sh...	What is the step by step guide to invest in sh...	True	True
1	1	3	4	What is the story of Kohinoor (Koh-i-Noor) Dia...	What would happen if the Indian government sto...	True	True
2	2	5	6	How can I increase the speed of my internet co...	How can Internet speed be increased by hacking...	True	True
3	3	7	8	Why am I mentally very lonely? How can I solve...	Find the remainder when [math]23^{24}[/math] i...	True	True
4	4	9	10	Which one dissolve in water quikly sugar, salt...	Which fish would survive in salt water?	True	True

	id	qid1	qid2	question1	question2
224308	224308	168427	214663	How can I transfer all my Google drive and Gma...	How can I transfer contacts from a Lumia 520 t...
331061	331061	168427	57325	How can I transfer all my Google drive and Gma...	How many Gmail accounts can I create with one ...
341566	341566	168427	383046	How can I transfer all my Google drive and Gma...	What will my limitation of use be if I purchas...

	id	qid1	qid2	question1	question2
6750	6750	13212	13213	why do you think you are special	why do you think you are special
23693	23693	44353	44354	what is wrong with this solution	what is wrong with this solution
30851	30851	56920	56921	what is it like to be gay in hong kong	what is it like to be gay in hong kong
61404	61404	107213	107214	what is it like to meet larry page	what is it like to meet larry page
78271	78271	133495	133496	i am 17 now how can i earn my first house or l...	i am 17 now how can i earn my first house or l...
103525	103525	24587	171095	what is original jurisdiction	what is original jurisdiction
121182	121182	196408	196409	what is the name of this instrument	what is the name of this instrument
143641	143641	227528	227529	what is the worst excuse you have ever heard	what is the worst excuse you have ever heard
154513	154513	242289	242290	what type of nikes are these	what type of nikes are these
158473	158473	247561	247562	what is the name of this song	what is the name of this song
172120	172120	265841	265842	what is the best way to advertise online	what is the best way to advertise online
174071	174071	268374	268375	who is this girl	who is this girl
182820	182820	279718	68960	do you think i have ocd	do you think i have ocd
190035	190035	288991	39099	what does my birth chart say about me	what does my birth chart say about me
192380	192380	292009	292010	i have scored 500 marks in neet can i get admi...	i have scored 500 marks in neet can i get admi...
205866	205866	309141	309142	how do i teach my children about english spell...	how do i teach my children about english spell...
211669	211669	316563	316564	why in katy perry -part of me song the watch g...	why in katy perry -part of me song the watch g...
220517	220517	327627	327628	does he like me	does he like me
236250	236250	347125	347126	if you were a farmer what would you grow	if you were a farmer what would you grow
240964	240964	352894	352895	what is the title of this song	what is the title of this song
251464	251464	365580	365581	how would a teacher be held liable for an inju...	how would a teacher be held liable for an inju...
252019	252019	366246	366247	if you were the last person on earth what woul...	if you were the last person on earth what woul...
254962	254962	369789	369790	can someone help me	can someone help me
272794	272794	391127	391128	what is the story behind this photo	what is the story behind this photo
276854	276854	395845	395846	what is the meaning of this	what is the meaning of this
285520	285520	405956	405957	what is wrong here	what is wrong here
308063	308063	431849	431850	can someone translate this in english	can someone translate this in english
310728	310728	269554	434889	what is the single most effective piece of fin...	what is the single most effective piece of fin...
316633	316633	441717	441718	how is this possible	how is this possible
347129	347129	475567	215030	what is the genre of this song	what is the genre of this song
355138	355138	227529	484352	what is the worst excuse you have ever heard	what is the worst excuse you have ever heard
365306	365306	495409	495410	why am i so slow	why am i so slow
381782	381782	513543	513544	what is the difference between the two chinese...	what is the difference between the two chinese...
395473	395473	528457	528458	what is the answer to this iq test question	what is the answer to this iq test question
398714	398714	531956	531957	what is your favorite john carpenter movie	what is your favorite john carpenter movie
399243	399243	532492	280412	what is the iupac name of this compound	what is the iupac name of this compound