In [1]:
from __future__ import division
import time, os, gc
import numpy as np
import pandas as pd
import scipy
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectPercentile, f_classif
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.feature_extraction import text
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

PATH = os.path.expanduser("~") + "/data/quora/"
print os.listdir(PATH)


['GoogleNews-vectors-negative300.bin.gz', 'sample_submission.csv', 'sample_submission.csv .zip', 'test.csv', 'test.csv.zip', 'test_interaction.pkl', 'test_jaccard.pkl', 'test_len.pkl', 'test_porter.csv', 'test_porter_interaction.pkl', 'test_porter_jaccard.pkl', 'test_question1_porter_tfidf.pkl', 'test_question1_tfidf.pkl', 'test_question2_porter_tfidf.pkl', 'test_question2_tfidf.pkl', 'train.csv', 'train.csv.zip', 'train_interaction.pkl', 'train_jaccard.pkl', 'train_len.pkl', 'train_porter.csv', 'train_porter_interaction.pkl', 'train_porter_jaccard.pkl', 'train_question1_porter_tfidf.pkl', 'train_question1_tfidf.pkl', 'train_question2_porter_tfidf.pkl', 'train_question2_tfidf.pkl', 'X_t_tfidf.svm', 'X_test_tfidf.svm', 'X_tfidf.svm', 'X_train_tfidf.svm']

In [2]:
train_orig =  pd.read_csv(PATH+'train.csv', header=0)
test_orig =  pd.read_csv(PATH+'test.csv', header=0)

def stem_str(x,stemmer=SnowballStemmer('english')):
        x = text.re.sub("[^a-zA-Z0-9]"," ", x)
        x = (" ").join([stemmer.stem(z) for z in x.split(" ")])
        x = " ".join(x.split())
        return x
porter = PorterStemmer()
snowball = SnowballStemmer('english')

train_orig['question1'] = train_orig['question1'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
train_orig['question1'] = train_orig['question1'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
train_orig['question2'] = train_orig['question2'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
train_orig['question2'] = train_orig['question2'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))

df1 = train_orig[['question1']].copy()
df2 = train_orig[['question2']].copy()
df1_test = test_orig[['question1']].copy()
df2_test = test_orig[['question2']].copy()

df2.rename(columns = {'question2':'question1'},inplace=True)
df2_test.rename(columns = {'question2':'question1'},inplace=True)

train_questions = df1.append(df2)
train_questions = train_questions.append(df1_test)
# train_questions = df1.append(df1_test)
# train_questions = train_questions.append(df2)
train_questions = train_questions.append(df2_test)
#train_questions.drop_duplicates(subset = ['qid1'],inplace=True)
train_questions.drop_duplicates(subset = ['question1'],inplace=True)

train_questions.reset_index(inplace=True,drop=True)
questions_dict = pd.Series(train_questions.index.values,index=train_questions.question1.values).to_dict()
train_cp = train_orig.copy()
test_cp = test_orig.copy()
train_cp.drop(['qid1','qid2'],axis=1,inplace=True)

test_cp['is_duplicate'] = -1
test_cp.rename(columns={'test_id':'id'},inplace=True)
comb = pd.concat([train_cp,test_cp])

comb['q1_hash'] = comb['question1'].map(questions_dict)
comb['q2_hash'] = comb['question2'].map(questions_dict)

q1_vc = comb.q1_hash.value_counts().to_dict()
q2_vc = comb.q2_hash.value_counts().to_dict()

def try_apply_dict(x,dict_to_apply):
    try:
        return dict_to_apply[x]
    except KeyError:
        return 0
#map to frequency space
comb['q1_freq'] = comb['q1_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))
comb['q2_freq'] = comb['q2_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))
comb['q1_hash_freq'] = comb['q1_hash'].map(lambda x: try_apply_dict(x,q1_vc))
comb['q2_hash_freq'] = comb['q2_hash'].map(lambda x: try_apply_dict(x,q2_vc))

comb['freq_diff'] = (abs(comb['q1_freq'] - comb['q2_freq'])+0.1) / \
               (comb['q1_freq'] * comb['q2_freq'])


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-2-f33d5a92a184> in <module>()
     10 snowball = SnowballStemmer('english')
     11 
---> 12 train_orig['question1'] = train_orig['question1'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
     13 train_orig['question1'] = train_orig['question1'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
     14 train_orig['question2'] = train_orig['question2'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))

/usr/local/lib/python2.7/site-packages/pandas/core/series.pyc in apply(self, func, convert_dtype, args, **kwds)
   2235             values = lib.map_infer(values, boxer)
   2236 
-> 2237         mapped = lib.map_infer(values, f, convert=convert_dtype)
   2238         if len(mapped) and isinstance(mapped[0], Series):
   2239             from pandas.core.frame import DataFrame

pandas/src/inference.pyx in pandas.lib.map_infer (pandas/lib.c:63043)()

<ipython-input-2-f33d5a92a184> in <lambda>(x)
     10 snowball = SnowballStemmer('english')
     11 
---> 12 train_orig['question1'] = train_orig['question1'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
     13 train_orig['question1'] = train_orig['question1'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
     14 train_orig['question2'] = train_orig['question2'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))

<ipython-input-2-f33d5a92a184> in stem_str(x, stemmer)
      4 def stem_str(x,stemmer=SnowballStemmer('english')):
      5         x = text.re.sub("[^a-zA-Z0-9]"," ", x)
----> 6         x = (" ").join([stemmer.stem(z) for z in x.split(" ")])
      7         x = " ".join(x.split())
      8         return x

/usr/local/lib/python2.7/site-packages/nltk/stem/snowball.pyc in stem(self, word)
    721                     break
    722         else:
--> 723             r1, r2 = self._r1r2_standard(word, self.__vowels)
    724 
    725 

/usr/local/lib/python2.7/site-packages/nltk/stem/snowball.pyc in _r1r2_standard(self, word, vowels)
    230         r1 = ""
    231         r2 = ""
--> 232         for i in range(1, len(word)):
    233             if word[i] not in vowels and word[i-1] in vowels:
    234                 r1 = word[i+1:]

KeyboardInterrupt: 

In [6]:
corr_list = ['id','q1_hash','q2_hash','q1_freq','q2_freq','is_duplicate','freq_diff','q1_hash_freq','q2_hash_freq']
train_comb = comb[comb['is_duplicate'] >= 0][corr_list]
test_comb = comb[comb['is_duplicate'] < 0][corr_list]
train_comb.corr()


Out[6]:
id q1_hash q2_hash q1_freq q2_freq is_duplicate freq_diff q1_hash_freq q2_hash_freq
id 1.000000 0.690308 0.041179 -0.002600 -0.000871 -0.008784 0.001727 -0.002885 -0.001022
q1_hash 0.690308 1.000000 0.282445 -0.359849 -0.243217 -0.207682 0.035621 -0.386223 -0.207151
q2_hash 0.041179 0.282445 1.000000 -0.397311 -0.471671 -0.346925 0.069128 -0.406026 -0.365688
q1_freq -0.002600 -0.359849 -0.397311 1.000000 0.599397 0.343747 -0.166016 0.898113 0.528905
q2_freq -0.000871 -0.243217 -0.471671 0.599397 1.000000 0.265540 -0.099017 0.583196 0.948601
is_duplicate -0.008784 -0.207682 -0.346925 0.343747 0.265540 1.000000 -0.332427 0.334511 0.228869
freq_diff 0.001727 0.035621 0.069128 -0.166016 -0.099017 -0.332427 1.000000 -0.158787 -0.078082
q1_hash_freq -0.002885 -0.386223 -0.406026 0.898113 0.583196 0.334511 -0.158787 1.000000 0.476849
q2_hash_freq -0.001022 -0.207151 -0.365688 0.528905 0.948601 0.228869 -0.078082 0.476849 1.000000

In [33]:
corr_mat = train_comb.corr()
corr_mat.head()
#more frequenct questions are more likely to be duplicates


Out[33]:
id q1_hash q2_hash q1_freq q2_freq is_duplicate
id 1.000000 0.692730 0.286969 -0.001608 -0.000777 -0.008784
q1_hash 0.692730 1.000000 0.492993 -0.341777 -0.202545 -0.206498
q2_hash 0.286969 0.492993 1.000000 -0.392605 -0.466434 -0.349626
q1_freq -0.001608 -0.341777 -0.392605 1.000000 0.494315 0.296621
q2_freq -0.000777 -0.202545 -0.466434 0.494315 1.000000 0.198609

In [29]:
# train_comb['q_hash_diff'] = train_comb['q1_hash'] - train_comb['q2_hash']
train_comb['q_hash_pos'] = train_comb['q1_hash']-train_comb['q2_hash']>0
train_comb['q_hash_pos'] = train_comb['q_hash_pos'].astype(int)
train_comb['q_hash_pos_1'] = train_comb[['q1_freq','q_hash_pos']].apply(lambda x: 1 if x[0]>1 and x[1]>0 else 0, axis=1)
train_comb.corr()


Out[29]:
id q1_hash q2_hash q1_freq q2_freq is_duplicate freq_diff q1_hash_freq q2_hash_freq q_hash_pos q_hash_pos_1
id 1.000000 0.690308 0.041179 -0.002600 -0.000871 -0.008784 0.001727 -0.002885 -0.001022 0.115121 0.071093
q1_hash 0.690308 1.000000 0.282445 -0.359849 -0.243217 -0.207682 0.035621 -0.386223 -0.207151 0.060618 -0.046562
q2_hash 0.041179 0.282445 1.000000 -0.397311 -0.471671 -0.346925 0.069128 -0.406026 -0.365688 -0.583149 -0.472782
q1_freq -0.002600 -0.359849 -0.397311 1.000000 0.599397 0.343747 -0.166016 0.898113 0.528905 0.128596 0.210646
q2_freq -0.000871 -0.243217 -0.471671 0.599397 1.000000 0.265540 -0.099017 0.583196 0.948601 0.292966 0.292321
is_duplicate -0.008784 -0.207682 -0.346925 0.343747 0.265540 1.000000 -0.332427 0.334511 0.228869 0.123509 0.207493
freq_diff 0.001727 0.035621 0.069128 -0.166016 -0.099017 -0.332427 1.000000 -0.158787 -0.078082 0.072430 -0.164259
q1_hash_freq -0.002885 -0.386223 -0.406026 0.898113 0.583196 0.334511 -0.158787 1.000000 0.476849 0.129154 0.210356
q2_hash_freq -0.001022 -0.207151 -0.365688 0.528905 0.948601 0.228869 -0.078082 0.476849 1.000000 0.206482 0.205859
q_hash_pos 0.115121 0.060618 -0.583149 0.128596 0.292966 0.123509 0.072430 0.129154 0.206482 1.000000 0.805124
q_hash_pos_1 0.071093 -0.046562 -0.472782 0.210646 0.292321 0.207493 -0.164259 0.210356 0.205859 0.805124 1.000000

In [59]:
train_comb['is_duplicate'].mean()
pos = train_comb[train_comb['q1_hash']-train_comb['q2_hash']>0]
pos.shape
pos['is_duplicate'].mean()
pos = pos[pos['q1_freq']>1]
pos.shape
pos['is_duplicate'].mean()
pos = train_comb[train_comb['q1_hash']-train_comb['q2_hash']<0]
pos.shape
pos['is_duplicate'].mean()


Out[59]:
0.36919785302629299
Out[59]:
(47463, 8)
Out[59]:
0.53262541347997383
Out[59]:
(32092, 8)
Out[59]:
0.71020815156425277
Out[59]:
(354682, 8)
Out[59]:
0.34458472660016576

In [30]:
list1 = []
list1.append(0)
gpf = train_comb['q2_hash'].values
tag = gpf[0]
for i in range(train_comb.shape[0])[1:]:
    if gpf[i]-tag<0:
        list1.append(gpf[i]-tag)
    if gpf[i]-tag>=0:
        list1.append(gpf[i]-tag)
        tag=gpf[i]

train_comb['q2_change'] = list1

list1 = []
list1.append(0)
gpf = train_comb['q1_hash'].values
tag = gpf[0]
for i in range(train_comb.shape[0])[1:]:
    if gpf[i]-tag<0:
        list1.append(gpf[i]-tag)
    if gpf[i]-tag>=0:
        list1.append(gpf[i]-tag)
        tag=gpf[i]
        
train_comb['q1_change'] = list1

In [121]:
v1 = 0
train_comb[(train_comb['q1_change']<=v1) & (train_comb['q2_change']<=v1)]['is_duplicate'].shape
train_comb[(train_comb['q1_change']<=v1) & (train_comb['q2_change']<=v1)]['is_duplicate'].mean()
train_comb[(train_comb['q1_change']==1)&(train_comb['q2_change']==1)]['is_duplicate'].shape
train_comb[(train_comb['q1_change']==1)&(train_comb['q2_change']==1)]['is_duplicate'].mean()
train_comb[(train_comb['q1_change']!=1)&(train_comb['q2_change']==1)]['is_duplicate'].shape
train_comb[(train_comb['q1_change']!=1)&(train_comb['q2_change']==1)]['is_duplicate'].mean()
train_comb[(train_comb['q1_change']==1)&(train_comb['q2_change']!=1)]['is_duplicate'].shape
train_comb[(train_comb['q1_change']==1)&(train_comb['q2_change']!=1)]['is_duplicate'].mean()


Out[121]:
(79710,)
Out[121]:
0.78024087316522395
Out[121]:
(209993,)
Out[121]:
0.23252679851233135
Out[121]:
(34915,)
Out[121]:
0.15153945295718171
Out[121]:
(79672,)
Out[121]:
0.41357063962245205

In [36]:
train_comb['q1_q2_change_mean'] = (train_comb['q1_change'] + train_comb['q2_change'])/2.0
train_comb['q1_q2_change_min'] = train_comb[['q1_change','q2_change']].apply(lambda x: min(x[0],x[1]),axis=1)
train_comb['q1_q2_change_max'] = train_comb[['q1_change','q2_change']].apply(lambda x: max(x[0],x[1]),axis=1)
train_comb.corr()


Out[36]:
id q1_hash q2_hash q1_freq q2_freq is_duplicate freq_diff q1_hash_freq q2_hash_freq q_hash_pos q_hash_pos_1 q2_change q1_change q1_q2_change_mean q1_q2_change_min q1_q2_change_max
id 1.000000 0.690308 0.041179 -0.002600 -0.000871 -0.008784 0.001727 -0.002885 -0.001022 0.115121 0.071093 -0.020399 -0.356764 -0.040099 -0.027226 -0.281257
q1_hash 0.690308 1.000000 0.282445 -0.359849 -0.243217 -0.207682 0.035621 -0.386223 -0.207151 0.060618 -0.046562 0.239944 0.427433 0.259142 0.247919 0.364492
q2_hash 0.041179 0.282445 1.000000 -0.397311 -0.471671 -0.346925 0.069128 -0.406026 -0.365688 -0.583149 -0.472782 0.998102 0.315344 0.995484 0.997163 0.495795
q1_freq -0.002600 -0.359849 -0.397311 1.000000 0.599397 0.343747 -0.166016 0.898113 0.528905 0.128596 0.210646 -0.397409 -0.464393 -0.415473 -0.400677 -0.519822
q2_freq -0.000871 -0.243217 -0.471671 0.599397 1.000000 0.265540 -0.099017 0.583196 0.948601 0.292966 0.292321 -0.471925 -0.315102 -0.480048 -0.471667 -0.424641
is_duplicate -0.008784 -0.207682 -0.346925 0.343747 0.265540 1.000000 -0.332427 0.334511 0.228869 0.123509 0.207493 -0.346618 -0.259241 -0.354151 -0.345165 -0.369878
freq_diff 0.001727 0.035621 0.069128 -0.166016 -0.099017 -0.332427 1.000000 -0.158787 -0.078082 0.072430 -0.164259 0.069072 0.044340 0.070160 0.058338 0.276045
q1_hash_freq -0.002885 -0.386223 -0.406026 0.898113 0.583196 0.334511 -0.158787 1.000000 0.476849 0.129154 0.210356 -0.406110 -0.498197 -0.425902 -0.410139 -0.544893
q2_hash_freq -0.001022 -0.207151 -0.365688 0.528905 0.948601 0.228869 -0.078082 0.476849 1.000000 0.206482 0.205859 -0.365866 -0.268117 -0.373507 -0.365617 -0.358035
q_hash_pos 0.115121 0.060618 -0.583149 0.128596 0.292966 0.123509 0.072430 0.129154 0.206482 1.000000 0.805124 -0.590683 -0.068443 -0.582470 -0.590505 -0.147689
q_hash_pos_1 0.071093 -0.046562 -0.472782 0.210646 0.292321 0.207493 -0.164259 0.210356 0.205859 0.805124 1.000000 -0.477532 -0.152411 -0.476366 -0.477399 -0.232615
q2_change -0.020399 0.239944 0.998102 -0.397409 -0.471925 -0.346618 0.069072 -0.406110 -0.365866 -0.590683 -0.477532 1.000000 0.337464 0.998590 0.999479 0.513399
q1_change -0.356764 0.427433 0.315344 -0.464393 -0.315102 -0.259241 0.044340 -0.498197 -0.268117 -0.068443 -0.152411 0.337464 1.000000 0.386955 0.356322 0.824397
q1_q2_change_mean -0.040099 0.259142 0.995484 -0.415473 -0.480048 -0.354151 0.070160 -0.425902 -0.373507 -0.582470 -0.476366 0.998590 0.386955 1.000000 0.999143 0.549391
q1_q2_change_min -0.027226 0.247919 0.997163 -0.400677 -0.471667 -0.345165 0.058338 -0.410139 -0.365617 -0.590505 -0.477399 0.999479 0.356322 0.999143 1.000000 0.514343
q1_q2_change_max -0.281257 0.364492 0.495795 -0.519822 -0.424641 -0.369878 0.276045 -0.544893 -0.358035 -0.147689 -0.232615 0.513399 0.824397 0.549391 0.514343 1.000000

In [40]:
v1=0
train_comb['q_change_pair'] = (train_comb['q1_change']<v1) & (train_comb['q2_change']<v1)
train_comb['q_change_pair'] = train_comb['q_change_pair'].astype(int)
train_comb.corr()


Out[40]:
id q1_hash q2_hash q1_freq q2_freq is_duplicate freq_diff q1_hash_freq q2_hash_freq q_hash_pos q_hash_pos_1 q2_change q1_change q1_q2_change_mean q1_q2_change_min q1_q2_change_max q_change_pair
id 1.000000 0.690308 0.041179 -0.002600 -0.000871 -0.008784 0.001727 -0.002885 -0.001022 0.115121 0.071093 -0.020399 -0.356764 -0.040099 -0.027226 -0.281257 0.169849
q1_hash 0.690308 1.000000 0.282445 -0.359849 -0.243217 -0.207682 0.035621 -0.386223 -0.207151 0.060618 -0.046562 0.239944 0.427433 0.259142 0.247919 0.364492 -0.355028
q2_hash 0.041179 0.282445 1.000000 -0.397311 -0.471671 -0.346925 0.069128 -0.406026 -0.365688 -0.583149 -0.472782 0.998102 0.315344 0.995484 0.997163 0.495795 -0.545675
q1_freq -0.002600 -0.359849 -0.397311 1.000000 0.599397 0.343747 -0.166016 0.898113 0.528905 0.128596 0.210646 -0.397409 -0.464393 -0.415473 -0.400677 -0.519822 0.526781
q2_freq -0.000871 -0.243217 -0.471671 0.599397 1.000000 0.265540 -0.099017 0.583196 0.948601 0.292966 0.292321 -0.471925 -0.315102 -0.480048 -0.471667 -0.424641 0.435390
is_duplicate -0.008784 -0.207682 -0.346925 0.343747 0.265540 1.000000 -0.332427 0.334511 0.228869 0.123509 0.207493 -0.346618 -0.259241 -0.354151 -0.345165 -0.369878 0.422098
freq_diff 0.001727 0.035621 0.069128 -0.166016 -0.099017 -0.332427 1.000000 -0.158787 -0.078082 0.072430 -0.164259 0.069072 0.044340 0.070160 0.058338 0.276045 -0.323348
q1_hash_freq -0.002885 -0.386223 -0.406026 0.898113 0.583196 0.334511 -0.158787 1.000000 0.476849 0.129154 0.210356 -0.406110 -0.498197 -0.425902 -0.410139 -0.544893 0.545906
q2_hash_freq -0.001022 -0.207151 -0.365688 0.528905 0.948601 0.228869 -0.078082 0.476849 1.000000 0.206482 0.205859 -0.365866 -0.268117 -0.373507 -0.365617 -0.358035 0.373949
q_hash_pos 0.115121 0.060618 -0.583149 0.128596 0.292966 0.123509 0.072430 0.129154 0.206482 1.000000 0.805124 -0.590683 -0.068443 -0.582470 -0.590505 -0.147689 0.213000
q_hash_pos_1 0.071093 -0.046562 -0.472782 0.210646 0.292321 0.207493 -0.164259 0.210356 0.205859 0.805124 1.000000 -0.477532 -0.152411 -0.476366 -0.477399 -0.232615 0.323326
q2_change -0.020399 0.239944 0.998102 -0.397409 -0.471925 -0.346618 0.069072 -0.406110 -0.365866 -0.590683 -0.477532 1.000000 0.337464 0.998590 0.999479 0.513399 -0.556578
q1_change -0.356764 0.427433 0.315344 -0.464393 -0.315102 -0.259241 0.044340 -0.498197 -0.268117 -0.068443 -0.152411 0.337464 1.000000 0.386955 0.356322 0.824397 -0.677780
q1_q2_change_mean -0.040099 0.259142 0.995484 -0.415473 -0.480048 -0.354151 0.070160 -0.425902 -0.373507 -0.582470 -0.476366 0.998590 0.386955 1.000000 0.999143 0.549391 -0.583421
q1_q2_change_min -0.027226 0.247919 0.997163 -0.400677 -0.471667 -0.345165 0.058338 -0.410139 -0.365617 -0.590505 -0.477399 0.999479 0.356322 0.999143 1.000000 0.514343 -0.558467
q1_q2_change_max -0.281257 0.364492 0.495795 -0.519822 -0.424641 -0.369878 0.276045 -0.544893 -0.358035 -0.147689 -0.232615 0.513399 0.824397 0.549391 0.514343 1.000000 -0.814297
q_change_pair 0.169849 -0.355028 -0.545675 0.526781 0.435390 0.422098 -0.323348 0.545906 0.373949 0.213000 0.323326 -0.556578 -0.677780 -0.583421 -0.558467 -0.814297 1.000000

In [47]:
train_comb[train_comb['q_change_pair']==1].shape
train_comb[train_comb['q_hash_pos_1']==1].shape
train_comb[(train_comb['q_change_pair']==0) & (train_comb['q_hash_pos_1']==1)].shape
train_comb[(train_comb['q_change_pair']==0) & (train_comb['q_hash_pos_1']==1)]['is_duplicate'].mean()


Out[47]:
(79709, 17)
Out[47]:
(32092, 17)
Out[47]:
(11706, 17)
Out[47]:
0.5579190158892875

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [150]:
df= pd.read_csv(PATH+'train.csv')
pos = df
v1 = 'Snapchat'
record = pos[(pos['question1'].str.contains(v1)) | (pos['question2'].str.contains(v1))]
record


Out[150]:
id qid1 qid2 question1 question2 is_duplicate
1056 1056 2106 2107 I search for someone who is definitely on Snap... On Snapchat, someone blocked me, but still sho... 0
1869 1869 3721 3722 How do u get olds messages from snapchat? What should I do to hack Snapchat messages rem... 0
2115 2115 4209 4210 Why is Snapchat currently more or less success... How is Snapchat valued more than Twitter? 0
2333 2333 4639 4640 Why can I not add my friend back on Snapchat? If you delete someone off snapchat how do you ... 0
3158 3158 6261 6262 I forgot my password and the email address I u... How do I delete an account on instagram if I c... 0
3499 3499 6933 6934 Is Snapchat purposefully leaking false acquisi... How can I create and host an Our Story / Live ... 0
3621 3621 7174 7175 How can I increase my Snapchat score instantly? Is Snapchat dead? 0
4816 4816 9508 6261 How do I address formally two persons in an em... I forgot my password and the email address I u... 0
5124 5124 10097 10098 How do you delete saved Snapchat messages that... How do you delete messages on Snapchat? 0
5248 5248 10333 10334 What happens when I block and unblock someone ... On Snapchat, when you block somebody, then unb... 0
5855 5855 11498 11499 If it shows up as pending on Snapchat did they... Today “Mumbai” is Live on Snapchat. What shoul... 0
5869 5869 11524 11525 When I take pictures and send them in SnapChat... I deleted my Snapchat memories after I backed ... 0
6211 6211 12177 12178 Why can I not see a friends Snapchat score any... Why can't I delete my messages on Snapchat? 0
7689 7689 15008 15009 How do you delete a private message which fail... Is there any way to delete Snapchat saved mess... 0
7725 7725 15080 15081 On Snapchat, how do I know if someone deleted ... On Snapchat, how do I know someone is still fo... 0
9016 9016 15008 17546 How do you delete a private message which fail... How can I delete "pending" messages in snapchat? 0
9444 9444 18346 18347 How can you tell if someone blocked you from v... Can someone view my story on Snapchat if they'... 0
9491 9491 18434 18435 Is there a way to block someone's Snapchat sto... Is there any way to view the person's profile ... 0
9703 9703 18844 18845 Why won't Snapchat let me sign in? Which is the best broadband service in Vishal ... 0
10032 10032 18435 19474 Is there any way to view the person's profile ... I added someone on snapchat and as far as I'm ... 0
10409 10409 10097 20167 How do you delete saved Snapchat messages that... Why can't I unsave messages on Snapchat? 0
11046 11046 21361 21362 Someone deleted me from Snapchat but I can sti... If I deleted someone on snapchat, and then mad... 0
11050 11050 21368 21369 Can Instagram stories kill Snapchat from the s... Will Instagram Stories outdo Snapchat? 1
11834 11834 22837 22838 Is there a way to unsend Snapchats that haven'... Why can’t I send a chat on Snapchat? 0
11839 11839 22847 22848 My dear Quorans, Do we still need sales or mar... In the future will we still see startups like ... 0
11894 11894 22950 22951 How can I make my Snapchat score increase faster? How does score increase on Snapchat? 1
12050 12050 23246 23247 What does a red heart emoji next to somebody's... What do the different colors of hearts in emoj... 0
13383 13383 10333 25707 What happens when I block and unblock someone ... If you block someone on WhatsApp what happen t... 0
14437 14437 27653 27654 How is Snapchat using so much storage on my iP... What technology stack does Snapchat use? 0
14758 14758 28244 28245 On Snapchat, why does someone I deleted from f... When you delete someone off Snapchat do they a... 0
... ... ... ... ... ... ...
388551 388551 15081 120999 On Snapchat, how do I know someone is still fo... Someone deleted me on Snapchat so I deleted th... 0
388566 388566 85620 228882 On Snapchat, if I block someone and they saved... How can I see if someone has saved your messag... 0
389350 389350 521838 93535 How long will it take me to learn programming ... How long would it take to learn programming we... 0
389608 389608 52188 69314 If somebody adds you on Snapchat, why can't yo... On Snapchat, I deleted someone. Can they re-ad... 0
390410 390410 61020 10097 How do I retrieve deleted Snapchat messages? How do you delete saved Snapchat messages that... 0
390732 390732 49343 206784 How can I find out my child's Snapchat password? How can I hack my snapchat password? 1
391610 391610 15080 524207 On Snapchat, how do I know if someone deleted ... How do you know if someone delete you on snapc... 1
391745 391745 193638 405559 How do I delete save message from everyone on ... How do I delete a conversation from snapchat? 1
391994 391994 76559 231771 How do I delete my Snapchat conversations in b... If you delete your snapchat does it delete the... 0
392468 392468 40744 170140 Does Snapchat send screenshot notifications fo... Does Snapchat tell the user who screenshotted ... 0
393066 393066 525816 49344 What's the site to go on if you forget both em... How do I get someone's Snapchat password? 0
393349 393349 56907 279523 On Snapchat, what happens when you block someone? On Snapchat, when someone blocks you, can they... 0
394030 394030 167955 526888 Which Typeface / Font does Snapchat use? What font is on Snapchat for iPhone 6? 0
394877 394877 28245 61020 When you delete someone off Snapchat do they a... How do I retrieve deleted Snapchat messages? 0
394958 394958 306499 85131 If you logout of Snapchat, will your messages ... How do I delete sent pictures on chat for Snap... 0
395069 395069 528013 528014 Someone deleted me on snapchat, so I deleted h... My ex blocked me on Snapchat, then appeared on... 0
395937 395937 85132 322215 Can I see my deleted Snapchat history? How do I recover deleted snapchats? 0
395996 395996 166093 18434 Why can I not see someone's story on snapchat? Is there a way to block someone's Snapchat sto... 0
396726 396726 529808 347375 What is Snapchat's strategy? What is Snapchat? 0
396924 396924 61020 10098 How do I retrieve deleted Snapchat messages? How do you delete messages on Snapchat? 0
397703 397703 480398 62994 If I unfriend someone on Snapchat can I still ... If I unfriend someone on Snapchat can they sti... 1
398993 398993 532234 532235 How would you evaluate Snapchat's RSU offer ag... I am trying to estimate the value of 1500 RSU ... 0
398996 398996 532240 532241 Historical gun confiscations? How effective are the game mechanics in Snapch... 0
399365 399365 532624 532625 Which database is used by snapchat? Why would someone use Instagram Stories over S... 0
400213 400213 533554 49344 How do I get into someone's Snapchat account? How do I get someone's Snapchat password? 0
401474 401474 12178 31473 Why can't I delete my messages on Snapchat? On Snapchat, if I remove someone as a friend, ... 0
401533 401533 534932 534933 What is the salary for new grads starting at S... What is the salary for new grads starting at A... 0
401620 401620 79563 6261 How can I figure out my Snapchat password and ... I forgot my password and the email address I u... 0
402384 402384 335452 535860 What are the Snapchat usernames of celebrities? How can I change my Snapchat username? 0
402487 402487 121599 238385 How do I see old snapchat conversations? Why does Snapchat automatically delete the his... 0

782 rows × 6 columns


In [149]:
list1 = ["What's the best way to learn faster?",
 'How do I learn quickly?',
 'How can l learn faster?',
 'How can I learn faster?',
 'How should I learn faster?',
 'How do I learn with a minimal amount of time?',
 'How do you learn the most in the shortest time?',
 'How can you learn fast?']
# df= pd.read_csv(PATH+'train.csv')
record = df[(df['question1'].isin(list1)) | (df['question2'].isin(list1))]
record[record['is_duplicate']==0]


Out[149]:
id qid1 qid2 question1 question2 is_duplicate
93883 93883 51852 156879 What's the best way to learn faster? How can I learn to speak faster? 0
188448 188448 286982 43764 How do I learn anything fast? How can you learn fast? 0
367493 367493 43764 230277 How can you learn fast? What is the fastest, and the most efficient wa... 0

In [163]:
df1= pd.read_csv(PATH+'train.csv',nrows=50000)
pos = df1[df1.is_duplicate==0]
g = nx.Graph()
g.add_nodes_from(pos.question1)
g.add_nodes_from(pos.question2)
edges = list(pos[['question1','question2']].to_records(index=False))
g.add_edges_from(edges)
cc = filter(lambda x : (len(x) > 2), 
            nx.connected_component_subgraphs(g))
dict0 = {}
for i in range(len(cc)):
    dict0[i] = cc[i].nodes()
    
pos = df1[df1.is_duplicate==1]
g = nx.Graph()
g.add_nodes_from(pos.question1)
g.add_nodes_from(pos.question2)
edges = list(pos[['question1','question2']].to_records(index=False))
g.add_edges_from(edges)
cc = filter(lambda x : (len(x) > 2), 
            nx.connected_component_subgraphs(g))
dict1 = {}
for i in range(len(cc)):
    dict1[i] = cc[i].nodes()

In [178]:
cnt=0
for key,value in dict0.iteritems():
    for v1 in value:
#     record = test[(test['question1'].isin(v1)) | (test['question2'].isin(v1))]
        result = [value1 for key,value1 in dict1.iteritems() if v1 in value1]
        if len(result)>0:
    #         nx.draw_circular(cc[key], with_labels=True, alpha=0.5, font_size=12)
    #         plt.show()
            print v1
            print  value
            result
            cnt+=1
        if cnt>1:
            break


What is the best way to deal with social anxiety disorder?
["What's it like to have social anxiety disorder?", 'How common is social anxiety disorder?', 'What is the best way to deal with social anxiety disorder?']
Out[178]:
[['What are different ways to deal with social anxiety?',
  'How do I deal with social anxiety disorder?',
  "What's the best advice you could give to a person suffering from social anxiety disorder?",
  'How should you deal with social anxiety?',
  'How do I deal with my social anxiety?',
  'What is the best way to deal with social anxiety disorder?']]
Why does temperature decrease with increase in altitude?
['What is the maximum altitude for a drone?', 'Why is it generally colder at higher elevations?', 'Why does temperature decrease with increase in altitude?']
Out[178]:
[['Why does temperature decrease when altitude increases?',
  'Why does the air temperature decrease with an increase in height?',
  'Why does temperature decrease with increase in altitude?']]
How do I open a private Instagram account?
['How do I open a private Instagram account?', 'How many Oriya girls wear mini skirts?', 'How do I recover a deleted Instagram name?', "Is it possible to view someone's private Instagram account?", 'Is there a way to view a private Instagram?', 'Does viewing your own Instagram video count as a view?', 'Why are Instagram filters free?', "How can you look at someone's private Instagram account without following them?", "How do I see followers on someone's private Instagram?", "Why did my crush followed my cousin's private account on instagram, what does this mean?", 'Why did Symbian fail?', 'If I link my Instagram account to my Facebook account, will my friends from Facebook be able to see the photos I post even if my Instagram account is private?', 'Can you view pictures on Instagram without an account?']
Out[178]:
[['How do I open a private Instagram account?',
  'How do I look at the followers of a private instagram account?',
  "How do I look at someone's Instagram when it's private?",
  'How can I view a private Instagram?',
  "How do I look at photos on an Instagram account if it's private?",
  'How do I see a private Instagram account?',
  "How can I see someone's private instagram account?",
  'Can I view a private Instagram?',
  'Can I see a private Instagram?',
  "How do I view someones's private instagram pictures?"]]
Which is the best place to visit in Goa?
['Which is the best place to visit in Goa?', 'Any hippie places to visit in Goa?', 'What are the 10 best things that can be done in Goa?']
Out[178]:
[['We are Planning to visit Goa for three days,which are the best places to visit?',
  'What are the best places to visit in Goa?',
  'What are some of the best local places to visit in Goa?',
  'Which are best places to visit in GOA during vacations?',
  'Which are the best places in Goa to visit alone?',
  'What places should one visit in Goa?',
  'What are all the best places to visit in goa?',
  'What are the places in Goa to visit?',
  'Which is the best place to visit in Goa with Friends?',
  'What are the best places to visit in Goa in 2 days?',
  'Which is the best place to visit in Goa?']]
How do I get started with Android application development?
['How do I get started with Android application development?', 'How do I make an Android application using Python?', 'What should I do to make an Android app?']
Out[178]:
[['How do I begin with android application development?',
  'How do I get started with Android application development?',
  'What is the best way to get started with learning Android development?',
  'How do I start with Android development?']]
How do you get a girl to like you?
['How do you get a girl to like you?', 'How do I get over a girl I like?', 'How do I get over a girl that I like?', 'What you can do to Get a Boy to like You?']
Out[178]:
[["How do I get a girl's attention?",
  'How do you get a girl to like you?',
  'There is a girl that I like. How do I get her to like me?',
  'What are some ways to get a girlfriend?',
  'What is the best way to get a girl to like you?']]
What is the best way to start learning hacking?
['What is the best way to start learning hacking?', 'How does one become a hacker?', 'How do I find a hacker?']
Out[178]:
[['What is the best way to learn how to hack (whitehat)?',
  'What are the best way to learn hacking?',
  'What is the best way to learn hacking in short time?',
  'How do I start hack with no knowledge?',
  'What is the best way to start learning hacking?',
  'How should I learn hacking by myself?',
  'How can I learn to hack seriously?',
  'How does a person learn how to hack?',
  'What is the best possible way for learning hacking?',
  'Which is the best way to learn hacking?',
  'Which is the best way to learn hacking just as a hobby?',
  'How does one learn how to hack?',
  'How can I learn hacking for security purposes?',
  'What is the best way to learn white hat hacking?']]
What is best way to make money online?
['What is best way to make money online?', 'What is the hardest way to make money online?', 'What is best way to ask for money online?']
Out[178]:
[['What are ways I can make money online?',
  'What are ways of earning money online?',
  'What should I do to earn money online?',
  'How can I earn money on internet?',
  'What is make money online?',
  'What should I do to make money online in India?',
  'What is the easiest way to earn money from online?',
  'What are some easy ways to make done extra money online?',
  'How can I earn money easily online?',
  'How do I earn money from the Internet?',
  'How can one make money online?',
  'How can I realistically make money online?',
  'How do I really make money online?',
  'How do I make money from home?',
  'How can we earn money online without investment?',
  'How do you earn money from internet?',
  'How can I earn money part time online?',
  'How can I earn money online easily?',
  "I'm 18. How can I make money online?",
  'What is the best way for making money online?',
  "What's the easiest way to make money online?",
  'What are the easiest ways to make good money using the Internet?',
  'What are the best ways to earn money from home?',
  'How can I make money online quickly and easily?',
  'How should I earn money online working from home?',
  'Am not starting big? How can I make $1000 per month online?',
  'How can I make money online consistently?',
  'How can I earn from online?',
  'What are the easiest ways to earn money online?',
  'How can I earn money online, seriously?',
  'What is the easiest way to earn money using internet?',
  'What is an easy way make money online?',
  'What are ways to make money online at home?',
  'Can I earn money online?',
  'What are the various ways through which one can earn money online?',
  'How can I earn money online from home only?',
  'How do we make money online?',
  'How could I make money online?',
  'How can I start to make money online?',
  'What is the easiest way to make a little money online?',
  'How do I earn money online?',
  'What is best way to make money online?',
  'How can I make money online for job?',
  'What is the easy way to make money online?',
  'How can we earn money online in india?',
  'What is a way to make money online?',
  'How can I earn money online?',
  'What are the best ways to make money online?',
  'What are some of the best ways of earning money by working at home?',
  'How do you make money online?',
  'Is there any easy way to make money online?',
  'What are the easy ways to earn money online?',
  'How do you make easy money online?',
  'How can i make money online easily?',
  'How do I earn more money through internet/online?',
  'Can I make money online?',
  'How does one earn money online without an investment from home?']]
How can I increase the traffic on my website?
['How can I increase the traffic on my website?', 'How can I increase traffic on buymarijuanaonline.store?', 'I am a blogger. I want to get huge traffic on my blog. What should I do?']
Out[178]:
[['How can I increase traffic to a story blog?',
  'How can I increase the traffic to my website?',
  'What is the best way to drive traffic to a website?',
  'How Do I get traffic on website?',
  'How do I increase traffic on my site?',
  'How can I get traffic in my website?',
  'How do I get more traffic on my website?',
  'How can I increase traffic to my website using social media?',
  'How can I increase traffic on my blog?',
  'How can I Increase the traffic of my blog?',
  'What is the best way to get free traffic to my website?',
  'How to increase my website Traffic?',
  'How can I increase the traffic on a site?',
  'How do I increase organic traffic to website?',
  'What is the best way to increase traffic for a new blog?',
  'How can I build traffic for my website?',
  'How can I increase traffic to my site and what are some suggestions on how to get more of it?',
  'How do I  increase traffic on my site?',
  'How can I get traffic on website?',
  'How can I get traffic for my website?',
  'What is the best way to get traffic on your website?',
  'How can I increase the traffic on my blog (www.midnightexpressions.wordpress.com)?',
  'How can I increase the traffic on my website? Jeenkart.com',
  'What are the best way to increase website traffic organically?',
  'How do i get traffic for website?',
  'How can I increase the traffic to a website?',
  'How do I get more traffic to my site?',
  'How can I increase traffic to my websites by Facebook?',
  'How can I increase website traffic?',
  'How do I flow traffic to my website?',
  'How can I increase the traffic on my website without investing?',
  'What is the increase organic traffic of websites?',
  'How traffic increased for websites through backlinks?',
  'How can I increase the traffic on my website?',
  'How can I increase traffic very soon on my blog?',
  'How can I increase a website traffic?']]
What's your new year resolution for 2017?
["What's your new year resolution for 2017?", "Can you suggest some good new year resolution's that anyone can implement for 2017 ?", "What's your new year resolution do or not to do?"]
Out[178]:
[["What will be your New Year's resolution for 2017?",
  'What are your new year resolutions for 2017?',
  "What are some of your New Year's resolutions for 2017?",
  'What is your new year resolution, short term and long term goal for 2017?',
  "What is your New Year's resolution for 2017?",
  'What are your resolutions for 2017? And why?',
  "What's your New Year resolutions for 2017 and what will you do to accomplish your goal?",
  "What's are your resolutions for 2017?",
  "What's your New Year's resolution for 2017?",
  'What is your resolution for 2017?',
  "What are your New Year's resolutions for 2017?",
  'Do you have any New Years resolutions for 2017?',
  'What is your new year resolution?',
  "What's your resolutions for 2017?",
  'What is your resolution for this year 2017?',
  'What would be your New Year resolutions for 2017?',
  'What will be your new year resolution for 2017 and your plan of execution?',
  'What can be my new year resolution for 2017?',
  'What is your new year resolution for 2017 or goal for 2017?',
  'What are some of the best New Years resolutions for 2017?',
  'What are your 2017 resolutions?',
  'What are your New Year\xe2\x80\x99s resolutions?',
  "What is your creative New Year's resolution for 2017?",
  "What's your 2017 new year resolution?",
  'What should be my resolution for 2017?',
  "What's your new year resolution for 2017?",
  'What Is your New year resolutions in 2017?',
  'What is your New Year resolution?',
  'What are your New Years resolutions for 2017?',
  'What are your New Year resolutions for the upcoming year 2017?',
  "What is your New Year's resolutions for 2017?",
  'What is/are your New Year resolutions for 2017?',
  'What are some new year resolutions for 2017?',
  'What is your 2017 New Year\xe2\x80\x99s resolution?',
  'What is your New Year Resolution for 2017?',
  'What are some meaningful new year resolutions for 2017?',
  'What is your New Year\xe2\x80\x99s Resolution(s) for 2017?',
  "What's your New Year 2017 resolution?",
  'What are your new year resolutions\xe2\x80\x992017?',
  'What are the resolutions you are going to take for the upcoming New year 2017?',
  "What's your new year 2017 resolution to improve your daily life routine?",
  "What are your New Year's resolutions?",
  'What will be your 2017 resolution?']]
What app allows you to listen to music without WiFi or Internet?
['What app allows you to listen to music without WiFi or Internet?', 'Which is the best no WiFi music app for the iPhone?', 'How do you see a saved Wi-Fi password on Android without root privileges?', 'What is a downloadable app or device which allows you to listen to audio academic lessons without wifi?']
Out[178]:
[['What app allows you to listen to music without WiFi or Internet?',
  'What music app is free without wifi connection?',
  'Are there any music apps that I can listen to without needing an Internet connection?',
  'What app for music without wifi for iPod?',
  'What are some free music apps to download whereby you can download music in the app itself and listen to the music when offline?']]
Where can I find delicious cupcakes at Gold Coast?
['Where can I find delicious cupcakes at Gold Coast?', 'Where can I found different types of gluten free vegan cupcakes in Gold Coast?', 'Where can I get best and freshest ingredients on cupcakes in Gold Coast?']
Out[178]:
[['Where can I get an unique taste for cupcakes in Gold Coast?',
  'Where can I get best flavors, designs and decorations for cupcakes at Gold Coast?',
  'Where can I buy best quality customized cupcakes in Gold Coast?',
  'Where can I find delicious cupcakes at Gold Coast?',
  'Where can I found different cupcake flavors in Gold Coast?',
  'Where can I buy special flavor cupcake at Gold Coast?',
  'Where can I buy very incredible and most amazing cupcakes in Gold Coast?',
  'Where can I get good quality cupcakes and a lot of different flavor in Gold Coast?',
  'Where can I found different flavours for cupcakes at Gold Coast?',
  'Where can I get highest quality, tastiest cupcakes across the Gold Coast?',
  'Where can I buy best quality gourmet cupcakes in Gold Coast?',
  'Where can I get wonderful flavors on cupcakes in Gold Coast?',
  'Where can I find the best quality cupcakes in Gold Coast?']]
What are the most annoying types of questions on Quora?
['What are the most annoying types of questions on Quora?', 'What are the most annoying fitness questions that need to die on Quora?', 'What is the silliest question on Quora?']
Out[178]:
[['What are some dumb questions ever asked on Quora?',
  'What are the most annoying questions that you come across in Quora?',
  'What are the most annoying questions you see on Quora?',
  'What are the dumbest questions ever asked on Quora?',
  'What are the most annoying types of questions on Quora?',
  'What is the most stupid question asked on Quora?',
  'What are the most annoying questions that you feel ridiculous in Quora?',
  'What is the stupidest question asked on Quora?']]
How do I reset my Gmail password when I don't remember my recovery information?
["How do I reset my Gmail password when I don't remember my recovery information?", 'How do I reset my password for old Gmail account?', 'How do I recover password for Gmail password without security questions?', "How do I recover my Gmail password when I don't remember my recovery mail ID?", 'How do I recover my forgotten Gmail password?', 'How can I reset/change my password for a different Gmail account from my new account?', 'Can I reset my Gmail password without having to change my username?', 'How do I recover my Gmail password with a recovery mobile number and a Gmail address?', 'How do I recover my Gmail account without the recovery email?', "I have a Gmail address and can't remember the password. The only pictures of my son I own are on it. How do I go about getting into my account?", 'How can you recover an gmail account without any information?']
Out[178]:
[["How do you rest your rescue password if you don't remember your answers to the sequrity questions?",
  "How do I reset my Gmail password when I don't remember my recovery information?",
  'How do I reset my Gmail password when I forgotten it?',
  'How do I reset my gmail password when they are not highlighting my recovery email option?',
  "How do I reset my Gmail password when I don't have access to my recovery information?",
  "I was suddenly logged off Gmail. I can't remember my Gmail password and just realized the recovery email is no longer alive. What can I do?",
  'How can I add a recovery phone number to my Gmail account without password to my account?',
  "I can't remember my Gmail password or my recovery email. How can I recover my e-mail?",
  'How can I recover my Gmail forgot my password and recovery no?',
  "I forgot my Gmail password and I can't answer the Gmail recovery questions. What can I do?",
  "How can I reset my Gmail password if I don't remember my recovery Email and current password?",
  'I lost my password with my Gmail account. How do I reset it without the account recovery info?',
  "With a forgotten Gmail password, how do you find an old Gmail password when you don't remember the recovery information?",
  "How do I recover my lost Gmail password if I don't have the same number and don't remember the recovery email?",
  'How do I reset my password to Gmail without my recovery information?',
  "How do I gain access to my gmail when I don't have access to the phone number or recovery email?",
  "How can I reset my Gmail password when I don't remember my recovery information?"]]
Which is the easiest programming language to learn?
['Which is the easiest programming language to learn?', 'Which is the best programming language for beginners?', 'Which language should I start with to learn coding?']
Out[178]:
[['Which is the best easiest programming language?',
  'Which is the easiest programming language to learn?',
  'Which is the easiest programming language to master?']]
What are the habits of highly successful people?
['What are the habits of highly successful people?', 'What are good eating habits of successful people?', 'How could I develop the habits of highly successful people?']
Out[178]:
[['What are the habits of highly successful people?',
  'What are habits of successful people?',
  'What are some positive habits successful people practice on a daily basis?']]
What are the best movies of all time?
['What are the best movies of all time?', 'What are the must watch movies to see before you die?', 'What movie can you watch all the time and never get tired of watching?', 'Which Austin Powers movie is the best?']
Out[178]:
[['What are the best movies of all time?',
  'Which is best movie in history?',
  'Which according to you is the best movie of all time? Select only one choice.']]
How could I get rich?
['How could I get rich?', 'What are the best ways to become rich quickly?', 'How do you get rich overnight?']
Out[178]:
[['How could I get rich?',
  'How can I get rich soon?',
  'What are some ways to get rich?']]
Which is better: an arranged marriage or a love marriage?
['Which is better: an arranged marriage or a love marriage?', 'What kind of marriages last longer? Love or arrange?', 'What are the differences between a love marriage and an arranged marriage?']
Out[178]:
[['Which is better: an arranged marriage or a love marriage?',
  'Which is better - love or an arranged marriage?',
  'Which is better - an arranged marriage or a love marriage?']]
In your opinion, who won the first Trump–Clinton U.S. Presidential debate?
['In your opinion, who won the first Trump\xe2\x80\x93Clinton U.S. Presidential debate?', 'Is Hillary Clinton right that Donald Trump has refused to pay workers?', 'Should first generation Americans be afraid of Trump?', 'Why are you voting for Donald Trump over Hillary Clinton?']
Out[178]:
[['Republicans: What are your opinions on the first Trump-Clinton Presidential debate?',
  'Who won the first presidential debate September 2016?',
  'Who won the first 2016 debate?',
  'Who won the first Clinton-Trump debate? And why?',
  'In your opinion, who won the first Trump\xe2\x80\x93Clinton U.S. Presidential debate?',
  'Was Donald Trump trumped on the first Presidential debate?',
  'Who do you think won the first presidential debate between Hillary Clinton and Donald Trump and why?',
  'Who won the First Presidential Debate of 2016?',
  'Who won the debate between Hillary Clinton and Donald Trump on 9/26/2016?',
  'Who won the first debate in your opinion Hillary Clinton or Donald Trump?',
  'Who won the September 26, 2016 presidential debate?',
  'Who won the debate Hillary or Trump?']]
How do people earn money from YouTube?
['How do people earn money from YouTube?', 'Is it worth to earn money from youtube?', 'Is there any need of money to upload videos on YouTube?']
Out[178]:
[['How can I make money fast from Youtube?',
  'How can I earn money using YouTube?',
  'How winning money from YouTube?',
  'How do people earn money through YouTube in India?',
  'How do I make money with YouTube?',
  'How can I earn money from YouTube?',
  'How do people earn money from YouTube?',
  'What are some ways to make money from YouTube?',
  'How can I make money from YouTube?',
  'Can I make money by uploading videos on YouTube (if I have subscribers)?',
  'How can I earn money in YouTube?',
  'How do I make money through YouTube?',
  'How can I make money on YouTube?',
  'How can i earn through youtube?']]
How can guys last longer during sex?
['How can guys last longer during sex?', 'How do I control my premature ejaculation?', 'How do I last long during banging your girl?']
Out[178]:
[['How can guys last longer during sex?',
  'How do men last longer in bed?',
  'How can I last for a longer time during sex?']]
How will demonetization affect India?
['How will demonetization affect India?', 'How long will it take for the economy to become normal again since demonetization?', 'Does the demonetization affect the normal people?']
Out[178]:
[['How will demonetization affect India?',
  'How is demonetization affecting people of India?',
  'What are the effects of demonetization in India?']]
What is the best photo editing software or app?
['What is the best photo editing software or app?', 'What is the best photo editing app for Android mobile?', 'Which is the best photo editing app for Android and iPhone?']
Out[178]:
[['Which is the best photo editing software?',
  'What is the best software for photo editing?',
  'What is the best photo editing software or app?',
  'What is the best photo editing application and software available online or offline?',
  'Which are best apps for photo edit?',
  'What is the best tool for photo editing?']]
What are some ways to lose weight fast?
['What are some ways to lose weight fast?', 'I have to reduce my weight by 15 kgs. What is the healthy rate at which I should aim to reduce per month? 21yr old-female-66kg-5.2ft?', 'What are the best ways to lose weight fast?', 'How can I lose weight fast and never gain it again?']
Out[178]:
[['How do I lose fats and excessive weight from body?',
  'How should I lose weight?',
  'Could you please give some weight loss advice for me and my husband?',
  'What are the best way of loose the weight?',
  'How do I lose 38 pounds in a year?',
  'What are the best simple ways to loose weight?',
  'How do I lose weight without stopping?',
  'How can I lose 25 kg?',
  'How can I efficiently lose weight?',
  'How do I lose weight from 70 to 50?',
  'How can I lose weight loss?',
  'How do I actually go about losing weight?',
  'What is the most effective everlasting method of losing weight?',
  'What is the best method of losing weight?',
  'What should you do if you want to lose a lot of weight?',
  'The best way for weight loss?',
  'How do lose weight with healthy way?',
  "What's the best, most effective tips for losing weight?",
  'What are the best was to lose weight?',
  'What is the best way to reduce body weight?',
  "I'm fat. How do I lose weight?",
  'How do I lose my weight from 58 to 50 kgs?',
  'How can I lose weight slowly and naturally?',
  "What's the best plan to lose weight?",
  'How do I lose weight ayurvedically?',
  'What can I do to loose 20-30kg?',
  'What are the ways of losing weight?',
  'How do I actually lose weight?',
  'How do I lose 15 kilos?',
  'What is the best way to lose weight and not gain it back?',
  'How can I make a plan to lose 12-15 pounds in 2-3 months?',
  'How can I lose an extreme amount of weight?',
  'What are the best ways to lose weight? What is the best diet plan?',
  'How should I loose weight?',
  'What can I do to lose 20 pounds?',
  'How do I suck it up and lose weight?',
  'What are the best things to do when working on losing weight?',
  'How do I lose 20-30 kg?',
  'I am ugly and fat, how to lose weight?',
  'How do I lose 45 pounds the easiest way if I have cravings?',
  'How do I get rid of excessive weight?',
  'How can I lose weight safely?',
  'Can you offer me any advice on how to lose weight?',
  'How can I lose weight effectively?',
  'How do I lose weight?',
  'How can I lose post marriage weight?',
  'How can I lose weight at age 55?',
  'What is the fastest possible way to lose weight?',
  'How do I lose weight without quitting?',
  'Which are the best ways to lose weight?',
  "I love food and have a big appetite. I'm also quite busy. What tips can you give me to lose weight?",
  'What are some good ways to lose weight?',
  "I'm overweight. How can I begin to lose weight?",
  'How can you lose weight fast in a healthy way?',
  'What is the best way to be in a calorie deficit and lose weight successfully?',
  'What should I do to reduce weight?',
  'How can I lose 10 Kilos?',
  'How should one change their diet to lose weight?',
  'How can I slowly lose weight?',
  'How can I really start losing weight?',
  'What would be a realistic plan to lose weight?',
  'What are some ways to lose weight fast?',
  'What is the best guide to lose unwanted pounds?',
  'How can I lose 4kg weight?',
  'Where do I find a simple to understand solution on how to lose weight?',
  'How do I lose 30 pounds?',
  'How do i lose weight?',
  'What are the best ways to lose weight, especially around your core?',
  'What are the best ways to lose weight?',
  "I'm 12 and at 60 kg and about 144 cm how do I lose weight?",
  'What is the easiest way to loose weight?']]
How do I know the AO code and AO type for a PAN card application?
['How do I know the AO code and AO type for a PAN card application?', 'What is a pan card?', 'What is PAN?']
Out[178]:
[['I have to apply for a new pan card, How should I find my AO number?',
  'How do I know the AO code and AO type for a PAN card application?',
  'How do I select my AO code for new Pan Card application?',
  'What AO code should NRIs or OCIs use when applying for a PAN card?',
  'What is AO code for a student who is applying for PAN card but does not have any source of income?']]
How do I get tickets for The Kapil Sharma Show ?
['How do I get tickets for The Kapil Sharma Show ?', 'How do I participate in The Kapil Sharma Show as audience member?', 'Does Kapil Sharma pay celebrities to come to his show?', "When is Kapil Sharma's next show?"]
Out[178]:
[['How can I watch The Kapil Sharma Show live in Mumbai?',
  'How do I get tickets for The Kapil Sharma Show ?',
  'How can I get an entry in The Kapil Sharma Show?']]
What do you people think of Mr. Arvind Kejriwal and his AAP?
['What do you people think of Mr. Arvind Kejriwal and his AAP?', 'What Anna Hazare think about PM Modi?', 'Why did Kumar Vishwas lie to people about what was written in the suicide letter of Gajendra in the AAP rally?']
Out[178]:
[['What do you people think of Mr. Arvind Kejriwal and his AAP?',
  'May 2016: What do Delhi people think about Kejriwal, are YOU really satisfied with Governance and his attitude to PM Modi?',
  'What Delhi people think about kejriwal?']]
How can you tell if you're a narcissist?
["How can you tell if you're a narcissist?", 'How can I tell if I\xe2\x80\x99m a narcissist?', 'How can I identify a narcissist?', 'How do you tell a narcissist they are narcissist?']
Out[178]:
[["How can you tell if you're a narcissist?",
  'How can you tell if you are a narcissist?',
  'How can I identify a narcissist?']]
What is the best phone to buy below 15k?
['What is the best phone to buy below 15k?', 'Why is cellphone called as cellphone?', 'Which is the best mobile phone to buy below 2k?']
Out[178]:
[['Can you suggest a best budget phone below 15k?',
  'Which is best mobile under 15000?',
  'Which is the best phone under \xe2\x82\xb915000?',
  'Which mobile phone should I buy under Rs.15000?',
  'Which phone would be the best for \xe2\x82\xb915,000?',
  'Which smartphone would be best under 15000? (2016)',
  'Which mobile is better under 15k?',
  'Which phone is best under 15k?',
  'What is the best phone I can get for below 15k?',
  'Which are best mobile phones to buy under 15000?',
  'What are the good options for mobile phones under 15000?',
  'Which is the best phone below 15000?',
  'Which phone is best to buy under 15k?',
  'Which phone should I buy under 15k?',
  'Which is the best phone under 15000 Rs.?',
  'What is the best phone to buy below 15k?',
  'Which phone should I buy under INR 15K?',
  'What is the best phone I can buy under the price of 15000?',
  'What are some good smartphones under 15k?',
  'What phone should I buy under Rs 15000?',
  'Which is the best mobile below 15000?']]

In [126]:
train_or =  pd.read_csv(PATH+'train.csv', header=0)
test_or =  pd.read_csv(PATH+'test.csv', header=0)

In [88]:
train =  pd.read_csv(PATH+'train.csv', header=0)
test =  pd.read_csv(PATH+'test.csv', header=0)

def stem_str(x,stemmer=SnowballStemmer('english')):
        x = text.re.sub("[^a-zA-Z0-9]"," ", x)
        x = (" ").join([stemmer.stem(z) for z in x.split(" ")])
        x = " ".join(x.split())
        return x
porter = PorterStemmer()
snowball = SnowballStemmer('english')

train['question1'] = train['question1'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
test['question1'] = test['question1'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
train['question2'] = train['question2'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
test['question2'] = test['question2'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))

In [89]:
# train= pd.read_csv(PATH+'train.csv')
pos = train[train.is_duplicate==1]

import networkx as nx

g = nx.Graph()
g.add_nodes_from(pos.question1)
g.add_nodes_from(pos.question2)
edges = list(pos[['question1','question2']].to_records(index=False))
g.add_edges_from(edges)
len(set(pos.question1) | set(pos.question2)), g.number_of_nodes()


Out[89]:
(147698, 147698)

In [131]:
import warnings
from random import randint
cc = filter(lambda x : (len(x) >9), 
            nx.connected_component_subgraphs(g))
print len(cc)
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    for i in range(len(cc)):
        i = randint(0,len(cc))
        if len(cc[i])>9:
            cc[i].nodes()
            text = cc[i].nodes()
            nx.draw(cc[i], with_labels=True, alpha=0.5, font_size=12)
            plt.show()
            break


506
Out[131]:
[u'how can i earn from onlin',
 u'what are the easiest way to make good money use the internet',
 u'how can i make money onlin easili',
 u'what is an easi way make money onlin',
 u'how should i earn money onlin work from home',
 u'how can i earn money onlin easili',
 u'what is a way to make money onlin',
 u'what are the various way through which one can earn money onlin',
 u'how do i earn money from the internet',
 u'how do you make money onlin',
 u'what should i do to earn money onlin',
 u'how can i earn money onlin from home onli',
 u'what are way of earn money onlin',
 u'what are way to make money onlin at home',
 u'how can i earn money easili onlin',
 u'how do i realli make money onlin',
 u'what are the best way to earn money from home',
 u'how do you make easi money onlin',
 u'what is the easi way to make money onlin',
 u'is there ani easi way to make money onlin',
 u'am not start big how can i make 1000 per month onlin',
 u'i m 18 how can i make money onlin',
 u'what are some of the best way of earn money by work at home',
 u'how doe one earn money onlin without an invest from home',
 u'what is the easiest way to earn money from onlin',
 u'can i make money onlin',
 u'how do i earn more money through internet onlin',
 u'how can we earn money onlin without invest',
 u'can i earn money onlin',
 u'how can i start to make money onlin',
 u'what are some easi way to make done extra money onlin',
 u'how can we earn money onlin in india',
 u'how can i start make money use internet',
 u'what should i do to make money onlin in india',
 u'what is make money onlin',
 u'how do we make money onlin',
 u'how can i earn money on internet',
 u'how can i make money onlin quick and easili',
 u'how could i make money onlin',
 u'how can i earn money onlin',
 u'what are the easiest way to earn money onlin',
 u'what are the easi way to earn money onlin',
 u'what is the easiest way to make a littl money onlin',
 u'how can i make money onlin consist',
 u'how can i earn money part time onlin',
 u'how can one make money onlin',
 u'how can i realist make money onlin',
 u'what s the easiest way to make money onlin',
 u'how do i earn money onlin',
 u'how do you earn money from internet',
 u'what are the best way to make money onlin',
 u'how can i make money onlin for job',
 u'how can i earn money onlin serious',
 u'what is the easiest way to earn money use internet',
 u'what are way i can make money onlin',
 u'how do i make money from home',
 u'what is best way to make money onlin',
 u'what is the best way for make money onlin']

In [132]:
from collections import Counter

text1 = " ".join(text) 
c = Counter(text1.split())
c.most_common()[:10]
topic = [i[0] for i in c.most_common()[:3]]


Out[132]:
[(u'money', 56),
 (u'onlin', 49),
 (u'how', 32),
 (u'make', 32),
 (u'i', 30),
 (u'earn', 26),
 (u'can', 24),
 (u'what', 23),
 (u'way', 21),
 (u'to', 18)]

In [134]:
from gensim.summarization import keywords
from gensim.summarization import summarize

text1 = " ".join(text) 
print text1
# text1 = "have been captured by a race of. heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix. As a rebel against the machines, Neo must return to the Matrix and confront the agents: super-powerful "
# print text1
# summarize(text1, split=True)
topic = keywords(text1, split=True,words=3)
print topic


how can i earn from onlin what are the easiest way to make good money use the internet how can i make money onlin easili what is an easi way make money onlin how should i earn money onlin work from home how can i earn money onlin easili what is a way to make money onlin what are the various way through which one can earn money onlin how do i earn money from the internet how do you make money onlin what should i do to earn money onlin how can i earn money onlin from home onli what are way of earn money onlin what are way to make money onlin at home how can i earn money easili onlin how do i realli make money onlin what are the best way to earn money from home how do you make easi money onlin what is the easi way to make money onlin is there ani easi way to make money onlin am not start big how can i make 1000 per month onlin i m 18 how can i make money onlin what are some of the best way of earn money by work at home how doe one earn money onlin without an invest from home what is the easiest way to earn money from onlin can i make money onlin how do i earn more money through internet onlin how can we earn money onlin without invest can i earn money onlin how can i start to make money onlin what are some easi way to make done extra money onlin how can we earn money onlin in india how can i start make money use internet what should i do to make money onlin in india what is make money onlin how do we make money onlin how can i earn money on internet how can i make money onlin quick and easili how could i make money onlin how can i earn money onlin what are the easiest way to earn money onlin what are the easi way to earn money onlin what is the easiest way to make a littl money onlin how can i make money onlin consist how can i earn money part time onlin how can one make money onlin how can i realist make money onlin what s the easiest way to make money onlin how do i earn money onlin how do you earn money from internet what are the best way to make money onlin how can i make money onlin for job how can i earn money onlin serious what is the easiest way to earn money use internet what are way i can make money onlin how do i make money from home what is best way to make money onlin what is the best way for make money onlin
[u'onlin', u'money', u'way']

In [133]:
def multi_words(words,list_):
    if all(word in str(words) for word in list_):
        return 1
    else: return 0
topic = [_i.encode('utf-8') for _i in topic]
print topic
# topic=['Ancient','test']
for qst in ['question1','question2']:
    q1 = train[train[qst].apply(lambda x: multi_words(x,topic))==1]
    q1.shape
    q1['is_duplicate'].mean()

test[test['question1'].apply(lambda x: multi_words(x,topic))==1].shape
test[test['question2'].apply(lambda x: multi_words(x,topic))==1].shape
train_or[train['question1'].apply(lambda x: multi_words(x,topic))==1].head()
test_or[test['question1'].apply(lambda x: multi_words(x,topic))==1].head()


['money', 'onlin', 'how']
Out[133]:
(693, 6)
Out[133]:
0.82106782106782106
Out[133]:
(651, 6)
Out[133]:
0.86021505376344087
Out[133]:
(3431, 3)
Out[133]:
(3459, 3)
Out[133]:
id qid1 qid2 question1 question2 is_duplicate
284 284 568 569 How can I make money online with free of cost? How do I to make money online? 1
1284 1284 2560 2561 How can I make money online in India? What's the easiest way to make money online? 0
2462 2462 4893 4894 How can i make money online easily? How do I quickly and easily make money online ... 0
3076 3076 6099 6100 I'm 18. How can I make money online? What is the easiest way to earn money from onl... 1
3112 3112 6171 6100 How can we earn money online without investment? What is the easiest way to earn money from onl... 1
Out[133]:
test_id question1 question2
1339 1339 How can a 16 year boy in india earn donald onl... I'm a 15 number year old teen. What ways are t...
1760 1760 How do learn you make money online? What is the easy way people earn money online?
1875 1875 How can I earn money him online? How do you" you earn money through internet?
3739 3739 How played do I make money online? What should I do to british money online?
4200 4200 How can I earn money online without investment... I am working in one of the software product-ba...

In [ ]:


In [261]:
def multi_words(words,list_):
    if all(word in str(words[0]) for word in list_):
        return 1
    elif all(word in str(words[1]) for word in list_):
        return 1
    else: return 0

topic=['Microsoft','acquisition']
train[(train[['question1','question2']].apply(lambda x: multi_words(x,topic), 
                                              axis=1)==1)]['is_duplicate'].mean()
test[(test[['question1','question2']].apply(lambda x: multi_words(x,topic), 
                                              axis=1)==1)].shape


Out[261]:
0.7857142857142857
Out[261]:
(159, 3)

In [231]:
topic= "Microsoft"
train[train['question1'].str.contains(topic, na=False)]['is_duplicate'].mean()
train[train['question2'].str.contains(topic, na=False)]['is_duplicate'].mean()
train[(train['question1'].str.contains(topic, na=False))  |
      (train['question2'].str.contains(topic, na=False))]['is_duplicate'].mean()
train[(train['question1'].str.contains(topic, na=False))  &
      (train['question2'].str.contains(topic, na=False))]['is_duplicate'].mean()
train[(train['question1'].str.contains(topic, na=False))  |
      (train['question2'].str.contains(topic, na=False))].shape#['is_duplicate'].mean()
test[(test['question1'].str.contains(topic, na=False)) | 
     (test['question2'].str.contains(topic, na=False))].shape
test[(test['question1'].str.contains(topic, na=False)) & 
     (test['question2'].str.contains(topic, na=False))].shape
train[(train['question1'].str.contains(topic, na=False))  &
      (train['question2'].str.contains(topic, na=False))]


Out[231]:
0.23516949152542374
Out[231]:
0.22016460905349794
Out[231]:
0.18077474892395984
Out[231]:
0.35249042145593867
Out[231]:
(697, 6)
Out[231]:
(5598, 3)
Out[231]:
(1684, 3)
Out[231]:
id qid1 qid2 question1 question2 is_duplicate
980 980 1955 1956 What can I do to get a job at Microsoft? How do I get job in Google or Microsoft? 1
1194 1194 2380 2381 Is it worth it to work as an application suppo... What is the salary for a Software Engineering ... 0
2739 2739 5439 5440 What is better, Microsoft or Apple? Which is better: Microsoft or Apple? 1
3255 3255 6452 6453 How many stocks does Microsoft India give for ... Is Director and Principal same level in Micros... 0
4069 4069 8054 8055 Is LinkedIn a good acquisition for Microsoft? How does the LinkedIn acquisition help Microso... 1
6184 6184 12123 12124 Do you have to know what's taught in a compute... Should I do an MTech from IIT in order to impr... 0
10538 10538 20403 20404 Which company will fall first: Google, Apple, ... Which is the better company to work for as a p... 0
10948 10948 21178 21179 What are the best programs for drawing on a Mi... Is Microsoft surface pro 4 worth buying? 0
11181 11181 21613 21614 What are Microsoft's best and worst acquisitions? What has been Microsoft's worst acquisition? 1
14538 14538 27833 27834 What is the difference between SQL and Microso... Difference between Microsoft SQL Server vs Ora... 0
14907 14907 28519 28520 How can we get Microsoft Customer service? How do we get Microsoft Customer Service? 1
14923 14923 28548 12124 What do IT companies like Accenture, Cognizant... Should I do an MTech from IIT in order to impr... 0
16334 16334 31144 31145 Can Microsoft surface pro be used for transcri... Where can I buy a Microsoft Surface Pro 3 in S... 0
18365 18365 34802 34803 How do I change direction from ltr to rtl (not... How can I put Microsoft Office on a Mac? 0
19996 19996 37760 37761 How can I remove IRM protection from Microsoft... How can I remove IRM protection from Microsoft... 0
22197 22197 41696 41697 What's the best way to learn Microsoft Office ... How can I learn Microsoft office? 1
23105 23105 43318 43319 What are the consequences for the stakeholders... How is the future of the Windows Phone after t... 0
24165 24165 45185 45186 Is it true that, once you are in a big company... What is the way to get a job at big companies ... 0
24291 24291 20403 45405 Which company will fall first: Google, Apple, ... How will our lives be affected if Google, Micr... 0
25563 25563 47636 47637 How do you circle a number in Microsoft Word? How can you circle a word in Microsoft word? 1
26933 26933 50066 50067 How can you fix Microsoft Word if it won't open? Why did my footnote go to the next page in Mic... 0
27132 27132 50426 50427 Do software companies Amazon, Google, Microsof... How many problems one should be able to solve ... 0
28411 28411 52687 52688 How was Microsoft word coded? How can Microsoft Word be improved? 0
28786 28786 53338 53339 What is your review of Microsoft Surface Pro 3? What is your review of Microsoft Surface Pro 4? 0
33078 33078 60819 60820 What are the differences between Microsoft Exc... In hiring an executive assistant, can I assume... 0
33887 33887 20403 62186 Which company will fall first: Google, Apple, ... Does Google have plans to compete with Microso... 0
34115 34115 45186 62567 What is the way to get a job at big companies ... Can I get a job or internship at Google, Micro... 0
35192 35192 64346 1955 How can I work in Microsoft? What can I do to get a job at Microsoft? 1
35256 35256 64457 64458 Does Microsoft own Google? Why doesn't Microsoft own Google? 0
36350 36350 31650 66291 I am being offered a job as a Solutions Archit... I have a job offer from Microsoft that I have ... 0
... ... ... ... ... ... ...
355849 355849 211036 485094 How can I enable fullscreen mode in Microsoft ... Why is Microsoft not fixing the fullscreen mod... 0
359524 359524 489198 97500 What are some amazing facts about giants like ... What are some amazing facts about Google/Micro... 0
361783 361783 491649 491650 How do I get a dynamic table in Microsoft Excel? What are pivot tables in Microsoft Excel? 0
362553 362553 60819 492468 What are the differences between Microsoft Exc... Is there any website or blog to teach Microsof... 0
363026 363026 492968 492969 How do I update my Microsoft account payment i... How do I updates my Microsoft account? 0
363504 363504 493491 493492 What are the levels of data scientists at Micr... How common are 40 hours/week data scientist jo... 0
364033 364033 152238 339541 Will Microsoft ever make Windows open source? Will Microsoft open source Windows? 1
364078 364078 172865 494117 How can I learn Microsoft Excel by myself? What is the best way to learn Microsoft Excel? 0
367965 367965 498306 498307 Why is Microsoft incapable of creating a prope... Why can't Microsoft make a decent browser? 1
368113 368113 498458 498459 What's the most recent version of Microsoft Of... Can I get Microsoft Office to work on Windows 7? 0
369216 369216 499628 499629 Is there a replacement for Microsoft Office Ac... Do Microsoft employees use Google at the office? 0
369811 369811 500287 500288 How does Microsoft benefit from LinkedIn acqui... Why did Microsoft buy LinkedIn for $26.2 billi... 1
370308 370308 223958 64346 How can I land a job at Microsoft? How can I work in Microsoft? 1
371588 371588 502223 21614 What has been Microsoft's most successful acqu... What has been Microsoft's worst acquisition? 1
372638 372638 79022 1955 How can I get a job in Microsoft? What can I do to get a job at Microsoft? 1
375792 375792 506858 506859 Is it true that Google chrome uses most amount... Is it true that your laptop's battery will run... 1
377024 377024 508249 508250 What is "program management" at Microsoft? What does a program manager do at Microsoft? 1
377481 377481 508781 508782 How can an electronics and communications engi... Do big companies like google, Facebook, Micros... 0
379270 379270 510776 510777 Is there a free download for Microsoft Windows 7? How can I download Microsoft Windows 7 for free? 1
386239 386239 518426 518427 Can I install Android apps in Microsoft lumia ... In terms of audio quality, display & camera, s... 0
386887 386887 519158 519159 How do I remove virus from Microsoft Edge brow... How do I remove virus named "evotracker" from ... 1
388278 388278 520655 520656 What are the packages and benefits offered by ... What is the general CGPA cut-off for appearing... 0
389068 389068 83729 521521 How do you put a squared symbol in Microsoft W... How do you type the symbol x-bar in Microsoft ... 0
390763 390763 523305 523306 What kind of projects get big tech companies' ... Are open source contributions less impressive ... 0
391030 391030 523594 523595 How many users does Microsoft Excel have? What do you need to know to call yourself a po... 0
392934 392934 525677 525678 Which search engine do Microsoft employees use... Why is Microsoft investing more time on Bing w... 0
397727 397727 83729 460405 How do you put a squared symbol in Microsoft W... How do I get word count in Microsoft Word 2003? 0
401630 401630 535044 535045 What is it like to work at Microsoft Research? Does Microsoft Research take interns? What is ... 1
402705 402705 536217 536218 If I install Microsoft PowerPoint 2012 to my P... If I install Microsoft PowerPoint 2012 to my P... 1
403853 403853 537469 537470 What is Microsoft Surface Studio? What do you think about the Microsoft Surface ... 1

261 rows × 6 columns


In [ ]:


In [161]:
dict1 = {}
for i in range(len(cc)):
    dict1[i] = cc[i].nodes()

from multiprocessing import Pool
num_partitions = 24 #number of partitions to split dataframe
num_cores = 24 #number of cores on your machine

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def network_size(search_value1):
    result = [len(value) for key,value in dict1.iteritems() if search_value1 in value]
    if len(result)>0:
        return result[0]
    return int(2)

def multiply_columns(df):
    df['netsize1'] = df['question1'].apply(lambda x: network_size(x))    
    return df
    
iris = parallelize_dataframe(df1.sample(n=100000), multiply_columns)

In [174]:
iris[['is_duplicate','netsize1']].corr()


Out[174]:
is_duplicate netsize1
is_duplicate 1.000000 0.083557
netsize1 0.083557 1.000000

In [59]:
train_comb['netsize'] = iris['netsize1']

In [69]:
train_comb['net2freq'] = train_comb['netsize'] / (train_comb['q1_freq']+train_comb['q2_freq'])
train_comb.corr()


Out[69]:
id q1_hash q2_hash q1_freq q2_freq is_duplicate freq_diff q1_hash_freq q2_hash_freq q_hash_pos q_hash_pos_1 q2_change q1_change q1_q2_change_mean q1_q2_change_min q1_q2_change_max q_change_pair netsize net2freq
id 1.000000 0.690308 0.041179 -0.002600 -0.000871 -0.008784 0.001727 -0.002885 -0.001022 0.115121 0.071093 -0.020399 -0.356764 -0.040099 -0.027226 -0.281257 0.169849 0.000735 0.001416
q1_hash 0.690308 1.000000 0.282445 -0.359849 -0.243217 -0.207682 0.035621 -0.386223 -0.207151 0.060618 -0.046562 0.239944 0.427433 0.259142 0.247919 0.364492 -0.355028 -0.126377 -0.043419
q2_hash 0.041179 0.282445 1.000000 -0.397311 -0.471671 -0.346925 0.069128 -0.406026 -0.365688 -0.583149 -0.472782 0.998102 0.315344 0.995484 0.997163 0.495795 -0.545675 -0.205244 -0.078206
q1_freq -0.002600 -0.359849 -0.397311 1.000000 0.599397 0.343747 -0.166016 0.898113 0.528905 0.128596 0.210646 -0.397409 -0.464393 -0.415473 -0.400677 -0.519822 0.526781 0.217691 0.028268
q2_freq -0.000871 -0.243217 -0.471671 0.599397 1.000000 0.265540 -0.099017 0.583196 0.948601 0.292966 0.292321 -0.471925 -0.315102 -0.480048 -0.471667 -0.424641 0.435390 0.250374 0.021752
is_duplicate -0.008784 -0.207682 -0.346925 0.343747 0.265540 1.000000 -0.332427 0.334511 0.228869 0.123509 0.207493 -0.346618 -0.259241 -0.354151 -0.345165 -0.369878 0.422098 0.085704 0.004848
freq_diff 0.001727 0.035621 0.069128 -0.166016 -0.099017 -0.332427 1.000000 -0.158787 -0.078082 0.072430 -0.164259 0.069072 0.044340 0.070160 0.058338 0.276045 -0.323348 0.024984 0.085937
q1_hash_freq -0.002885 -0.386223 -0.406026 0.898113 0.583196 0.334511 -0.158787 1.000000 0.476849 0.129154 0.210356 -0.406110 -0.498197 -0.425902 -0.410139 -0.544893 0.545906 0.199510 0.028500
q2_hash_freq -0.001022 -0.207151 -0.365688 0.528905 0.948601 0.228869 -0.078082 0.476849 1.000000 0.206482 0.205859 -0.365866 -0.268117 -0.373507 -0.365617 -0.358035 0.373949 0.236189 0.017627
q_hash_pos 0.115121 0.060618 -0.583149 0.128596 0.292966 0.123509 0.072430 0.129154 0.206482 1.000000 0.805124 -0.590683 -0.068443 -0.582470 -0.590505 -0.147689 0.213000 0.124839 0.053049
q_hash_pos_1 0.071093 -0.046562 -0.472782 0.210646 0.292321 0.207493 -0.164259 0.210356 0.205859 0.805124 1.000000 -0.477532 -0.152411 -0.476366 -0.477399 -0.232615 0.323326 0.125530 0.040480
q2_change -0.020399 0.239944 0.998102 -0.397409 -0.471925 -0.346618 0.069072 -0.406110 -0.365866 -0.590683 -0.477532 1.000000 0.337464 0.998590 0.999479 0.513399 -0.556578 -0.205419 -0.078339
q1_change -0.356764 0.427433 0.315344 -0.464393 -0.315102 -0.259241 0.044340 -0.498197 -0.268117 -0.068443 -0.152411 0.337464 1.000000 0.386955 0.356322 0.824397 -0.677780 -0.165039 -0.058009
q1_q2_change_mean -0.040099 0.259142 0.995484 -0.415473 -0.480048 -0.354151 0.070160 -0.425902 -0.373507 -0.582470 -0.476366 0.998590 0.386955 1.000000 0.999143 0.549391 -0.583421 -0.210527 -0.080009
q1_q2_change_min -0.027226 0.247919 0.997163 -0.400677 -0.471667 -0.345165 0.058338 -0.410139 -0.365617 -0.590505 -0.477399 0.999479 0.356322 0.999143 1.000000 0.514343 -0.558467 -0.207436 -0.080218
q1_q2_change_max -0.281257 0.364492 0.495795 -0.519822 -0.424641 -0.369878 0.276045 -0.544893 -0.358035 -0.147689 -0.232615 0.513399 0.824397 0.549391 0.514343 1.000000 -0.814297 -0.174425 -0.038350
q_change_pair 0.169849 -0.355028 -0.545675 0.526781 0.435390 0.422098 -0.323348 0.545906 0.373949 0.213000 0.323326 -0.556578 -0.677780 -0.583421 -0.558467 -0.814297 1.000000 0.193357 0.053250
netsize 0.000735 -0.126377 -0.205244 0.217691 0.250374 0.085704 0.024984 0.199510 0.236189 0.124839 0.125530 -0.205419 -0.165039 -0.210527 -0.207436 -0.174425 0.193357 1.000000 0.751904
net2freq 0.001416 -0.043419 -0.078206 0.028268 0.021752 0.004848 0.085937 0.028500 0.017627 0.053049 0.040480 -0.078339 -0.058009 -0.080009 -0.080218 -0.038350 0.053250 0.751904 1.000000

In [209]:
for i in range(100):
    a=comb.sample()
    if a['q1_freq'].values>3:
        a = int(a['q2_hash'].values)
        if train_comb[(train_comb['q1_hash']==a)|(train_comb['q2_hash']==a)].shape[0]>1:
#         test_comb[(test_comb['q1_hash']==a)|(test_comb['q2_hash']==a)].shape
            train_comb[(train_comb['q1_hash']==a)|(train_comb['q2_hash']==a)]
            break


Out[209]:
id q1_hash q2_hash q1_freq q2_freq is_duplicate freq_diff q1_hash_freq q2_hash_freq q_hash_pos q_hash_pos_1 q2_change q1_change q1_q2_change_mean q1_q2_change_min q1_q2_change_max q_change_pair netsize net2freq
28327 28327 26907 2520383 14 2 0 0.432143 4 2 0 0 1 1 1.0 1 1 0 10201 637.562500
95021 95021 82881 26907 3 14 1 0.264286 3 10 1 1 -2537436 1 -1268717.5 -2537436 1 0 10201 600.058824
124452 124452 105625 26907 1 14 0 0.935714 1 10 1 0 -2556000 1 -1277999.5 -2556000 1 0 10201 680.066667
136699 136699 20653 26907 6 14 1 0.096429 4 10 0 0 -2563610 -94170 -1328890.0 -2563610 -94170 1 10201 510.050000
158792 158792 131039 26907 2 14 0 0.432143 2 10 1 1 -2577158 1 -1288578.5 -2577158 1 0 10201 637.562500
170187 170187 10259 26907 5 14 1 0.130000 4 10 0 0 -2584149 -128978 -1356563.5 -2584149 -128978 1 10201 536.894737
201534 201534 26907 2625604 14 3 1 0.264286 4 3 0 0 -4262 -134304 -69283.0 -134304 -4262 1 10201 600.058824
236164 236164 3895 26907 4 14 0 0.180357 4 10 0 0 -2623603 -180988 -1402295.5 -2623603 -180988 1 10201 566.722222
240858 240858 188003 26907 3 14 1 0.264286 1 10 1 1 -2626348 1 -1313173.5 -2626348 1 0 10201 600.058824
291500 291500 220905 26907 1 14 0 0.935714 1 10 1 0 -2655646 1 -1327822.5 -2655646 1 0 10201 680.066667
324156 324156 45596 26907 5 14 1 0.130000 4 10 1 1 -2674083 -195735 -1434909.0 -2674083 -195735 1 10201 536.894737
338058 338058 156205 26907 6 14 1 0.096429 2 10 1 1 -2681823 -93608 -1387715.5 -2681823 -93608 1 10201 510.050000
348057 348057 26907 2714267 14 1 0 0.935714 4 1 0 0 1 -228960 -114479.5 -228960 1 0 10201 680.066667
395057 395057 26907 74557 14 20 0 0.021786 4 11 0 0 -2665910 -257307 -1461608.5 -2665910 -257307 1 10201 300.029412

In [139]:
comb[(comb['q1_hash']==2513495)|(comb['q2_hash']==2513495)]


Out[139]:
id question1 question2 is_duplicate q1_hash q2_hash q1_freq q2_freq q1_hash_freq q2_hash_freq freq_diff
18353 18353 how is the word calumni use in a sentenc how is the word wist use in a sentenc 0 17715 2513495 7 15 4 15 0.077143
35104 35104 how is the word subcontin use in a sentenc how is the word wist use in a sentenc 0 32934 2513495 4 15 4 15 0.185000
74153 74153 how is the word potent use in a sentenc how is the word wist use in a sentenc 0 35287 2513495 3 15 3 15 0.268889
82459 82459 how is the word impedi use in a sentenc how is the word wist use in a sentenc 0 72867 2513495 5 15 4 15 0.134667
82752 82752 how is the word aver use in a sentenc how is the word wist use in a sentenc 0 73102 2513495 3 15 3 15 0.268889
88220 88220 how is the word anticip use in a sentenc how is the word wist use in a sentenc 0 21015 2513495 5 15 5 15 0.134667
90920 90920 how is the word zealot use in a sentenc how is the word wist use in a sentenc 0 33978 2513495 6 15 6 15 0.101111
173957 173957 how is the word viscous use in a sentenc how is the word wist use in a sentenc 0 141954 2513495 2 15 2 15 0.436667
179175 179175 how is the word prejud use in a sentenc how is the word wist use in a sentenc 0 145655 2513495 1 15 1 15 0.940000
224713 224713 how is the word merci use in a sentenc how is the word wist use in a sentenc 0 98914 2513495 3 15 3 15 0.268889
261096 261096 how is the word motif use in a sentenc how is the word wist use in a sentenc 0 50848 2513495 5 15 5 15 0.134667
343621 343621 how is the word patho use in a sentenc how is the word wist use in a sentenc 0 122259 2513495 4 15 4 15 0.185000
372920 372920 how is the word omin use in a sentenc how is the word wist use in a sentenc 0 133678 2513495 4 15 3 15 0.185000
378272 378272 how is the word rescind use in a sentenc how is the word wist use in a sentenc 0 78968 2513495 8 15 8 15 0.059167
386989 386989 how is the word coloni use in a sentenc how is the word wist use in a sentenc 0 63281 2513495 4 15 3 15 0.185000

In [140]:
train_comb[train_comb['q2_hash']==2513495]


Out[140]:
id q1_hash q2_hash q1_freq q2_freq is_duplicate freq_diff q1_hash_freq q2_hash_freq q_hash_pos q_hash_pos_1 q2_change q1_change q1_q2_change_mean q1_q2_change_min q1_q2_change_max q_change_pair netsize net2freq
18353 18353 17715 2513495 7 15 0 0.077143 4 15 0 0 1 1 1.0 1 1 0 255 11.590909
35104 35104 32934 2513495 4 15 0 0.185000 4 15 0 0 -11447 1 -5723.0 -11447 1 0 255 13.421053
74153 74153 35287 2513495 3 15 0 0.268889 3 15 0 0 -37360 -30800 -34080.0 -37360 -30800 1 255 14.166667
82459 82459 72867 2513495 5 15 0 0.134667 4 15 0 0 -42755 1 -21377.0 -42755 1 0 255 12.750000
82752 82752 73102 2513495 3 15 0 0.268889 3 15 0 0 -42937 1 -21468.0 -42937 1 0 255 14.166667
88220 88220 21015 2513495 5 15 0 0.134667 5 15 0 0 -46498 -56440 -51469.0 -56440 -46498 1 255 12.750000
90920 90920 33978 2513495 6 15 0 0.101111 6 15 0 0 -48249 -45646 -46947.5 -48249 -45646 1 255 12.142857
173957 173957 141954 2513495 2 15 0 0.436667 2 15 0 0 -99896 1 -49947.5 -99896 1 0 255 15.000000
179175 179175 145655 2513495 1 15 0 0.940000 1 15 0 0 -103027 1 -51513.0 -103027 1 0 255 15.937500
224713 224713 98914 2513495 3 15 0 0.268889 3 15 0 0 -130213 -78235 -104224.0 -130213 -78235 1 255 14.166667
261096 261096 50848 2513495 5 15 0 0.134667 5 15 0 0 -151540 -150523 -151031.5 -151540 -150523 1 255 12.750000
343621 343621 122259 2513495 4 15 0 0.185000 4 15 0 0 -198353 -130963 -164658.0 -198353 -130963 1 255 13.421053
372920 372920 133678 2513495 4 15 0 0.185000 3 15 0 0 -214495 -137158 -175826.5 -214495 -137158 1 255 13.421053
378272 378272 78968 2513495 8 15 0 0.059167 8 15 0 0 -217532 -195153 -206342.5 -217532 -195153 1 255 11.086957
386989 386989 63281 2513495 4 15 0 0.185000 3 15 0 0 -222436 -216161 -219298.5 -222436 -216161 1 255 13.421053

In [213]:
_ngram_str_map = {
    1: "Unigram",
    2: "Bigram",
    3: "Trigram",
    4: "Fourgram",
    5: "Fivegram",
    12: "UBgram",
    123: "UBTgram",
}
_ngram_str_map[123]


Out[213]:
'UBTgram'

In [ ]:


In [ ]:
def _unigrams(words):
    """
        Input: a list of words, e.g., ["I", "am", "Denny"]
        Output: a list of unigram
    """
    assert type(words) == list
    return words


def _bigrams(words, join_string, skip=0):
    """
       Input: a list of words, e.g., ["I", "am", "Denny"]
       Output: a list of bigram, e.g., ["I_am", "am_Denny"]
       I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 1:
        lst = []
        for i in range(L-1):
            for k in range(1,skip+2):
                if i+k < L:
                    lst.append( join_string.join([words[i], words[i+k]]) )
    else:
        # set it as unigram
        lst = _unigrams(words)
    return lst


def _trigrams(words, join_string, skip=0):
    """
       Input: a list of words, e.g., ["I", "am", "Denny"]
       Output: a list of trigram, e.g., ["I_am_Denny"]
       I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 2:
        lst = []
        for i in range(L-2):
            for k1 in range(1,skip+2):
                for k2 in range(1,skip+2):
                    if i+k1 < L and i+k1+k2 < L:
                        lst.append( join_string.join([words[i], words[i+k1], words[i+k1+k2]]) )
    else:
        # set it as bigram
        lst = _bigrams(words, join_string, skip)
    return lst

def _ngrams(words, ngram, join_string=" "):
    """wrapper for ngram"""
    if ngram == 1:
        return _unigrams(words)
    elif ngram == 2:
        return _bigrams(words, join_string)
    elif ngram == 3:
        return _trigrams(words, join_string)
    elif ngram == 12:
        unigram = _unigrams(words)
        bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2]
        return unigram + bigram
    elif ngram == 123:
        unigram = _unigrams(words)
        bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2]
        trigram = [x for x in _trigrams(words, join_string) if len(x.split(join_string)) == 3]
        return unigram + bigram + trigram
    
def _tokenize(text, token_pattern=" "):
    # token_pattern = r"(?u)\b\w\w+\b"
    # token_pattern = r"\w{1,}"
    # token_pattern = r"\w+"
    # token_pattern = r"[\w']+"
    if token_pattern == " ":
        # just split the text into tokens
        return text.split(" ")
    else:
        token_pattern = re.compile(token_pattern, flags = re.UNICODE | re.LOCALE)
        group = token_pattern.findall(text)
        return group
    
def _try_divide(x, y, val=0.0):
    """try to divide two numbers"""
    if y != 0.0:
        val = float(x) / y
    return val
    
    
def UniqueRatio_Ngram:
    obs_tokens = _tokenize(obs, token_pattern)
    obs_ngrams = _ngrams(obs_tokens, 12)
    return _try_divide(len(set(obs_ngrams)), len(obs_ngrams))

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: