In [2]:
from __future__ import division
import time, os, gc
import numpy as np
import pandas as pd
import scipy
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectPercentile, f_classif
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.feature_extraction import text
import config

PATH = config.RAW_PATH

In [2]:
train_orig =  pd.read_csv(PATH+'train.csv', header=0)#.sample(n=10000)
test_orig =  pd.read_csv(PATH+'test.csv', header=0)#.sample(n=10000)

# def stem_str(x,stemmer=SnowballStemmer('english')):
#         x = text.re.sub("[^a-zA-Z0-9]"," ", x)
#         x = (" ").join([stemmer.stem(z) for z in x.split(" ")])
#         x = " ".join(x.split())
#         return x
# porter = PorterStemmer()
# snowball = SnowballStemmer('english')

# train_orig['question1'] = train_orig['question1'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
# train_orig['question1'] = train_orig['question1'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
# train_orig['question2'] = train_orig['question2'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
# train_orig['question2'] = train_orig['question2'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))

df1 = train_orig[['question1']].copy()
df2 = train_orig[['question2']].copy()
df1_test = test_orig[['question1']].copy()
df2_test = test_orig[['question2']].copy()

df2.rename(columns = {'question2':'question1'},inplace=True)
df2_test.rename(columns = {'question2':'question1'},inplace=True)
train_questions = df1.append(df2)
train_questions = train_questions.append(df1_test)
train_questions = train_questions.append(df2_test)
#train_questions.drop_duplicates(subset = ['qid1'],inplace=True)
train_questions.drop_duplicates(subset = ['question1'],inplace=True)


train_questions.reset_index(inplace=True,drop=True)
questions_dict = pd.Series(train_questions.index.values,index=train_questions.question1.values).to_dict()
train_cp = train_orig.copy()
test_cp = test_orig.copy()
train_cp.drop(['qid1','qid2'],axis=1,inplace=True)

test_cp['is_duplicate'] = -1
test_cp.rename(columns={'test_id':'id'},inplace=True)
comb = pd.concat([train_cp,test_cp])

comb['q1_hash'] = comb['question1'].map(questions_dict)
comb['q2_hash'] = comb['question2'].map(questions_dict)

q1_vc = comb.q1_hash.value_counts().to_dict()
q2_vc = comb.q2_hash.value_counts().to_dict()

def try_apply_dict(x,dict_to_apply):
    try:
        return dict_to_apply[x]
    except KeyError:
        return 0
#map to frequency space
comb['q1_freq'] = comb['q1_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))
comb['q2_freq'] = comb['q2_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))
# comb['q1_hash_freq'] = comb['q1_hash'].map(lambda x: try_apply_dict(x,q1_vc))
# comb['q2_hash_freq'] = comb['q2_hash'].map(lambda x: try_apply_dict(x,q2_vc))

comb['freq_diff'] = (abs(comb['q1_freq'] - comb['q2_freq'])+0.1) / (comb['q1_freq'] * comb['q2_freq'])

In [3]:
comb['q_hash_pos'] = comb['q1_hash']-comb['q2_hash']>0
comb['q_hash_pos'] = comb['q_hash_pos'].astype(int)
# comb['q_hash_pos_1'] = comb[['q1_freq','q_hash_pos']].apply(lambda x: 1 if x[0]>1 and x[1]>0 else 0, axis=1)

list1 = []
list1.append(0)
gpf = comb['q2_hash'].values
tag = gpf[0]
for i in range(comb.shape[0])[1:]:
    if gpf[i]-tag<0:
        list1.append(gpf[i]-tag)
    if gpf[i]-tag>=0:
        list1.append(gpf[i]-tag)
        tag=gpf[i]
comb['q2_change'] = list1

list1 = []
list1.append(0)
gpf = comb['q1_hash'].values
tag = gpf[0]
for i in range(comb.shape[0])[1:]:
    if gpf[i]-tag<0:
        list1.append(gpf[i]-tag)
    if gpf[i]-tag>=0:
        list1.append(gpf[i]-tag)
        tag=gpf[i]
comb['q1_change'] = list1

# comb['q1_q2_change_mean'] = (comb['q1_change'] + comb['q2_change'])/2.0
# comb['q1_q2_change_min'] = comb[['q1_change','q2_change']].apply(lambda x: min(x[0],x[1]),axis=1)
comb['q1_q2_change_max'] = comb[['q1_change','q2_change']].apply(lambda x: max(x[0],x[1]),axis=1)

Q_CHANGE = 0
comb['q_change_pair'] = (comb['q1_change']<Q_CHANGE) & (comb['q2_change']<Q_CHANGE)
comb['q_change_pair'] = comb['q_change_pair'].astype(int)
# comb['q_change_x'] = comb['q1_change'] * comb['q2_change']

In [4]:
from collections import defaultdict

# train_orig =  pd.read_csv(config.RAW_PATH+'train.csv', header=0)
# test_orig =  pd.read_csv(config.RAW_PATH+'test.csv', header=0)

ques = pd.concat([train_orig[['question1', 'question2']], \
        test_orig[['question1', 'question2']]], axis=0).reset_index(drop='index')
ques.shape

q_dict = defaultdict(set)
for i in range(ques.shape[0]):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])
        
def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

def q1_q2_intersect_ratio(row):
    return len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']])))\
            / len(set(q_dict[row['question1']]).union(set(q_dict[row['question2']])))

comb['q1_q2_intersect'] = comb.apply(q1_q2_intersect, axis=1, raw=True)
# test_orig['q1_q2_intersect'] = test_orig.apply(q1_q2_intersect, axis=1, raw=True)
comb['q1_q2_intersect_ratio'] = comb.apply(q1_q2_intersect_ratio, axis=1, raw=True)
# test_orig['q1_q2_intersect_ratio'] = test_orig.apply(q1_q2_intersect_ratio, axis=1, raw=True)

In [5]:
from nltk.corpus import stopwords

stops = set(stopwords.words("english"))
def word_match_share(q1, q2, stops=None):
    q1 = str(q1).lower().split()
    q2 = str(q2).lower().split()
    q1words = {}
    q2words = {}
    for word in q1:
        if word not in stops:
            q1words[word] = 1
    for word in q2:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0.
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

q_dict = defaultdict(dict)
for i in range(ques.shape[0]):
        wm = word_match_share(ques.question1[i], ques.question2[i], stops=stops)
        q_dict[ques.question1[i]][ques.question2[i]] = wm
        q_dict[ques.question2[i]][ques.question1[i]] = wm

In [6]:
def q1_q2_wm_ratio(row):
    q1 = q_dict[row['question1']]
    q2 = q_dict[row['question2']]
    inter_keys = set(q1.keys()).intersection(set(q2.keys()))
    if(len(inter_keys) == 0): return 0.
    inter_wm = 0.
    total_wm = 0.
    for q,wm in q1.items():
        if q in inter_keys:
            inter_wm += wm
        total_wm += wm
    for q,wm in q2.items():
        if q in inter_keys:
            inter_wm += wm
        total_wm += wm
    if(total_wm == 0.): return 0.
    return inter_wm/total_wm


comb['q1_q2_wm_ratio'] = comb.apply(q1_q2_wm_ratio, axis=1, raw=True)

In [7]:
# train_comb = comb[comb['is_duplicate'] >= 0]#[corr_list]
# test_comb = comb[comb['is_duplicate'] < 0]#[corr_list]
comb[comb['is_duplicate'] >= 0].corr()


Out[7]:
id is_duplicate q1_hash q2_hash q1_freq q2_freq freq_diff q_hash_pos q2_change q1_change q1_q2_change_max q_change_pair q1_q2_intersect q1_q2_intersect_ratio q1_q2_wm_ratio
id 1.000000 -0.008784 0.692730 0.286969 -0.001608 -0.000777 0.001553 0.115864 -0.181267 -0.355739 -0.280031 0.169770 -0.003347 -0.006791 -0.006519
is_duplicate -0.008784 1.000000 -0.206498 -0.349626 0.296621 0.198609 -0.337501 0.123114 -0.354887 -0.258640 -0.369709 0.421856 0.412979 0.609256 0.641026
q1_hash 0.692730 -0.206498 1.000000 0.492993 -0.341777 -0.202545 0.093927 0.062609 0.175287 0.425403 0.362024 -0.352406 -0.316926 -0.339755 -0.344282
q2_hash 0.286969 -0.349626 0.492993 1.000000 -0.392605 -0.466434 0.165365 -0.611694 0.889947 0.280395 0.496426 -0.542355 -0.464619 -0.526013 -0.540884
q1_freq -0.001608 0.296621 -0.341777 -0.392605 1.000000 0.494315 -0.169080 0.120163 -0.402370 -0.443739 -0.477415 0.485402 0.789134 0.490514 0.487466
q2_freq -0.000777 0.198609 -0.202545 -0.466434 0.494315 1.000000 -0.100777 0.277154 -0.478562 -0.263241 -0.359293 0.370622 0.591368 0.358951 0.355246
freq_diff 0.001553 -0.337501 0.093927 0.165365 -0.169080 -0.100777 1.000000 0.001560 0.169113 0.120852 0.301276 -0.353560 -0.298225 -0.428979 -0.444353
q_hash_pos 0.115864 0.123114 0.062609 -0.611694 0.120163 0.277154 0.001560 1.000000 -0.683733 -0.067113 -0.146369 0.212361 0.162014 0.190019 0.199713
q2_change -0.181267 -0.354887 0.175287 0.889947 -0.402370 -0.478562 0.169113 -0.683733 1.000000 0.456725 0.642652 -0.638353 -0.475486 -0.536898 -0.552301
q1_change -0.355739 -0.258640 0.425403 0.280395 -0.443739 -0.263241 0.120852 -0.067113 0.456725 1.000000 0.822861 -0.676821 -0.409180 -0.434685 -0.440957
q1_q2_change_max -0.280031 -0.369709 0.362024 0.496426 -0.477415 -0.359293 0.301276 -0.146369 0.642652 0.822861 1.000000 -0.814393 -0.524993 -0.558387 -0.564957
q_change_pair 0.169770 0.421856 -0.352406 -0.542355 0.485402 0.370622 -0.353560 0.212361 -0.638353 -0.676821 -0.814393 1.000000 0.527872 0.621898 0.638566
q1_q2_intersect -0.003347 0.412979 -0.316926 -0.464619 0.789134 0.591368 -0.298225 0.162014 -0.475486 -0.409180 -0.524993 0.527872 1.000000 0.722593 0.684574
q1_q2_intersect_ratio -0.006791 0.609256 -0.339755 -0.526013 0.490514 0.358951 -0.428979 0.190019 -0.536898 -0.434685 -0.558387 0.621898 0.722593 1.000000 0.976114
q1_q2_wm_ratio -0.006519 0.641026 -0.344282 -0.540884 0.487466 0.355246 -0.444353 0.199713 -0.552301 -0.440957 -0.564957 0.638566 0.684574 0.976114 1.000000

In [8]:
comb.to_csv(config.FEAT_PATH+'magic_feature.csv',index=False)

In [ ]:


In [ ]:


In [4]:
# encoding: utf-8
import sys  

reload(sys)  
sys.setdefaultencoding('utf8')

import pandas as pd
import hashlib
import gc 

df_train = pd.read_csv(config.RAW_PATH+'train.csv').fillna("")
df_test = pd.read_csv(config.RAW_PATH+'test.csv').fillna("")


# Generating a graph of Questions and their neighbors
def generate_qid_graph_table(row):
    hash_key1 = hashlib.md5(row["question1"].encode('utf-8')).hexdigest()
    hash_key2 = hashlib.md5(row["question2"].encode('utf-8')).hexdigest()

    qid_graph.setdefault(hash_key1, []).append(hash_key2)
    qid_graph.setdefault(hash_key2, []).append(hash_key1)


qid_graph = {}
print('Apply to train...')
df_train.apply(generate_qid_graph_table, axis=1)
print('Apply to test...')
df_test.apply(generate_qid_graph_table, axis=1)


def pagerank():
    MAX_ITER = 20
    d = 0.85

    # Initializing -- every node gets a uniform value!
    pagerank_dict = {i: 1 / len(qid_graph) for i in qid_graph}
    num_nodes = len(pagerank_dict)

    for iter in range(0, MAX_ITER):

        for node in qid_graph:
            local_pr = 0

            for neighbor in qid_graph[node]:
                local_pr += pagerank_dict[neighbor] / len(qid_graph[neighbor])

            pagerank_dict[node] = (1 - d) / num_nodes + d * local_pr

    return pagerank_dict

print('Main PR generator...')
pagerank_dict = pagerank()

def get_pagerank_value(row):
    q1 = hashlib.md5(row["question1"].encode('utf-8')).hexdigest()
    q2 = hashlib.md5(row["question2"].encode('utf-8')).hexdigest()
    s = pd.Series({
        "q1_pr": pagerank_dict[q1],
        "q2_pr": pagerank_dict[q2]
    })
    return s

print('Apply to train...')
pagerank_feats_train = df_train.apply(get_pagerank_value, axis=1)
print('Writing train...')
# pagerank_feats_train.to_csv("pagerank_train.csv", index=False)
del df_train
gc.collect()
print('Apply to test...')
pagerank_feats_test = df_test.apply(get_pagerank_value, axis=1)
print('Writing test...')
# pagerank_feats_test.to_csv("pagerank_test.csv", index=False)

In [15]:
train = pd.concat([pagerank_feats_train, pagerank_feats_test], axis=0).reset_index(drop=True)
train.to_csv(config.FEAT_PATH+'pagerank.csv', index=False)

In [4]:
from tqdm import tqdm
import re

train_orig =  pd.read_csv(config.RAW_PATH+'train.csv', header=0)#.sample(n=1000)
test_orig =  pd.read_csv(config.RAW_PATH+'test.csv', header=0)#.sample(n=1000)
df = pd.concat([train_orig[['question1', 'question2']], 
                test_orig[['question1', 'question2']]], axis=0).reset_index(drop=True)
locations = pd.read_csv(config.FEAT_PATH+"cities.csv")
countries = set(locations['Country'].dropna(inplace=False).values.tolist())
all_places = countries
regex = "|".join(sorted(set(all_places)))



results = []
for index, row in tqdm(df.iterrows()):
    q1 = str(row['question1'])
    q2 = str(row['question2'])

    rr = {}

    q1_matches = []
    q2_matches = []

    if (len(q1) > 0):
        q1_matches = [i.lower() for i in re.findall(regex, q1, flags=re.IGNORECASE)]

    if (len(q2) > 0):
        q2_matches = [i.lower() for i in re.findall(regex, q2, flags=re.IGNORECASE)]

    rr['z_q1_place_num'] = len(q1_matches)
    rr['z_q1_has_place'] =len(q1_matches) > 0

    rr['z_q2_place_num'] = len(q2_matches) 
    rr['z_q2_has_place'] = len(q2_matches) > 0

    rr['z_place_match_num'] = len(set(q1_matches).intersection(set(q2_matches)))
    rr['z_place_match'] = rr['z_place_match_num'] > 0

    rr['z_place_mismatch_num'] = len(set(q1_matches).difference(set(q2_matches)))
    rr['z_place_mismatch'] = rr['z_place_mismatch_num'] > 0

    results.append(rr)     

loc_df = pd.DataFrame.from_dict(results)
#out_df.to_csv("../features/{}_place_matches.csv".format(dataset), index=False, header=True)
# out_df.to_csv("{}_place_matches.csv".format(dataset))


2750086it [16:42, 2744.08it/s]

In [5]:
magic_feature = pd.read_csv(config.FEAT_PATH+'magic_feature.csv')
pagerank = pd.read_csv(config.FEAT_PATH+'pagerank.csv')

magic_feature = pd.concat([magic_feature, pagerank, loc_df], axis=1)
magic_feature['z_place_match']=magic_feature['z_place_match'].astype(int)
magic_feature['z_place_mismatch']=magic_feature['z_place_mismatch'].astype(int)
magic_feature['z_q1_has_place']=magic_feature['z_q1_has_place'].astype(int)
magic_feature['z_q2_has_place']=magic_feature['z_q2_has_place'].astype(int)

In [39]:
train_orig =  pd.read_csv(config.RAW_PATH+'train.csv', header=0)#.sample(n=1000)
test_orig =  pd.read_csv(config.RAW_PATH+'test.csv', header=0)#.sample(n=1000)
df = pd.concat([train_orig[['question1', 'question2']], 
                test_orig[['question1', 'question2']]], axis=0).reset_index(drop=True)

question_kcores = pd.read_csv(config.FEAT_PATH+'question_kcores.csv')
question_kcores_dict = dict(zip(question_kcores.question, question_kcores.kcores))

def kcores(x):
    try:
        return question_kcores_dict[x.lower()]
    except: return -1
    
df['q1_kcores'] = df[['question1']].apply(lambda x: kcores(str(x['question1'])), axis=1)
df['q2_kcores'] = df[['question2']].apply(lambda x: kcores(str(x['question2'])), axis=1)
magic_feature[['q1_kcores','q2_kcores']] = df[['q1_kcores','q2_kcores']]

In [36]:


In [73]:
############ cliques networks ###########

%matplotlib inline
import networkx as nx
import pandas as pd
from itertools import combinations
import config

G = nx.Graph()
df = pd.read_csv(config.RAW_PATH+'train.csv',nrows=400000).fillna("")

edges = [tuple(x) for x in df[['question1', 'question2']].values]
G.add_edges_from(edges)

map_label = dict(((x[0], x[1]), x[2]) for x in df[['question1', 'question2', 'is_duplicate']].values)
map_clique_size = {}
cliques = sorted(list(nx.find_cliques(G)), key=lambda x: len(x))
for cli in cliques:
    for q1, q2 in combinations(cli, 2):
        if (q1, q2) in map_label:
            map_clique_size[q1, q2] = len(cli)
        elif (q2, q1) in map_label:
            map_clique_size[q2, q1] = len(cli)

df['clique_size'] = df.apply(lambda row: map_clique_size.get((row['question1'], row['question2']), -1), axis=1)

In [74]:
df[df['clique_size']>2]


Out[74]:
id qid1 qid2 question1 question2 is_duplicate clique_size
12 12 25 26 What can make Physics easy to learn? How can you make physics easy to learn? 1 3
13 13 27 28 What was your first sexual experience like? What was your first sexual experience? 1 3
14 14 29 30 What are the laws to change your status from a... What are the laws to change your status from a... 0 3
15 15 31 32 What would a Trump presidency mean for current... How will a Trump presidency affect the student... 1 7
18 18 37 38 Why are so many Quora users posting questions ... Why do people ask Quora questions which can be... 1 14
22 22 45 46 What are the questions should not ask on Quora? Which question should I ask on Quora? 0 3
29 29 59 60 How should I prepare for CA final law? How one should know that he/she completely pre... 1 9
31 31 63 64 What are some special cares for someone with a... How can I keep my nose from getting stuffy at ... 1 3
38 38 77 78 How do we prepare for UPSC? How do I prepare for civil service? 1 6
48 48 97 98 What are some examples of products that can be... What are some of the products made from crude ... 1 3
49 49 99 100 How do I make friends. How to make friends ? 1 3
58 58 117 118 I was suddenly logged off Gmail. I can't remem... I can't remember my Gmail password or my recov... 1 7
62 62 125 126 How is the new Harry Potter book 'Harry Potter... How bad is the new book by J.K Rowling? 1 3
66 66 133 134 What is the best book ever made? What is the most important book you have ever ... 1 4
67 67 135 136 Can we ever store energy produced in lightning? Is it possible to store the energy of lightning? 1 4
72 72 145 146 How I can speak English fluently? How can I learn to speak English fluently? 1 19
73 73 147 148 How helpful is QuickBooks' auto data recovery ... What is the quickbooks customer support phone ... 1 5
74 74 149 150 Who is the richest gambler of all time and how... Who is the richest gambler of all time and how... 1 3
79 79 159 160 What is purpose of life? What's the purpose of life? What is life actua... 1 20
85 85 171 172 How can I increase my height after 21 also? Can height increase after 25? 1 13
86 86 173 174 What were the major effects of the cambodia ea... What were the major effects of the cambodia ea... 1 22
88 88 177 178 Which is the best gaming laptop under 60k INR? Which is the best gaming laptop under Rs 60000? 1 11
92 92 185 186 What are some of the best romantic movies in E... What is the best romantic movie you have ever ... 1 10
93 93 187 188 What causes a nightmare? What causes nightmares that seem real? 1 3
100 100 201 202 Will there really be any war between India and... Will there be a nuclear war between India and ... 1 12
113 113 227 228 What do you think China food? How do you think of Chinese food? 1 4
118 118 237 238 What are some mind-blowing computer tools that... What are some mind-blowing technologies that e... 0 3
120 120 241 242 Why my question was marked as needing imrovement? How can I ask a question without getting marke... 1 4
123 123 247 248 What is the greatest mystery in the universe? What is the greatest mystery of all time? 0 3
125 125 251 252 Why does China block sanctions at the UN again... Why does China support Masood Azhar? 1 4
... ... ... ... ... ... ... ...
399860 399860 6424 115673 How do you know if a person is lying? How do I know that a person is lying to me? 1 3
399862 399862 9350 59124 How do I improve my English with creative writ... How can I upgrade my English Writing skills? 1 14
399866 399866 29454 238380 How do I increase my breast size? How can I increase size of breast? 1 3
399867 399867 61005 128041 How long would it take me to learn Java? How long does it take to learn java and how di... 1 4
399880 399880 489 56778 If dark energy is created with expansion can i... If vacuum gravitational and dark energy are cr... 1 12
399890 399890 13216 6275 What do you think about RBI's new move of bann... What do you think of the decision by the India... 1 11
399891 399891 369519 141079 What should be done when one feels lonely? What should one do when one feels lonely? 1 4
399896 399896 67546 68125 Should people over 87 not be allowed to vote? Should people over 94 not be allowed to vote? 1 13
399900 399900 4629 22122 What are the best digital marketing courses fo... What is the best digital marketing course onli... 1 11
399901 399901 118213 36478 Will Donald Trump end up in some conflict of i... Will Donald Trump end up in some conflict of i... 1 4
399914 399914 30820 415556 How can someone learn to become an interaction... How should I learn interaction design? Where s... 1 3
399916 399916 354054 133020 What are some quick and painless ways to kill ... What is the most painless and peaceful way to ... 1 6
399930 399930 189036 138328 What are the main imports and exports of Venez... What are the main imports and exports of Venez... 0 4
399932 399932 40526 36872 How do I forget someone whom l love so much? How do I forget someone you were deeply in loved? 1 9
399933 399933 25909 123240 How do you block someone on Quora? How do you block people on Quora? 1 9
399934 399934 153379 11099 How do I get The Kapil Sharma Show's tickets o... How do I get tickets for The Kapil Sharma Show ? 1 4
399936 399936 316892 344198 What's the best coffee? Which is the best coffee? 1 3
399938 399938 4661 61683 What is that one decision that changed your li... What is that one incident in your life that co... 1 8
399941 399941 131205 50850 Is it good to invest in PNB housing finance is... Should I invest in PNB Housing Finance IPO ? 1 3
399943 399943 122006 96191 What are the latest developments in IT industry? What are the latest technologies and developme... 1 3
399956 399956 122282 70466 What is latest technologies? What is the latest technology in manufacturing? 0 3
399960 399960 86156 264174 How did corn for grain become a cash crop in L... How did hay become a cash crop in Louisiana? W... 0 3
399967 399967 19307 9396 Why Modi is putting a ban on 500 and 1000 notes? Why did Modi scrap Rs 500 & Rs 1000 notes? And... 1 8
399972 399972 25851 185894 What the purpose of life on earth? What's our purpose of life in this world? 1 5
399973 399973 197647 131213 Where does the water from the Great Lakes come... Where does the water from the Great Lakes come... 0 3
399980 399980 178548 231588 How do I add or change my Quora profile picture? How do you change your profile picture? 1 8
399986 399986 34642 251420 What made Facebook different than the other so... How was Facebook better than the already exist... 1 4
399987 399987 54431 5294 How is being gay or lesbian less moral than di... Is Run Ze Cao's falsification of Einstein's re... 0 3
399990 399990 152363 299311 What are some scientific or psychological theo... Is there a scientific explanation for homosexu... 1 5
399991 399991 38490 20771 What was the best day of your life? What happe... What was the best day of your life? 1 12

99590 rows × 7 columns


In [77]:
import networkx as nx
import random

cliques = nx.find_cliques(G)
cliques = [clq for clq in cliques if len(clq) >= 6]

h = G.subgraph([n for n in random.choice(cliques)])
nx.draw(h, with_labels=True, alpha=0.8, font_size=11)
# break



In [ ]: