Exploratory Data Analysis



In [1]:

    
import numpy as np
import pandas as pd
import zipfile
from IPython.display import display # Allows the use of display() for DataFrames
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [2]:

    
# Extract all zipped files
dest_dir = 'C:\\Users\\bencxs\\workspace\\quora-question-pairs'
source_filename = ['train.csv.zip','test.csv.zip','sample_submission.csv.zip']

def unzip(source_filename, dest_dir):
    for source in source_filename:
        print("Extracting " + source)
        with zipfile.ZipFile(source) as zf:
            zf.extractall(dest_dir)
        print("Extracted " + source)
        
unzip(source_filename, dest_dir)









    



Extracting train.csv.zip
Extracted train.csv.zip
Extracting test.csv.zip
Extracted test.csv.zip
Extracting sample_submission.csv.zip
Extracted sample_submission.csv.zip



In [3]:

    
# Read training set
df_train = pd.read_csv('train.csv')

display(df_train.head())
display(df_train.describe())









    






  
    
      
      id
      qid1
      qid2
      question1
      question2
      is_duplicate
    
  
  
    
      0
      0
      1
      2
      What is the step by step guide to invest in sh...
      What is the step by step guide to invest in sh...
      0
    
    
      1
      1
      3
      4
      What is the story of Kohinoor (Koh-i-Noor) Dia...
      What would happen if the Indian government sto...
      0
    
    
      2
      2
      5
      6
      How can I increase the speed of my internet co...
      How can Internet speed be increased by hacking...
      0
    
    
      3
      3
      7
      8
      Why am I mentally very lonely? How can I solve...
      Find the remainder when [math]23^{24}[/math] i...
      0
    
    
      4
      4
      9
      10
      Which one dissolve in water quikly sugar, salt...
      Which fish would survive in salt water?
      0
    
  








    






  
    
      
      id
      qid1
      qid2
      is_duplicate
    
  
  
    
      count
      404290.000000
      404290.000000
      404290.000000
      404290.000000
    
    
      mean
      202144.500000
      217243.942418
      220955.655337
      0.369198
    
    
      std
      116708.614502
      157751.700002
      159903.182629
      0.482588
    
    
      min
      0.000000
      1.000000
      2.000000
      0.000000
    
    
      25%
      101072.250000
      74437.500000
      74727.000000
      0.000000
    
    
      50%
      202144.500000
      192182.000000
      197052.000000
      0.000000
    
    
      75%
      303216.750000
      346573.500000
      354692.500000
      1.000000
    
    
      max
      404289.000000
      537932.000000
      537933.000000
      1.000000



In [5]:

    
sns.set(style="whitegrid")
ax = sns.countplot(x="is_duplicate", data=df_train)



In [19]:

    
qids = pd.Series(df_train['qid1'].tolist() + df_train['qid2'].tolist())
#sns.distplot(qids, bins=50, kde=False)

plt.figure(figsize=(12, 5))
plt.hist(qids.value_counts(), bins=50)
plt.yscale('log', nonposy='clip')
plt.title('Log-Histogram of question appearance counts')
plt.xlabel('Number of occurences of question')
plt.ylabel('Number of questions')
plt.show()



In [26]:

    
# Since % of duplicate labels are 37% in the training set, we can estimate the % in the test set
# by using the logloss formula
from sklearn.metrics import log_loss

l = []
p = [0.37] * 1000
for r in range(1, 1000):
    y = [1]*r + [0]*(1000-r)
    l.append(log_loss(y, p))

plt.plot(x=np.arange(0, 100.1, 0.1), y=l)
plt.show()



In [ ]:

	id	qid1	qid2	question1	question2
0	0	1	2	What is the step by step guide to invest in sh...	What is the step by step guide to invest in sh...
1	1	3	4	What is the story of Kohinoor (Koh-i-Noor) Dia...	What would happen if the Indian government sto...
2	2	5	6	How can I increase the speed of my internet co...	How can Internet speed be increased by hacking...
3	3	7	8	Why am I mentally very lonely? How can I solve...	Find the remainder when [math]23^{24}[/math] i...
4	4	9	10	Which one dissolve in water quikly sugar, salt...	Which fish would survive in salt water?

	id	qid1	qid2	is_duplicate
count	404290.000000	404290.000000	404290.000000	404290.000000
mean	202144.500000	217243.942418	220955.655337	0.369198
std	116708.614502	157751.700002	159903.182629	0.482588
min	0.000000	1.000000	2.000000	0.000000
25%	101072.250000	74437.500000	74727.000000	0.000000
50%	202144.500000	192182.000000	197052.000000	0.000000
75%	303216.750000	346573.500000	354692.500000	1.000000
max	404289.000000	537932.000000	537933.000000	1.000000