In [1]:
import numpy as np
import pandas as pd
import zipfile
from IPython.display import display # Allows the use of display() for DataFrames
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
# Extract all zipped files
dest_dir = 'C:\\Users\\bencxs\\workspace\\quora-question-pairs'
source_filename = ['train.csv.zip','test.csv.zip','sample_submission.csv.zip']
def unzip(source_filename, dest_dir):
for source in source_filename:
print("Extracting " + source)
with zipfile.ZipFile(source) as zf:
zf.extractall(dest_dir)
print("Extracted " + source)
unzip(source_filename, dest_dir)
In [3]:
# Read training set
df_train = pd.read_csv('train.csv')
display(df_train.head())
display(df_train.describe())
In [5]:
sns.set(style="whitegrid")
ax = sns.countplot(x="is_duplicate", data=df_train)
In [19]:
qids = pd.Series(df_train['qid1'].tolist() + df_train['qid2'].tolist())
#sns.distplot(qids, bins=50, kde=False)
plt.figure(figsize=(12, 5))
plt.hist(qids.value_counts(), bins=50)
plt.yscale('log', nonposy='clip')
plt.title('Log-Histogram of question appearance counts')
plt.xlabel('Number of occurences of question')
plt.ylabel('Number of questions')
plt.show()
In [26]:
# Since % of duplicate labels are 37% in the training set, we can estimate the % in the test set
# by using the logloss formula
from sklearn.metrics import log_loss
l = []
p = [0.37] * 1000
for r in range(1, 1000):
y = [1]*r + [0]*(1000-r)
l.append(log_loss(y, p))
plt.plot(x=np.arange(0, 100.1, 0.1), y=l)
plt.show()
In [ ]: