In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
In [2]:
df = pd.read_csv('../data/raw/train.csv')
df.head()
Out[2]:
Ok, so we're getting a pretty simple input format: Row-ID, Question-ID 1 and 2, the titles for question 1 and 2 and a marker if this question is a duplicate. According to the Kaggle competition question1
and question2
are the full text of the question. So let's see if this is really full text or just the title by looking at the longest sample we have.
I am wondering if this list is fully connected between all question or just randomly and if some of those questions are in the data multiple times.
In [3]:
questions = pd.concat([df['question1'], df['question2']])
df_combined = pd.DataFrame({'question': questions})
# There seems to be some error in the loaded data, we should investigate later (some value seems to be float)
df_combined['question'] = df_combined['question'].apply(str)
df_combined['text_length'] = df_combined['question'].apply(len)
df_combined.sort_values(by='text_length', ascending=False).iloc[0]['question']
Out[3]:
According to my knowledge with quora, this is indeed a full text question. I am just wondering, if there is no short title for this question. As far as I know, each question has a short title (and some additionally have a long description like this).
In [4]:
question_ids = pd.concat([df['qid1'], df['qid2']])
df_combined = pd.Series(question_ids)
df_combined.value_counts().sort_values(ascending=False).head()
Out[4]:
Yes, some of them are there multiple times, but not too often. Let's see if the IDs really match the texts.
In [5]:
questions = pd.concat([df['question1'], df['question2']])
df_combined = pd.Series(questions)
df_combined.value_counts().sort_values(ascending=False).head()
Out[5]:
We see that there seem to be some question with different ID, but the same title. If we're lucky, there is a match of those and they are in the data set as duplicates.
In [6]:
question_title = 'What are the best ways to lose weight?'
df[(df['question1'] == question_title) & (df['question2'] == question_title)]
Out[6]:
Unfortunately, they are not. So let's at least verify the ID of the second question with this title to make sure that there is nothing wrong with our counting code.
In [7]:
ids1 = df[(df['question1'] == question_title)]['qid1'].value_counts()
ids2 = df[(df['question2'] == question_title)]['qid2'].value_counts()
ids1.add(ids2, fill_value=0)
Out[7]:
The question IDs are fine, there are four questions with the same title, but only one of them occurs in a lot of matches in this duplicate list.
Let's finally check how many unique questions and how many samples we got.
In [8]:
questions = len(pd.concat([df['qid1'], df['qid2']]).unique())
samples = len(df)
print('%d questions and %d samples' % (questions, samples))
There are two questions per sample, so there will be a lot of questions which only occur a single time in the whole data set.
Let's see how often they decided that two questions are duplicates in this dataset and how often not. This is important to make sure that our model will not be biased by the data it has seen (e.g. get an accuracy score of 99% by just betting "no", just because 99% of the training data is "no").
In [9]:
sns.countplot(df['is_duplicate']);
About 250,000 samples from the data are not considered duplicates and about 150,000 samples of the data are considered duplicates. That's not exactly uniform, but not totally biased either. We will have to keep this in mind, but it won't totally break our estimations if we forget to keep a very close eye onto it.