In [1]:
#First let's import the necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from IPython.display import display, HTML
pd.set_option('display.max_columns', 500)
In [2]:
#Specifying the Data Path
cwd = os.getcwd()
file_path = os.path.join(cwd, 'cleaned_speed_dating.csv')
In [3]:
df=pd.read_csv(file_path)
Split the dataset into training and test sets
In [4]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size = 0.2, random_state=42)
Split the dataset by gender
In [5]:
female_df = df.loc[df['gender'] == 0]
male_df = df.loc[df['gender'] == 1]
In [6]:
female_train, female_test = train_test_split(female_df, test_size = 0.2, random_state=42)
male_train, male_test = train_test_split(male_df, test_size = 0.2, random_state=42)
In [7]:
from sklearn import linear_model
lr = linear_model.LogisticRegression()
In [8]:
# Do logistic regression using only one variable
predictors = ['attr_partner']
lr_model = lr.fit(train[predictors].values, train['dec'].values)
print('training set performance is {}'.format(lr_model.score(train[predictors].values, train['dec'].values)))
print('test set performance is {}'.format(lr_model.score(test[predictors].values, test['dec'].values)))
In fact, if you only use one variable, that is no different than just doing a cut off based on one of the previous graphs and make naive predictions based on that. Let's see what are the other variables that we can use.
In [9]:
list(df.columns[1:-1])
Out[9]:
In [10]:
predictors = ['age','date','int_corr',
'samerace',
'sinc_partner',
'intel_partner',
'fun_partner',
'amb_partner',
'shar_partner','prob']
lr_model = lr.fit(train[predictors].values, train['dec'].values)
print('training set performance is {}'.format(lr_model.score(train[predictors].values, train['dec'].values)))
print('test set performance is {}'.format(lr_model.score(test[predictors].values, test['dec'].values)))
In [11]:
# You can also see how important each of those factors is (sort of)
print(lr_model.coef_)
But it does seems like attractiveness is more indicative than anything else. Next you can try to combine them and repeat the same procedure. Is there an improvement to the performance? What can you infer from the coefficients this time?
In [ ]:
Try to include also those variables that you think are important and repeat the same step again. Observe what happen to the performances when you add more and more predictors.
In [ ]:
Based on the graphs on EDA, it seems that male and female make their decisions quite differently. Try to repeat the above with female_df and male_df and see if the results improve.
In order to know how good our prediction performance is, we should at least compare it to the performances of some naive algorithms.
In [12]:
# what if I just look at the training set and guess the most popular decisions?
no_female_train = female_train.query('dec == 0')
print('Proportion of rejection by female in training set is {}'\
.format(no_female_train.shape[0]/female_train.shape[0]))
no_female_test = female_test.query('dec == 0')
print('Proportion of rejection by female in test set is {}'\
.format(no_female_test.shape[0]/female_test.shape[0]))
In [13]:
# what if I just look at the training set and guess the most popular decisions?
no_male_train = male_train.query('dec == 0')
print('Proportion of rejection by male in training set is {}'\
.format(no_male_train.shape[0]/male_train.shape[0]))
no_male_test = male_test.query('dec == 0')
print('Proportion of rejection by male in test set is {}'\
.format(no_male_test.shape[0]/male_test.shape[0]))
In [14]:
# what if I simply do a cut off at attr_partner and base my decision on that? (refer to graphs plotted in EDA)
male_test['attr_cut_predict'] = (male_test['attr_partner']>=7)
print('male test set performance is {}'\
.format((male_test['attr_cut_predict'] == male_test['dec']).sum()/male_test.shape[0]))
In [15]:
female_test['attr_cut_predict'] = (female_test['attr_partner']>=8)
print('female test set performance is {}'\
.format((female_test['attr_cut_predict'] == female_test['dec']).sum()/female_test.shape[0]))
Tree model allows combination of factors (as opposed to logistic regression model).
In [16]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(min_impurity_split=0.3)
In [17]:
predictors = ['age','date','int_corr',
'samerace',
'sinc_partner',
'intel_partner',
'fun_partner',
'amb_partner',
'shar_partner','prob', 'attr_partner','attr_want',
'sinc_want',
'intel_want',
'fun_want',
'amb_want',
'shar_want']
dt_model = dt.fit(train[predictors].values, train['dec'].values)
print('training set performance is {}'.format(dt_model.score(train[predictors].values, train['dec'].values)))
print('test set performance is {}'.format(dt_model.score(test[predictors].values, test['dec'].values)))
In [18]:
dt_model.feature_importances_
Out[18]:
In [19]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
In [20]:
predictors = ['age','date','int_corr',
'samerace',
'sinc_partner',
'intel_partner',
'fun_partner',
'amb_partner',
'shar_partner','prob', 'attr_partner','attr_want',
'sinc_want',
'intel_want',
'fun_want',
'amb_want',
'shar_want']
rf_model = rf.fit(train[predictors].values, train['dec'].values)
print('training set performance is {}'.format(rf_model.score(train[predictors].values, train['dec'].values)))
print('test set performance is {}'.format(rf_model.score(test[predictors].values, test['dec'].values)))
In [21]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(max_depth=7)
In [22]:
predictors = ['age','date','int_corr',
'samerace',
'sinc_partner',
'intel_partner',
'fun_partner',
'amb_partner',
'shar_partner','prob', 'attr_partner','attr_want',
'sinc_want',
'intel_want',
'fun_want',
'amb_want',
'shar_want']
gb_model = gb.fit(train[predictors].values, train['dec'].values)
print('training set performance is {}'.format(gb_model.score(train[predictors].values, train['dec'].values)))
print('test set performance is {}'.format(gb_model.score(test[predictors].values, test['dec'].values)))