In [1]:
#First let's import the necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import os
from IPython.display import display, HTML

pd.set_option('display.max_columns', 500)

In [2]:
#Specifying the Data Path

cwd = os.getcwd()
file_path = os.path.join(cwd, 'cleaned_speed_dating.csv')

In [3]:
df=pd.read_csv(file_path)

Split the dataset into training and test sets


In [4]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size = 0.2, random_state=42)

Split the dataset by gender


In [5]:
female_df = df.loc[df['gender'] == 0]
male_df = df.loc[df['gender'] == 1]

In [6]:
female_train, female_test = train_test_split(female_df, test_size = 0.2, random_state=42)
male_train, male_test = train_test_split(male_df, test_size = 0.2, random_state=42)

Logistic Regression


In [7]:
from sklearn import linear_model
lr = linear_model.LogisticRegression()

In [8]:
# Do logistic regression using only one variable
predictors = ['attr_partner']
lr_model = lr.fit(train[predictors].values, train['dec'].values)
print('training set performance is {}'.format(lr_model.score(train[predictors].values, train['dec'].values)))
print('test set performance is {}'.format(lr_model.score(test[predictors].values, test['dec'].values)))


training set performance is 0.727696032781
test set performance is 0.729709605361

In fact, if you only use one variable, that is no different than just doing a cut off based on one of the previous graphs and make naive predictions based on that. Let's see what are the other variables that we can use.


In [9]:
list(df.columns[1:-1])


Out[9]:
['gender',
 'age',
 'date',
 'sports',
 'tvsports',
 'exercise',
 'dining',
 'museums',
 'art',
 'hiking',
 'gaming',
 'clubbing',
 'reading',
 'tv',
 'theater',
 'movies',
 'concerts',
 'music',
 'shopping',
 'yoga',
 'attr_want',
 'sinc_want',
 'intel_want',
 'fun_want',
 'amb_want',
 'shar_want',
 'attr_self',
 'sinc_self',
 'fun_self',
 'intel_self',
 'amb_self',
 'pid',
 'age_partner',
 'int_corr',
 'samerace',
 'attr_partner',
 'sinc_partner',
 'intel_partner',
 'fun_partner',
 'amb_partner',
 'shar_partner',
 'prob']

In [10]:
predictors = ['age','date','int_corr',
 'samerace',
 'sinc_partner',
 'intel_partner',
 'fun_partner',
 'amb_partner',
 'shar_partner','prob']
lr_model = lr.fit(train[predictors].values, train['dec'].values)
print('training set performance is {}'.format(lr_model.score(train[predictors].values, train['dec'].values)))
print('test set performance is {}'.format(lr_model.score(test[predictors].values, test['dec'].values)))


training set performance is 0.713540696592
test set performance is 0.707371556217

In [11]:
# You can also see how important each of those factors is (sort of)
print(lr_model.coef_)


[[-0.01254919 -0.01858074  0.03288736  0.06893665 -0.07068834  0.04977025
   0.42423327 -0.15730692  0.258164    0.15273089]]

But it does seems like attractiveness is more indicative than anything else. Next you can try to combine them and repeat the same procedure. Is there an improvement to the performance? What can you infer from the coefficients this time?


In [ ]:

Try to include also those variables that you think are important and repeat the same step again. Observe what happen to the performances when you add more and more predictors.


In [ ]:

Based on the graphs on EDA, it seems that male and female make their decisions quite differently. Try to repeat the above with female_df and male_df and see if the results improve.

Benchmark

In order to know how good our prediction performance is, we should at least compare it to the performances of some naive algorithms.


In [12]:
# what if I just look at the training set and guess the most popular decisions?
no_female_train = female_train.query('dec == 0')
print('Proportion of rejection by female in training set is {}'\
      .format(no_female_train.shape[0]/female_train.shape[0]))

no_female_test = female_test.query('dec == 0')
print('Proportion of rejection by female in test set is {}'\
      .format(no_female_test.shape[0]/female_test.shape[0]))


Proportion of rejection by female in training set is 0
Proportion of rejection by female in test set is 0

In [13]:
# what if I just look at the training set and guess the most popular decisions?
no_male_train = male_train.query('dec == 0')
print('Proportion of rejection by male in training set is {}'\
      .format(no_male_train.shape[0]/male_train.shape[0]))

no_male_test = male_test.query('dec == 0')
print('Proportion of rejection by male in test set is {}'\
      .format(no_male_test.shape[0]/male_test.shape[0]))


Proportion of rejection by male in training set is 0
Proportion of rejection by male in test set is 0

In [14]:
# what if I simply do a cut off at attr_partner and base my decision on that? (refer to graphs plotted in EDA)
male_test['attr_cut_predict'] = (male_test['attr_partner']>=7)
print('male test set performance is {}'\
      .format((male_test['attr_cut_predict'] == male_test['dec']).sum()/male_test.shape[0]))


male test set performance is 0
/home/paul/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app

In [15]:
female_test['attr_cut_predict'] = (female_test['attr_partner']>=8)
print('female test set performance is {}'\
      .format((female_test['attr_cut_predict'] == female_test['dec']).sum()/female_test.shape[0]))


female test set performance is 0
/home/paul/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

Tree

Tree model allows combination of factors (as opposed to logistic regression model).


In [16]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(min_impurity_split=0.3)

In [17]:
predictors = ['age','date','int_corr',
 'samerace',
 'sinc_partner',
 'intel_partner',
 'fun_partner',
 'amb_partner',
 'shar_partner','prob', 'attr_partner','attr_want',
 'sinc_want',
 'intel_want',
 'fun_want',
 'amb_want',
 'shar_want']
dt_model = dt.fit(train[predictors].values, train['dec'].values)
print('training set performance is {}'.format(dt_model.score(train[predictors].values, train['dec'].values)))
print('test set performance is {}'.format(dt_model.score(test[predictors].values, test['dec'].values)))


training set performance is 0.879307133544
test set performance is 0.731198808637
/home/paul/anaconda2/lib/python2.7/site-packages/sklearn/tree/tree.py:282: DeprecationWarning: The min_impurity_split parameter is deprecated and will be removed in version 0.21. Use the min_impurity_decrease parameter instead.
  DeprecationWarning)

In [18]:
dt_model.feature_importances_


Out[18]:
array([ 0.02580046,  0.01579462,  0.04281306,  0.00661296,  0.01798519,
        0.01638643,  0.03296896,  0.0242345 ,  0.10993459,  0.04845516,
        0.42419231,  0.05331946,  0.04078902,  0.05114997,  0.03566275,
        0.02324129,  0.03065928])

Ensemble Methods


In [19]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [20]:
predictors = ['age','date','int_corr',
 'samerace',
 'sinc_partner',
 'intel_partner',
 'fun_partner',
 'amb_partner',
 'shar_partner','prob', 'attr_partner','attr_want',
 'sinc_want',
 'intel_want',
 'fun_want',
 'amb_want',
 'shar_want']
rf_model = rf.fit(train[predictors].values, train['dec'].values)
print('training set performance is {}'.format(rf_model.score(train[predictors].values, train['dec'].values)))
print('test set performance is {}'.format(rf_model.score(test[predictors].values, test['dec'].values)))


training set performance is 0.991246042093
test set performance is 0.751303052867

In [21]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(max_depth=7)

In [22]:
predictors = ['age','date','int_corr',
 'samerace',
 'sinc_partner',
 'intel_partner',
 'fun_partner',
 'amb_partner',
 'shar_partner','prob', 'attr_partner','attr_want',
 'sinc_want',
 'intel_want',
 'fun_want',
 'amb_want',
 'shar_want']
gb_model = gb.fit(train[predictors].values, train['dec'].values)
print('training set performance is {}'.format(gb_model.score(train[predictors].values, train['dec'].values)))
print('test set performance is {}'.format(gb_model.score(test[predictors].values, test['dec'].values)))


training set performance is 0.982305829763
test set performance is 0.793000744602