In [44]:
import pandas as pd
import numpy as np
import sklearn
import subprocess
import warnings
pd.set_option('display.max_columns', None)

In [45]:
source_path = "/home/dolounet/dev/workshops/"

In [46]:
# coding: ISO-8859-1

Import data


In [47]:
raw_dataset = pd.read_csv(source_path + "Speed_Dating_Data.csv",encoding = "ISO-8859-1")

Data exploration

Shape, types, distribution, modalities and potential missing values


In [48]:
raw_dataset.head(2)


Out[48]:
iid id gender idg condtn wave round position positin1 order partner pid match int_corr samerace age_o race_o pf_o_att pf_o_sin pf_o_int pf_o_fun pf_o_amb pf_o_sha dec_o attr_o sinc_o intel_o fun_o amb_o shar_o like_o prob_o met_o age field field_cd undergra mn_sat tuition race imprace imprelig from zipcode income goal date go_out career career_c sports tvsports exercise dining museums art hiking gaming clubbing reading tv theater movies concerts music shopping yoga exphappy expnum attr1_1 sinc1_1 intel1_1 fun1_1 amb1_1 shar1_1 attr4_1 sinc4_1 intel4_1 fun4_1 amb4_1 shar4_1 attr2_1 sinc2_1 intel2_1 fun2_1 amb2_1 shar2_1 attr3_1 sinc3_1 fun3_1 intel3_1 amb3_1 attr5_1 sinc5_1 intel5_1 fun5_1 amb5_1 dec attr sinc intel fun amb shar like prob met match_es attr1_s sinc1_s intel1_s fun1_s amb1_s shar1_s attr3_s sinc3_s intel3_s fun3_s amb3_s satis_2 length numdat_2 attr7_2 sinc7_2 intel7_2 fun7_2 amb7_2 shar7_2 attr1_2 sinc1_2 intel1_2 fun1_2 amb1_2 shar1_2 attr4_2 sinc4_2 intel4_2 fun4_2 amb4_2 shar4_2 attr2_2 sinc2_2 intel2_2 fun2_2 amb2_2 shar2_2 attr3_2 sinc3_2 intel3_2 fun3_2 amb3_2 attr5_2 sinc5_2 intel5_2 fun5_2 amb5_2 you_call them_cal date_3 numdat_3 num_in_3 attr1_3 sinc1_3 intel1_3 fun1_3 amb1_3 shar1_3 attr7_3 sinc7_3 intel7_3 fun7_3 amb7_3 shar7_3 attr4_3 sinc4_3 intel4_3 fun4_3 amb4_3 shar4_3 attr2_3 sinc2_3 intel2_3 fun2_3 amb2_3 shar2_3 attr3_3 sinc3_3 intel3_3 fun3_3 amb3_3 attr5_3 sinc5_3 intel5_3 fun5_3 amb5_3
0 1 1.0 0 1 1 1 10 7 NaN 4 1 11.0 0 0.14 0 27.0 2.0 35.0 20.0 20.0 20.0 0.0 5.0 0 6.0 8.0 8.0 8.0 8.0 6.0 7.0 4.0 2.0 21.0 Law 1.0 NaN NaN NaN 4.0 2.0 4.0 Chicago 60,521 69,487.00 2.0 7.0 1.0 lawyer NaN 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 3.0 2.0 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN 35.0 20.0 15.0 20.0 5.0 5.0 6.0 8.0 8.0 8.0 7.0 NaN NaN NaN NaN NaN 1 6.0 9.0 7.0 7.0 6.0 5.0 7.0 6.0 2.0 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 2.0 1.0 NaN NaN NaN NaN NaN NaN 19.44 16.67 13.89 22.22 11.11 16.67 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 7.0 8.0 7.0 6.0 NaN NaN NaN NaN NaN 1.0 1.0 0.0 NaN NaN 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.0 7.0 7.0 7.0 7.0 NaN NaN NaN NaN NaN
1 1 1.0 0 1 1 1 10 7 NaN 3 2 12.0 0 0.54 0 22.0 2.0 60.0 0.0 0.0 40.0 0.0 0.0 0 7.0 8.0 10.0 7.0 7.0 5.0 8.0 4.0 2.0 21.0 Law 1.0 NaN NaN NaN 4.0 2.0 4.0 Chicago 60,521 69,487.00 2.0 7.0 1.0 lawyer NaN 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 3.0 2.0 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN 35.0 20.0 15.0 20.0 5.0 5.0 6.0 8.0 8.0 8.0 7.0 NaN NaN NaN NaN NaN 1 7.0 8.0 7.0 8.0 5.0 6.0 7.0 5.0 1.0 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 2.0 1.0 NaN NaN NaN NaN NaN NaN 19.44 16.67 13.89 22.22 11.11 16.67 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 7.0 8.0 7.0 6.0 NaN NaN NaN NaN NaN 1.0 1.0 0.0 NaN NaN 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.0 7.0 7.0 7.0 7.0 NaN NaN NaN NaN NaN

In [49]:
raw_dataset_copy = raw_dataset

In [50]:
columns_by_types = raw_dataset.columns.to_series().groupby(raw_dataset.dtypes).groups

In [51]:
raw_dataset.dtypes.value_counts()


Out[51]:
float64    174
int64       13
object       8
dtype: int64

In [52]:
raw_dataset.isnull().sum().head(3)


Out[52]:
iid       0
id        1
gender    0
dtype: int64

In [53]:
summary = raw_dataset.describe() #.transpose()
#print (summary.head())

In [54]:
#raw_dataset.groupby("gender").agg({"iid": pd.Series.nunique})
raw_dataset.groupby('gender').iid.nunique()


Out[54]:
gender
0    274
1    277
Name: iid, dtype: int64

In [55]:
raw_dataset.groupby('career').iid.nunique().sort_values(ascending=False).head(5)


Out[55]:
career
Finance          13
professor        12
Lawyer           11
Professor        10
Social Worker     9
Name: iid, dtype: int64

In [56]:
raw_dataset.groupby(["gender","match"]).iid.nunique()


Out[56]:
gender  match
0       0        274
        1        221
1       0        277
        1        231
Name: iid, dtype: int64

Data processing


In [58]:
from love_matcher.refactored.main_cut import MainClass, RawSetProcessing, Trainer, FeatureEngineering, TuneParameters
local_path = source_path
local_filename = "Speed_Dating_Data.csv"
main_class = MainClass(workspace=local_path)
raw_dataframe = main_class.read_dataframe()
my_variables_selection = ["iid", "pid", "match","gender","date","go_out","sports","tvsports","exercise","dining",
                          "museums","art","hiking","gaming","clubbing","reading","tv","theater","movies",
                          "concerts","music","shopping","yoga"]

In [59]:
raw_set = RawSetProcessing(features=my_variables_selection, dataframe=raw_dataframe)
dataset_df = raw_set.combiner_pipeline()

In [60]:
dataset_df.head(2)


Out[60]:
iid pid match gender date go_out sports tvsports exercise dining museums art hiking gaming clubbing reading tv theater movies concerts music shopping yoga
0 1 11.0 0 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0
1 1 12.0 0 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0

In [61]:
# Number of unique participants
dataset_df.iid.nunique()


Out[61]:
543

In [62]:
dataset_df.shape


Out[62]:
(8271, 23)

Feature engineering


In [63]:
suffix_me = "_me"
suffix_partner = "_partner"
my_label = "match_perc"

In [64]:
feat_eng = FeatureEngineering(suffix_me, suffix_partner, my_label)
feat_engineered_df = feat_eng.get_partner_features(dataset_df)

In [65]:
feat_engineered_df.head(2)


Out[65]:
iid_me pid match gender_me date_me go_out_me sports_me tvsports_me exercise_me dining_me museums_me art_me hiking_me gaming_me clubbing_me reading_me tv_me theater_me movies_me concerts_me music_me shopping_me yoga_me iid_partner gender_partner date_partner go_out_partner sports_partner tvsports_partner exercise_partner dining_partner museums_partner art_partner hiking_partner gaming_partner clubbing_partner reading_partner tv_partner theater_partner movies_partner concerts_partner music_partner shopping_partner yoga_partner
0 1 11.0 0 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 11 1 5.0 4.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0
1 2 11.0 0 0 5.0 1.0 3.0 2.0 7.0 10.0 8.0 6.0 3.0 5.0 8.0 10.0 1.0 9.0 8.0 7.0 8.0 3.0 1.0 11 1 5.0 4.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0

In [66]:
feat_engineered_df.groupby("match").iid_me.count()


Out[66]:
match
0    6826
1    1348
Name: iid_me, dtype: int64

Modelling

This model aims to predict the match depending on interests of the person.

Variables:

  • gender
  • date (In general, how frequently do you go on dates?)
  • go out (How often do you go out (not necessarily on dates)?
  • sports: Playing sports/ athletics
  • tvsports: Watching sports
  • excersice: Body building/exercising
  • dining: Dining out
  • museums: Museums/galleries
  • art: Art
  • hiking: Hiking/camping
  • gaming: Gaming
  • clubbing: Dancing/clubbing
  • reading: Reading
  • tv: Watching TV
  • theater: Theater
  • movies: Movies
  • concerts: Going to concerts
  • music: Music
  • shopping: Shopping
  • yoga: Yoga/meditation

In [67]:
import sklearn
print (sklearn.__version__)


0.18.1

In [68]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import subprocess

Variables selection


In [69]:
features = list(['iid',"gender","date","go_out","sports","tvsports","exercise","dining","museums","art",
                 "hiking","gaming","clubbing","reading","tv","theater","movies","concerts","music",
                 "shopping","yoga"])
label = "match"

In [70]:
#add suffix to each element of list
def process_features_names(features, suffix_1, suffix_2):
    features_me = [feat + suffix_1 for feat in features]
    features_partner = [feat + suffix_2 for feat in features]
    features_all = features_me + features_partner
    return features_all

features_model = process_features_names(features, suffix_me, suffix_partner)

In [71]:
explanatory = feat_engineered_df[features_model]
explained = feat_engineered_df[label]

In [72]:
explanatory[explanatory["iid_me"] == 1].head(5)


Out[72]:
iid_me gender_me date_me go_out_me sports_me tvsports_me exercise_me dining_me museums_me art_me hiking_me gaming_me clubbing_me reading_me tv_me theater_me movies_me concerts_me music_me shopping_me yoga_me iid_partner gender_partner date_partner go_out_partner sports_partner tvsports_partner exercise_partner dining_partner museums_partner art_partner hiking_partner gaming_partner clubbing_partner reading_partner tv_partner theater_partner movies_partner concerts_partner music_partner shopping_partner yoga_partner
0 1 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 11 1 5.0 4.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0
10 1 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 12 1 1.0 1.0 9.0 7.0 9.0 8.0 7.0 6.0 3.0 3.0 5.0 6.0 6.0 4.0 7.0 7.0 9.0 5.0 5.0
20 1 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 13 1 7.0 1.0 7.0 8.0 2.0 9.0 5.0 6.0 4.0 7.0 7.0 6.0 8.0 10.0 8.0 9.0 9.0 8.0 1.0
30 1 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 14 1 4.0 1.0 10.0 6.0 8.0 8.0 3.0 3.0 10.0 8.0 8.0 6.0 7.0 3.0 10.0 6.0 8.0 6.0 1.0
40 1 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 15 1 4.0 1.0 9.0 7.0 9.0 7.0 4.0 3.0 6.0 7.0 9.0 8.0 6.0 9.0 9.0 6.0 7.0 2.0 1.0

In [ ]:

Tuning


In [73]:
from sklearn import ensemble

In [74]:
warnings.filterwarnings("ignore")

In [75]:
# Parameters for Random Forest

parameters = [
  {'max_depth': [8,10,12,14,16,18], 
   'min_samples_split': [10,15,20,25,30], 
   'min_samples_leaf': [10,15,20,25,30]
  }
]
scores = ['precision', 'recall']

In [76]:
RFModel = ensemble.RandomForestClassifier(n_estimators=5, oob_score=False)

In [82]:
tune = TuneParameters(explanatory, explained, RFModel, parameters, scores)
best_parameters = tune.combiner_pipeline()
x_train, x_test, y_train, y_test  = tune.create_train_test_splits()


# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'min_samples_leaf': 10, 'min_samples_split': 15, 'max_depth': 10}

             precision    recall  f1-score   support

          0       0.84      1.00      0.91      3413
          1       0.71      0.01      0.03       674

avg / total       0.82      0.84      0.77      4087


Training


In [83]:
estimator_RFC = ensemble.RandomForestClassifier()

In [84]:
best_parameters


Out[84]:
{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_split': 1e-07,
 'min_samples_leaf': 10,
 'min_samples_split': 15,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 5,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [87]:
train = Trainer(x_train, y_train, x_test, y_test, best_parameters)
estimator, score_train, score_test = train.combiner_pipeline()
print (estimator, score_train, score_test)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=10,
            min_samples_split=15, min_weight_fraction_leaf=0.0,
            n_estimators=5, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False) 0.842671886469 0.834352826034

Evaluate


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: