In [1]:
import pandas as pd
import numpy as np
import sklearn
import subprocess
import warnings
pd.set_option('display.max_columns', None)

In [2]:
source_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"

In [3]:
# coding: ISO-8859-1

Import data


In [4]:
raw_dataset = pd.read_csv(source_path + "Speed_Dating_Data.csv",encoding = "ISO-8859-1")

Data exploration

Shape, types, distribution, modalities and potential missing values


In [5]:
raw_dataset.head(2)


Out[5]:
iid id gender idg condtn wave round position positin1 order partner pid match int_corr samerace age_o race_o pf_o_att pf_o_sin pf_o_int pf_o_fun pf_o_amb pf_o_sha dec_o attr_o sinc_o intel_o fun_o amb_o shar_o like_o prob_o met_o age field field_cd undergra mn_sat tuition race imprace imprelig from zipcode income goal date go_out career career_c sports tvsports exercise dining museums art hiking gaming clubbing reading tv theater movies concerts music shopping yoga exphappy expnum attr1_1 sinc1_1 intel1_1 fun1_1 amb1_1 shar1_1 attr4_1 sinc4_1 intel4_1 fun4_1 amb4_1 shar4_1 attr2_1 sinc2_1 intel2_1 fun2_1 amb2_1 shar2_1 attr3_1 sinc3_1 fun3_1 intel3_1 amb3_1 attr5_1 sinc5_1 intel5_1 fun5_1 amb5_1 dec attr sinc intel fun amb shar like prob met match_es attr1_s sinc1_s intel1_s fun1_s amb1_s shar1_s attr3_s sinc3_s intel3_s fun3_s amb3_s satis_2 length numdat_2 attr7_2 sinc7_2 intel7_2 fun7_2 amb7_2 shar7_2 attr1_2 sinc1_2 intel1_2 fun1_2 amb1_2 shar1_2 attr4_2 sinc4_2 intel4_2 fun4_2 amb4_2 shar4_2 attr2_2 sinc2_2 intel2_2 fun2_2 amb2_2 shar2_2 attr3_2 sinc3_2 intel3_2 fun3_2 amb3_2 attr5_2 sinc5_2 intel5_2 fun5_2 amb5_2 you_call them_cal date_3 numdat_3 num_in_3 attr1_3 sinc1_3 intel1_3 fun1_3 amb1_3 shar1_3 attr7_3 sinc7_3 intel7_3 fun7_3 amb7_3 shar7_3 attr4_3 sinc4_3 intel4_3 fun4_3 amb4_3 shar4_3 attr2_3 sinc2_3 intel2_3 fun2_3 amb2_3 shar2_3 attr3_3 sinc3_3 intel3_3 fun3_3 amb3_3 attr5_3 sinc5_3 intel5_3 fun5_3 amb5_3
0 1 1.0 0 1 1 1 10 7 NaN 4 1 11.0 0 0.14 0 27.0 2.0 35.0 20.0 20.0 20.0 0.0 5.0 0 6.0 8.0 8.0 8.0 8.0 6.0 7.0 4.0 2.0 21.0 Law 1.0 NaN NaN NaN 4.0 2.0 4.0 Chicago 60,521 69,487.00 2.0 7.0 1.0 lawyer NaN 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 3.0 2.0 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN 35.0 20.0 15.0 20.0 5.0 5.0 6.0 8.0 8.0 8.0 7.0 NaN NaN NaN NaN NaN 1 6.0 9.0 7.0 7.0 6.0 5.0 7.0 6.0 2.0 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 2.0 1.0 NaN NaN NaN NaN NaN NaN 19.44 16.67 13.89 22.22 11.11 16.67 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 7.0 8.0 7.0 6.0 NaN NaN NaN NaN NaN 1.0 1.0 0.0 NaN NaN 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.0 7.0 7.0 7.0 7.0 NaN NaN NaN NaN NaN
1 1 1.0 0 1 1 1 10 7 NaN 3 2 12.0 0 0.54 0 22.0 2.0 60.0 0.0 0.0 40.0 0.0 0.0 0 7.0 8.0 10.0 7.0 7.0 5.0 8.0 4.0 2.0 21.0 Law 1.0 NaN NaN NaN 4.0 2.0 4.0 Chicago 60,521 69,487.00 2.0 7.0 1.0 lawyer NaN 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 3.0 2.0 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN 35.0 20.0 15.0 20.0 5.0 5.0 6.0 8.0 8.0 8.0 7.0 NaN NaN NaN NaN NaN 1 7.0 8.0 7.0 8.0 5.0 6.0 7.0 5.0 1.0 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 2.0 1.0 NaN NaN NaN NaN NaN NaN 19.44 16.67 13.89 22.22 11.11 16.67 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 7.0 8.0 7.0 6.0 NaN NaN NaN NaN NaN 1.0 1.0 0.0 NaN NaN 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.0 7.0 7.0 7.0 7.0 NaN NaN NaN NaN NaN

In [6]:
raw_dataset_copy = raw_dataset

In [7]:
columns_by_types = raw_dataset.columns.to_series().groupby(raw_dataset.dtypes).groups

In [8]:
raw_dataset.dtypes.value_counts()


Out[8]:
float64    174
int64       13
object       8
dtype: int64

In [9]:
raw_dataset.isnull().sum().head(3)


Out[9]:
iid       0
id        1
gender    0
dtype: int64

In [10]:
summary = raw_dataset.describe() #.transpose()
#print (summary.head())

In [11]:
#raw_dataset.groupby("gender").agg({"iid": pd.Series.nunique})
raw_dataset.groupby('gender').iid.nunique()


Out[11]:
gender
0    274
1    277
Name: iid, dtype: int64

In [12]:
raw_dataset.groupby('career').iid.nunique().sort_values(ascending=False).head(5)


Out[12]:
career
Finance          13
professor        12
Lawyer           11
Professor        10
Social Worker     9
Name: iid, dtype: int64

In [13]:
raw_dataset.groupby(["gender","match"]).iid.nunique()


Out[13]:
gender  match
0       0        274
        1        221
1       0        277
        1        231
Name: iid, dtype: int64

Data processing


In [14]:
local_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"
local_filename = "Speed_Dating_Data.csv"
my_variables_selection = ["iid", "pid", "match","gender","date","go_out","sports","tvsports","exercise","dining",
                          "museums","art","hiking","gaming","clubbing","reading","tv","theater","movies",
                          "concerts","music","shopping","yoga"]

In [15]:
class RawSetProcessing():
    """
    This class aims to load and clean the dataset.
    """
    def __init__(self,source_path,filename,features):
        self.source_path = source_path
        self.filename = filename
        self.features = features
    
    # Load data
    def load_data(self):
        raw_dataset_df = pd.read_csv(self.source_path + self.filename,encoding = "ISO-8859-1")
        return raw_dataset_df
    
    # Select variables to process and include in the model
    def subset_features(self, df):
        sel_vars_df = df[self.features]
        return sel_vars_df
    
    @staticmethod
    # Remove ids with missing values
    def remove_ids_with_missing_values(df):
        sel_vars_filled_df = df.dropna()
        return sel_vars_filled_df
    
    @staticmethod
    def drop_duplicated_values(df):
        df = df.drop_duplicates()
        return df
    
    # Combine processing stages
    def combiner_pipeline(self):
        raw_dataset = self.load_data()
        subset_df = self.subset_features(raw_dataset)
        subset_no_dup_df = self.drop_duplicated_values(subset_df)
        subset_filled_df = self.remove_ids_with_missing_values(subset_no_dup_df)
        return subset_filled_df

In [16]:
raw_set = RawSetProcessing(local_path, local_filename, my_variables_selection)
dataset_df = raw_set.combiner_pipeline()

In [17]:
dataset_df.head(2)


Out[17]:
iid pid match gender date go_out sports tvsports exercise dining museums art hiking gaming clubbing reading tv theater movies concerts music shopping yoga
0 1 11.0 0 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0
1 1 12.0 0 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0

In [18]:
# Number of unique participants
dataset_df.iid.nunique()


Out[18]:
543

In [19]:
dataset_df.shape


Out[19]:
(8271, 23)

Feature engineering


In [20]:
suffix_me = "_me"
suffix_partner = "_partner"
my_label = "match_perc"

In [21]:
class FeatureEngineering():
    """
    This class aims to load and clean the dataset.
    """
    def __init__(self,suffix_1, suffix_2, label):
        self.suffix_1 = suffix_1
        self.suffix_2 = suffix_2
        self.label = label
    
    def get_partner_features(self, df, ignore_vars=True):
        df_partner = df.copy()
        if ignore_vars is True:
            df_partner = df_partner.drop(['pid','match'], 1).drop_duplicates()
        else:
            df_partner = df_partner.copy()
        merged_datasets = df.merge(df_partner, how = "inner",left_on="pid", right_on="iid",suffixes=(self.suffix_1,self.suffix_2))
        return merged_datasets
    
    def add_success_failure_match(self, df):
        df['total_match'] = df['match'].groupby(df['iid']).transform('sum')
        df['total_dates'] = df['match'].groupby(df['iid']).transform('count')
        df['total_nomatch'] = df['total_dates'] - df['total_match']
        df['match_perc'] = df['total_match'] / df['total_dates']
        return df
    
    def label_to_categories(self, df):
        df['match_success'] = pd.cut(df[self.label], bins=(0,0.2,1), include_lowest=True)
        return df
    
    @staticmethod
    def aggregate_data(df):
        model_set = dataset_df.drop(["pid","match"],1)
        model_set = model_set.drop_duplicates()
        return model_set
    
    # Combine engineering stages
    def combiner_pipeline(self, df):
        add_match_feat_df = self.add_success_failure_match(df)
        labels_df = self.label_to_categories(df)
        model_set = self.aggregate_data(labels_df)
        return model_set

In [22]:
feat_eng = FeatureEngineering(suffix_me, suffix_partner, my_label)
#feat_engineered_model_1_df = feat_eng.combiner_pipeline(dataset_df)
#feat_engineered_model_1_df.head(2)

In [23]:
feat_engineered_df = feat_eng.get_partner_features(dataset_df)
feat_engineered_df.head(2)


Out[23]:
iid_me pid match gender_me date_me go_out_me sports_me tvsports_me exercise_me dining_me museums_me art_me hiking_me gaming_me clubbing_me reading_me tv_me theater_me movies_me concerts_me music_me shopping_me yoga_me iid_partner gender_partner date_partner go_out_partner sports_partner tvsports_partner exercise_partner dining_partner museums_partner art_partner hiking_partner gaming_partner clubbing_partner reading_partner tv_partner theater_partner movies_partner concerts_partner music_partner shopping_partner yoga_partner
0 1 11.0 0 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 11 1 5.0 4.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0
1 2 11.0 0 0 5.0 1.0 3.0 2.0 7.0 10.0 8.0 6.0 3.0 5.0 8.0 10.0 1.0 9.0 8.0 7.0 8.0 3.0 1.0 11 1 5.0 4.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0

In [24]:
feat_engineered_df.groupby("match").iid_me.count()


Out[24]:
match
0    6826
1    1348
Name: iid_me, dtype: int64

Modelling

This model aims to predict the match depending on interests of the person.

Variables:

  • gender
  • date (In general, how frequently do you go on dates?)
  • go out (How often do you go out (not necessarily on dates)?
  • sports: Playing sports/ athletics
  • tvsports: Watching sports
  • excersice: Body building/exercising
  • dining: Dining out
  • museums: Museums/galleries
  • art: Art
  • hiking: Hiking/camping
  • gaming: Gaming
  • clubbing: Dancing/clubbing
  • reading: Reading
  • tv: Watching TV
  • theater: Theater
  • movies: Movies
  • concerts: Going to concerts
  • music: Music
  • shopping: Shopping
  • yoga: Yoga/meditation

In [25]:
import sklearn
print (sklearn.__version__)


0.18.1

In [26]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import subprocess

Variables selection


In [27]:
#features = list(["gender","age_o","race_o","goal","samerace","imprace","imprelig","date","go_out","career_c"])
features = list(['iid',"gender","date","go_out","sports","tvsports","exercise","dining","museums","art",
                 "hiking","gaming","clubbing","reading","tv","theater","movies","concerts","music",
                 "shopping","yoga"])
label = "match"

In [28]:
#add suffix to each element of list
def process_features_names(features, suffix_1, suffix_2):
    features_me = [feat + suffix_1 for feat in features]
    features_partner = [feat + suffix_2 for feat in features]
    features_all = features_me + features_partner
    return features_all

features_model = process_features_names(features, suffix_me, suffix_partner)

In [29]:
explanatory = feat_engineered_df[features_model]
explained = feat_engineered_df[label]

In [30]:
explanatory[explanatory["iid_me"] == 1].head(5)


Out[30]:
iid_me gender_me date_me go_out_me sports_me tvsports_me exercise_me dining_me museums_me art_me hiking_me gaming_me clubbing_me reading_me tv_me theater_me movies_me concerts_me music_me shopping_me yoga_me iid_partner gender_partner date_partner go_out_partner sports_partner tvsports_partner exercise_partner dining_partner museums_partner art_partner hiking_partner gaming_partner clubbing_partner reading_partner tv_partner theater_partner movies_partner concerts_partner music_partner shopping_partner yoga_partner
0 1 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 11 1 5.0 4.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0
10 1 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 12 1 1.0 1.0 9.0 7.0 9.0 8.0 7.0 6.0 3.0 3.0 5.0 6.0 6.0 4.0 7.0 7.0 9.0 5.0 5.0
20 1 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 13 1 7.0 1.0 7.0 8.0 2.0 9.0 5.0 6.0 4.0 7.0 7.0 6.0 8.0 10.0 8.0 9.0 9.0 8.0 1.0
30 1 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 14 1 4.0 1.0 10.0 6.0 8.0 8.0 3.0 3.0 10.0 8.0 8.0 6.0 7.0 3.0 10.0 6.0 8.0 6.0 1.0
40 1 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 15 1 4.0 1.0 9.0 7.0 9.0 7.0 4.0 3.0 6.0 7.0 9.0 8.0 6.0 9.0 9.0 6.0 7.0 2.0 1.0

In [ ]:

Tuning


In [31]:
from sklearn import ensemble

In [32]:
warnings.filterwarnings("ignore")

In [33]:
# Parameters for Random Forest

parameters = [
  {'max_depth': [8,10,12,14,16,18], 
   'min_samples_split': [10,15,20,25,30], 
   'min_samples_leaf': [10,15,20,25,30]
  }
]
scores = ['precision', 'recall']

In [34]:
RFModel = ensemble.RandomForestClassifier(n_estimators=5, oob_score=False)

In [41]:
class TuneParameters():
    def __init__(self, explanatory_vars, explained_var, estimator, parameters, scores):
        self.explanatory_vars = explanatory_vars
        self.explained_var = explained_var
        self.estimator = estimator
        self.parameters = parameters
        self.scores = scores
    
    def create_train_test_splits(self):
        X_train, X_test, y_train, y_test = train_test_split(self.explanatory_vars, self.explained_var, 
                                                            test_size = 0.5, random_state = 0,
                                                            stratify = self.explained_var)
        return X_train, X_test, y_train, y_test 
    
    def tuning_parameters(self, trainset_x, testset_x, trainset_y, testset_y):
        for score in self.scores:
            print("# Tuning hyper-parameters for %s" % score)
            print("")

            grid_rfc = GridSearchCV(self.estimator, self.parameters,n_jobs=100, cv=10, refit=True,
                               scoring='%s_macro' % score)
            grid_rfc.fit(trainset_x, trainset_y)

            print("Best parameters set found on development set:")
            print("")
            print(grid_rfc.best_params_)
            print("")
            y_true, y_pred = testset_y, grid_rfc.predict(testset_x)
            print(classification_report(y_true, y_pred))
            print("")

            best_parameters = grid_rfc.best_estimator_.get_params()
            return best_parameters
        
    def combiner_pipeline(self):
        X_train, X_test, y_train, y_test = self.create_train_test_splits()
        best_params = self.tuning_parameters(X_train, X_test, y_train, y_test)
        return best_params

In [42]:
tune = TuneParameters(explanatory, explained, RFModel, parameters, scores)
best_parameters = tune.combiner_pipeline()
X_train, X_test, y_train, y_test  = tune.create_train_test_splits()


# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'max_depth': 12, 'min_samples_split': 30, 'min_samples_leaf': 10}

             precision    recall  f1-score   support

          0       0.84      1.00      0.91      3413
          1       0.69      0.02      0.03       674

avg / total       0.81      0.84      0.77      4087


Training


In [43]:
estimator_RFC = ensemble.RandomForestClassifier()

In [44]:
best_parameters


Out[44]:
{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 12,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_split': 1e-07,
 'min_samples_leaf': 10,
 'min_samples_split': 30,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 5,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [45]:
class Trainer():
    def __init__(self, X_train, y_train, best_params):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.estimator = None
        self.best_params = best_params

    def build_best_estimator(self):
        params = self.best_params
        model = ensemble.RandomForestClassifier(**params)
        self.estimator = model.fit(self.X_train,self.y_train)
        return self.estimator

    def score_estimator_train(self):
        return self.estimator.score(self.X_train,self.y_train)

    def score_estimator_test(self):
        return self.estimator.score(self.X_test,self.y_test)
    
    def combiner_pipeline(self):
        self.estimator = self.build_best_estimator()
        score_train = self.score_estimator_train()
        score_test = self.score_estimator_test()
        return self.estimator, score_train, score_test

In [46]:
train = Trainer(X_train, y_train, best_parameters)
estimator, score_train, score_test = train.combiner_pipeline()
print (estimator, score_train, score_test)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=10,
            min_samples_split=30, min_weight_fraction_leaf=0.0,
            n_estimators=5, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False) 0.841448495229 0.832884756545

Evaluate


In [ ]:

Test


In [38]:
import unittest
from pandas.util.testing import assert_frame_equal

There is a weird thing, with self.XX the code does not work. I tried self.assertEqual


In [39]:
def get_partner_features(df, suffix_1, suffix_2, ignore_vars=True):
    df_partner = df.copy()
    if ignore_vars is True:
        df_partner = df_partner.drop(['pid','match'], 1).drop_duplicates()
    else:
        df_partner = df_partner.copy()
    merged_datasets = df.merge(df_partner, how = "inner",left_on="pid", right_on="iid",suffixes=(suffix_1,suffix_2))
    return merged_datasets

In [46]:
class FeatureEngineeringTest(unittest.TestCase):
    def test_get_partner_features(self):
        """

        :return:
        """
        # Given
        raw_data_a = {
            'iid': ['1', '2', '3', '4', '5','6'],
            'first_name': ['Sue', 'Maria', 'Sandra', 'Bill', 'Brian','Bruce'],
            'sport':['foot','run','volley','basket','swim','tv'],
            'pid': ['4', '5', '6', '1', '2','3'],}
        
        df_a = pd.DataFrame(raw_data_a, columns = ['iid', 'first_name', 'sport','pid'])
        
        expected_data = {
            'iid_me': ['1', '2', '3', '4', '5','6'],
            'first_name_me': ['Sue', 'Maria', 'Sandra', 'Bill', 'Brian','Bruce'],
            'sport_me': ['foot','run','volley','basket','swim','tv'],
            'pid_me': ['4', '5', '6', '1', '2','3'],
            'iid_partner': ['4', '5', '6', '1', '2','3'],
            'first_name_partner': ['Bill', 'Brian','Bruce','Sue', 'Maria', 'Sandra'],
            'sport_partner': ['basket','swim','tv','foot','run','volley'],
            'pid_partner':['1', '2', '3', '4', '5','6']}
        
        expected_output_values = pd.DataFrame(expected_data, 
                                              columns = ['iid_me','first_name_me','sport_me','pid_me',
                                                         'iid_partner','first_name_partner','sport_partner',
                                                         'pid_partner'])

        # When

        output_values = get_partner_features(df_a, "_me","_partner",ignore_vars=False)

        # Then

        assert_frame_equal(output_values, expected_output_values)

In [47]:
suite = unittest.TestLoader().loadTestsFromTestCase(FeatureEngineeringTest)
unittest.TextTestRunner(verbosity=2).run(suite)


test_get_partner_features (__main__.FeatureEngineeringTest) ... ok

----------------------------------------------------------------------
Ran 1 test in 0.009s

OK
Out[47]:
<unittest.runner.TextTestResult run=1 errors=0 failures=0>

In [ ]: