In [75]:
import pandas as pd
import numpy as np
import sklearn
import subprocess
import warnings
pd.set_option('display.max_columns', None)

In [21]:
source_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"

In [47]:
# coding: ISO-8859-1

Import data


In [29]:
raw_dataset = pd.read_csv(source_path + "Speed_Dating_Data.csv",encoding = "ISO-8859-1")

Data exploration

Shape, types, distribution, modalities and potential missing values


In [30]:
raw_dataset.head(3)


Out[30]:
iid id gender idg condtn wave round position positin1 order partner pid match int_corr samerace age_o race_o pf_o_att pf_o_sin pf_o_int pf_o_fun pf_o_amb pf_o_sha dec_o attr_o sinc_o intel_o fun_o amb_o shar_o like_o prob_o met_o age field field_cd undergra mn_sat tuition race imprace imprelig from zipcode income goal date go_out career career_c sports tvsports exercise dining museums art hiking gaming clubbing reading tv theater movies concerts music shopping yoga exphappy expnum attr1_1 sinc1_1 intel1_1 fun1_1 amb1_1 shar1_1 attr4_1 sinc4_1 intel4_1 fun4_1 amb4_1 shar4_1 attr2_1 sinc2_1 intel2_1 fun2_1 amb2_1 shar2_1 attr3_1 sinc3_1 fun3_1 intel3_1 amb3_1 attr5_1 sinc5_1 intel5_1 fun5_1 amb5_1 dec attr sinc intel fun amb shar like prob met match_es attr1_s sinc1_s intel1_s fun1_s amb1_s shar1_s attr3_s sinc3_s intel3_s fun3_s amb3_s satis_2 length numdat_2 attr7_2 sinc7_2 intel7_2 fun7_2 amb7_2 shar7_2 attr1_2 sinc1_2 intel1_2 fun1_2 amb1_2 shar1_2 attr4_2 sinc4_2 intel4_2 fun4_2 amb4_2 shar4_2 attr2_2 sinc2_2 intel2_2 fun2_2 amb2_2 shar2_2 attr3_2 sinc3_2 intel3_2 fun3_2 amb3_2 attr5_2 sinc5_2 intel5_2 fun5_2 amb5_2 you_call them_cal date_3 numdat_3 num_in_3 attr1_3 sinc1_3 intel1_3 fun1_3 amb1_3 shar1_3 attr7_3 sinc7_3 intel7_3 fun7_3 amb7_3 shar7_3 attr4_3 sinc4_3 intel4_3 fun4_3 amb4_3 shar4_3 attr2_3 sinc2_3 intel2_3 fun2_3 amb2_3 shar2_3 attr3_3 sinc3_3 intel3_3 fun3_3 amb3_3 attr5_3 sinc5_3 intel5_3 fun5_3 amb5_3
0 1 1.0 0 1 1 1 10 7 NaN 4 1 11.0 0 0.14 0 27.0 2.0 35.0 20.0 20.0 20.0 0.0 5.0 0 6.0 8.0 8.0 8.0 8.0 6.0 7.0 4.0 2.0 21.0 Law 1.0 NaN NaN NaN 4.0 2.0 4.0 Chicago 60,521 69,487.00 2.0 7.0 1.0 lawyer NaN 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 3.0 2.0 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN 35.0 20.0 15.0 20.0 5.0 5.0 6.0 8.0 8.0 8.0 7.0 NaN NaN NaN NaN NaN 1 6.0 9.0 7.0 7.0 6.0 5.0 7.0 6.0 2.0 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 2.0 1.0 NaN NaN NaN NaN NaN NaN 19.44 16.67 13.89 22.22 11.11 16.67 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 7.0 8.0 7.0 6.0 NaN NaN NaN NaN NaN 1.0 1.0 0.0 NaN NaN 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.0 7.0 7.0 7.0 7.0 NaN NaN NaN NaN NaN
1 1 1.0 0 1 1 1 10 7 NaN 3 2 12.0 0 0.54 0 22.0 2.0 60.0 0.0 0.0 40.0 0.0 0.0 0 7.0 8.0 10.0 7.0 7.0 5.0 8.0 4.0 2.0 21.0 Law 1.0 NaN NaN NaN 4.0 2.0 4.0 Chicago 60,521 69,487.00 2.0 7.0 1.0 lawyer NaN 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 3.0 2.0 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN 35.0 20.0 15.0 20.0 5.0 5.0 6.0 8.0 8.0 8.0 7.0 NaN NaN NaN NaN NaN 1 7.0 8.0 7.0 8.0 5.0 6.0 7.0 5.0 1.0 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 2.0 1.0 NaN NaN NaN NaN NaN NaN 19.44 16.67 13.89 22.22 11.11 16.67 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 7.0 8.0 7.0 6.0 NaN NaN NaN NaN NaN 1.0 1.0 0.0 NaN NaN 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.0 7.0 7.0 7.0 7.0 NaN NaN NaN NaN NaN
2 1 1.0 0 1 1 1 10 7 NaN 10 3 13.0 1 0.16 1 22.0 4.0 19.0 18.0 19.0 18.0 14.0 12.0 1 10.0 10.0 10.0 10.0 10.0 10.0 10.0 10.0 1.0 21.0 Law 1.0 NaN NaN NaN 4.0 2.0 4.0 Chicago 60,521 69,487.00 2.0 7.0 1.0 lawyer NaN 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 3.0 2.0 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN 35.0 20.0 15.0 20.0 5.0 5.0 6.0 8.0 8.0 8.0 7.0 NaN NaN NaN NaN NaN 1 5.0 8.0 9.0 8.0 5.0 7.0 7.0 NaN 1.0 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 2.0 1.0 NaN NaN NaN NaN NaN NaN 19.44 16.67 13.89 22.22 11.11 16.67 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 7.0 8.0 7.0 6.0 NaN NaN NaN NaN NaN 1.0 1.0 0.0 NaN NaN 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.0 7.0 7.0 7.0 7.0 NaN NaN NaN NaN NaN

In [31]:
raw_dataset_copy = raw_dataset

In [32]:
check1 = raw_dataset_copy[raw_dataset_copy["iid"] == 1]
check1_sel = check1[["iid", "pid", "match","gender","date","go_out","sports","tvsports","exercise","dining",
                          "museums","art","hiking","gaming","clubbing","reading","tv","theater","movies",
                          "concerts","music","shopping","yoga"]]

In [33]:
check1_sel.drop_duplicates().head(20)


Out[33]:
iid pid match gender date go_out sports tvsports exercise dining museums art hiking gaming clubbing reading tv theater movies concerts music shopping yoga
0 1 11.0 0 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0
1 1 12.0 0 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0
2 1 13.0 1 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0
3 1 14.0 1 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0
4 1 15.0 1 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0
5 1 16.0 0 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0
6 1 17.0 0 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0
7 1 18.0 0 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0
8 1 19.0 1 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0
9 1 20.0 0 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0

In [34]:
#merged_datasets = raw_dataset.merge(raw_dataset_copy, left_on="pid", right_on="iid")
#merged_datasets[["iid_x","gender_x","pid_y","gender_y"]].head(5)
#same_gender = merged_datasets[merged_datasets["gender_x"] == merged_datasets["gender_y"]]
#same_gender.head()

In [35]:
columns_by_types = raw_dataset.columns.to_series().groupby(raw_dataset.dtypes).groups

In [36]:
raw_dataset.dtypes.value_counts()


Out[36]:
float64    174
int64       13
object       8
dtype: int64

In [37]:
raw_dataset.isnull().sum().head(3)


Out[37]:
iid       0
id        1
gender    0
dtype: int64

In [85]:
summary = raw_dataset.describe() #.transpose()
print (summary.head())


               iid           id       gender          idg       condtn  \
count  8378.000000  8377.000000  8378.000000  8378.000000  8378.000000   
mean    283.675937     8.960248     0.500597    17.327166     1.828837   
std     158.583367     5.491329     0.500029    10.940735     0.376673   
min       1.000000     1.000000     0.000000     1.000000     1.000000   
25%     154.000000     4.000000     0.000000     8.000000     2.000000   

              wave        round     position     positin1        order  \
count  8378.000000  8378.000000  8378.000000  6532.000000  8378.000000   
mean     11.350919    16.872046     9.042731     9.295775     8.927668   
std       5.995903     4.358458     5.514939     5.650199     5.477009   
min       1.000000     5.000000     1.000000     1.000000     1.000000   
25%       7.000000    14.000000     4.000000     4.000000     4.000000   

           partner          pid        match     int_corr     samerace  \
count  8378.000000  8368.000000  8378.000000  8220.000000  8378.000000   
mean      8.963595   283.863767     0.164717     0.196010     0.395799   
std       5.491068   158.584899     0.370947     0.303539     0.489051   
min       1.000000     1.000000     0.000000    -0.830000     0.000000   
25%       4.000000   154.000000     0.000000    -0.020000     0.000000   

             age_o       race_o     pf_o_att     pf_o_sin     pf_o_int  \
count  8274.000000  8305.000000  8289.000000  8289.000000  8289.000000   
mean     26.364999     2.756653    22.495347    17.396867    20.270759   
std       3.563648     1.230689    12.569802     7.044003     6.782895   
min      18.000000     1.000000     0.000000     0.000000     0.000000   
25%      24.000000     2.000000    15.000000    15.000000    17.390000   

          pf_o_fun     pf_o_amb     pf_o_sha        dec_o       attr_o  \
count  8280.000000  8271.000000  8249.000000  8378.000000  8166.000000   
mean     17.459714    10.685375    11.845930     0.419551     6.190411   
std       6.085526     6.126544     6.362746     0.493515     1.950305   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      15.000000     5.000000     9.520000     0.000000     5.000000   

            sinc_o      intel_o        fun_o        amb_o       shar_o  \
count  8091.000000  8072.000000  8018.000000  7656.000000  7302.000000   
mean      7.175256     7.369301     6.400599     6.778409     5.474870   
std       1.740575     1.550501     1.954078     1.794080     2.156163   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       6.000000     6.000000     5.000000     6.000000     4.000000   

            like_o       prob_o        met_o          age     field_cd  \
count  8128.000000  8060.000000  7993.000000  8283.000000  8296.000000   
mean      6.134498     5.208251     1.960215    26.358928     7.662488   
std       1.841258     2.129354     0.245925     3.566763     3.758935   
min       0.000000     0.000000     1.000000    18.000000     1.000000   
25%       5.000000     4.000000     2.000000    24.000000     5.000000   

              race      imprace     imprelig         goal         date  \
count  8315.000000  8299.000000  8299.000000  8299.000000  8281.000000   
mean      2.757186     3.784793     3.651645     2.122063     5.006762   
std       1.230905     2.845708     2.805237     1.407181     1.444531   
min       1.000000     0.000000     1.000000     1.000000     1.000000   
25%       2.000000     1.000000     1.000000     1.000000     4.000000   

            go_out     career_c       sports     tvsports     exercise  \
count  8299.000000  8240.000000  8299.000000  8299.000000  8299.000000   
mean      2.158091     5.277791     6.425232     4.575491     6.245813   
std       1.105246     3.309520     2.619024     2.801874     2.418858   
min       1.000000     1.000000     1.000000     1.000000     1.000000   
25%       1.000000     2.000000     4.000000     2.000000     5.000000   

            dining      museums          art       hiking       gaming  \
count  8299.000000  8299.000000  8299.000000  8299.000000  8299.000000   
mean      7.783829     6.985781     6.714544     5.737077     3.881191   
std       1.754868     2.052232     2.263407     2.570207     2.620507   
min       1.000000     0.000000     0.000000     0.000000     0.000000   
25%       7.000000     6.000000     5.000000     4.000000     2.000000   

          clubbing      reading           tv      theater       movies  \
count  8299.000000  8299.000000  8299.000000  8299.000000  8299.000000   
mean      5.745993     7.678515     5.304133     6.776118     7.919629   
std       2.502218     2.006565     2.529135     2.235152     1.700927   
min       0.000000     1.000000     1.000000     0.000000     0.000000   
25%       4.000000     7.000000     3.000000     5.000000     7.000000   

          concerts        music     shopping         yoga     exphappy  \
count  8299.000000  8299.000000  8299.000000  8299.000000  8277.000000   
mean      6.825401     7.851066     5.631281     4.339197     5.534131   
std       2.156283     1.791827     2.608913     2.717612     1.734059   
min       0.000000     1.000000     1.000000     0.000000     1.000000   
25%       5.000000     7.000000     4.000000     2.000000     5.000000   

            expnum      attr1_1      sinc1_1     intel1_1       fun1_1  \
count  1800.000000  8299.000000  8299.000000  8299.000000  8289.000000   
mean      5.570556    22.514632    17.396389    20.265613    17.457043   
std       4.762569    12.587674     7.046700     6.783003     6.085239   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       2.000000    15.000000    15.000000    17.390000    15.000000   

            amb1_1      shar1_1      attr4_1      sinc4_1     intel4_1  \
count  8279.000000  8257.000000  6489.000000  6489.000000  6489.000000   
mean     10.682539    11.845111    26.394360    11.071506    12.636308   
std       6.124888     6.362154    16.297045     6.659233     6.717476   
min       0.000000     0.000000     5.000000     0.000000     0.000000   
25%       5.000000     9.520000    10.000000     6.000000     8.000000   

            fun4_1       amb4_1      shar4_1      attr2_1      sinc2_1  \
count  6489.000000  6489.000000  6467.000000  8299.000000  8299.000000   
mean     15.566805     9.780089    11.014845    30.362192    13.273691   
std       7.328256     6.998428     6.060150    16.249937     6.976775   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      10.000000     5.000000     7.000000    20.000000    10.000000   

          intel2_1       fun2_1       amb2_1      shar2_1      attr3_1  \
count  8299.000000  8299.000000  8289.000000  8289.000000  8273.000000   
mean     14.416891    18.422620    11.744499    11.854817     7.084733   
std       6.263304     6.577929     6.886532     6.167314     1.395783   
min       0.000000     0.000000     0.000000     0.000000     2.000000   
25%      10.000000    15.000000     6.000000    10.000000     6.000000   

           sinc3_1       fun3_1     intel3_1       amb3_1      attr5_1  \
count  8273.000000  8273.000000  8273.000000  8273.000000  4906.000000   
mean      8.294935     7.704460     8.403965     7.578388     6.941908   
std       1.407460     1.564321     1.076608     1.778315     1.498653   
min       2.000000     2.000000     3.000000     2.000000     2.000000   
25%       8.000000     7.000000     8.000000     7.000000     6.000000   

           sinc5_1     intel5_1       fun5_1       amb5_1          dec  \
count  4906.000000  4906.000000  4906.000000  4906.000000  8378.000000   
mean      7.927232     8.284346     7.426213     7.617611     0.419909   
std       1.627054     1.283657     1.779129     1.773094     0.493573   
min       1.000000     3.000000     2.000000     1.000000     0.000000   
25%       7.000000     8.000000     6.000000     7.000000     0.000000   

              attr         sinc        intel          fun          amb  \
count  8176.000000  8101.000000  8082.000000  8028.000000  7666.000000   
mean      6.189995     7.175164     7.368597     6.400598     6.777524   
std       1.950169     1.740315     1.550453     1.953702     1.794055   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       5.000000     6.000000     6.000000     5.000000     6.000000   

              shar         like         prob          met     match_es  \
count  7311.000000  8138.000000  8069.000000  8003.000000  7205.000000   
mean      5.474559     6.134087     5.207523     0.948769     3.207814   
std       2.156363     1.841285     2.129565     0.989889     2.444813   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       4.000000     5.000000     4.000000     0.000000     2.000000   

           attr1_s      sinc1_s     intel1_s       fun1_s       amb1_s  \
count  4096.000000  4096.000000  4096.000000  4096.000000  4096.000000   
mean     20.791624    15.434255    17.243708    15.260869    11.144619   
std      12.968524     6.915322     6.596420     5.356969     5.514028   
min       3.000000     0.000000     0.000000     1.000000     0.000000   
25%      14.810000    10.000000    10.000000    10.000000     7.000000   

           shar1_s     attr3_s      sinc3_s     intel3_s       fun3_s  \
count  4096.000000  4000.00000  4000.000000  4000.000000  4000.000000   
mean     12.457925     7.21125     8.082000     8.257750     7.692500   
std       5.921789     1.41545     1.455741     1.179317     1.626839   
min       0.000000     3.00000     1.000000     4.000000     3.000000   
25%       9.000000     7.00000     7.000000     8.000000     7.000000   

            amb3_s      satis_2       length     numdat_2      attr7_2  \
count  4000.000000  7463.000000  7463.000000  7433.000000  1984.000000   
mean      7.589250     5.711510     1.843495     2.338087    32.819556   
std       1.793136     1.820764     0.975662     0.631240    17.155270   
min       2.000000     1.000000     1.000000     1.000000    10.000000   
25%       7.000000     5.000000     1.000000     2.000000    20.000000   

           sinc7_2     intel7_2       fun7_2       amb7_2      shar7_2  \
count  1955.000000  1984.000000  1984.000000  1955.000000  1974.000000   
mean     13.529923    15.293851    18.868448     7.286957    12.156028   
std       7.977482     7.292868     8.535963     6.125187     8.241906   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      10.000000    10.000000    10.000000     0.000000     5.000000   

           attr1_2      sinc1_2     intel1_2       fun1_2       amb1_2  \
count  7445.000000  7463.000000  7463.000000  7463.000000  7463.000000   
mean     26.217194    15.865084    17.813755    17.654765     9.913436   
std      14.388694     6.658494     6.535894     6.129746     5.675550   
min       5.000000     0.000000     0.000000     0.000000     0.000000   
25%      16.670000    10.000000    15.000000    15.000000     5.000000   

           shar1_2      attr4_2      sinc4_2     intel4_2       fun4_2  \
count  7463.000000  5775.000000  5775.000000  5775.000000  5775.000000   
mean     12.760263    26.806234    11.929177    12.103030    15.163810   
std       6.651547    16.402836     6.401556     5.990607     7.290107   
min       0.000000     6.000000     0.000000     0.000000     0.000000   
25%      10.000000    10.000000     8.000000     8.000000     9.000000   

            amb4_2      shar4_2      attr2_2     sinc2_2     intel2_2  \
count  5775.000000  5775.000000  5775.000000  5775.00000  5775.000000   
mean      9.342511    11.320866    29.344369    13.89823    13.958265   
std       5.856329     6.296155    14.551171     6.17169     5.398621   
min       0.000000     0.000000     0.000000     0.00000     0.000000   
25%       5.000000     7.000000    19.150000    10.00000    10.000000   

            fun2_2       amb2_2      shar2_2      attr3_2      sinc3_2  \
count  5775.000000  5775.000000  5775.000000  7463.000000  7463.000000   
mean     17.967233    11.909735    12.887976     7.125285     7.931529   
std       6.100307     6.313281     5.615691     1.371390     1.503236   
min       0.000000     0.000000     0.000000     2.000000     2.000000   
25%      15.000000    10.000000    10.000000     7.000000     7.000000   

          intel3_2       fun3_2       amb3_2      attr5_2      sinc5_2  \
count  7463.000000  7463.000000  7463.000000  4377.000000  4377.000000   
mean      8.238912     7.602171     7.486802     6.827964     7.394106   
std       1.180280     1.548200     1.744634     1.411096     1.588145   
min       4.000000     1.000000     2.000000     2.000000     2.000000   
25%       8.000000     7.000000     7.000000     6.000000     6.000000   

          intel5_2       fun5_2       amb5_2     you_call     them_cal  \
count  4377.000000  4377.000000  4377.000000  3974.000000  3974.000000   
mean      7.838702     7.279415     7.332191     0.780825     0.981631   
std       1.280936     1.647478     1.521854     1.611694     1.382139   
min       2.000000     2.000000     2.000000     0.000000     0.000000   
25%       7.000000     6.000000     6.000000     0.000000     0.000000   

            date_3     numdat_3    num_in_3      attr1_3      sinc1_3  \
count  3974.000000  1496.000000  668.000000  3974.000000  3974.000000   
mean      0.376950     1.230615    0.934132    24.384524    16.588583   
std       0.484683     1.294557    0.753902    13.712120     7.471537   
min       0.000000     0.000000    0.000000     0.000000     0.000000   
25%       0.000000     1.000000    1.000000    15.220000    10.000000   

          intel1_3       fun1_3       amb1_3      shar1_3      attr7_3  \
count  3974.000000  3974.000000  3974.000000  3974.000000  2016.000000   
mean     19.411346    16.233415    10.898075    12.699142    31.330357   
std       6.124502     5.163777     5.900697     6.557041    17.551540   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      16.670000    14.810000     5.000000    10.000000    20.000000   

           sinc7_3     intel7_3       fun7_3       amb7_3      shar7_3  \
count  2016.000000  2016.000000  2016.000000  2016.000000  2016.000000   
mean     15.654266    16.679563    16.418155     7.823909    12.207837   
std       9.336288     7.880088     7.231325     6.100502     8.615985   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      10.000000    10.000000    10.000000     0.000000     5.000000   

           attr4_3      sinc4_3     intel4_3       fun4_3       amb4_3  \
count  2959.000000  2959.000000  2959.000000  2959.000000  2959.000000   
mean     25.610341    10.751267    11.524839    14.276783     9.207503   
std      17.477134     5.740351     6.004222     6.927869     6.385852   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      10.000000     7.000000     7.000000     9.000000     5.000000   

           shar4_3      attr2_3      sinc2_3     intel2_3       fun2_3  \
count  2959.000000  2959.000000  2959.000000  2959.000000  2959.000000   
mean     11.253802    24.970936    10.923285    11.952687    14.959108   
std       6.516178    17.007669     6.226283     7.010650     7.935509   
min       0.000000     5.000000     0.000000     0.000000     0.000000   
25%       7.000000    10.000000     7.000000     7.000000     9.000000   

            amb2_3      shar2_3      attr3_3      sinc3_3     intel3_3  \
count  2959.000000  2016.000000  3974.000000  3974.000000  3974.000000   
mean      9.526191    11.966270     7.240312     8.093357     8.388777   
std       6.403117     7.012067     1.576596     1.610309     1.459094   
min       0.000000     0.000000     2.000000     2.000000     3.000000   
25%       6.000000     5.000000     7.000000     7.000000     8.000000   

            fun3_3       amb3_3      attr5_3      sinc5_3     intel5_3  \
count  3974.000000  3974.000000  2016.000000  2016.000000  2016.000000   
mean      7.658782     7.391545     6.810020     7.615079     7.932540   
std       1.744670     1.961417     1.507341     1.504551     1.340868   
min       2.000000     1.000000     2.000000     2.000000     4.000000   
25%       7.000000     6.000000     6.000000     7.000000     7.000000   

            fun5_3       amb5_3  
count  2016.000000  2016.000000  
mean      7.155258     7.048611  
std       1.672787     1.717988  
min       1.000000     1.000000  
25%       6.000000     6.000000  

In [86]:
#raw_dataset.groupby("gender").agg({"iid": pd.Series.nunique})
raw_dataset.groupby('gender').iid.nunique()


Out[86]:
gender
0    274
1    277
Name: iid, dtype: int64

In [87]:
raw_dataset.groupby('career').iid.nunique().sort_values(ascending=False).head(5)


Out[87]:
career
Finance          13
professor        12
Lawyer           11
Professor        10
Social Worker     9
Name: iid, dtype: int64

In [88]:
raw_dataset.groupby(["gender","match"]).iid.nunique()


Out[88]:
gender  match
0       0        274
        1        221
1       0        277
        1        231
Name: iid, dtype: int64

Data processing


In [89]:
local_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"
local_filename = "Speed_Dating_Data.csv"
my_variables_selection = ["iid", "pid", "match","gender","date","go_out","sports","tvsports","exercise","dining",
                          "museums","art","hiking","gaming","clubbing","reading","tv","theater","movies",
                          "concerts","music","shopping","yoga"]

In [90]:
class RawSetProcessing(object):
    """
    This class aims to load and clean the dataset.
    """
    def __init__(self,source_path,filename,features):
        self.source_path = source_path
        self.filename = filename
        self.features = features
    
    # Load data
    def load_data(self):
        raw_dataset_df = pd.read_csv(self.source_path + self.filename,encoding = "ISO-8859-1")
        return raw_dataset_df
    
    # Select variables to process and include in the model
    def subset_features(self, df):
        sel_vars_df = df[self.features]
        return sel_vars_df
    
    @staticmethod
    # Remove ids with missing values
    def remove_ids_with_missing_values(df):
        sel_vars_filled_df = df.dropna()
        return sel_vars_filled_df
    
    @staticmethod
    def drop_duplicated_values(df):
        df = df.drop_duplicates()
        return df
    
    # Combine processing stages
    def combiner_pipeline(self):
        raw_dataset = self.load_data()
        subset_df = self.subset_features(raw_dataset)
        subset_no_dup_df = self.drop_duplicated_values(subset_df)
        subset_filled_df = self.remove_ids_with_missing_values(subset_no_dup_df)
        return subset_filled_df

In [91]:
raw_set = RawSetProcessing(local_path, local_filename, my_variables_selection)
dataset_df = raw_set.combiner_pipeline()

In [92]:
dataset_df.head(3)


Out[92]:
iid pid match gender date go_out sports tvsports exercise dining museums art hiking gaming clubbing reading tv theater movies concerts music shopping yoga
0 1 11.0 0 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0
1 1 12.0 0 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0
2 1 13.0 1 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0

In [93]:
# Number of unique participants
dataset_df.iid.nunique()


Out[93]:
543

In [94]:
dataset_df.shape


Out[94]:
(8271, 23)

Feature engineering


In [95]:
suffix_me = "_me"
suffix_partner = "_partner"

In [96]:
def get_partner_features(df, suffix_1, suffix_2, ignore_vars=True):
    #print df[df["iid"] == 1]
    df_partner = df.copy()
    if ignore_vars is True:
        df_partner = df_partner.drop(['pid','match'], 1).drop_duplicates()
    else:
        df_partner = df_partner.copy()
    #print df_partner.shape
    merged_datasets = df.merge(df_partner, how = "inner",left_on="pid", right_on="iid",suffixes=(suffix_1,suffix_2))
    #print merged_datasets[merged_datasets["iid_me"] == 1]
    return merged_datasets

In [97]:
feat_eng_df = get_partner_features(dataset_df,suffix_me,suffix_partner)
feat_eng_df.head(3)


Out[97]:
iid_me pid match gender_me date_me go_out_me sports_me tvsports_me exercise_me dining_me museums_me art_me hiking_me gaming_me clubbing_me reading_me tv_me theater_me movies_me concerts_me music_me shopping_me yoga_me iid_partner gender_partner date_partner go_out_partner sports_partner tvsports_partner exercise_partner dining_partner museums_partner art_partner hiking_partner gaming_partner clubbing_partner reading_partner tv_partner theater_partner movies_partner concerts_partner music_partner shopping_partner yoga_partner
0 1 11.0 0 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 11 1 5.0 4.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0
1 2 11.0 0 0 5.0 1.0 3.0 2.0 7.0 10.0 8.0 6.0 3.0 5.0 8.0 10.0 1.0 9.0 8.0 7.0 8.0 3.0 1.0 11 1 5.0 4.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0
2 3 11.0 0 0 3.0 1.0 3.0 8.0 7.0 8.0 5.0 5.0 8.0 4.0 5.0 7.0 8.0 7.0 7.0 7.0 5.0 8.0 7.0 11 1 5.0 4.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0

Modelling

This model aims to answer the questions what is the profile of the persons regarding interests that got the most matches.

Variables:

  • gender
  • date (In general, how frequently do you go on dates?)
  • go out (How often do you go out (not necessarily on dates)?
  • sports: Playing sports/ athletics
  • tvsports: Watching sports
  • excersice: Body building/exercising
  • dining: Dining out
  • museums: Museums/galleries
  • art: Art
  • hiking: Hiking/camping
  • gaming: Gaming
  • clubbing: Dancing/clubbing
  • reading: Reading
  • tv: Watching TV
  • theater: Theater
  • movies: Movies
  • concerts: Going to concerts
  • music: Music
  • shopping: Shopping
  • yoga: Yoga/meditation

In [98]:
import sklearn
print (sklearn.__version__)


0.18.1

In [99]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import subprocess

Variables selection


In [100]:
#features = list(["gender","age_o","race_o","goal","samerace","imprace","imprelig","date","go_out","career_c"])
features = list(["gender","date","go_out","sports","tvsports","exercise","dining","museums","art",
                 "hiking","gaming","clubbing","reading","tv","theater","movies","concerts","music",
                 "shopping","yoga"])
label = "match"

In [101]:
#add suffix to each element of list
def process_features_names(features, suffix_1, suffix_2):
    features_me = [feat + suffix_1 for feat in features]
    features_partner = [feat + suffix_2 for feat in features]
    features_all = features_me + features_partner
    print (features_all)
    return features_all

features_model = process_features_names(features, suffix_me, suffix_partner)


['gender_me', 'date_me', 'go_out_me', 'sports_me', 'tvsports_me', 'exercise_me', 'dining_me', 'museums_me', 'art_me', 'hiking_me', 'gaming_me', 'clubbing_me', 'reading_me', 'tv_me', 'theater_me', 'movies_me', 'concerts_me', 'music_me', 'shopping_me', 'yoga_me', 'gender_partner', 'date_partner', 'go_out_partner', 'sports_partner', 'tvsports_partner', 'exercise_partner', 'dining_partner', 'museums_partner', 'art_partner', 'hiking_partner', 'gaming_partner', 'clubbing_partner', 'reading_partner', 'tv_partner', 'theater_partner', 'movies_partner', 'concerts_partner', 'music_partner', 'shopping_partner', 'yoga_partner']

In [102]:
feat_eng_df.head(5)


Out[102]:
iid_me pid match gender_me date_me go_out_me sports_me tvsports_me exercise_me dining_me museums_me art_me hiking_me gaming_me clubbing_me reading_me tv_me theater_me movies_me concerts_me music_me shopping_me yoga_me iid_partner gender_partner date_partner go_out_partner sports_partner tvsports_partner exercise_partner dining_partner museums_partner art_partner hiking_partner gaming_partner clubbing_partner reading_partner tv_partner theater_partner movies_partner concerts_partner music_partner shopping_partner yoga_partner
0 1 11.0 0 0 7.0 1.0 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 11 1 5.0 4.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0
1 2 11.0 0 0 5.0 1.0 3.0 2.0 7.0 10.0 8.0 6.0 3.0 5.0 8.0 10.0 1.0 9.0 8.0 7.0 8.0 3.0 1.0 11 1 5.0 4.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0
2 3 11.0 0 0 3.0 1.0 3.0 8.0 7.0 8.0 5.0 5.0 8.0 4.0 5.0 7.0 8.0 7.0 7.0 7.0 5.0 8.0 7.0 11 1 5.0 4.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0
3 4 11.0 0 0 5.0 1.0 1.0 1.0 6.0 7.0 6.0 7.0 7.0 5.0 7.0 7.0 7.0 9.0 7.0 8.0 7.0 1.0 8.0 11 1 5.0 4.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0
4 5 11.0 0 0 4.0 1.0 7.0 4.0 7.0 7.0 6.0 8.0 6.0 6.0 8.0 6.0 8.0 6.0 6.0 3.0 7.0 8.0 3.0 11 1 5.0 4.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0

In [103]:
explanatory = feat_eng_df[features_model]
explained = feat_eng_df[label]

Decision Tree


In [104]:
clf = tree.DecisionTreeClassifier(min_samples_split=20,min_samples_leaf=10,max_depth=4)
clf = clf.fit(explanatory, explained)

In [105]:
# Download http://www.graphviz.org/

with open("data.dot", 'w') as f:
    f = tree.export_graphviz(clf, out_file=f, feature_names= features_model, class_names="match")
    
subprocess.call(['dot', '-Tpdf', 'data.dot', '-o' 'data.pdf'])


Out[105]:
0

Tuning Parameters


In [70]:
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(explanatory, explained, test_size=0.3, random_state=0)

In [71]:
parameters = [
  {'criterion': ['gini','entropy'], 'max_depth': [4,6,10,12,14], 
   'min_samples_split': [10,20,30], 'min_samples_leaf': [10,15,20]
  }
]

scores = ['precision', 'recall']

In [72]:
dtc = tree.DecisionTreeClassifier()
clf = GridSearchCV(dtc, parameters,n_jobs=3, cv=5, refit=True)

In [74]:
warnings.filterwarnings("ignore")

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print("")

    clf = GridSearchCV(dtc, parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print("")
    print(clf.best_params_)
    print("")
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print("")


# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'min_samples_leaf': 10, 'min_samples_split': 10, 'max_depth': 4, 'criterion': 'gini'}

             precision    recall  f1-score   support

          0       0.84      0.99      0.91      2064
          1       0.27      0.01      0.02       389

avg / total       0.75      0.84      0.77      2453


# Tuning hyper-parameters for recall

Best parameters set found on development set:

{'min_samples_leaf': 10, 'min_samples_split': 10, 'max_depth': 14, 'criterion': 'gini'}

             precision    recall  f1-score   support

          0       0.85      0.92      0.89      2064
          1       0.28      0.16      0.20       389

avg / total       0.76      0.80      0.78      2453



In [106]:
best_param_dtc = tree.DecisionTreeClassifier(criterion="entropy",min_samples_split=10,min_samples_leaf=10,max_depth=14)
best_param_dtc = best_param_dtc.fit(explanatory, explained)

In [107]:
best_param_dtc.feature_importances_


Out[107]:
array([ 0.00538282,  0.02441344,  0.02342661,  0.02108348,  0.04005844,
        0.029787  ,  0.01949407,  0.01747603,  0.01774458,  0.01792501,
        0.03965957,  0.02793697,  0.02792654,  0.02825926,  0.02041224,
        0.01717185,  0.02546885,  0.01670987,  0.02317958,  0.02938759,
        0.        ,  0.0308793 ,  0.02539205,  0.0416509 ,  0.04748682,
        0.02656185,  0.01361627,  0.01351116,  0.02030704,  0.02668936,
        0.02990962,  0.03737748,  0.02734437,  0.03347446,  0.03162642,
        0.01682956,  0.02250987,  0.01302588,  0.03421447,  0.03468931])

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [108]:
raw_dataset.rename(columns={"age_o":"age_of_partner","race_o":"race_of_partner"},inplace=True)

In [ ]:


In [ ]:

Test


In [163]:
import unittest
from pandas.util.testing import assert_frame_equal

There is a weird thing, with self.XX the code does not work. I tried self.assertEqual


In [185]:
class FeatureEngineeringTest(unittest.TestCase):
    def test_get_partner_features(self):
        """

        :return:
        """
        # Given
        raw_data_a = {
        'iid': ['1', '2', '3', '4', '5','6'],
        'first_name': ['Sue', 'Maria', 'Sandra', 'Bill', 'Brian','Bruce'],
        'sport':['foot','run','volley','basket','swim','tv'],
        'pid': ['4', '5', '6', '1', '2','3'],}
        
        df_a = pd.DataFrame(raw_data_a, columns = ['iid', 'first_name', 'sport','pid'])
        
        expected_output_values = pd.DataFrame({
                'iid_me': ['1', '2', '3', '4', '5','6'],
                'first_name_me': ['Sue', 'Maria', 'Sandra', 'Bill', 'Brian','Bruce'],
                'sport_me': ['foot','run','volley','basket','swim','tv'],
                'pid_me': ['4', '5', '6', '1', '2','3'],
                'iid_partner': ['4', '5', '6', '1', '2','3'],
                'first_name_partner': ['Bill', 'Brian','Bruce','Sue', 'Maria', 'Sandra'],
                'sport_partner': ['basket','swim','tv','foot','run','volley'],
                'pid_partner':['1', '2', '3', '4', '5','6']
            }, columns = ['iid_me','first_name_me','sport_me','pid_me',
                          'iid_partner','first_name_partner','sport_partner','pid_partner'])

        # When

        output_values = get_partner_features(df_a, "_me","_partner",ignore_vars=False)

        # Then

        assert_frame_equal(output_values, expected_output_values)

In [186]:
suite = unittest.TestLoader().loadTestsFromTestCase(FeatureEngineeringTest)
unittest.TextTestRunner(verbosity=2).run(suite)


test_get_partner_features (__main__.FeatureEngineeringTest) ... ok

----------------------------------------------------------------------
Ran 1 test in 0.008s

OK
Out[186]:
<unittest.runner.TextTestResult run=1 errors=0 failures=0>

In [ ]: