In [75]:

    
import pandas as pd
import numpy as np
import sklearn
import subprocess
import warnings
pd.set_option('display.max_columns', None)



In [21]:

    
source_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"



In [47]:

    
# coding: ISO-8859-1

Import data



In [29]:

    
raw_dataset = pd.read_csv(source_path + "Speed_Dating_Data.csv",encoding = "ISO-8859-1")

Data exploration

Shape, types, distribution, modalities and potential missing values



In [30]:

    
raw_dataset.head(3)









    Out[30]:






  
    
      
      iid
      id
      gender
      idg
      condtn
      wave
      round
      position
      positin1
      order
      partner
      pid
      match
      int_corr
      samerace
      age_o
      race_o
      pf_o_att
      pf_o_sin
      pf_o_int
      pf_o_fun
      pf_o_amb
      pf_o_sha
      dec_o
      attr_o
      sinc_o
      intel_o
      fun_o
      amb_o
      shar_o
      like_o
      prob_o
      met_o
      age
      field
      field_cd
      undergra
      mn_sat
      tuition
      race
      imprace
      imprelig
      from
      zipcode
      income
      goal
      date
      go_out
      career
      career_c
      sports
      tvsports
      exercise
      dining
      museums
      art
      hiking
      gaming
      clubbing
      reading
      tv
      theater
      movies
      concerts
      music
      shopping
      yoga
      exphappy
      expnum
      attr1_1
      sinc1_1
      intel1_1
      fun1_1
      amb1_1
      shar1_1
      attr4_1
      sinc4_1
      intel4_1
      fun4_1
      amb4_1
      shar4_1
      attr2_1
      sinc2_1
      intel2_1
      fun2_1
      amb2_1
      shar2_1
      attr3_1
      sinc3_1
      fun3_1
      intel3_1
      amb3_1
      attr5_1
      sinc5_1
      intel5_1
      fun5_1
      amb5_1
      dec
      attr
      sinc
      intel
      fun
      amb
      shar
      like
      prob
      met
      match_es
      attr1_s
      sinc1_s
      intel1_s
      fun1_s
      amb1_s
      shar1_s
      attr3_s
      sinc3_s
      intel3_s
      fun3_s
      amb3_s
      satis_2
      length
      numdat_2
      attr7_2
      sinc7_2
      intel7_2
      fun7_2
      amb7_2
      shar7_2
      attr1_2
      sinc1_2
      intel1_2
      fun1_2
      amb1_2
      shar1_2
      attr4_2
      sinc4_2
      intel4_2
      fun4_2
      amb4_2
      shar4_2
      attr2_2
      sinc2_2
      intel2_2
      fun2_2
      amb2_2
      shar2_2
      attr3_2
      sinc3_2
      intel3_2
      fun3_2
      amb3_2
      attr5_2
      sinc5_2
      intel5_2
      fun5_2
      amb5_2
      you_call
      them_cal
      date_3
      numdat_3
      num_in_3
      attr1_3
      sinc1_3
      intel1_3
      fun1_3
      amb1_3
      shar1_3
      attr7_3
      sinc7_3
      intel7_3
      fun7_3
      amb7_3
      shar7_3
      attr4_3
      sinc4_3
      intel4_3
      fun4_3
      amb4_3
      shar4_3
      attr2_3
      sinc2_3
      intel2_3
      fun2_3
      amb2_3
      shar2_3
      attr3_3
      sinc3_3
      intel3_3
      fun3_3
      amb3_3
      attr5_3
      sinc5_3
      intel5_3
      fun5_3
      amb5_3
    
  
  
    
      0
      1
      1.0
      0
      1
      1
      1
      10
      7
      NaN
      4
      1
      11.0
      0
      0.14
      0
      27.0
      2.0
      35.0
      20.0
      20.0
      20.0
      0.0
      5.0
      0
      6.0
      8.0
      8.0
      8.0
      8.0
      6.0
      7.0
      4.0
      2.0
      21.0
      Law
      1.0
      NaN
      NaN
      NaN
      4.0
      2.0
      4.0
      Chicago
      60,521
      69,487.00
      2.0
      7.0
      1.0
      lawyer
      NaN
      9.0
      2.0
      8.0
      9.0
      1.0
      1.0
      5.0
      1.0
      5.0
      6.0
      9.0
      1.0
      10.0
      10.0
      9.0
      8.0
      1.0
      3.0
      2.0
      15.0
      20.0
      20.0
      15.0
      15.0
      15.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      35.0
      20.0
      15.0
      20.0
      5.0
      5.0
      6.0
      8.0
      8.0
      8.0
      7.0
      NaN
      NaN
      NaN
      NaN
      NaN
      1
      6.0
      9.0
      7.0
      7.0
      6.0
      5.0
      7.0
      6.0
      2.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      6.0
      2.0
      1.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      19.44
      16.67
      13.89
      22.22
      11.11
      16.67
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      6.0
      7.0
      8.0
      7.0
      6.0
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
      1.0
      0.0
      NaN
      NaN
      15.0
      20.0
      20.0
      15.0
      15.0
      15.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      5.0
      7.0
      7.0
      7.0
      7.0
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1
      1
      1.0
      0
      1
      1
      1
      10
      7
      NaN
      3
      2
      12.0
      0
      0.54
      0
      22.0
      2.0
      60.0
      0.0
      0.0
      40.0
      0.0
      0.0
      0
      7.0
      8.0
      10.0
      7.0
      7.0
      5.0
      8.0
      4.0
      2.0
      21.0
      Law
      1.0
      NaN
      NaN
      NaN
      4.0
      2.0
      4.0
      Chicago
      60,521
      69,487.00
      2.0
      7.0
      1.0
      lawyer
      NaN
      9.0
      2.0
      8.0
      9.0
      1.0
      1.0
      5.0
      1.0
      5.0
      6.0
      9.0
      1.0
      10.0
      10.0
      9.0
      8.0
      1.0
      3.0
      2.0
      15.0
      20.0
      20.0
      15.0
      15.0
      15.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      35.0
      20.0
      15.0
      20.0
      5.0
      5.0
      6.0
      8.0
      8.0
      8.0
      7.0
      NaN
      NaN
      NaN
      NaN
      NaN
      1
      7.0
      8.0
      7.0
      8.0
      5.0
      6.0
      7.0
      5.0
      1.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      6.0
      2.0
      1.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      19.44
      16.67
      13.89
      22.22
      11.11
      16.67
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      6.0
      7.0
      8.0
      7.0
      6.0
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
      1.0
      0.0
      NaN
      NaN
      15.0
      20.0
      20.0
      15.0
      15.0
      15.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      5.0
      7.0
      7.0
      7.0
      7.0
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      2
      1
      1.0
      0
      1
      1
      1
      10
      7
      NaN
      10
      3
      13.0
      1
      0.16
      1
      22.0
      4.0
      19.0
      18.0
      19.0
      18.0
      14.0
      12.0
      1
      10.0
      10.0
      10.0
      10.0
      10.0
      10.0
      10.0
      10.0
      1.0
      21.0
      Law
      1.0
      NaN
      NaN
      NaN
      4.0
      2.0
      4.0
      Chicago
      60,521
      69,487.00
      2.0
      7.0
      1.0
      lawyer
      NaN
      9.0
      2.0
      8.0
      9.0
      1.0
      1.0
      5.0
      1.0
      5.0
      6.0
      9.0
      1.0
      10.0
      10.0
      9.0
      8.0
      1.0
      3.0
      2.0
      15.0
      20.0
      20.0
      15.0
      15.0
      15.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      35.0
      20.0
      15.0
      20.0
      5.0
      5.0
      6.0
      8.0
      8.0
      8.0
      7.0
      NaN
      NaN
      NaN
      NaN
      NaN
      1
      5.0
      8.0
      9.0
      8.0
      5.0
      7.0
      7.0
      NaN
      1.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      6.0
      2.0
      1.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      19.44
      16.67
      13.89
      22.22
      11.11
      16.67
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      6.0
      7.0
      8.0
      7.0
      6.0
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
      1.0
      0.0
      NaN
      NaN
      15.0
      20.0
      20.0
      15.0
      15.0
      15.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      5.0
      7.0
      7.0
      7.0
      7.0
      NaN
      NaN
      NaN
      NaN
      NaN



In [31]:

    
raw_dataset_copy = raw_dataset



In [32]:

    
check1 = raw_dataset_copy[raw_dataset_copy["iid"] == 1]
check1_sel = check1[["iid", "pid", "match","gender","date","go_out","sports","tvsports","exercise","dining",
                          "museums","art","hiking","gaming","clubbing","reading","tv","theater","movies",
                          "concerts","music","shopping","yoga"]]



In [33]:

    
check1_sel.drop_duplicates().head(20)



In [34]:

    
#merged_datasets = raw_dataset.merge(raw_dataset_copy, left_on="pid", right_on="iid")
#merged_datasets[["iid_x","gender_x","pid_y","gender_y"]].head(5)
#same_gender = merged_datasets[merged_datasets["gender_x"] == merged_datasets["gender_y"]]
#same_gender.head()



In [35]:

    
columns_by_types = raw_dataset.columns.to_series().groupby(raw_dataset.dtypes).groups



In [36]:

    
raw_dataset.dtypes.value_counts()









    Out[36]:





float64    174
int64       13
object       8
dtype: int64



In [37]:

    
raw_dataset.isnull().sum().head(3)









    Out[37]:





iid       0
id        1
gender    0
dtype: int64



In [85]:

    
summary = raw_dataset.describe() #.transpose()
print (summary.head())









    



               iid           id       gender          idg       condtn  \
count  8378.000000  8377.000000  8378.000000  8378.000000  8378.000000   
mean    283.675937     8.960248     0.500597    17.327166     1.828837   
std     158.583367     5.491329     0.500029    10.940735     0.376673   
min       1.000000     1.000000     0.000000     1.000000     1.000000   
25%     154.000000     4.000000     0.000000     8.000000     2.000000   

              wave        round     position     positin1        order  \
count  8378.000000  8378.000000  8378.000000  6532.000000  8378.000000   
mean     11.350919    16.872046     9.042731     9.295775     8.927668   
std       5.995903     4.358458     5.514939     5.650199     5.477009   
min       1.000000     5.000000     1.000000     1.000000     1.000000   
25%       7.000000    14.000000     4.000000     4.000000     4.000000   

           partner          pid        match     int_corr     samerace  \
count  8378.000000  8368.000000  8378.000000  8220.000000  8378.000000   
mean      8.963595   283.863767     0.164717     0.196010     0.395799   
std       5.491068   158.584899     0.370947     0.303539     0.489051   
min       1.000000     1.000000     0.000000    -0.830000     0.000000   
25%       4.000000   154.000000     0.000000    -0.020000     0.000000   

             age_o       race_o     pf_o_att     pf_o_sin     pf_o_int  \
count  8274.000000  8305.000000  8289.000000  8289.000000  8289.000000   
mean     26.364999     2.756653    22.495347    17.396867    20.270759   
std       3.563648     1.230689    12.569802     7.044003     6.782895   
min      18.000000     1.000000     0.000000     0.000000     0.000000   
25%      24.000000     2.000000    15.000000    15.000000    17.390000   

          pf_o_fun     pf_o_amb     pf_o_sha        dec_o       attr_o  \
count  8280.000000  8271.000000  8249.000000  8378.000000  8166.000000   
mean     17.459714    10.685375    11.845930     0.419551     6.190411   
std       6.085526     6.126544     6.362746     0.493515     1.950305   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      15.000000     5.000000     9.520000     0.000000     5.000000   

            sinc_o      intel_o        fun_o        amb_o       shar_o  \
count  8091.000000  8072.000000  8018.000000  7656.000000  7302.000000   
mean      7.175256     7.369301     6.400599     6.778409     5.474870   
std       1.740575     1.550501     1.954078     1.794080     2.156163   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       6.000000     6.000000     5.000000     6.000000     4.000000   

            like_o       prob_o        met_o          age     field_cd  \
count  8128.000000  8060.000000  7993.000000  8283.000000  8296.000000   
mean      6.134498     5.208251     1.960215    26.358928     7.662488   
std       1.841258     2.129354     0.245925     3.566763     3.758935   
min       0.000000     0.000000     1.000000    18.000000     1.000000   
25%       5.000000     4.000000     2.000000    24.000000     5.000000   

              race      imprace     imprelig         goal         date  \
count  8315.000000  8299.000000  8299.000000  8299.000000  8281.000000   
mean      2.757186     3.784793     3.651645     2.122063     5.006762   
std       1.230905     2.845708     2.805237     1.407181     1.444531   
min       1.000000     0.000000     1.000000     1.000000     1.000000   
25%       2.000000     1.000000     1.000000     1.000000     4.000000   

            go_out     career_c       sports     tvsports     exercise  \
count  8299.000000  8240.000000  8299.000000  8299.000000  8299.000000   
mean      2.158091     5.277791     6.425232     4.575491     6.245813   
std       1.105246     3.309520     2.619024     2.801874     2.418858   
min       1.000000     1.000000     1.000000     1.000000     1.000000   
25%       1.000000     2.000000     4.000000     2.000000     5.000000   

            dining      museums          art       hiking       gaming  \
count  8299.000000  8299.000000  8299.000000  8299.000000  8299.000000   
mean      7.783829     6.985781     6.714544     5.737077     3.881191   
std       1.754868     2.052232     2.263407     2.570207     2.620507   
min       1.000000     0.000000     0.000000     0.000000     0.000000   
25%       7.000000     6.000000     5.000000     4.000000     2.000000   

          clubbing      reading           tv      theater       movies  \
count  8299.000000  8299.000000  8299.000000  8299.000000  8299.000000   
mean      5.745993     7.678515     5.304133     6.776118     7.919629   
std       2.502218     2.006565     2.529135     2.235152     1.700927   
min       0.000000     1.000000     1.000000     0.000000     0.000000   
25%       4.000000     7.000000     3.000000     5.000000     7.000000   

          concerts        music     shopping         yoga     exphappy  \
count  8299.000000  8299.000000  8299.000000  8299.000000  8277.000000   
mean      6.825401     7.851066     5.631281     4.339197     5.534131   
std       2.156283     1.791827     2.608913     2.717612     1.734059   
min       0.000000     1.000000     1.000000     0.000000     1.000000   
25%       5.000000     7.000000     4.000000     2.000000     5.000000   

            expnum      attr1_1      sinc1_1     intel1_1       fun1_1  \
count  1800.000000  8299.000000  8299.000000  8299.000000  8289.000000   
mean      5.570556    22.514632    17.396389    20.265613    17.457043   
std       4.762569    12.587674     7.046700     6.783003     6.085239   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       2.000000    15.000000    15.000000    17.390000    15.000000   

            amb1_1      shar1_1      attr4_1      sinc4_1     intel4_1  \
count  8279.000000  8257.000000  6489.000000  6489.000000  6489.000000   
mean     10.682539    11.845111    26.394360    11.071506    12.636308   
std       6.124888     6.362154    16.297045     6.659233     6.717476   
min       0.000000     0.000000     5.000000     0.000000     0.000000   
25%       5.000000     9.520000    10.000000     6.000000     8.000000   

            fun4_1       amb4_1      shar4_1      attr2_1      sinc2_1  \
count  6489.000000  6489.000000  6467.000000  8299.000000  8299.000000   
mean     15.566805     9.780089    11.014845    30.362192    13.273691   
std       7.328256     6.998428     6.060150    16.249937     6.976775   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      10.000000     5.000000     7.000000    20.000000    10.000000   

          intel2_1       fun2_1       amb2_1      shar2_1      attr3_1  \
count  8299.000000  8299.000000  8289.000000  8289.000000  8273.000000   
mean     14.416891    18.422620    11.744499    11.854817     7.084733   
std       6.263304     6.577929     6.886532     6.167314     1.395783   
min       0.000000     0.000000     0.000000     0.000000     2.000000   
25%      10.000000    15.000000     6.000000    10.000000     6.000000   

           sinc3_1       fun3_1     intel3_1       amb3_1      attr5_1  \
count  8273.000000  8273.000000  8273.000000  8273.000000  4906.000000   
mean      8.294935     7.704460     8.403965     7.578388     6.941908   
std       1.407460     1.564321     1.076608     1.778315     1.498653   
min       2.000000     2.000000     3.000000     2.000000     2.000000   
25%       8.000000     7.000000     8.000000     7.000000     6.000000   

           sinc5_1     intel5_1       fun5_1       amb5_1          dec  \
count  4906.000000  4906.000000  4906.000000  4906.000000  8378.000000   
mean      7.927232     8.284346     7.426213     7.617611     0.419909   
std       1.627054     1.283657     1.779129     1.773094     0.493573   
min       1.000000     3.000000     2.000000     1.000000     0.000000   
25%       7.000000     8.000000     6.000000     7.000000     0.000000   

              attr         sinc        intel          fun          amb  \
count  8176.000000  8101.000000  8082.000000  8028.000000  7666.000000   
mean      6.189995     7.175164     7.368597     6.400598     6.777524   
std       1.950169     1.740315     1.550453     1.953702     1.794055   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       5.000000     6.000000     6.000000     5.000000     6.000000   

              shar         like         prob          met     match_es  \
count  7311.000000  8138.000000  8069.000000  8003.000000  7205.000000   
mean      5.474559     6.134087     5.207523     0.948769     3.207814   
std       2.156363     1.841285     2.129565     0.989889     2.444813   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       4.000000     5.000000     4.000000     0.000000     2.000000   

           attr1_s      sinc1_s     intel1_s       fun1_s       amb1_s  \
count  4096.000000  4096.000000  4096.000000  4096.000000  4096.000000   
mean     20.791624    15.434255    17.243708    15.260869    11.144619   
std      12.968524     6.915322     6.596420     5.356969     5.514028   
min       3.000000     0.000000     0.000000     1.000000     0.000000   
25%      14.810000    10.000000    10.000000    10.000000     7.000000   

           shar1_s     attr3_s      sinc3_s     intel3_s       fun3_s  \
count  4096.000000  4000.00000  4000.000000  4000.000000  4000.000000   
mean     12.457925     7.21125     8.082000     8.257750     7.692500   
std       5.921789     1.41545     1.455741     1.179317     1.626839   
min       0.000000     3.00000     1.000000     4.000000     3.000000   
25%       9.000000     7.00000     7.000000     8.000000     7.000000   

            amb3_s      satis_2       length     numdat_2      attr7_2  \
count  4000.000000  7463.000000  7463.000000  7433.000000  1984.000000   
mean      7.589250     5.711510     1.843495     2.338087    32.819556   
std       1.793136     1.820764     0.975662     0.631240    17.155270   
min       2.000000     1.000000     1.000000     1.000000    10.000000   
25%       7.000000     5.000000     1.000000     2.000000    20.000000   

           sinc7_2     intel7_2       fun7_2       amb7_2      shar7_2  \
count  1955.000000  1984.000000  1984.000000  1955.000000  1974.000000   
mean     13.529923    15.293851    18.868448     7.286957    12.156028   
std       7.977482     7.292868     8.535963     6.125187     8.241906   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      10.000000    10.000000    10.000000     0.000000     5.000000   

           attr1_2      sinc1_2     intel1_2       fun1_2       amb1_2  \
count  7445.000000  7463.000000  7463.000000  7463.000000  7463.000000   
mean     26.217194    15.865084    17.813755    17.654765     9.913436   
std      14.388694     6.658494     6.535894     6.129746     5.675550   
min       5.000000     0.000000     0.000000     0.000000     0.000000   
25%      16.670000    10.000000    15.000000    15.000000     5.000000   

           shar1_2      attr4_2      sinc4_2     intel4_2       fun4_2  \
count  7463.000000  5775.000000  5775.000000  5775.000000  5775.000000   
mean     12.760263    26.806234    11.929177    12.103030    15.163810   
std       6.651547    16.402836     6.401556     5.990607     7.290107   
min       0.000000     6.000000     0.000000     0.000000     0.000000   
25%      10.000000    10.000000     8.000000     8.000000     9.000000   

            amb4_2      shar4_2      attr2_2     sinc2_2     intel2_2  \
count  5775.000000  5775.000000  5775.000000  5775.00000  5775.000000   
mean      9.342511    11.320866    29.344369    13.89823    13.958265   
std       5.856329     6.296155    14.551171     6.17169     5.398621   
min       0.000000     0.000000     0.000000     0.00000     0.000000   
25%       5.000000     7.000000    19.150000    10.00000    10.000000   

            fun2_2       amb2_2      shar2_2      attr3_2      sinc3_2  \
count  5775.000000  5775.000000  5775.000000  7463.000000  7463.000000   
mean     17.967233    11.909735    12.887976     7.125285     7.931529   
std       6.100307     6.313281     5.615691     1.371390     1.503236   
min       0.000000     0.000000     0.000000     2.000000     2.000000   
25%      15.000000    10.000000    10.000000     7.000000     7.000000   

          intel3_2       fun3_2       amb3_2      attr5_2      sinc5_2  \
count  7463.000000  7463.000000  7463.000000  4377.000000  4377.000000   
mean      8.238912     7.602171     7.486802     6.827964     7.394106   
std       1.180280     1.548200     1.744634     1.411096     1.588145   
min       4.000000     1.000000     2.000000     2.000000     2.000000   
25%       8.000000     7.000000     7.000000     6.000000     6.000000   

          intel5_2       fun5_2       amb5_2     you_call     them_cal  \
count  4377.000000  4377.000000  4377.000000  3974.000000  3974.000000   
mean      7.838702     7.279415     7.332191     0.780825     0.981631   
std       1.280936     1.647478     1.521854     1.611694     1.382139   
min       2.000000     2.000000     2.000000     0.000000     0.000000   
25%       7.000000     6.000000     6.000000     0.000000     0.000000   

            date_3     numdat_3    num_in_3      attr1_3      sinc1_3  \
count  3974.000000  1496.000000  668.000000  3974.000000  3974.000000   
mean      0.376950     1.230615    0.934132    24.384524    16.588583   
std       0.484683     1.294557    0.753902    13.712120     7.471537   
min       0.000000     0.000000    0.000000     0.000000     0.000000   
25%       0.000000     1.000000    1.000000    15.220000    10.000000   

          intel1_3       fun1_3       amb1_3      shar1_3      attr7_3  \
count  3974.000000  3974.000000  3974.000000  3974.000000  2016.000000   
mean     19.411346    16.233415    10.898075    12.699142    31.330357   
std       6.124502     5.163777     5.900697     6.557041    17.551540   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      16.670000    14.810000     5.000000    10.000000    20.000000   

           sinc7_3     intel7_3       fun7_3       amb7_3      shar7_3  \
count  2016.000000  2016.000000  2016.000000  2016.000000  2016.000000   
mean     15.654266    16.679563    16.418155     7.823909    12.207837   
std       9.336288     7.880088     7.231325     6.100502     8.615985   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      10.000000    10.000000    10.000000     0.000000     5.000000   

           attr4_3      sinc4_3     intel4_3       fun4_3       amb4_3  \
count  2959.000000  2959.000000  2959.000000  2959.000000  2959.000000   
mean     25.610341    10.751267    11.524839    14.276783     9.207503   
std      17.477134     5.740351     6.004222     6.927869     6.385852   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      10.000000     7.000000     7.000000     9.000000     5.000000   

           shar4_3      attr2_3      sinc2_3     intel2_3       fun2_3  \
count  2959.000000  2959.000000  2959.000000  2959.000000  2959.000000   
mean     11.253802    24.970936    10.923285    11.952687    14.959108   
std       6.516178    17.007669     6.226283     7.010650     7.935509   
min       0.000000     5.000000     0.000000     0.000000     0.000000   
25%       7.000000    10.000000     7.000000     7.000000     9.000000   

            amb2_3      shar2_3      attr3_3      sinc3_3     intel3_3  \
count  2959.000000  2016.000000  3974.000000  3974.000000  3974.000000   
mean      9.526191    11.966270     7.240312     8.093357     8.388777   
std       6.403117     7.012067     1.576596     1.610309     1.459094   
min       0.000000     0.000000     2.000000     2.000000     3.000000   
25%       6.000000     5.000000     7.000000     7.000000     8.000000   

            fun3_3       amb3_3      attr5_3      sinc5_3     intel5_3  \
count  3974.000000  3974.000000  2016.000000  2016.000000  2016.000000   
mean      7.658782     7.391545     6.810020     7.615079     7.932540   
std       1.744670     1.961417     1.507341     1.504551     1.340868   
min       2.000000     1.000000     2.000000     2.000000     4.000000   
25%       7.000000     6.000000     6.000000     7.000000     7.000000   

            fun5_3       amb5_3  
count  2016.000000  2016.000000  
mean      7.155258     7.048611  
std       1.672787     1.717988  
min       1.000000     1.000000  
25%       6.000000     6.000000



In [86]:

    
#raw_dataset.groupby("gender").agg({"iid": pd.Series.nunique})
raw_dataset.groupby('gender').iid.nunique()









    Out[86]:





gender
0    274
1    277
Name: iid, dtype: int64



In [87]:

    
raw_dataset.groupby('career').iid.nunique().sort_values(ascending=False).head(5)









    Out[87]:





career
Finance          13
professor        12
Lawyer           11
Professor        10
Social Worker     9
Name: iid, dtype: int64



In [88]:

    
raw_dataset.groupby(["gender","match"]).iid.nunique()









    Out[88]:





gender  match
0       0        274
        1        221
1       0        277
        1        231
Name: iid, dtype: int64

Data processing



In [89]:

    
local_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"
local_filename = "Speed_Dating_Data.csv"
my_variables_selection = ["iid", "pid", "match","gender","date","go_out","sports","tvsports","exercise","dining",
                          "museums","art","hiking","gaming","clubbing","reading","tv","theater","movies",
                          "concerts","music","shopping","yoga"]



In [90]:

    
class RawSetProcessing(object):
    """
    This class aims to load and clean the dataset.
    """
    def __init__(self,source_path,filename,features):
        self.source_path = source_path
        self.filename = filename
        self.features = features
    
    # Load data
    def load_data(self):
        raw_dataset_df = pd.read_csv(self.source_path + self.filename,encoding = "ISO-8859-1")
        return raw_dataset_df
    
    # Select variables to process and include in the model
    def subset_features(self, df):
        sel_vars_df = df[self.features]
        return sel_vars_df
    
    @staticmethod
    # Remove ids with missing values
    def remove_ids_with_missing_values(df):
        sel_vars_filled_df = df.dropna()
        return sel_vars_filled_df
    
    @staticmethod
    def drop_duplicated_values(df):
        df = df.drop_duplicates()
        return df
    
    # Combine processing stages
    def combiner_pipeline(self):
        raw_dataset = self.load_data()
        subset_df = self.subset_features(raw_dataset)
        subset_no_dup_df = self.drop_duplicated_values(subset_df)
        subset_filled_df = self.remove_ids_with_missing_values(subset_no_dup_df)
        return subset_filled_df



In [91]:

    
raw_set = RawSetProcessing(local_path, local_filename, my_variables_selection)
dataset_df = raw_set.combiner_pipeline()



In [92]:

    
dataset_df.head(3)



In [93]:

    
# Number of unique participants
dataset_df.iid.nunique()









    Out[93]:





543



In [94]:

    
dataset_df.shape









    Out[94]:





(8271, 23)

Feature engineering



In [95]:

    
suffix_me = "_me"
suffix_partner = "_partner"



In [96]:

    
def get_partner_features(df, suffix_1, suffix_2, ignore_vars=True):
    #print df[df["iid"] == 1]
    df_partner = df.copy()
    if ignore_vars is True:
        df_partner = df_partner.drop(['pid','match'], 1).drop_duplicates()
    else:
        df_partner = df_partner.copy()
    #print df_partner.shape
    merged_datasets = df.merge(df_partner, how = "inner",left_on="pid", right_on="iid",suffixes=(suffix_1,suffix_2))
    #print merged_datasets[merged_datasets["iid_me"] == 1]
    return merged_datasets



In [97]:

    
feat_eng_df = get_partner_features(dataset_df,suffix_me,suffix_partner)
feat_eng_df.head(3)









    Out[97]:






  
    
      
      iid_me
      pid
      match
      gender_me
      date_me
      go_out_me
      sports_me
      tvsports_me
      exercise_me
      dining_me
      museums_me
      art_me
      hiking_me
      gaming_me
      clubbing_me
      reading_me
      tv_me
      theater_me
      movies_me
      concerts_me
      music_me
      shopping_me
      yoga_me
      iid_partner
      gender_partner
      date_partner
      go_out_partner
      sports_partner
      tvsports_partner
      exercise_partner
      dining_partner
      museums_partner
      art_partner
      hiking_partner
      gaming_partner
      clubbing_partner
      reading_partner
      tv_partner
      theater_partner
      movies_partner
      concerts_partner
      music_partner
      shopping_partner
      yoga_partner
    
  
  
    
      0
      1
      11.0
      0
      0
      7.0
      1.0
      9.0
      2.0
      8.0
      9.0
      1.0
      1.0
      5.0
      1.0
      5.0
      6.0
      9.0
      1.0
      10.0
      10.0
      9.0
      8.0
      1.0
      11
      1
      5.0
      4.0
      8.0
      7.0
      2.0
      6.0
      7.0
      5.0
      5.0
      5.0
      4.0
      9.0
      2.0
      4.0
      8.0
      7.0
      8.0
      5.0
      1.0
    
    
      1
      2
      11.0
      0
      0
      5.0
      1.0
      3.0
      2.0
      7.0
      10.0
      8.0
      6.0
      3.0
      5.0
      8.0
      10.0
      1.0
      9.0
      8.0
      7.0
      8.0
      3.0
      1.0
      11
      1
      5.0
      4.0
      8.0
      7.0
      2.0
      6.0
      7.0
      5.0
      5.0
      5.0
      4.0
      9.0
      2.0
      4.0
      8.0
      7.0
      8.0
      5.0
      1.0
    
    
      2
      3
      11.0
      0
      0
      3.0
      1.0
      3.0
      8.0
      7.0
      8.0
      5.0
      5.0
      8.0
      4.0
      5.0
      7.0
      8.0
      7.0
      7.0
      7.0
      5.0
      8.0
      7.0
      11
      1
      5.0
      4.0
      8.0
      7.0
      2.0
      6.0
      7.0
      5.0
      5.0
      5.0
      4.0
      9.0
      2.0
      4.0
      8.0
      7.0
      8.0
      5.0
      1.0

Modelling

This model aims to answer the questions what is the profile of the persons regarding interests that got the most matches.

Variables:

gender
date (In general, how frequently do you go on dates?)
go out (How often do you go out (not necessarily on dates)?
sports: Playing sports/ athletics
tvsports: Watching sports
excersice: Body building/exercising
dining: Dining out
museums: Museums/galleries
art: Art
hiking: Hiking/camping
gaming: Gaming
clubbing: Dancing/clubbing
reading: Reading
tv: Watching TV
theater: Theater
movies: Movies
concerts: Going to concerts
music: Music
shopping: Shopping
yoga: Yoga/meditation



In [98]:

    
import sklearn
print (sklearn.__version__)



In [99]:

    
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import subprocess

Variables selection



In [100]:

    
#features = list(["gender","age_o","race_o","goal","samerace","imprace","imprelig","date","go_out","career_c"])
features = list(["gender","date","go_out","sports","tvsports","exercise","dining","museums","art",
                 "hiking","gaming","clubbing","reading","tv","theater","movies","concerts","music",
                 "shopping","yoga"])
label = "match"



In [101]:

    
#add suffix to each element of list
def process_features_names(features, suffix_1, suffix_2):
    features_me = [feat + suffix_1 for feat in features]
    features_partner = [feat + suffix_2 for feat in features]
    features_all = features_me + features_partner
    print (features_all)
    return features_all

features_model = process_features_names(features, suffix_me, suffix_partner)









    



['gender_me', 'date_me', 'go_out_me', 'sports_me', 'tvsports_me', 'exercise_me', 'dining_me', 'museums_me', 'art_me', 'hiking_me', 'gaming_me', 'clubbing_me', 'reading_me', 'tv_me', 'theater_me', 'movies_me', 'concerts_me', 'music_me', 'shopping_me', 'yoga_me', 'gender_partner', 'date_partner', 'go_out_partner', 'sports_partner', 'tvsports_partner', 'exercise_partner', 'dining_partner', 'museums_partner', 'art_partner', 'hiking_partner', 'gaming_partner', 'clubbing_partner', 'reading_partner', 'tv_partner', 'theater_partner', 'movies_partner', 'concerts_partner', 'music_partner', 'shopping_partner', 'yoga_partner']



In [102]:

    
feat_eng_df.head(5)









    Out[102]:






  
    
      
      iid_me
      pid
      match
      gender_me
      date_me
      go_out_me
      sports_me
      tvsports_me
      exercise_me
      dining_me
      museums_me
      art_me
      hiking_me
      gaming_me
      clubbing_me
      reading_me
      tv_me
      theater_me
      movies_me
      concerts_me
      music_me
      shopping_me
      yoga_me
      iid_partner
      gender_partner
      date_partner
      go_out_partner
      sports_partner
      tvsports_partner
      exercise_partner
      dining_partner
      museums_partner
      art_partner
      hiking_partner
      gaming_partner
      clubbing_partner
      reading_partner
      tv_partner
      theater_partner
      movies_partner
      concerts_partner
      music_partner
      shopping_partner
      yoga_partner
    
  
  
    
      0
      1
      11.0
      0
      0
      7.0
      1.0
      9.0
      2.0
      8.0
      9.0
      1.0
      1.0
      5.0
      1.0
      5.0
      6.0
      9.0
      1.0
      10.0
      10.0
      9.0
      8.0
      1.0
      11
      1
      5.0
      4.0
      8.0
      7.0
      2.0
      6.0
      7.0
      5.0
      5.0
      5.0
      4.0
      9.0
      2.0
      4.0
      8.0
      7.0
      8.0
      5.0
      1.0
    
    
      1
      2
      11.0
      0
      0
      5.0
      1.0
      3.0
      2.0
      7.0
      10.0
      8.0
      6.0
      3.0
      5.0
      8.0
      10.0
      1.0
      9.0
      8.0
      7.0
      8.0
      3.0
      1.0
      11
      1
      5.0
      4.0
      8.0
      7.0
      2.0
      6.0
      7.0
      5.0
      5.0
      5.0
      4.0
      9.0
      2.0
      4.0
      8.0
      7.0
      8.0
      5.0
      1.0
    
    
      2
      3
      11.0
      0
      0
      3.0
      1.0
      3.0
      8.0
      7.0
      8.0
      5.0
      5.0
      8.0
      4.0
      5.0
      7.0
      8.0
      7.0
      7.0
      7.0
      5.0
      8.0
      7.0
      11
      1
      5.0
      4.0
      8.0
      7.0
      2.0
      6.0
      7.0
      5.0
      5.0
      5.0
      4.0
      9.0
      2.0
      4.0
      8.0
      7.0
      8.0
      5.0
      1.0
    
    
      3
      4
      11.0
      0
      0
      5.0
      1.0
      1.0
      1.0
      6.0
      7.0
      6.0
      7.0
      7.0
      5.0
      7.0
      7.0
      7.0
      9.0
      7.0
      8.0
      7.0
      1.0
      8.0
      11
      1
      5.0
      4.0
      8.0
      7.0
      2.0
      6.0
      7.0
      5.0
      5.0
      5.0
      4.0
      9.0
      2.0
      4.0
      8.0
      7.0
      8.0
      5.0
      1.0
    
    
      4
      5
      11.0
      0
      0
      4.0
      1.0
      7.0
      4.0
      7.0
      7.0
      6.0
      8.0
      6.0
      6.0
      8.0
      6.0
      8.0
      6.0
      6.0
      3.0
      7.0
      8.0
      3.0
      11
      1
      5.0
      4.0
      8.0
      7.0
      2.0
      6.0
      7.0
      5.0
      5.0
      5.0
      4.0
      9.0
      2.0
      4.0
      8.0
      7.0
      8.0
      5.0
      1.0



In [103]:

    
explanatory = feat_eng_df[features_model]
explained = feat_eng_df[label]

Decision Tree



In [104]:

    
clf = tree.DecisionTreeClassifier(min_samples_split=20,min_samples_leaf=10,max_depth=4)
clf = clf.fit(explanatory, explained)



In [105]:

    
# Download http://www.graphviz.org/

with open("data.dot", 'w') as f:
    f = tree.export_graphviz(clf, out_file=f, feature_names= features_model, class_names="match")
    
subprocess.call(['dot', '-Tpdf', 'data.dot', '-o' 'data.pdf'])









    Out[105]:





0

Tuning Parameters



In [70]:

    
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(explanatory, explained, test_size=0.3, random_state=0)



In [71]:

    
parameters = [
  {'criterion': ['gini','entropy'], 'max_depth': [4,6,10,12,14], 
   'min_samples_split': [10,20,30], 'min_samples_leaf': [10,15,20]
  }
]

scores = ['precision', 'recall']



In [72]:

    
dtc = tree.DecisionTreeClassifier()
clf = GridSearchCV(dtc, parameters,n_jobs=3, cv=5, refit=True)



In [74]:

    
warnings.filterwarnings("ignore")

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print("")

    clf = GridSearchCV(dtc, parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print("")
    print(clf.best_params_)
    print("")
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print("")









    



# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'min_samples_leaf': 10, 'min_samples_split': 10, 'max_depth': 4, 'criterion': 'gini'}

             precision    recall  f1-score   support

          0       0.84      0.99      0.91      2064
          1       0.27      0.01      0.02       389

avg / total       0.75      0.84      0.77      2453


# Tuning hyper-parameters for recall

Best parameters set found on development set:

{'min_samples_leaf': 10, 'min_samples_split': 10, 'max_depth': 14, 'criterion': 'gini'}

             precision    recall  f1-score   support

          0       0.85      0.92      0.89      2064
          1       0.28      0.16      0.20       389

avg / total       0.76      0.80      0.78      2453



In [106]:

    
best_param_dtc = tree.DecisionTreeClassifier(criterion="entropy",min_samples_split=10,min_samples_leaf=10,max_depth=14)
best_param_dtc = best_param_dtc.fit(explanatory, explained)



In [107]:

    
best_param_dtc.feature_importances_









    Out[107]:





array([ 0.00538282,  0.02441344,  0.02342661,  0.02108348,  0.04005844,
        0.029787  ,  0.01949407,  0.01747603,  0.01774458,  0.01792501,
        0.03965957,  0.02793697,  0.02792654,  0.02825926,  0.02041224,
        0.01717185,  0.02546885,  0.01670987,  0.02317958,  0.02938759,
        0.        ,  0.0308793 ,  0.02539205,  0.0416509 ,  0.04748682,
        0.02656185,  0.01361627,  0.01351116,  0.02030704,  0.02668936,
        0.02990962,  0.03737748,  0.02734437,  0.03347446,  0.03162642,
        0.01682956,  0.02250987,  0.01302588,  0.03421447,  0.03468931])



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [108]:

    
raw_dataset.rename(columns={"age_o":"age_of_partner","race_o":"race_of_partner"},inplace=True)



In [ ]:



In [ ]:

Test



In [163]:

    
import unittest
from pandas.util.testing import assert_frame_equal

There is a weird thing, with self.XX the code does not work. I tried self.assertEqual



In [185]:

    
class FeatureEngineeringTest(unittest.TestCase):
    def test_get_partner_features(self):
        """

        :return:
        """
        # Given
        raw_data_a = {
        'iid': ['1', '2', '3', '4', '5','6'],
        'first_name': ['Sue', 'Maria', 'Sandra', 'Bill', 'Brian','Bruce'],
        'sport':['foot','run','volley','basket','swim','tv'],
        'pid': ['4', '5', '6', '1', '2','3'],}
        
        df_a = pd.DataFrame(raw_data_a, columns = ['iid', 'first_name', 'sport','pid'])
        
        expected_output_values = pd.DataFrame({
                'iid_me': ['1', '2', '3', '4', '5','6'],
                'first_name_me': ['Sue', 'Maria', 'Sandra', 'Bill', 'Brian','Bruce'],
                'sport_me': ['foot','run','volley','basket','swim','tv'],
                'pid_me': ['4', '5', '6', '1', '2','3'],
                'iid_partner': ['4', '5', '6', '1', '2','3'],
                'first_name_partner': ['Bill', 'Brian','Bruce','Sue', 'Maria', 'Sandra'],
                'sport_partner': ['basket','swim','tv','foot','run','volley'],
                'pid_partner':['1', '2', '3', '4', '5','6']
            }, columns = ['iid_me','first_name_me','sport_me','pid_me',
                          'iid_partner','first_name_partner','sport_partner','pid_partner'])

        # When

        output_values = get_partner_features(df_a, "_me","_partner",ignore_vars=False)

        # Then

        assert_frame_equal(output_values, expected_output_values)



In [186]:

    
suite = unittest.TestLoader().loadTestsFromTestCase(FeatureEngineeringTest)
unittest.TextTestRunner(verbosity=2).run(suite)









    



test_get_partner_features (__main__.FeatureEngineeringTest) ... ok

----------------------------------------------------------------------
Ran 1 test in 0.008s

OK






    Out[186]:





<unittest.runner.TextTestResult run=1 errors=0 failures=0>



In [ ]:

	iid	id	idg	condtn	wave	round	position	positin1	order	partner	pid	match	int_corr	samerace	age_o	race_o	pf_o_att	pf_o_sin	pf_o_int	pf_o_fun	pf_o_amb	pf_o_sha	dec_o	attr_o	sinc_o	intel_o	fun_o	amb_o	shar_o	like_o	prob_o	met_o	age	field	field_cd	undergra	mn_sat	tuition	race	imprace	imprelig	from	zipcode	income	goal	date	go_out	career	career_c	sports	tvsports	exercise	dining	museums	art	hiking	gaming	clubbing	reading	tv	theater	movies	concerts	music	shopping	yoga	exphappy	expnum	attr1_1	sinc1_1	intel1_1	fun1_1	amb1_1	shar1_1	attr4_1	sinc4_1	intel4_1	fun4_1	amb4_1	shar4_1	attr2_1	sinc2_1	intel2_1	fun2_1	amb2_1	shar2_1	attr3_1	sinc3_1	fun3_1	intel3_1	amb3_1	attr5_1	sinc5_1	intel5_1	fun5_1	amb5_1	dec	attr	sinc	intel	fun	amb	shar	like	prob	met	match_es	attr1_s	sinc1_s	intel1_s	fun1_s	amb1_s	shar1_s	attr3_s	sinc3_s	intel3_s	fun3_s	amb3_s	satis_2	length	numdat_2	attr7_2	sinc7_2	intel7_2	fun7_2	amb7_2	shar7_2	attr1_2	sinc1_2	intel1_2	fun1_2	amb1_2	shar1_2	attr4_2	sinc4_2	intel4_2	fun4_2	amb4_2	shar4_2	attr2_2	sinc2_2	intel2_2	fun2_2	amb2_2	shar2_2	attr3_2	sinc3_2	intel3_2	fun3_2	amb3_2	attr5_2	sinc5_2	intel5_2	fun5_2	amb5_2	you_call	them_cal	numdat_3	num_in_3	attr1_3	sinc1_3	intel1_3	fun1_3	amb1_3	shar1_3	attr7_3	sinc7_3	intel7_3	fun7_3	amb7_3	shar7_3	attr4_3	sinc4_3	intel4_3	fun4_3	amb4_3	shar4_3	attr2_3	sinc2_3	intel2_3	fun2_3	amb2_3	shar2_3	attr3_3	sinc3_3	intel3_3	fun3_3	amb3_3	attr5_3	sinc5_3	intel5_3	fun5_3	amb5_3
0	1	1.0	1	1	1	10	7	NaN	4	1	11.0	0	0.14	0	27.0	2.0	35.0	20.0	20.0	20.0	0.0	5.0	0	6.0	8.0	8.0	8.0	8.0	6.0	7.0	4.0	2.0	21.0	Law	1.0	NaN	NaN	NaN	4.0	2.0	4.0	Chicago	60,521	69,487.00	2.0	7.0	1.0	lawyer	NaN	9.0	2.0	8.0	9.0	1.0	1.0	5.0	1.0	5.0	6.0	9.0	1.0	10.0	10.0	9.0	8.0	1.0	3.0	2.0	15.0	20.0	20.0	15.0	15.0	15.0	NaN	NaN	NaN	NaN	NaN	NaN	35.0	20.0	15.0	20.0	5.0	5.0	6.0	8.0	8.0	8.0	7.0	NaN	NaN	NaN	NaN	NaN	1	6.0	9.0	7.0	7.0	6.0	5.0	7.0	6.0	2.0	4.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6.0	2.0	1.0	NaN	NaN	NaN	NaN	NaN	NaN	19.44	16.67	13.89	22.22	11.11	16.67	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6.0	7.0	8.0	7.0	6.0	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	NaN	15.0	20.0	20.0	15.0	15.0	15.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	5.0	7.0	7.0	7.0	7.0	NaN	NaN	NaN	NaN	NaN
1	1	1.0	1	1	1	10	7	NaN	3	2	12.0	0	0.54	0	22.0	2.0	60.0	0.0	0.0	40.0	0.0	0.0	0	7.0	8.0	10.0	7.0	7.0	5.0	8.0	4.0	2.0	21.0	Law	1.0	NaN	NaN	NaN	4.0	2.0	4.0	Chicago	60,521	69,487.00	2.0	7.0	1.0	lawyer	NaN	9.0	2.0	8.0	9.0	1.0	1.0	5.0	1.0	5.0	6.0	9.0	1.0	10.0	10.0	9.0	8.0	1.0	3.0	2.0	15.0	20.0	20.0	15.0	15.0	15.0	NaN	NaN	NaN	NaN	NaN	NaN	35.0	20.0	15.0	20.0	5.0	5.0	6.0	8.0	8.0	8.0	7.0	NaN	NaN	NaN	NaN	NaN	1	7.0	8.0	7.0	8.0	5.0	6.0	7.0	5.0	1.0	4.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6.0	2.0	1.0	NaN	NaN	NaN	NaN	NaN	NaN	19.44	16.67	13.89	22.22	11.11	16.67	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6.0	7.0	8.0	7.0	6.0	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	NaN	15.0	20.0	20.0	15.0	15.0	15.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	5.0	7.0	7.0	7.0	7.0	NaN	NaN	NaN	NaN	NaN
2	1	1.0	1	1	1	10	7	NaN	10	3	13.0	1	0.16	1	22.0	4.0	19.0	18.0	19.0	18.0	14.0	12.0	1	10.0	10.0	10.0	10.0	10.0	10.0	10.0	10.0	1.0	21.0	Law	1.0	NaN	NaN	NaN	4.0	2.0	4.0	Chicago	60,521	69,487.00	2.0	7.0	1.0	lawyer	NaN	9.0	2.0	8.0	9.0	1.0	1.0	5.0	1.0	5.0	6.0	9.0	1.0	10.0	10.0	9.0	8.0	1.0	3.0	2.0	15.0	20.0	20.0	15.0	15.0	15.0	NaN	NaN	NaN	NaN	NaN	NaN	35.0	20.0	15.0	20.0	5.0	5.0	6.0	8.0	8.0	8.0	7.0	NaN	NaN	NaN	NaN	NaN	1	5.0	8.0	9.0	8.0	5.0	7.0	7.0	NaN	1.0	4.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6.0	2.0	1.0	NaN	NaN	NaN	NaN	NaN	NaN	19.44	16.67	13.89	22.22	11.11	16.67	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6.0	7.0	8.0	7.0	6.0	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	NaN	15.0	20.0	20.0	15.0	15.0	15.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	5.0	7.0	7.0	7.0	7.0	NaN	NaN	NaN	NaN	NaN