In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [2]:
source_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"

Import data


In [3]:
raw_dataset = pd.read_csv(source_path + "Speed_Dating_Data.csv")

In [4]:
raw_dataset.shape


Out[4]:
(8378, 195)

Data exploration

Shape, types, distribution, modalities and potential missing values


In [4]:
raw_dataset[raw_dataset["iid"] == 11].head(3)


Out[4]:
iid id gender idg condtn wave round position positin1 order partner pid match int_corr samerace age_o race_o pf_o_att pf_o_sin pf_o_int pf_o_fun pf_o_amb pf_o_sha dec_o attr_o sinc_o intel_o fun_o amb_o shar_o like_o prob_o met_o age field field_cd undergra mn_sat tuition race imprace imprelig from zipcode income goal date go_out career career_c sports tvsports exercise dining museums art hiking gaming clubbing reading tv theater movies concerts music shopping yoga exphappy expnum attr1_1 sinc1_1 intel1_1 fun1_1 amb1_1 shar1_1 attr4_1 sinc4_1 intel4_1 fun4_1 amb4_1 shar4_1 attr2_1 sinc2_1 intel2_1 fun2_1 amb2_1 shar2_1 attr3_1 sinc3_1 fun3_1 intel3_1 amb3_1 attr5_1 sinc5_1 intel5_1 fun5_1 amb5_1 dec attr sinc intel fun amb shar like prob met match_es attr1_s sinc1_s intel1_s fun1_s amb1_s shar1_s attr3_s sinc3_s intel3_s fun3_s amb3_s satis_2 length numdat_2 attr7_2 sinc7_2 intel7_2 fun7_2 amb7_2 shar7_2 attr1_2 sinc1_2 intel1_2 fun1_2 amb1_2 shar1_2 attr4_2 sinc4_2 intel4_2 fun4_2 amb4_2 shar4_2 attr2_2 sinc2_2 intel2_2 fun2_2 amb2_2 shar2_2 attr3_2 sinc3_2 intel3_2 fun3_2 amb3_2 attr5_2 sinc5_2 intel5_2 fun5_2 amb5_2 you_call them_cal date_3 numdat_3 num_in_3 attr1_3 sinc1_3 intel1_3 fun1_3 amb1_3 shar1_3 attr7_3 sinc7_3 intel7_3 fun7_3 amb7_3 shar7_3 attr4_3 sinc4_3 intel4_3 fun4_3 amb4_3 shar4_3 attr2_3 sinc2_3 intel2_3 fun2_3 amb2_3 shar2_3 attr3_3 sinc3_3 intel3_3 fun3_3 amb3_3 attr5_3 sinc5_3 intel5_3 fun5_3 amb5_3
100 11 1.0 1 2 1 1 10 7 NaN 4 1 1.0 0 0.14 0 21.0 4.0 15.0 20.0 20.0 15.0 15.0 15.0 1 6.0 9.0 7.0 7.0 6.0 5.0 7.0 6.0 2.0 27.0 Finance 8.0 NaN NaN NaN 2.0 7.0 3.0 Argentina 0 NaN 1.0 5.0 4.0 Academia, Research, Banking, Life 2.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0 7.0 3.0 35.0 20.0 20.0 20.0 0.0 5.0 NaN NaN NaN NaN NaN NaN 25.0 5.0 20.0 20.0 25.0 5.0 8.0 9.0 7.0 8.0 5.0 NaN NaN NaN NaN NaN 0 6.0 8.0 8.0 8.0 8.0 6.0 7.0 4.0 2.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.0 2.0 NaN NaN NaN NaN NaN NaN NaN 19.51 17.07 17.07 17.07 12.2 17.07 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.0 8.0 8.0 6.0 5.0 NaN NaN NaN NaN NaN 0.0 0.0 0.0 NaN NaN 35.0 25.0 15.0 15.0 0.0 10.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.0 8.0 7.0 7.0 5.0 NaN NaN NaN NaN NaN
101 11 1.0 1 2 1 1 10 3 NaN 10 2 2.0 0 0.29 1 24.0 2.0 45.0 5.0 25.0 20.0 0.0 5.0 0 5.0 7.0 8.0 4.0 6.0 3.0 6.0 4.0 2.0 27.0 Finance 8.0 NaN NaN NaN 2.0 7.0 3.0 Argentina 0 NaN 1.0 5.0 4.0 Academia, Research, Banking, Life 2.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0 7.0 3.0 35.0 20.0 20.0 20.0 0.0 5.0 NaN NaN NaN NaN NaN NaN 25.0 5.0 20.0 20.0 25.0 5.0 8.0 9.0 7.0 8.0 5.0 NaN NaN NaN NaN NaN 0 8.0 7.0 6.0 9.0 7.0 4.0 7.0 2.0 2.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.0 2.0 NaN NaN NaN NaN NaN NaN NaN 19.51 17.07 17.07 17.07 12.2 17.07 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.0 8.0 8.0 6.0 5.0 NaN NaN NaN NaN NaN 0.0 0.0 0.0 NaN NaN 35.0 25.0 15.0 15.0 0.0 10.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.0 8.0 7.0 7.0 5.0 NaN NaN NaN NaN NaN
102 11 1.0 1 2 1 1 10 9 NaN 6 3 3.0 0 -0.24 1 25.0 2.0 35.0 10.0 35.0 10.0 10.0 0.0 0 7.0 9.0 10.0 7.0 8.0 9.0 8.0 7.0 1.0 27.0 Finance 8.0 NaN NaN NaN 2.0 7.0 3.0 Argentina 0 NaN 1.0 5.0 4.0 Academia, Research, Banking, Life 2.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0 7.0 3.0 35.0 20.0 20.0 20.0 0.0 5.0 NaN NaN NaN NaN NaN NaN 25.0 5.0 20.0 20.0 25.0 5.0 8.0 9.0 7.0 8.0 5.0 NaN NaN NaN NaN NaN 0 7.0 8.0 6.0 5.0 8.0 4.0 5.0 2.0 1.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.0 2.0 NaN NaN NaN NaN NaN NaN NaN 19.51 17.07 17.07 17.07 12.2 17.07 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.0 8.0 8.0 6.0 5.0 NaN NaN NaN NaN NaN 0.0 0.0 0.0 NaN NaN 35.0 25.0 15.0 15.0 0.0 10.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.0 8.0 7.0 7.0 5.0 NaN NaN NaN NaN NaN

In [24]:
raw_dataset_copy = raw_dataset

In [34]:
merged_datasets = raw_dataset.merge(raw_dataset_copy, left_on="pid", right_on="iid")

In [36]:
merged_datasets.head(3)


Out[36]:
iid_x id_x gender_x idg_x condtn_x wave_x round_x position_x positin1_x order_x partner_x pid_x match_x int_corr_x samerace_x age_o_x race_o_x pf_o_att_x pf_o_sin_x pf_o_int_x pf_o_fun_x pf_o_amb_x pf_o_sha_x dec_o_x attr_o_x sinc_o_x intel_o_x fun_o_x amb_o_x shar_o_x like_o_x prob_o_x met_o_x age_x field_x field_cd_x undergra_x mn_sat_x tuition_x race_x imprace_x imprelig_x from_x zipcode_x income_x goal_x date_x go_out_x career_x career_c_x sports_x tvsports_x exercise_x dining_x museums_x art_x hiking_x gaming_x clubbing_x reading_x tv_x theater_x movies_x concerts_x music_x shopping_x yoga_x exphappy_x expnum_x attr1_1_x sinc1_1_x intel1_1_x fun1_1_x amb1_1_x shar1_1_x attr4_1_x sinc4_1_x intel4_1_x fun4_1_x amb4_1_x shar4_1_x attr2_1_x sinc2_1_x intel2_1_x fun2_1_x amb2_1_x shar2_1_x attr3_1_x sinc3_1_x fun3_1_x intel3_1_x amb3_1_x attr5_1_x sinc5_1_x intel5_1_x fun5_1_x amb5_1_x dec_x attr_x sinc_x intel_x fun_x amb_x shar_x like_x prob_x met_x match_es_x attr1_s_x sinc1_s_x intel1_s_x fun1_s_x amb1_s_x shar1_s_x attr3_s_x sinc3_s_x intel3_s_x fun3_s_x amb3_s_x satis_2_x length_x numdat_2_x attr7_2_x sinc7_2_x intel7_2_x fun7_2_x amb7_2_x shar7_2_x attr1_2_x sinc1_2_x intel1_2_x fun1_2_x amb1_2_x shar1_2_x attr4_2_x sinc4_2_x intel4_2_x fun4_2_x amb4_2_x shar4_2_x attr2_2_x sinc2_2_x intel2_2_x fun2_2_x amb2_2_x shar2_2_x attr3_2_x sinc3_2_x intel3_2_x fun3_2_x amb3_2_x attr5_2_x sinc5_2_x intel5_2_x fun5_2_x amb5_2_x you_call_x them_cal_x date_3_x numdat_3_x num_in_3_x attr1_3_x sinc1_3_x intel1_3_x fun1_3_x amb1_3_x shar1_3_x attr7_3_x sinc7_3_x intel7_3_x fun7_3_x amb7_3_x shar7_3_x attr4_3_x sinc4_3_x intel4_3_x fun4_3_x amb4_3_x shar4_3_x attr2_3_x sinc2_3_x intel2_3_x fun2_3_x amb2_3_x shar2_3_x attr3_3_x sinc3_3_x intel3_3_x fun3_3_x amb3_3_x attr5_3_x sinc5_3_x intel5_3_x fun5_3_x amb5_3_x iid_y id_y gender_y idg_y condtn_y wave_y round_y position_y positin1_y order_y partner_y pid_y match_y int_corr_y samerace_y age_o_y race_o_y pf_o_att_y pf_o_sin_y pf_o_int_y pf_o_fun_y pf_o_amb_y pf_o_sha_y dec_o_y attr_o_y sinc_o_y intel_o_y fun_o_y amb_o_y shar_o_y like_o_y prob_o_y met_o_y age_y field_y field_cd_y undergra_y mn_sat_y tuition_y race_y imprace_y imprelig_y from_y zipcode_y income_y goal_y date_y go_out_y career_y career_c_y sports_y tvsports_y exercise_y dining_y museums_y art_y hiking_y gaming_y clubbing_y reading_y tv_y theater_y movies_y concerts_y music_y shopping_y yoga_y exphappy_y expnum_y attr1_1_y sinc1_1_y intel1_1_y fun1_1_y amb1_1_y shar1_1_y attr4_1_y sinc4_1_y intel4_1_y fun4_1_y amb4_1_y shar4_1_y attr2_1_y sinc2_1_y intel2_1_y fun2_1_y amb2_1_y shar2_1_y attr3_1_y sinc3_1_y fun3_1_y intel3_1_y amb3_1_y attr5_1_y sinc5_1_y intel5_1_y fun5_1_y amb5_1_y dec_y attr_y sinc_y intel_y fun_y amb_y shar_y like_y prob_y met_y match_es_y attr1_s_y sinc1_s_y intel1_s_y fun1_s_y amb1_s_y shar1_s_y attr3_s_y sinc3_s_y intel3_s_y fun3_s_y amb3_s_y satis_2_y length_y numdat_2_y attr7_2_y sinc7_2_y intel7_2_y fun7_2_y amb7_2_y shar7_2_y attr1_2_y sinc1_2_y intel1_2_y fun1_2_y amb1_2_y shar1_2_y attr4_2_y sinc4_2_y intel4_2_y fun4_2_y amb4_2_y shar4_2_y attr2_2_y sinc2_2_y intel2_2_y fun2_2_y amb2_2_y shar2_2_y attr3_2_y sinc3_2_y intel3_2_y fun3_2_y amb3_2_y attr5_2_y sinc5_2_y intel5_2_y fun5_2_y amb5_2_y you_call_y them_cal_y date_3_y numdat_3_y num_in_3_y attr1_3_y sinc1_3_y intel1_3_y fun1_3_y amb1_3_y shar1_3_y attr7_3_y sinc7_3_y intel7_3_y fun7_3_y amb7_3_y shar7_3_y attr4_3_y sinc4_3_y intel4_3_y fun4_3_y amb4_3_y shar4_3_y attr2_3_y sinc2_3_y intel2_3_y fun2_3_y amb2_3_y shar2_3_y attr3_3_y sinc3_3_y intel3_3_y fun3_3_y amb3_3_y attr5_3_y sinc5_3_y intel5_3_y fun5_3_y amb5_3_y
0 1 1.0 0 1 1 1 10 7 NaN 4 1 11.0 0 0.14 0 27.0 2.0 35.0 20.0 20.0 20.0 0.0 5.0 0 6.0 8.0 8.0 8.0 8.0 6.0 7.0 4.0 2.0 21.0 Law 1.0 NaN NaN NaN 4.0 2.0 4.0 Chicago 60,521 69,487.00 2.0 7.0 1.0 lawyer NaN 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 3.0 2.0 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN 35.0 20.0 15.0 20.0 5.0 5.0 6.0 8.0 8.0 8.0 7.0 NaN NaN NaN NaN NaN 1 6.0 9.0 7.0 7.0 6.0 5.0 7.0 6.0 2.0 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 2.0 1.0 NaN NaN NaN NaN NaN NaN 19.44 16.67 13.89 22.22 11.11 16.67 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 7.0 8.0 7.0 6.0 NaN NaN NaN NaN NaN 1.0 1.0 0.0 NaN NaN 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.0 7.0 7.0 7.0 7.0 NaN NaN NaN NaN NaN 11 1.0 1 2 1 1 10 7 NaN 4 1 1.0 0 0.14 0 21.0 4.0 15.0 20.0 20.0 15.0 15.0 15.0 1 6.0 9.0 7.0 7.0 6.0 5.0 7.0 6.0 2.0 27.0 Finance 8.0 NaN NaN NaN 2.0 7.0 3.0 Argentina 0 NaN 1.0 5.0 4.0 Academia, Research, Banking, Life 2.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0 7.0 3.0 35.0 20.0 20.0 20.0 0.0 5.0 NaN NaN NaN NaN NaN NaN 25.0 5.0 20.0 20.0 25.0 5.0 8.0 9.0 7.0 8.0 5.0 NaN NaN NaN NaN NaN 0 6.0 8.0 8.0 8.0 8.0 6.0 7.0 4.0 2.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.0 2.0 NaN NaN NaN NaN NaN NaN NaN 19.51 17.07 17.07 17.07 12.2 17.07 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.0 8.0 8.0 6.0 5.0 NaN NaN NaN NaN NaN 0.0 0.0 0.0 NaN NaN 35.0 25.0 15.0 15.0 0.0 10.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.0 8.0 7.0 7.0 5.0 NaN NaN NaN NaN NaN
1 1 1.0 0 1 1 1 10 7 NaN 4 1 11.0 0 0.14 0 27.0 2.0 35.0 20.0 20.0 20.0 0.0 5.0 0 6.0 8.0 8.0 8.0 8.0 6.0 7.0 4.0 2.0 21.0 Law 1.0 NaN NaN NaN 4.0 2.0 4.0 Chicago 60,521 69,487.00 2.0 7.0 1.0 lawyer NaN 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 3.0 2.0 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN 35.0 20.0 15.0 20.0 5.0 5.0 6.0 8.0 8.0 8.0 7.0 NaN NaN NaN NaN NaN 1 6.0 9.0 7.0 7.0 6.0 5.0 7.0 6.0 2.0 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 2.0 1.0 NaN NaN NaN NaN NaN NaN 19.44 16.67 13.89 22.22 11.11 16.67 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 7.0 8.0 7.0 6.0 NaN NaN NaN NaN NaN 1.0 1.0 0.0 NaN NaN 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.0 7.0 7.0 7.0 7.0 NaN NaN NaN NaN NaN 11 1.0 1 2 1 1 10 3 NaN 10 2 2.0 0 0.29 1 24.0 2.0 45.0 5.0 25.0 20.0 0.0 5.0 0 5.0 7.0 8.0 4.0 6.0 3.0 6.0 4.0 2.0 27.0 Finance 8.0 NaN NaN NaN 2.0 7.0 3.0 Argentina 0 NaN 1.0 5.0 4.0 Academia, Research, Banking, Life 2.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0 7.0 3.0 35.0 20.0 20.0 20.0 0.0 5.0 NaN NaN NaN NaN NaN NaN 25.0 5.0 20.0 20.0 25.0 5.0 8.0 9.0 7.0 8.0 5.0 NaN NaN NaN NaN NaN 0 8.0 7.0 6.0 9.0 7.0 4.0 7.0 2.0 2.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.0 2.0 NaN NaN NaN NaN NaN NaN NaN 19.51 17.07 17.07 17.07 12.2 17.07 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.0 8.0 8.0 6.0 5.0 NaN NaN NaN NaN NaN 0.0 0.0 0.0 NaN NaN 35.0 25.0 15.0 15.0 0.0 10.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.0 8.0 7.0 7.0 5.0 NaN NaN NaN NaN NaN
2 1 1.0 0 1 1 1 10 7 NaN 4 1 11.0 0 0.14 0 27.0 2.0 35.0 20.0 20.0 20.0 0.0 5.0 0 6.0 8.0 8.0 8.0 8.0 6.0 7.0 4.0 2.0 21.0 Law 1.0 NaN NaN NaN 4.0 2.0 4.0 Chicago 60,521 69,487.00 2.0 7.0 1.0 lawyer NaN 9.0 2.0 8.0 9.0 1.0 1.0 5.0 1.0 5.0 6.0 9.0 1.0 10.0 10.0 9.0 8.0 1.0 3.0 2.0 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN 35.0 20.0 15.0 20.0 5.0 5.0 6.0 8.0 8.0 8.0 7.0 NaN NaN NaN NaN NaN 1 6.0 9.0 7.0 7.0 6.0 5.0 7.0 6.0 2.0 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 2.0 1.0 NaN NaN NaN NaN NaN NaN 19.44 16.67 13.89 22.22 11.11 16.67 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 7.0 8.0 7.0 6.0 NaN NaN NaN NaN NaN 1.0 1.0 0.0 NaN NaN 15.0 20.0 20.0 15.0 15.0 15.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.0 7.0 7.0 7.0 7.0 NaN NaN NaN NaN NaN 11 1.0 1 2 1 1 10 9 NaN 6 3 3.0 0 -0.24 1 25.0 2.0 35.0 10.0 35.0 10.0 10.0 0.0 0 7.0 9.0 10.0 7.0 8.0 9.0 8.0 7.0 1.0 27.0 Finance 8.0 NaN NaN NaN 2.0 7.0 3.0 Argentina 0 NaN 1.0 5.0 4.0 Academia, Research, Banking, Life 2.0 8.0 7.0 2.0 6.0 7.0 5.0 5.0 5.0 4.0 9.0 2.0 4.0 8.0 7.0 8.0 5.0 1.0 7.0 3.0 35.0 20.0 20.0 20.0 0.0 5.0 NaN NaN NaN NaN NaN NaN 25.0 5.0 20.0 20.0 25.0 5.0 8.0 9.0 7.0 8.0 5.0 NaN NaN NaN NaN NaN 0 7.0 8.0 6.0 5.0 8.0 4.0 5.0 2.0 1.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.0 2.0 NaN NaN NaN NaN NaN NaN NaN 19.51 17.07 17.07 17.07 12.2 17.07 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.0 8.0 8.0 6.0 5.0 NaN NaN NaN NaN NaN 0.0 0.0 0.0 NaN NaN 35.0 25.0 15.0 15.0 0.0 10.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.0 8.0 7.0 7.0 5.0 NaN NaN NaN NaN NaN

In [37]:
merged_datasets[["iid_x","gender_x","pid_y","gender_y"]].head(5)


Out[37]:
iid_x gender_x pid_y gender_y
0 1 0 1.0 1
1 1 0 2.0 1
2 1 0 3.0 1
3 1 0 4.0 1
4 1 0 5.0 1

In [39]:
same_gender = merged_datasets[merged_datasets["gender_x"] == merged_datasets["gender_y"]]
same_gender.head()


Out[39]:
iid_x id_x gender_x idg_x condtn_x wave_x round_x position_x positin1_x order_x partner_x pid_x match_x int_corr_x samerace_x age_o_x race_o_x pf_o_att_x pf_o_sin_x pf_o_int_x pf_o_fun_x pf_o_amb_x pf_o_sha_x dec_o_x attr_o_x sinc_o_x intel_o_x fun_o_x amb_o_x shar_o_x like_o_x prob_o_x met_o_x age_x field_x field_cd_x undergra_x mn_sat_x tuition_x race_x imprace_x imprelig_x from_x zipcode_x income_x goal_x date_x go_out_x career_x career_c_x sports_x tvsports_x exercise_x dining_x museums_x art_x hiking_x gaming_x clubbing_x reading_x tv_x theater_x movies_x concerts_x music_x shopping_x yoga_x exphappy_x expnum_x attr1_1_x sinc1_1_x intel1_1_x fun1_1_x amb1_1_x shar1_1_x attr4_1_x sinc4_1_x intel4_1_x fun4_1_x amb4_1_x shar4_1_x attr2_1_x sinc2_1_x intel2_1_x fun2_1_x amb2_1_x shar2_1_x attr3_1_x sinc3_1_x fun3_1_x intel3_1_x amb3_1_x attr5_1_x sinc5_1_x intel5_1_x fun5_1_x amb5_1_x dec_x attr_x sinc_x intel_x fun_x amb_x shar_x like_x prob_x met_x match_es_x attr1_s_x sinc1_s_x intel1_s_x fun1_s_x amb1_s_x shar1_s_x attr3_s_x sinc3_s_x intel3_s_x fun3_s_x amb3_s_x satis_2_x length_x numdat_2_x attr7_2_x sinc7_2_x intel7_2_x fun7_2_x amb7_2_x shar7_2_x attr1_2_x sinc1_2_x intel1_2_x fun1_2_x amb1_2_x shar1_2_x attr4_2_x sinc4_2_x intel4_2_x fun4_2_x amb4_2_x shar4_2_x attr2_2_x sinc2_2_x intel2_2_x fun2_2_x amb2_2_x shar2_2_x attr3_2_x sinc3_2_x intel3_2_x fun3_2_x amb3_2_x attr5_2_x sinc5_2_x intel5_2_x fun5_2_x amb5_2_x you_call_x them_cal_x date_3_x numdat_3_x num_in_3_x attr1_3_x sinc1_3_x intel1_3_x fun1_3_x amb1_3_x shar1_3_x attr7_3_x sinc7_3_x intel7_3_x fun7_3_x amb7_3_x shar7_3_x attr4_3_x sinc4_3_x intel4_3_x fun4_3_x amb4_3_x shar4_3_x attr2_3_x sinc2_3_x intel2_3_x fun2_3_x amb2_3_x shar2_3_x attr3_3_x sinc3_3_x intel3_3_x fun3_3_x amb3_3_x attr5_3_x sinc5_3_x intel5_3_x fun5_3_x amb5_3_x iid_y id_y gender_y idg_y condtn_y wave_y round_y position_y positin1_y order_y partner_y pid_y match_y int_corr_y samerace_y age_o_y race_o_y pf_o_att_y pf_o_sin_y pf_o_int_y pf_o_fun_y pf_o_amb_y pf_o_sha_y dec_o_y attr_o_y sinc_o_y intel_o_y fun_o_y amb_o_y shar_o_y like_o_y prob_o_y met_o_y age_y field_y field_cd_y undergra_y mn_sat_y tuition_y race_y imprace_y imprelig_y from_y zipcode_y income_y goal_y date_y go_out_y career_y career_c_y sports_y tvsports_y exercise_y dining_y museums_y art_y hiking_y gaming_y clubbing_y reading_y tv_y theater_y movies_y concerts_y music_y shopping_y yoga_y exphappy_y expnum_y attr1_1_y sinc1_1_y intel1_1_y fun1_1_y amb1_1_y shar1_1_y attr4_1_y sinc4_1_y intel4_1_y fun4_1_y amb4_1_y shar4_1_y attr2_1_y sinc2_1_y intel2_1_y fun2_1_y amb2_1_y shar2_1_y attr3_1_y sinc3_1_y fun3_1_y intel3_1_y amb3_1_y attr5_1_y sinc5_1_y intel5_1_y fun5_1_y amb5_1_y dec_y attr_y sinc_y intel_y fun_y amb_y shar_y like_y prob_y met_y match_es_y attr1_s_y sinc1_s_y intel1_s_y fun1_s_y amb1_s_y shar1_s_y attr3_s_y sinc3_s_y intel3_s_y fun3_s_y amb3_s_y satis_2_y length_y numdat_2_y attr7_2_y sinc7_2_y intel7_2_y fun7_2_y amb7_2_y shar7_2_y attr1_2_y sinc1_2_y intel1_2_y fun1_2_y amb1_2_y shar1_2_y attr4_2_y sinc4_2_y intel4_2_y fun4_2_y amb4_2_y shar4_2_y attr2_2_y sinc2_2_y intel2_2_y fun2_2_y amb2_2_y shar2_2_y attr3_2_y sinc3_2_y intel3_2_y fun3_2_y amb3_2_y attr5_2_y sinc5_2_y intel5_2_y fun5_2_y amb5_2_y you_call_y them_cal_y date_3_y numdat_3_y num_in_3_y attr1_3_y sinc1_3_y intel1_3_y fun1_3_y amb1_3_y shar1_3_y attr7_3_y sinc7_3_y intel7_3_y fun7_3_y amb7_3_y shar7_3_y attr4_3_y sinc4_3_y intel4_3_y fun4_3_y amb4_3_y shar4_3_y attr2_3_y sinc2_3_y intel2_3_y fun2_3_y amb2_3_y shar2_3_y attr3_3_y sinc3_3_y intel3_3_y fun3_3_y amb3_3_y attr5_3_y sinc5_3_y intel5_3_y fun5_3_y amb5_3_y

In [5]:
columns_by_types = raw_dataset.columns.to_series().groupby(raw_dataset.dtypes).groups

In [6]:
columns_by_types


Out[6]:
{dtype('int64'): ['iid',
  'gender',
  'idg',
  'condtn',
  'wave',
  'round',
  'position',
  'order',
  'partner',
  'match',
  'samerace',
  'dec_o',
  'dec'],
 dtype('float64'): ['id',
  'positin1',
  'pid',
  'int_corr',
  'age_o',
  'race_o',
  'pf_o_att',
  'pf_o_sin',
  'pf_o_int',
  'pf_o_fun',
  'pf_o_amb',
  'pf_o_sha',
  'attr_o',
  'sinc_o',
  'intel_o',
  'fun_o',
  'amb_o',
  'shar_o',
  'like_o',
  'prob_o',
  'met_o',
  'age',
  'field_cd',
  'race',
  'imprace',
  'imprelig',
  'goal',
  'date',
  'go_out',
  'career_c',
  'sports',
  'tvsports',
  'exercise',
  'dining',
  'museums',
  'art',
  'hiking',
  'gaming',
  'clubbing',
  'reading',
  'tv',
  'theater',
  'movies',
  'concerts',
  'music',
  'shopping',
  'yoga',
  'exphappy',
  'expnum',
  'attr1_1',
  'sinc1_1',
  'intel1_1',
  'fun1_1',
  'amb1_1',
  'shar1_1',
  'attr4_1',
  'sinc4_1',
  'intel4_1',
  'fun4_1',
  'amb4_1',
  'shar4_1',
  'attr2_1',
  'sinc2_1',
  'intel2_1',
  'fun2_1',
  'amb2_1',
  'shar2_1',
  'attr3_1',
  'sinc3_1',
  'fun3_1',
  'intel3_1',
  'amb3_1',
  'attr5_1',
  'sinc5_1',
  'intel5_1',
  'fun5_1',
  'amb5_1',
  'attr',
  'sinc',
  'intel',
  'fun',
  'amb',
  'shar',
  'like',
  'prob',
  'met',
  'match_es',
  'attr1_s',
  'sinc1_s',
  'intel1_s',
  'fun1_s',
  'amb1_s',
  'shar1_s',
  'attr3_s',
  'sinc3_s',
  'intel3_s',
  'fun3_s',
  'amb3_s',
  'satis_2',
  'length',
  'numdat_2',
  'attr7_2',
  'sinc7_2',
  'intel7_2',
  'fun7_2',
  'amb7_2',
  'shar7_2',
  'attr1_2',
  'sinc1_2',
  'intel1_2',
  'fun1_2',
  'amb1_2',
  'shar1_2',
  'attr4_2',
  'sinc4_2',
  'intel4_2',
  'fun4_2',
  'amb4_2',
  'shar4_2',
  'attr2_2',
  'sinc2_2',
  'intel2_2',
  'fun2_2',
  'amb2_2',
  'shar2_2',
  'attr3_2',
  'sinc3_2',
  'intel3_2',
  'fun3_2',
  'amb3_2',
  'attr5_2',
  'sinc5_2',
  'intel5_2',
  'fun5_2',
  'amb5_2',
  'you_call',
  'them_cal',
  'date_3',
  'numdat_3',
  'num_in_3',
  'attr1_3',
  'sinc1_3',
  'intel1_3',
  'fun1_3',
  'amb1_3',
  'shar1_3',
  'attr7_3',
  'sinc7_3',
  'intel7_3',
  'fun7_3',
  'amb7_3',
  'shar7_3',
  'attr4_3',
  'sinc4_3',
  'intel4_3',
  'fun4_3',
  'amb4_3',
  'shar4_3',
  'attr2_3',
  'sinc2_3',
  'intel2_3',
  'fun2_3',
  'amb2_3',
  'shar2_3',
  'attr3_3',
  'sinc3_3',
  'intel3_3',
  'fun3_3',
  'amb3_3',
  'attr5_3',
  'sinc5_3',
  'intel5_3',
  'fun5_3',
  'amb5_3'],
 dtype('O'): ['field',
  'undergra',
  'mn_sat',
  'tuition',
  'from',
  'zipcode',
  'income',
  'career']}

In [7]:
raw_dataset.dtypes.value_counts()


Out[7]:
float64    174
int64       13
object       8
dtype: int64

In [19]:
raw_dataset.isnull().sum().head(50)


Out[19]:
iid            0
id             1
gender         0
idg            0
condtn         0
wave           0
round          0
position       0
positin1    1846
order          0
partner        0
pid           10
match          0
int_corr     158
samerace       0
age_o        104
race_o        73
pf_o_att      89
pf_o_sin      89
pf_o_int      89
pf_o_fun      98
pf_o_amb     107
pf_o_sha     129
dec_o          0
attr_o       212
sinc_o       287
intel_o      306
fun_o        360
amb_o        722
shar_o      1076
like_o       250
prob_o       318
met_o        385
age           95
field         63
field_cd      82
undergra    3464
mn_sat      5245
tuition     4795
race          63
imprace       79
imprelig      79
from          79
zipcode     1064
income      4099
goal          79
date          97
go_out        79
career        89
career_c     138
dtype: int64

In [9]:
summary = raw_dataset.describe() #.transpose()
#summary.head(30)
print summary


               iid           id       gender          idg       condtn  \
count  8378.000000  8377.000000  8378.000000  8378.000000  8378.000000   
mean    283.675937     8.960248     0.500597    17.327166     1.828837   
std     158.583367     5.491329     0.500029    10.940735     0.376673   
min       1.000000     1.000000     0.000000     1.000000     1.000000   
25%     154.000000     4.000000     0.000000     8.000000     2.000000   
50%     281.000000     8.000000     1.000000    16.000000     2.000000   
75%     407.000000    13.000000     1.000000    26.000000     2.000000   
max     552.000000    22.000000     1.000000    44.000000     2.000000   

              wave        round     position     positin1        order  \
count  8378.000000  8378.000000  8378.000000  6532.000000  8378.000000   
mean     11.350919    16.872046     9.042731     9.295775     8.927668   
std       5.995903     4.358458     5.514939     5.650199     5.477009   
min       1.000000     5.000000     1.000000     1.000000     1.000000   
25%       7.000000    14.000000     4.000000     4.000000     4.000000   
50%      11.000000    18.000000     8.000000     9.000000     8.000000   
75%      15.000000    20.000000    13.000000    14.000000    13.000000   
max      21.000000    22.000000    22.000000    22.000000    22.000000   

           partner          pid        match     int_corr     samerace  \
count  8378.000000  8368.000000  8378.000000  8220.000000  8378.000000   
mean      8.963595   283.863767     0.164717     0.196010     0.395799   
std       5.491068   158.584899     0.370947     0.303539     0.489051   
min       1.000000     1.000000     0.000000    -0.830000     0.000000   
25%       4.000000   154.000000     0.000000    -0.020000     0.000000   
50%       8.000000   281.000000     0.000000     0.210000     0.000000   
75%      13.000000   408.000000     0.000000     0.430000     1.000000   
max      22.000000   552.000000     1.000000     0.910000     1.000000   

             age_o       race_o     pf_o_att     pf_o_sin     pf_o_int  \
count  8274.000000  8305.000000  8289.000000  8289.000000  8289.000000   
mean     26.364999     2.756653    22.495347    17.396867    20.270759   
std       3.563648     1.230689    12.569802     7.044003     6.782895   
min      18.000000     1.000000     0.000000     0.000000     0.000000   
25%      24.000000     2.000000    15.000000    15.000000    17.390000   
50%      26.000000     2.000000    20.000000    18.370000    20.000000   
75%      28.000000     4.000000    25.000000    20.000000    23.810000   
max      55.000000     6.000000   100.000000    60.000000    50.000000   

          pf_o_fun     pf_o_amb     pf_o_sha        dec_o       attr_o  \
count  8280.000000  8271.000000  8249.000000  8378.000000  8166.000000   
mean     17.459714    10.685375    11.845930     0.419551     6.190411   
std       6.085526     6.126544     6.362746     0.493515     1.950305   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      15.000000     5.000000     9.520000     0.000000     5.000000   
50%      18.000000    10.000000    10.640000     0.000000     6.000000   
75%      20.000000    15.000000    16.000000     1.000000     8.000000   
max      50.000000    53.000000    30.000000     1.000000    10.500000   

            sinc_o      intel_o        fun_o        amb_o       shar_o  \
count  8091.000000  8072.000000  8018.000000  7656.000000  7302.000000   
mean      7.175256     7.369301     6.400599     6.778409     5.474870   
std       1.740575     1.550501     1.954078     1.794080     2.156163   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       6.000000     6.000000     5.000000     6.000000     4.000000   
50%       7.000000     7.000000     7.000000     7.000000     6.000000   
75%       8.000000     8.000000     8.000000     8.000000     7.000000   
max      10.000000    10.000000    11.000000    10.000000    10.000000   

            like_o       prob_o        met_o          age     field_cd  \
count  8128.000000  8060.000000  7993.000000  8283.000000  8296.000000   
mean      6.134498     5.208251     1.960215    26.358928     7.662488   
std       1.841258     2.129354     0.245925     3.566763     3.758935   
min       0.000000     0.000000     1.000000    18.000000     1.000000   
25%       5.000000     4.000000     2.000000    24.000000     5.000000   
50%       6.000000     5.000000     2.000000    26.000000     8.000000   
75%       7.000000     7.000000     2.000000    28.000000    10.000000   
max      10.000000    10.000000     8.000000    55.000000    18.000000   

              race      imprace     imprelig         goal         date  \
count  8315.000000  8299.000000  8299.000000  8299.000000  8281.000000   
mean      2.757186     3.784793     3.651645     2.122063     5.006762   
std       1.230905     2.845708     2.805237     1.407181     1.444531   
min       1.000000     0.000000     1.000000     1.000000     1.000000   
25%       2.000000     1.000000     1.000000     1.000000     4.000000   
50%       2.000000     3.000000     3.000000     2.000000     5.000000   
75%       4.000000     6.000000     6.000000     2.000000     6.000000   
max       6.000000    10.000000    10.000000     6.000000     7.000000   

            go_out     career_c       sports     tvsports     exercise  \
count  8299.000000  8240.000000  8299.000000  8299.000000  8299.000000   
mean      2.158091     5.277791     6.425232     4.575491     6.245813   
std       1.105246     3.309520     2.619024     2.801874     2.418858   
min       1.000000     1.000000     1.000000     1.000000     1.000000   
25%       1.000000     2.000000     4.000000     2.000000     5.000000   
50%       2.000000     6.000000     7.000000     4.000000     6.000000   
75%       3.000000     7.000000     9.000000     7.000000     8.000000   
max       7.000000    17.000000    10.000000    10.000000    10.000000   

            dining      museums          art       hiking       gaming  \
count  8299.000000  8299.000000  8299.000000  8299.000000  8299.000000   
mean      7.783829     6.985781     6.714544     5.737077     3.881191   
std       1.754868     2.052232     2.263407     2.570207     2.620507   
min       1.000000     0.000000     0.000000     0.000000     0.000000   
25%       7.000000     6.000000     5.000000     4.000000     2.000000   
50%       8.000000     7.000000     7.000000     6.000000     3.000000   
75%       9.000000     9.000000     8.000000     8.000000     6.000000   
max      10.000000    10.000000    10.000000    10.000000    14.000000   

          clubbing      reading           tv      theater       movies  \
count  8299.000000  8299.000000  8299.000000  8299.000000  8299.000000   
mean      5.745993     7.678515     5.304133     6.776118     7.919629   
std       2.502218     2.006565     2.529135     2.235152     1.700927   
min       0.000000     1.000000     1.000000     0.000000     0.000000   
25%       4.000000     7.000000     3.000000     5.000000     7.000000   
50%       6.000000     8.000000     6.000000     7.000000     8.000000   
75%       8.000000     9.000000     7.000000     9.000000     9.000000   
max      10.000000    13.000000    10.000000    10.000000    10.000000   

          concerts        music     shopping         yoga     exphappy  \
count  8299.000000  8299.000000  8299.000000  8299.000000  8277.000000   
mean      6.825401     7.851066     5.631281     4.339197     5.534131   
std       2.156283     1.791827     2.608913     2.717612     1.734059   
min       0.000000     1.000000     1.000000     0.000000     1.000000   
25%       5.000000     7.000000     4.000000     2.000000     5.000000   
50%       7.000000     8.000000     6.000000     4.000000     6.000000   
75%       8.000000     9.000000     8.000000     7.000000     7.000000   
max      10.000000    10.000000    10.000000    10.000000    10.000000   

            expnum      attr1_1      sinc1_1     intel1_1       fun1_1  \
count  1800.000000  8299.000000  8299.000000  8299.000000  8289.000000   
mean      5.570556    22.514632    17.396389    20.265613    17.457043   
std       4.762569    12.587674     7.046700     6.783003     6.085239   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       2.000000    15.000000    15.000000    17.390000    15.000000   
50%       4.000000    20.000000    18.180000    20.000000    18.000000   
75%       8.000000    25.000000    20.000000    23.810000    20.000000   
max      20.000000   100.000000    60.000000    50.000000    50.000000   

            amb1_1      shar1_1      attr4_1      sinc4_1     intel4_1  \
count  8279.000000  8257.000000  6489.000000  6489.000000  6489.000000   
mean     10.682539    11.845111    26.394360    11.071506    12.636308   
std       6.124888     6.362154    16.297045     6.659233     6.717476   
min       0.000000     0.000000     5.000000     0.000000     0.000000   
25%       5.000000     9.520000    10.000000     6.000000     8.000000   
50%      10.000000    10.640000    25.000000    10.000000    10.000000   
75%      15.000000    16.000000    35.000000    15.000000    16.000000   
max      53.000000    30.000000    95.000000    35.000000    35.000000   

            fun4_1       amb4_1      shar4_1      attr2_1      sinc2_1  \
count  6489.000000  6489.000000  6467.000000  8299.000000  8299.000000   
mean     15.566805     9.780089    11.014845    30.362192    13.273691   
std       7.328256     6.998428     6.060150    16.249937     6.976775   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      10.000000     5.000000     7.000000    20.000000    10.000000   
50%      15.000000    10.000000    10.000000    25.000000    15.000000   
75%      20.000000    15.000000    15.000000    40.000000    18.750000   
max      45.000000    50.000000    40.000000   100.000000    50.000000   

          intel2_1       fun2_1       amb2_1      shar2_1      attr3_1  \
count  8299.000000  8299.000000  8289.000000  8289.000000  8273.000000   
mean     14.416891    18.422620    11.744499    11.854817     7.084733   
std       6.263304     6.577929     6.886532     6.167314     1.395783   
min       0.000000     0.000000     0.000000     0.000000     2.000000   
25%      10.000000    15.000000     6.000000    10.000000     6.000000   
50%      15.000000    20.000000    10.000000    10.000000     7.000000   
75%      20.000000    20.000000    15.000000    15.630000     8.000000   
max      40.000000    50.000000    50.000000    30.000000    10.000000   

           sinc3_1       fun3_1     intel3_1       amb3_1      attr5_1  \
count  8273.000000  8273.000000  8273.000000  8273.000000  4906.000000   
mean      8.294935     7.704460     8.403965     7.578388     6.941908   
std       1.407460     1.564321     1.076608     1.778315     1.498653   
min       2.000000     2.000000     3.000000     2.000000     2.000000   
25%       8.000000     7.000000     8.000000     7.000000     6.000000   
50%       8.000000     8.000000     8.000000     8.000000     7.000000   
75%       9.000000     9.000000     9.000000     9.000000     8.000000   
max      10.000000    10.000000    10.000000    10.000000    10.000000   

           sinc5_1     intel5_1       fun5_1       amb5_1          dec  \
count  4906.000000  4906.000000  4906.000000  4906.000000  8378.000000   
mean      7.927232     8.284346     7.426213     7.617611     0.419909   
std       1.627054     1.283657     1.779129     1.773094     0.493573   
min       1.000000     3.000000     2.000000     1.000000     0.000000   
25%       7.000000     8.000000     6.000000     7.000000     0.000000   
50%       8.000000     8.000000     8.000000     8.000000     0.000000   
75%       9.000000     9.000000     9.000000     9.000000     1.000000   
max      10.000000    10.000000    10.000000    10.000000     1.000000   

              attr         sinc        intel          fun          amb  \
count  8176.000000  8101.000000  8082.000000  8028.000000  7666.000000   
mean      6.189995     7.175164     7.368597     6.400598     6.777524   
std       1.950169     1.740315     1.550453     1.953702     1.794055   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       5.000000     6.000000     6.000000     5.000000     6.000000   
50%       6.000000     7.000000     7.000000     7.000000     7.000000   
75%       8.000000     8.000000     8.000000     8.000000     8.000000   
max      10.000000    10.000000    10.000000    10.000000    10.000000   

              shar         like         prob          met     match_es  \
count  7311.000000  8138.000000  8069.000000  8003.000000  7205.000000   
mean      5.474559     6.134087     5.207523     0.948769     3.207814   
std       2.156363     1.841285     2.129565     0.989889     2.444813   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       4.000000     5.000000     4.000000     0.000000     2.000000   
50%       6.000000     6.000000     5.000000     0.000000     3.000000   
75%       7.000000     7.000000     7.000000     2.000000     4.000000   
max      10.000000    10.000000    10.000000     8.000000    18.000000   

           attr1_s      sinc1_s     intel1_s       fun1_s       amb1_s  \
count  4096.000000  4096.000000  4096.000000  4096.000000  4096.000000   
mean     20.791624    15.434255    17.243708    15.260869    11.144619   
std      12.968524     6.915322     6.596420     5.356969     5.514028   
min       3.000000     0.000000     0.000000     1.000000     0.000000   
25%      14.810000    10.000000    10.000000    10.000000     7.000000   
50%      17.650000    15.790000    18.420000    15.910000    10.000000   
75%      25.000000    20.000000    20.000000    20.000000    15.000000   
max      95.000000    50.000000    40.000000    40.000000    23.810000   

           shar1_s     attr3_s      sinc3_s     intel3_s       fun3_s  \
count  4096.000000  4000.00000  4000.000000  4000.000000  4000.000000   
mean     12.457925     7.21125     8.082000     8.257750     7.692500   
std       5.921789     1.41545     1.455741     1.179317     1.626839   
min       0.000000     3.00000     1.000000     4.000000     3.000000   
25%       9.000000     7.00000     7.000000     8.000000     7.000000   
50%      12.500000     7.00000     8.000000     8.000000     8.000000   
75%      16.280000     8.00000     9.000000     9.000000     9.000000   
max      30.000000    10.00000    10.000000    10.000000    10.000000   

            amb3_s      satis_2       length     numdat_2      attr7_2  \
count  4000.000000  7463.000000  7463.000000  7433.000000  1984.000000   
mean      7.589250     5.711510     1.843495     2.338087    32.819556   
std       1.793136     1.820764     0.975662     0.631240    17.155270   
min       2.000000     1.000000     1.000000     1.000000    10.000000   
25%       7.000000     5.000000     1.000000     2.000000    20.000000   
50%       8.000000     6.000000     1.000000     2.000000    30.000000   
75%       9.000000     7.000000     3.000000     3.000000    40.000000   
max      10.000000    10.000000     3.000000     3.000000    80.000000   

           sinc7_2     intel7_2       fun7_2       amb7_2      shar7_2  \
count  1955.000000  1984.000000  1984.000000  1955.000000  1974.000000   
mean     13.529923    15.293851    18.868448     7.286957    12.156028   
std       7.977482     7.292868     8.535963     6.125187     8.241906   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      10.000000    10.000000    10.000000     0.000000     5.000000   
50%      10.000000    15.000000    20.000000     5.000000    10.000000   
75%      20.000000    20.000000    24.000000    10.000000    20.000000   
max      40.000000    50.000000    50.000000    20.000000    40.000000   

           attr1_2      sinc1_2     intel1_2       fun1_2       amb1_2  \
count  7445.000000  7463.000000  7463.000000  7463.000000  7463.000000   
mean     26.217194    15.865084    17.813755    17.654765     9.913436   
std      14.388694     6.658494     6.535894     6.129746     5.675550   
min       5.000000     0.000000     0.000000     0.000000     0.000000   
25%      16.670000    10.000000    15.000000    15.000000     5.000000   
50%      20.000000    16.670000    19.050000    18.370000    10.000000   
75%      30.000000    20.000000    20.000000    20.000000    15.000000   
max      85.000000    50.000000    40.000000    50.000000    22.220000   

           shar1_2      attr4_2      sinc4_2     intel4_2       fun4_2  \
count  7463.000000  5775.000000  5775.000000  5775.000000  5775.000000   
mean     12.760263    26.806234    11.929177    12.103030    15.163810   
std       6.651547    16.402836     6.401556     5.990607     7.290107   
min       0.000000     6.000000     0.000000     0.000000     0.000000   
25%      10.000000    10.000000     8.000000     8.000000     9.000000   
50%      13.000000    25.000000    10.000000    10.000000    15.000000   
75%      16.670000    40.000000    15.000000    15.000000    20.000000   
max      35.000000   100.000000    35.000000    40.000000    50.000000   

            amb4_2      shar4_2      attr2_2     sinc2_2     intel2_2  \
count  5775.000000  5775.000000  5775.000000  5775.00000  5775.000000   
mean      9.342511    11.320866    29.344369    13.89823    13.958265   
std       5.856329     6.296155    14.551171     6.17169     5.398621   
min       0.000000     0.000000     0.000000     0.00000     0.000000   
25%       5.000000     7.000000    19.150000    10.00000    10.000000   
50%      10.000000    10.000000    25.000000    15.00000    15.000000   
75%      10.000000    15.000000    38.460000    19.23000    17.390000   
max      35.000000    40.000000    85.000000    40.00000    30.770000   

            fun2_2       amb2_2      shar2_2      attr3_2      sinc3_2  \
count  5775.000000  5775.000000  5775.000000  7463.000000  7463.000000   
mean     17.967233    11.909735    12.887976     7.125285     7.931529   
std       6.100307     6.313281     5.615691     1.371390     1.503236   
min       0.000000     0.000000     0.000000     2.000000     2.000000   
25%      15.000000    10.000000    10.000000     7.000000     7.000000   
50%      18.520000    10.000000    13.950000     7.000000     8.000000   
75%      20.000000    15.090000    16.515000     8.000000     9.000000   
max      40.000000    50.000000    30.000000    10.000000    10.000000   

          intel3_2       fun3_2       amb3_2      attr5_2      sinc5_2  \
count  7463.000000  7463.000000  7463.000000  4377.000000  4377.000000   
mean      8.238912     7.602171     7.486802     6.827964     7.394106   
std       1.180280     1.548200     1.744634     1.411096     1.588145   
min       4.000000     1.000000     2.000000     2.000000     2.000000   
25%       8.000000     7.000000     7.000000     6.000000     6.000000   
50%       8.000000     8.000000     8.000000     7.000000     8.000000   
75%       9.000000     9.000000     9.000000     8.000000     8.000000   
max      10.000000    10.000000    10.000000    10.000000    10.000000   

          intel5_2       fun5_2       amb5_2     you_call     them_cal  \
count  4377.000000  4377.000000  4377.000000  3974.000000  3974.000000   
mean      7.838702     7.279415     7.332191     0.780825     0.981631   
std       1.280936     1.647478     1.521854     1.611694     1.382139   
min       2.000000     2.000000     2.000000     0.000000     0.000000   
25%       7.000000     6.000000     6.000000     0.000000     0.000000   
50%       8.000000     7.000000     7.000000     0.000000     1.000000   
75%       9.000000     8.000000     8.000000     1.000000     1.000000   
max      10.000000    10.000000    10.000000    21.000000     9.000000   

            date_3     numdat_3    num_in_3      attr1_3      sinc1_3  \
count  3974.000000  1496.000000  668.000000  3974.000000  3974.000000   
mean      0.376950     1.230615    0.934132    24.384524    16.588583   
std       0.484683     1.294557    0.753902    13.712120     7.471537   
min       0.000000     0.000000    0.000000     0.000000     0.000000   
25%       0.000000     1.000000    1.000000    15.220000    10.000000   
50%       0.000000     1.000000    1.000000    20.000000    16.670000   
75%       1.000000     1.000000    1.000000    30.000000    20.000000   
max       1.000000     9.000000    4.000000    80.000000    65.000000   

          intel1_3       fun1_3       amb1_3      shar1_3      attr7_3  \
count  3974.000000  3974.000000  3974.000000  3974.000000  2016.000000   
mean     19.411346    16.233415    10.898075    12.699142    31.330357   
std       6.124502     5.163777     5.900697     6.557041    17.551540   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      16.670000    14.810000     5.000000    10.000000    20.000000   
50%      20.000000    16.330000    10.000000    14.290000    25.000000   
75%      20.000000    20.000000    15.000000    16.670000    40.000000   
max      45.000000    30.000000    30.000000    55.000000    80.000000   

           sinc7_3     intel7_3       fun7_3       amb7_3      shar7_3  \
count  2016.000000  2016.000000  2016.000000  2016.000000  2016.000000   
mean     15.654266    16.679563    16.418155     7.823909    12.207837   
std       9.336288     7.880088     7.231325     6.100502     8.615985   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      10.000000    10.000000    10.000000     0.000000     5.000000   
50%      15.000000    18.000000    17.000000    10.000000    10.000000   
75%      20.000000    20.000000    20.000000    10.000000    20.000000   
max      60.000000    45.000000    40.000000    30.000000    55.000000   

           attr4_3      sinc4_3     intel4_3       fun4_3       amb4_3  \
count  2959.000000  2959.000000  2959.000000  2959.000000  2959.000000   
mean     25.610341    10.751267    11.524839    14.276783     9.207503   
std      17.477134     5.740351     6.004222     6.927869     6.385852   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%      10.000000     7.000000     7.000000     9.000000     5.000000   
50%      20.000000    10.000000    10.000000    12.000000     9.000000   
75%      37.000000    15.000000    15.000000    20.000000    10.000000   
max      80.000000    40.000000    30.000000    30.000000    40.000000   

           shar4_3      attr2_3      sinc2_3     intel2_3       fun2_3  \
count  2959.000000  2959.000000  2959.000000  2959.000000  2959.000000   
mean     11.253802    24.970936    10.923285    11.952687    14.959108   
std       6.516178    17.007669     6.226283     7.010650     7.935509   
min       0.000000     5.000000     0.000000     0.000000     0.000000   
25%       7.000000    10.000000     7.000000     7.000000     9.000000   
50%      10.000000    20.000000    10.000000    10.000000    15.000000   
75%      15.000000    35.000000    15.000000    15.000000    20.000000   
max      45.000000    80.000000    50.000000    60.000000    40.000000   

            amb2_3      shar2_3      attr3_3      sinc3_3     intel3_3  \
count  2959.000000  2016.000000  3974.000000  3974.000000  3974.000000   
mean      9.526191    11.966270     7.240312     8.093357     8.388777   
std       6.403117     7.012067     1.576596     1.610309     1.459094   
min       0.000000     0.000000     2.000000     2.000000     3.000000   
25%       6.000000     5.000000     7.000000     7.000000     8.000000   
50%      10.000000    10.000000     7.000000     8.000000     8.000000   
75%      10.000000    15.000000     8.000000     9.000000     9.000000   
max      50.000000    45.000000    12.000000    12.000000    12.000000   

            fun3_3       amb3_3      attr5_3      sinc5_3     intel5_3  \
count  3974.000000  3974.000000  2016.000000  2016.000000  2016.000000   
mean      7.658782     7.391545     6.810020     7.615079     7.932540   
std       1.744670     1.961417     1.507341     1.504551     1.340868   
min       2.000000     1.000000     2.000000     2.000000     4.000000   
25%       7.000000     6.000000     6.000000     7.000000     7.000000   
50%       8.000000     8.000000     7.000000     8.000000     8.000000   
75%       9.000000     9.000000     8.000000     9.000000     9.000000   
max      12.000000    12.000000    10.000000    10.000000    10.000000   

            fun5_3       amb5_3  
count  2016.000000  2016.000000  
mean      7.155258     7.048611  
std       1.672787     1.717988  
min       1.000000     1.000000  
25%       6.000000     6.000000  
50%       7.000000     7.000000  
75%       8.000000     8.000000  
max      10.000000    10.000000  

In [13]:
#raw_dataset.groupby("gender").agg({"iid": pd.Series.nunique})
raw_dataset.groupby('match').iid.count()


Out[13]:
match
0    6998
1    1380
Name: iid, dtype: int64

In [14]:
raw_dataset.groupby('career').iid.nunique().sort_values(ascending=False)


Out[14]:
career
Finance                                                            13
professor                                                          12
Lawyer                                                             11
Professor                                                          10
Social Worker                                                       9
Consulting                                                          8
lawyer                                                              7
Business                                                            7
Law                                                                 7
Academic                                                            6
Investment Banking                                                  6
Writer                                                              5
Scientist                                                           5
Management Consulting                                               5
law                                                                 5
Teacher                                                             4
Social Work                                                         4
research                                                            4
Clinical Psychologist                                               4
undecided                                                           4
Epidemiologist                                                      4
Entrepreneur                                                        4
Actress                                                             3
consulting                                                          3
academia                                                            3
Researcher                                                          3
School Psychologist                                                 3
teacher                                                             3
investment banking                                                  3
scientist                                                           3
                                                                   ..
academician                                                         1
academics                                                           1
academics or journalism                                             1
assistant master of the universe (otherwise it's too much work)     1
attorney?                                                           1
banker / academia                                                   1
biology industry                                                    1
boxing champ                                                        1
ceo                                                                 1
Writer/Editor                                                       1
Work in an investment bank                                          1
Work at the UN                                                      1
Trade Specialist                                                    1
Speech Language Pathologist                                         1
Speech Pathologist                                                  1
TBA                                                                 1
TEACHING                                                            1
Teacher/Professor                                                   1
To create early childhood intervention programs                     1
To go into Finance                                                  1
Trading                                                             1
What a question!                                                    1
UN Civil Servant                                                    1
University President                                                1
University Professor                                                1
Urban Planner                                                       1
Venture Capital/Consulting/Government                               1
WRITING                                                             1
Wall Street Economist                                               1
?                                                                   1
Name: iid, dtype: int64

In [18]:
raw_dataset.groupby(["gender","match"]).iid.nunique()


Out[18]:
gender  match
0       0        274
        1        221
1       0        277
        1        231
Name: iid, dtype: int64

Rename columns


In [9]:
raw_dataset.rename(columns={"age_o":"age_of_partner","race_o":"race_of_partner"},inplace=True)

In [ ]: