In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
In [2]:
source_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"
In [3]:
raw_dataset = pd.read_csv(source_path + "Speed_Dating_Data.csv")
In [4]:
raw_dataset.shape
Out[4]:
(8378, 195)
In [4]:
raw_dataset[raw_dataset["iid"] == 11].head(3)
Out[4]:
iid
id
gender
idg
condtn
wave
round
position
positin1
order
partner
pid
match
int_corr
samerace
age_o
race_o
pf_o_att
pf_o_sin
pf_o_int
pf_o_fun
pf_o_amb
pf_o_sha
dec_o
attr_o
sinc_o
intel_o
fun_o
amb_o
shar_o
like_o
prob_o
met_o
age
field
field_cd
undergra
mn_sat
tuition
race
imprace
imprelig
from
zipcode
income
goal
date
go_out
career
career_c
sports
tvsports
exercise
dining
museums
art
hiking
gaming
clubbing
reading
tv
theater
movies
concerts
music
shopping
yoga
exphappy
expnum
attr1_1
sinc1_1
intel1_1
fun1_1
amb1_1
shar1_1
attr4_1
sinc4_1
intel4_1
fun4_1
amb4_1
shar4_1
attr2_1
sinc2_1
intel2_1
fun2_1
amb2_1
shar2_1
attr3_1
sinc3_1
fun3_1
intel3_1
amb3_1
attr5_1
sinc5_1
intel5_1
fun5_1
amb5_1
dec
attr
sinc
intel
fun
amb
shar
like
prob
met
match_es
attr1_s
sinc1_s
intel1_s
fun1_s
amb1_s
shar1_s
attr3_s
sinc3_s
intel3_s
fun3_s
amb3_s
satis_2
length
numdat_2
attr7_2
sinc7_2
intel7_2
fun7_2
amb7_2
shar7_2
attr1_2
sinc1_2
intel1_2
fun1_2
amb1_2
shar1_2
attr4_2
sinc4_2
intel4_2
fun4_2
amb4_2
shar4_2
attr2_2
sinc2_2
intel2_2
fun2_2
amb2_2
shar2_2
attr3_2
sinc3_2
intel3_2
fun3_2
amb3_2
attr5_2
sinc5_2
intel5_2
fun5_2
amb5_2
you_call
them_cal
date_3
numdat_3
num_in_3
attr1_3
sinc1_3
intel1_3
fun1_3
amb1_3
shar1_3
attr7_3
sinc7_3
intel7_3
fun7_3
amb7_3
shar7_3
attr4_3
sinc4_3
intel4_3
fun4_3
amb4_3
shar4_3
attr2_3
sinc2_3
intel2_3
fun2_3
amb2_3
shar2_3
attr3_3
sinc3_3
intel3_3
fun3_3
amb3_3
attr5_3
sinc5_3
intel5_3
fun5_3
amb5_3
100
11
1.0
1
2
1
1
10
7
NaN
4
1
1.0
0
0.14
0
21.0
4.0
15.0
20.0
20.0
15.0
15.0
15.0
1
6.0
9.0
7.0
7.0
6.0
5.0
7.0
6.0
2.0
27.0
Finance
8.0
NaN
NaN
NaN
2.0
7.0
3.0
Argentina
0
NaN
1.0
5.0
4.0
Academia, Research, Banking, Life
2.0
8.0
7.0
2.0
6.0
7.0
5.0
5.0
5.0
4.0
9.0
2.0
4.0
8.0
7.0
8.0
5.0
1.0
7.0
3.0
35.0
20.0
20.0
20.0
0.0
5.0
NaN
NaN
NaN
NaN
NaN
NaN
25.0
5.0
20.0
20.0
25.0
5.0
8.0
9.0
7.0
8.0
5.0
NaN
NaN
NaN
NaN
NaN
0
6.0
8.0
8.0
8.0
8.0
6.0
7.0
4.0
2.0
0.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
7.0
2.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
19.51
17.07
17.07
17.07
12.2
17.07
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
8.0
8.0
8.0
6.0
5.0
NaN
NaN
NaN
NaN
NaN
0.0
0.0
0.0
NaN
NaN
35.0
25.0
15.0
15.0
0.0
10.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
7.0
8.0
7.0
7.0
5.0
NaN
NaN
NaN
NaN
NaN
101
11
1.0
1
2
1
1
10
3
NaN
10
2
2.0
0
0.29
1
24.0
2.0
45.0
5.0
25.0
20.0
0.0
5.0
0
5.0
7.0
8.0
4.0
6.0
3.0
6.0
4.0
2.0
27.0
Finance
8.0
NaN
NaN
NaN
2.0
7.0
3.0
Argentina
0
NaN
1.0
5.0
4.0
Academia, Research, Banking, Life
2.0
8.0
7.0
2.0
6.0
7.0
5.0
5.0
5.0
4.0
9.0
2.0
4.0
8.0
7.0
8.0
5.0
1.0
7.0
3.0
35.0
20.0
20.0
20.0
0.0
5.0
NaN
NaN
NaN
NaN
NaN
NaN
25.0
5.0
20.0
20.0
25.0
5.0
8.0
9.0
7.0
8.0
5.0
NaN
NaN
NaN
NaN
NaN
0
8.0
7.0
6.0
9.0
7.0
4.0
7.0
2.0
2.0
0.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
7.0
2.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
19.51
17.07
17.07
17.07
12.2
17.07
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
8.0
8.0
8.0
6.0
5.0
NaN
NaN
NaN
NaN
NaN
0.0
0.0
0.0
NaN
NaN
35.0
25.0
15.0
15.0
0.0
10.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
7.0
8.0
7.0
7.0
5.0
NaN
NaN
NaN
NaN
NaN
102
11
1.0
1
2
1
1
10
9
NaN
6
3
3.0
0
-0.24
1
25.0
2.0
35.0
10.0
35.0
10.0
10.0
0.0
0
7.0
9.0
10.0
7.0
8.0
9.0
8.0
7.0
1.0
27.0
Finance
8.0
NaN
NaN
NaN
2.0
7.0
3.0
Argentina
0
NaN
1.0
5.0
4.0
Academia, Research, Banking, Life
2.0
8.0
7.0
2.0
6.0
7.0
5.0
5.0
5.0
4.0
9.0
2.0
4.0
8.0
7.0
8.0
5.0
1.0
7.0
3.0
35.0
20.0
20.0
20.0
0.0
5.0
NaN
NaN
NaN
NaN
NaN
NaN
25.0
5.0
20.0
20.0
25.0
5.0
8.0
9.0
7.0
8.0
5.0
NaN
NaN
NaN
NaN
NaN
0
7.0
8.0
6.0
5.0
8.0
4.0
5.0
2.0
1.0
0.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
7.0
2.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
19.51
17.07
17.07
17.07
12.2
17.07
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
8.0
8.0
8.0
6.0
5.0
NaN
NaN
NaN
NaN
NaN
0.0
0.0
0.0
NaN
NaN
35.0
25.0
15.0
15.0
0.0
10.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
7.0
8.0
7.0
7.0
5.0
NaN
NaN
NaN
NaN
NaN
In [24]:
raw_dataset_copy = raw_dataset
In [34]:
merged_datasets = raw_dataset.merge(raw_dataset_copy, left_on="pid", right_on="iid")
In [36]:
merged_datasets.head(3)
Out[36]:
iid_x
id_x
gender_x
idg_x
condtn_x
wave_x
round_x
position_x
positin1_x
order_x
partner_x
pid_x
match_x
int_corr_x
samerace_x
age_o_x
race_o_x
pf_o_att_x
pf_o_sin_x
pf_o_int_x
pf_o_fun_x
pf_o_amb_x
pf_o_sha_x
dec_o_x
attr_o_x
sinc_o_x
intel_o_x
fun_o_x
amb_o_x
shar_o_x
like_o_x
prob_o_x
met_o_x
age_x
field_x
field_cd_x
undergra_x
mn_sat_x
tuition_x
race_x
imprace_x
imprelig_x
from_x
zipcode_x
income_x
goal_x
date_x
go_out_x
career_x
career_c_x
sports_x
tvsports_x
exercise_x
dining_x
museums_x
art_x
hiking_x
gaming_x
clubbing_x
reading_x
tv_x
theater_x
movies_x
concerts_x
music_x
shopping_x
yoga_x
exphappy_x
expnum_x
attr1_1_x
sinc1_1_x
intel1_1_x
fun1_1_x
amb1_1_x
shar1_1_x
attr4_1_x
sinc4_1_x
intel4_1_x
fun4_1_x
amb4_1_x
shar4_1_x
attr2_1_x
sinc2_1_x
intel2_1_x
fun2_1_x
amb2_1_x
shar2_1_x
attr3_1_x
sinc3_1_x
fun3_1_x
intel3_1_x
amb3_1_x
attr5_1_x
sinc5_1_x
intel5_1_x
fun5_1_x
amb5_1_x
dec_x
attr_x
sinc_x
intel_x
fun_x
amb_x
shar_x
like_x
prob_x
met_x
match_es_x
attr1_s_x
sinc1_s_x
intel1_s_x
fun1_s_x
amb1_s_x
shar1_s_x
attr3_s_x
sinc3_s_x
intel3_s_x
fun3_s_x
amb3_s_x
satis_2_x
length_x
numdat_2_x
attr7_2_x
sinc7_2_x
intel7_2_x
fun7_2_x
amb7_2_x
shar7_2_x
attr1_2_x
sinc1_2_x
intel1_2_x
fun1_2_x
amb1_2_x
shar1_2_x
attr4_2_x
sinc4_2_x
intel4_2_x
fun4_2_x
amb4_2_x
shar4_2_x
attr2_2_x
sinc2_2_x
intel2_2_x
fun2_2_x
amb2_2_x
shar2_2_x
attr3_2_x
sinc3_2_x
intel3_2_x
fun3_2_x
amb3_2_x
attr5_2_x
sinc5_2_x
intel5_2_x
fun5_2_x
amb5_2_x
you_call_x
them_cal_x
date_3_x
numdat_3_x
num_in_3_x
attr1_3_x
sinc1_3_x
intel1_3_x
fun1_3_x
amb1_3_x
shar1_3_x
attr7_3_x
sinc7_3_x
intel7_3_x
fun7_3_x
amb7_3_x
shar7_3_x
attr4_3_x
sinc4_3_x
intel4_3_x
fun4_3_x
amb4_3_x
shar4_3_x
attr2_3_x
sinc2_3_x
intel2_3_x
fun2_3_x
amb2_3_x
shar2_3_x
attr3_3_x
sinc3_3_x
intel3_3_x
fun3_3_x
amb3_3_x
attr5_3_x
sinc5_3_x
intel5_3_x
fun5_3_x
amb5_3_x
iid_y
id_y
gender_y
idg_y
condtn_y
wave_y
round_y
position_y
positin1_y
order_y
partner_y
pid_y
match_y
int_corr_y
samerace_y
age_o_y
race_o_y
pf_o_att_y
pf_o_sin_y
pf_o_int_y
pf_o_fun_y
pf_o_amb_y
pf_o_sha_y
dec_o_y
attr_o_y
sinc_o_y
intel_o_y
fun_o_y
amb_o_y
shar_o_y
like_o_y
prob_o_y
met_o_y
age_y
field_y
field_cd_y
undergra_y
mn_sat_y
tuition_y
race_y
imprace_y
imprelig_y
from_y
zipcode_y
income_y
goal_y
date_y
go_out_y
career_y
career_c_y
sports_y
tvsports_y
exercise_y
dining_y
museums_y
art_y
hiking_y
gaming_y
clubbing_y
reading_y
tv_y
theater_y
movies_y
concerts_y
music_y
shopping_y
yoga_y
exphappy_y
expnum_y
attr1_1_y
sinc1_1_y
intel1_1_y
fun1_1_y
amb1_1_y
shar1_1_y
attr4_1_y
sinc4_1_y
intel4_1_y
fun4_1_y
amb4_1_y
shar4_1_y
attr2_1_y
sinc2_1_y
intel2_1_y
fun2_1_y
amb2_1_y
shar2_1_y
attr3_1_y
sinc3_1_y
fun3_1_y
intel3_1_y
amb3_1_y
attr5_1_y
sinc5_1_y
intel5_1_y
fun5_1_y
amb5_1_y
dec_y
attr_y
sinc_y
intel_y
fun_y
amb_y
shar_y
like_y
prob_y
met_y
match_es_y
attr1_s_y
sinc1_s_y
intel1_s_y
fun1_s_y
amb1_s_y
shar1_s_y
attr3_s_y
sinc3_s_y
intel3_s_y
fun3_s_y
amb3_s_y
satis_2_y
length_y
numdat_2_y
attr7_2_y
sinc7_2_y
intel7_2_y
fun7_2_y
amb7_2_y
shar7_2_y
attr1_2_y
sinc1_2_y
intel1_2_y
fun1_2_y
amb1_2_y
shar1_2_y
attr4_2_y
sinc4_2_y
intel4_2_y
fun4_2_y
amb4_2_y
shar4_2_y
attr2_2_y
sinc2_2_y
intel2_2_y
fun2_2_y
amb2_2_y
shar2_2_y
attr3_2_y
sinc3_2_y
intel3_2_y
fun3_2_y
amb3_2_y
attr5_2_y
sinc5_2_y
intel5_2_y
fun5_2_y
amb5_2_y
you_call_y
them_cal_y
date_3_y
numdat_3_y
num_in_3_y
attr1_3_y
sinc1_3_y
intel1_3_y
fun1_3_y
amb1_3_y
shar1_3_y
attr7_3_y
sinc7_3_y
intel7_3_y
fun7_3_y
amb7_3_y
shar7_3_y
attr4_3_y
sinc4_3_y
intel4_3_y
fun4_3_y
amb4_3_y
shar4_3_y
attr2_3_y
sinc2_3_y
intel2_3_y
fun2_3_y
amb2_3_y
shar2_3_y
attr3_3_y
sinc3_3_y
intel3_3_y
fun3_3_y
amb3_3_y
attr5_3_y
sinc5_3_y
intel5_3_y
fun5_3_y
amb5_3_y
0
1
1.0
0
1
1
1
10
7
NaN
4
1
11.0
0
0.14
0
27.0
2.0
35.0
20.0
20.0
20.0
0.0
5.0
0
6.0
8.0
8.0
8.0
8.0
6.0
7.0
4.0
2.0
21.0
Law
1.0
NaN
NaN
NaN
4.0
2.0
4.0
Chicago
60,521
69,487.00
2.0
7.0
1.0
lawyer
NaN
9.0
2.0
8.0
9.0
1.0
1.0
5.0
1.0
5.0
6.0
9.0
1.0
10.0
10.0
9.0
8.0
1.0
3.0
2.0
15.0
20.0
20.0
15.0
15.0
15.0
NaN
NaN
NaN
NaN
NaN
NaN
35.0
20.0
15.0
20.0
5.0
5.0
6.0
8.0
8.0
8.0
7.0
NaN
NaN
NaN
NaN
NaN
1
6.0
9.0
7.0
7.0
6.0
5.0
7.0
6.0
2.0
4.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
6.0
2.0
1.0
NaN
NaN
NaN
NaN
NaN
NaN
19.44
16.67
13.89
22.22
11.11
16.67
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
6.0
7.0
8.0
7.0
6.0
NaN
NaN
NaN
NaN
NaN
1.0
1.0
0.0
NaN
NaN
15.0
20.0
20.0
15.0
15.0
15.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
5.0
7.0
7.0
7.0
7.0
NaN
NaN
NaN
NaN
NaN
11
1.0
1
2
1
1
10
7
NaN
4
1
1.0
0
0.14
0
21.0
4.0
15.0
20.0
20.0
15.0
15.0
15.0
1
6.0
9.0
7.0
7.0
6.0
5.0
7.0
6.0
2.0
27.0
Finance
8.0
NaN
NaN
NaN
2.0
7.0
3.0
Argentina
0
NaN
1.0
5.0
4.0
Academia, Research, Banking, Life
2.0
8.0
7.0
2.0
6.0
7.0
5.0
5.0
5.0
4.0
9.0
2.0
4.0
8.0
7.0
8.0
5.0
1.0
7.0
3.0
35.0
20.0
20.0
20.0
0.0
5.0
NaN
NaN
NaN
NaN
NaN
NaN
25.0
5.0
20.0
20.0
25.0
5.0
8.0
9.0
7.0
8.0
5.0
NaN
NaN
NaN
NaN
NaN
0
6.0
8.0
8.0
8.0
8.0
6.0
7.0
4.0
2.0
0.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
7.0
2.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
19.51
17.07
17.07
17.07
12.2
17.07
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
8.0
8.0
8.0
6.0
5.0
NaN
NaN
NaN
NaN
NaN
0.0
0.0
0.0
NaN
NaN
35.0
25.0
15.0
15.0
0.0
10.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
7.0
8.0
7.0
7.0
5.0
NaN
NaN
NaN
NaN
NaN
1
1
1.0
0
1
1
1
10
7
NaN
4
1
11.0
0
0.14
0
27.0
2.0
35.0
20.0
20.0
20.0
0.0
5.0
0
6.0
8.0
8.0
8.0
8.0
6.0
7.0
4.0
2.0
21.0
Law
1.0
NaN
NaN
NaN
4.0
2.0
4.0
Chicago
60,521
69,487.00
2.0
7.0
1.0
lawyer
NaN
9.0
2.0
8.0
9.0
1.0
1.0
5.0
1.0
5.0
6.0
9.0
1.0
10.0
10.0
9.0
8.0
1.0
3.0
2.0
15.0
20.0
20.0
15.0
15.0
15.0
NaN
NaN
NaN
NaN
NaN
NaN
35.0
20.0
15.0
20.0
5.0
5.0
6.0
8.0
8.0
8.0
7.0
NaN
NaN
NaN
NaN
NaN
1
6.0
9.0
7.0
7.0
6.0
5.0
7.0
6.0
2.0
4.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
6.0
2.0
1.0
NaN
NaN
NaN
NaN
NaN
NaN
19.44
16.67
13.89
22.22
11.11
16.67
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
6.0
7.0
8.0
7.0
6.0
NaN
NaN
NaN
NaN
NaN
1.0
1.0
0.0
NaN
NaN
15.0
20.0
20.0
15.0
15.0
15.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
5.0
7.0
7.0
7.0
7.0
NaN
NaN
NaN
NaN
NaN
11
1.0
1
2
1
1
10
3
NaN
10
2
2.0
0
0.29
1
24.0
2.0
45.0
5.0
25.0
20.0
0.0
5.0
0
5.0
7.0
8.0
4.0
6.0
3.0
6.0
4.0
2.0
27.0
Finance
8.0
NaN
NaN
NaN
2.0
7.0
3.0
Argentina
0
NaN
1.0
5.0
4.0
Academia, Research, Banking, Life
2.0
8.0
7.0
2.0
6.0
7.0
5.0
5.0
5.0
4.0
9.0
2.0
4.0
8.0
7.0
8.0
5.0
1.0
7.0
3.0
35.0
20.0
20.0
20.0
0.0
5.0
NaN
NaN
NaN
NaN
NaN
NaN
25.0
5.0
20.0
20.0
25.0
5.0
8.0
9.0
7.0
8.0
5.0
NaN
NaN
NaN
NaN
NaN
0
8.0
7.0
6.0
9.0
7.0
4.0
7.0
2.0
2.0
0.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
7.0
2.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
19.51
17.07
17.07
17.07
12.2
17.07
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
8.0
8.0
8.0
6.0
5.0
NaN
NaN
NaN
NaN
NaN
0.0
0.0
0.0
NaN
NaN
35.0
25.0
15.0
15.0
0.0
10.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
7.0
8.0
7.0
7.0
5.0
NaN
NaN
NaN
NaN
NaN
2
1
1.0
0
1
1
1
10
7
NaN
4
1
11.0
0
0.14
0
27.0
2.0
35.0
20.0
20.0
20.0
0.0
5.0
0
6.0
8.0
8.0
8.0
8.0
6.0
7.0
4.0
2.0
21.0
Law
1.0
NaN
NaN
NaN
4.0
2.0
4.0
Chicago
60,521
69,487.00
2.0
7.0
1.0
lawyer
NaN
9.0
2.0
8.0
9.0
1.0
1.0
5.0
1.0
5.0
6.0
9.0
1.0
10.0
10.0
9.0
8.0
1.0
3.0
2.0
15.0
20.0
20.0
15.0
15.0
15.0
NaN
NaN
NaN
NaN
NaN
NaN
35.0
20.0
15.0
20.0
5.0
5.0
6.0
8.0
8.0
8.0
7.0
NaN
NaN
NaN
NaN
NaN
1
6.0
9.0
7.0
7.0
6.0
5.0
7.0
6.0
2.0
4.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
6.0
2.0
1.0
NaN
NaN
NaN
NaN
NaN
NaN
19.44
16.67
13.89
22.22
11.11
16.67
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
6.0
7.0
8.0
7.0
6.0
NaN
NaN
NaN
NaN
NaN
1.0
1.0
0.0
NaN
NaN
15.0
20.0
20.0
15.0
15.0
15.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
5.0
7.0
7.0
7.0
7.0
NaN
NaN
NaN
NaN
NaN
11
1.0
1
2
1
1
10
9
NaN
6
3
3.0
0
-0.24
1
25.0
2.0
35.0
10.0
35.0
10.0
10.0
0.0
0
7.0
9.0
10.0
7.0
8.0
9.0
8.0
7.0
1.0
27.0
Finance
8.0
NaN
NaN
NaN
2.0
7.0
3.0
Argentina
0
NaN
1.0
5.0
4.0
Academia, Research, Banking, Life
2.0
8.0
7.0
2.0
6.0
7.0
5.0
5.0
5.0
4.0
9.0
2.0
4.0
8.0
7.0
8.0
5.0
1.0
7.0
3.0
35.0
20.0
20.0
20.0
0.0
5.0
NaN
NaN
NaN
NaN
NaN
NaN
25.0
5.0
20.0
20.0
25.0
5.0
8.0
9.0
7.0
8.0
5.0
NaN
NaN
NaN
NaN
NaN
0
7.0
8.0
6.0
5.0
8.0
4.0
5.0
2.0
1.0
0.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
7.0
2.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
19.51
17.07
17.07
17.07
12.2
17.07
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
8.0
8.0
8.0
6.0
5.0
NaN
NaN
NaN
NaN
NaN
0.0
0.0
0.0
NaN
NaN
35.0
25.0
15.0
15.0
0.0
10.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
7.0
8.0
7.0
7.0
5.0
NaN
NaN
NaN
NaN
NaN
In [37]:
merged_datasets[["iid_x","gender_x","pid_y","gender_y"]].head(5)
Out[37]:
iid_x
gender_x
pid_y
gender_y
0
1
0
1.0
1
1
1
0
2.0
1
2
1
0
3.0
1
3
1
0
4.0
1
4
1
0
5.0
1
In [39]:
same_gender = merged_datasets[merged_datasets["gender_x"] == merged_datasets["gender_y"]]
same_gender.head()
Out[39]:
iid_x
id_x
gender_x
idg_x
condtn_x
wave_x
round_x
position_x
positin1_x
order_x
partner_x
pid_x
match_x
int_corr_x
samerace_x
age_o_x
race_o_x
pf_o_att_x
pf_o_sin_x
pf_o_int_x
pf_o_fun_x
pf_o_amb_x
pf_o_sha_x
dec_o_x
attr_o_x
sinc_o_x
intel_o_x
fun_o_x
amb_o_x
shar_o_x
like_o_x
prob_o_x
met_o_x
age_x
field_x
field_cd_x
undergra_x
mn_sat_x
tuition_x
race_x
imprace_x
imprelig_x
from_x
zipcode_x
income_x
goal_x
date_x
go_out_x
career_x
career_c_x
sports_x
tvsports_x
exercise_x
dining_x
museums_x
art_x
hiking_x
gaming_x
clubbing_x
reading_x
tv_x
theater_x
movies_x
concerts_x
music_x
shopping_x
yoga_x
exphappy_x
expnum_x
attr1_1_x
sinc1_1_x
intel1_1_x
fun1_1_x
amb1_1_x
shar1_1_x
attr4_1_x
sinc4_1_x
intel4_1_x
fun4_1_x
amb4_1_x
shar4_1_x
attr2_1_x
sinc2_1_x
intel2_1_x
fun2_1_x
amb2_1_x
shar2_1_x
attr3_1_x
sinc3_1_x
fun3_1_x
intel3_1_x
amb3_1_x
attr5_1_x
sinc5_1_x
intel5_1_x
fun5_1_x
amb5_1_x
dec_x
attr_x
sinc_x
intel_x
fun_x
amb_x
shar_x
like_x
prob_x
met_x
match_es_x
attr1_s_x
sinc1_s_x
intel1_s_x
fun1_s_x
amb1_s_x
shar1_s_x
attr3_s_x
sinc3_s_x
intel3_s_x
fun3_s_x
amb3_s_x
satis_2_x
length_x
numdat_2_x
attr7_2_x
sinc7_2_x
intel7_2_x
fun7_2_x
amb7_2_x
shar7_2_x
attr1_2_x
sinc1_2_x
intel1_2_x
fun1_2_x
amb1_2_x
shar1_2_x
attr4_2_x
sinc4_2_x
intel4_2_x
fun4_2_x
amb4_2_x
shar4_2_x
attr2_2_x
sinc2_2_x
intel2_2_x
fun2_2_x
amb2_2_x
shar2_2_x
attr3_2_x
sinc3_2_x
intel3_2_x
fun3_2_x
amb3_2_x
attr5_2_x
sinc5_2_x
intel5_2_x
fun5_2_x
amb5_2_x
you_call_x
them_cal_x
date_3_x
numdat_3_x
num_in_3_x
attr1_3_x
sinc1_3_x
intel1_3_x
fun1_3_x
amb1_3_x
shar1_3_x
attr7_3_x
sinc7_3_x
intel7_3_x
fun7_3_x
amb7_3_x
shar7_3_x
attr4_3_x
sinc4_3_x
intel4_3_x
fun4_3_x
amb4_3_x
shar4_3_x
attr2_3_x
sinc2_3_x
intel2_3_x
fun2_3_x
amb2_3_x
shar2_3_x
attr3_3_x
sinc3_3_x
intel3_3_x
fun3_3_x
amb3_3_x
attr5_3_x
sinc5_3_x
intel5_3_x
fun5_3_x
amb5_3_x
iid_y
id_y
gender_y
idg_y
condtn_y
wave_y
round_y
position_y
positin1_y
order_y
partner_y
pid_y
match_y
int_corr_y
samerace_y
age_o_y
race_o_y
pf_o_att_y
pf_o_sin_y
pf_o_int_y
pf_o_fun_y
pf_o_amb_y
pf_o_sha_y
dec_o_y
attr_o_y
sinc_o_y
intel_o_y
fun_o_y
amb_o_y
shar_o_y
like_o_y
prob_o_y
met_o_y
age_y
field_y
field_cd_y
undergra_y
mn_sat_y
tuition_y
race_y
imprace_y
imprelig_y
from_y
zipcode_y
income_y
goal_y
date_y
go_out_y
career_y
career_c_y
sports_y
tvsports_y
exercise_y
dining_y
museums_y
art_y
hiking_y
gaming_y
clubbing_y
reading_y
tv_y
theater_y
movies_y
concerts_y
music_y
shopping_y
yoga_y
exphappy_y
expnum_y
attr1_1_y
sinc1_1_y
intel1_1_y
fun1_1_y
amb1_1_y
shar1_1_y
attr4_1_y
sinc4_1_y
intel4_1_y
fun4_1_y
amb4_1_y
shar4_1_y
attr2_1_y
sinc2_1_y
intel2_1_y
fun2_1_y
amb2_1_y
shar2_1_y
attr3_1_y
sinc3_1_y
fun3_1_y
intel3_1_y
amb3_1_y
attr5_1_y
sinc5_1_y
intel5_1_y
fun5_1_y
amb5_1_y
dec_y
attr_y
sinc_y
intel_y
fun_y
amb_y
shar_y
like_y
prob_y
met_y
match_es_y
attr1_s_y
sinc1_s_y
intel1_s_y
fun1_s_y
amb1_s_y
shar1_s_y
attr3_s_y
sinc3_s_y
intel3_s_y
fun3_s_y
amb3_s_y
satis_2_y
length_y
numdat_2_y
attr7_2_y
sinc7_2_y
intel7_2_y
fun7_2_y
amb7_2_y
shar7_2_y
attr1_2_y
sinc1_2_y
intel1_2_y
fun1_2_y
amb1_2_y
shar1_2_y
attr4_2_y
sinc4_2_y
intel4_2_y
fun4_2_y
amb4_2_y
shar4_2_y
attr2_2_y
sinc2_2_y
intel2_2_y
fun2_2_y
amb2_2_y
shar2_2_y
attr3_2_y
sinc3_2_y
intel3_2_y
fun3_2_y
amb3_2_y
attr5_2_y
sinc5_2_y
intel5_2_y
fun5_2_y
amb5_2_y
you_call_y
them_cal_y
date_3_y
numdat_3_y
num_in_3_y
attr1_3_y
sinc1_3_y
intel1_3_y
fun1_3_y
amb1_3_y
shar1_3_y
attr7_3_y
sinc7_3_y
intel7_3_y
fun7_3_y
amb7_3_y
shar7_3_y
attr4_3_y
sinc4_3_y
intel4_3_y
fun4_3_y
amb4_3_y
shar4_3_y
attr2_3_y
sinc2_3_y
intel2_3_y
fun2_3_y
amb2_3_y
shar2_3_y
attr3_3_y
sinc3_3_y
intel3_3_y
fun3_3_y
amb3_3_y
attr5_3_y
sinc5_3_y
intel5_3_y
fun5_3_y
amb5_3_y
In [5]:
columns_by_types = raw_dataset.columns.to_series().groupby(raw_dataset.dtypes).groups
In [6]:
columns_by_types
Out[6]:
{dtype('int64'): ['iid',
'gender',
'idg',
'condtn',
'wave',
'round',
'position',
'order',
'partner',
'match',
'samerace',
'dec_o',
'dec'],
dtype('float64'): ['id',
'positin1',
'pid',
'int_corr',
'age_o',
'race_o',
'pf_o_att',
'pf_o_sin',
'pf_o_int',
'pf_o_fun',
'pf_o_amb',
'pf_o_sha',
'attr_o',
'sinc_o',
'intel_o',
'fun_o',
'amb_o',
'shar_o',
'like_o',
'prob_o',
'met_o',
'age',
'field_cd',
'race',
'imprace',
'imprelig',
'goal',
'date',
'go_out',
'career_c',
'sports',
'tvsports',
'exercise',
'dining',
'museums',
'art',
'hiking',
'gaming',
'clubbing',
'reading',
'tv',
'theater',
'movies',
'concerts',
'music',
'shopping',
'yoga',
'exphappy',
'expnum',
'attr1_1',
'sinc1_1',
'intel1_1',
'fun1_1',
'amb1_1',
'shar1_1',
'attr4_1',
'sinc4_1',
'intel4_1',
'fun4_1',
'amb4_1',
'shar4_1',
'attr2_1',
'sinc2_1',
'intel2_1',
'fun2_1',
'amb2_1',
'shar2_1',
'attr3_1',
'sinc3_1',
'fun3_1',
'intel3_1',
'amb3_1',
'attr5_1',
'sinc5_1',
'intel5_1',
'fun5_1',
'amb5_1',
'attr',
'sinc',
'intel',
'fun',
'amb',
'shar',
'like',
'prob',
'met',
'match_es',
'attr1_s',
'sinc1_s',
'intel1_s',
'fun1_s',
'amb1_s',
'shar1_s',
'attr3_s',
'sinc3_s',
'intel3_s',
'fun3_s',
'amb3_s',
'satis_2',
'length',
'numdat_2',
'attr7_2',
'sinc7_2',
'intel7_2',
'fun7_2',
'amb7_2',
'shar7_2',
'attr1_2',
'sinc1_2',
'intel1_2',
'fun1_2',
'amb1_2',
'shar1_2',
'attr4_2',
'sinc4_2',
'intel4_2',
'fun4_2',
'amb4_2',
'shar4_2',
'attr2_2',
'sinc2_2',
'intel2_2',
'fun2_2',
'amb2_2',
'shar2_2',
'attr3_2',
'sinc3_2',
'intel3_2',
'fun3_2',
'amb3_2',
'attr5_2',
'sinc5_2',
'intel5_2',
'fun5_2',
'amb5_2',
'you_call',
'them_cal',
'date_3',
'numdat_3',
'num_in_3',
'attr1_3',
'sinc1_3',
'intel1_3',
'fun1_3',
'amb1_3',
'shar1_3',
'attr7_3',
'sinc7_3',
'intel7_3',
'fun7_3',
'amb7_3',
'shar7_3',
'attr4_3',
'sinc4_3',
'intel4_3',
'fun4_3',
'amb4_3',
'shar4_3',
'attr2_3',
'sinc2_3',
'intel2_3',
'fun2_3',
'amb2_3',
'shar2_3',
'attr3_3',
'sinc3_3',
'intel3_3',
'fun3_3',
'amb3_3',
'attr5_3',
'sinc5_3',
'intel5_3',
'fun5_3',
'amb5_3'],
dtype('O'): ['field',
'undergra',
'mn_sat',
'tuition',
'from',
'zipcode',
'income',
'career']}
In [7]:
raw_dataset.dtypes.value_counts()
Out[7]:
float64 174
int64 13
object 8
dtype: int64
In [19]:
raw_dataset.isnull().sum().head(50)
Out[19]:
iid 0
id 1
gender 0
idg 0
condtn 0
wave 0
round 0
position 0
positin1 1846
order 0
partner 0
pid 10
match 0
int_corr 158
samerace 0
age_o 104
race_o 73
pf_o_att 89
pf_o_sin 89
pf_o_int 89
pf_o_fun 98
pf_o_amb 107
pf_o_sha 129
dec_o 0
attr_o 212
sinc_o 287
intel_o 306
fun_o 360
amb_o 722
shar_o 1076
like_o 250
prob_o 318
met_o 385
age 95
field 63
field_cd 82
undergra 3464
mn_sat 5245
tuition 4795
race 63
imprace 79
imprelig 79
from 79
zipcode 1064
income 4099
goal 79
date 97
go_out 79
career 89
career_c 138
dtype: int64
In [9]:
summary = raw_dataset.describe() #.transpose()
#summary.head(30)
print summary
iid id gender idg condtn \
count 8378.000000 8377.000000 8378.000000 8378.000000 8378.000000
mean 283.675937 8.960248 0.500597 17.327166 1.828837
std 158.583367 5.491329 0.500029 10.940735 0.376673
min 1.000000 1.000000 0.000000 1.000000 1.000000
25% 154.000000 4.000000 0.000000 8.000000 2.000000
50% 281.000000 8.000000 1.000000 16.000000 2.000000
75% 407.000000 13.000000 1.000000 26.000000 2.000000
max 552.000000 22.000000 1.000000 44.000000 2.000000
wave round position positin1 order \
count 8378.000000 8378.000000 8378.000000 6532.000000 8378.000000
mean 11.350919 16.872046 9.042731 9.295775 8.927668
std 5.995903 4.358458 5.514939 5.650199 5.477009
min 1.000000 5.000000 1.000000 1.000000 1.000000
25% 7.000000 14.000000 4.000000 4.000000 4.000000
50% 11.000000 18.000000 8.000000 9.000000 8.000000
75% 15.000000 20.000000 13.000000 14.000000 13.000000
max 21.000000 22.000000 22.000000 22.000000 22.000000
partner pid match int_corr samerace \
count 8378.000000 8368.000000 8378.000000 8220.000000 8378.000000
mean 8.963595 283.863767 0.164717 0.196010 0.395799
std 5.491068 158.584899 0.370947 0.303539 0.489051
min 1.000000 1.000000 0.000000 -0.830000 0.000000
25% 4.000000 154.000000 0.000000 -0.020000 0.000000
50% 8.000000 281.000000 0.000000 0.210000 0.000000
75% 13.000000 408.000000 0.000000 0.430000 1.000000
max 22.000000 552.000000 1.000000 0.910000 1.000000
age_o race_o pf_o_att pf_o_sin pf_o_int \
count 8274.000000 8305.000000 8289.000000 8289.000000 8289.000000
mean 26.364999 2.756653 22.495347 17.396867 20.270759
std 3.563648 1.230689 12.569802 7.044003 6.782895
min 18.000000 1.000000 0.000000 0.000000 0.000000
25% 24.000000 2.000000 15.000000 15.000000 17.390000
50% 26.000000 2.000000 20.000000 18.370000 20.000000
75% 28.000000 4.000000 25.000000 20.000000 23.810000
max 55.000000 6.000000 100.000000 60.000000 50.000000
pf_o_fun pf_o_amb pf_o_sha dec_o attr_o \
count 8280.000000 8271.000000 8249.000000 8378.000000 8166.000000
mean 17.459714 10.685375 11.845930 0.419551 6.190411
std 6.085526 6.126544 6.362746 0.493515 1.950305
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 15.000000 5.000000 9.520000 0.000000 5.000000
50% 18.000000 10.000000 10.640000 0.000000 6.000000
75% 20.000000 15.000000 16.000000 1.000000 8.000000
max 50.000000 53.000000 30.000000 1.000000 10.500000
sinc_o intel_o fun_o amb_o shar_o \
count 8091.000000 8072.000000 8018.000000 7656.000000 7302.000000
mean 7.175256 7.369301 6.400599 6.778409 5.474870
std 1.740575 1.550501 1.954078 1.794080 2.156163
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 6.000000 6.000000 5.000000 6.000000 4.000000
50% 7.000000 7.000000 7.000000 7.000000 6.000000
75% 8.000000 8.000000 8.000000 8.000000 7.000000
max 10.000000 10.000000 11.000000 10.000000 10.000000
like_o prob_o met_o age field_cd \
count 8128.000000 8060.000000 7993.000000 8283.000000 8296.000000
mean 6.134498 5.208251 1.960215 26.358928 7.662488
std 1.841258 2.129354 0.245925 3.566763 3.758935
min 0.000000 0.000000 1.000000 18.000000 1.000000
25% 5.000000 4.000000 2.000000 24.000000 5.000000
50% 6.000000 5.000000 2.000000 26.000000 8.000000
75% 7.000000 7.000000 2.000000 28.000000 10.000000
max 10.000000 10.000000 8.000000 55.000000 18.000000
race imprace imprelig goal date \
count 8315.000000 8299.000000 8299.000000 8299.000000 8281.000000
mean 2.757186 3.784793 3.651645 2.122063 5.006762
std 1.230905 2.845708 2.805237 1.407181 1.444531
min 1.000000 0.000000 1.000000 1.000000 1.000000
25% 2.000000 1.000000 1.000000 1.000000 4.000000
50% 2.000000 3.000000 3.000000 2.000000 5.000000
75% 4.000000 6.000000 6.000000 2.000000 6.000000
max 6.000000 10.000000 10.000000 6.000000 7.000000
go_out career_c sports tvsports exercise \
count 8299.000000 8240.000000 8299.000000 8299.000000 8299.000000
mean 2.158091 5.277791 6.425232 4.575491 6.245813
std 1.105246 3.309520 2.619024 2.801874 2.418858
min 1.000000 1.000000 1.000000 1.000000 1.000000
25% 1.000000 2.000000 4.000000 2.000000 5.000000
50% 2.000000 6.000000 7.000000 4.000000 6.000000
75% 3.000000 7.000000 9.000000 7.000000 8.000000
max 7.000000 17.000000 10.000000 10.000000 10.000000
dining museums art hiking gaming \
count 8299.000000 8299.000000 8299.000000 8299.000000 8299.000000
mean 7.783829 6.985781 6.714544 5.737077 3.881191
std 1.754868 2.052232 2.263407 2.570207 2.620507
min 1.000000 0.000000 0.000000 0.000000 0.000000
25% 7.000000 6.000000 5.000000 4.000000 2.000000
50% 8.000000 7.000000 7.000000 6.000000 3.000000
75% 9.000000 9.000000 8.000000 8.000000 6.000000
max 10.000000 10.000000 10.000000 10.000000 14.000000
clubbing reading tv theater movies \
count 8299.000000 8299.000000 8299.000000 8299.000000 8299.000000
mean 5.745993 7.678515 5.304133 6.776118 7.919629
std 2.502218 2.006565 2.529135 2.235152 1.700927
min 0.000000 1.000000 1.000000 0.000000 0.000000
25% 4.000000 7.000000 3.000000 5.000000 7.000000
50% 6.000000 8.000000 6.000000 7.000000 8.000000
75% 8.000000 9.000000 7.000000 9.000000 9.000000
max 10.000000 13.000000 10.000000 10.000000 10.000000
concerts music shopping yoga exphappy \
count 8299.000000 8299.000000 8299.000000 8299.000000 8277.000000
mean 6.825401 7.851066 5.631281 4.339197 5.534131
std 2.156283 1.791827 2.608913 2.717612 1.734059
min 0.000000 1.000000 1.000000 0.000000 1.000000
25% 5.000000 7.000000 4.000000 2.000000 5.000000
50% 7.000000 8.000000 6.000000 4.000000 6.000000
75% 8.000000 9.000000 8.000000 7.000000 7.000000
max 10.000000 10.000000 10.000000 10.000000 10.000000
expnum attr1_1 sinc1_1 intel1_1 fun1_1 \
count 1800.000000 8299.000000 8299.000000 8299.000000 8289.000000
mean 5.570556 22.514632 17.396389 20.265613 17.457043
std 4.762569 12.587674 7.046700 6.783003 6.085239
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 2.000000 15.000000 15.000000 17.390000 15.000000
50% 4.000000 20.000000 18.180000 20.000000 18.000000
75% 8.000000 25.000000 20.000000 23.810000 20.000000
max 20.000000 100.000000 60.000000 50.000000 50.000000
amb1_1 shar1_1 attr4_1 sinc4_1 intel4_1 \
count 8279.000000 8257.000000 6489.000000 6489.000000 6489.000000
mean 10.682539 11.845111 26.394360 11.071506 12.636308
std 6.124888 6.362154 16.297045 6.659233 6.717476
min 0.000000 0.000000 5.000000 0.000000 0.000000
25% 5.000000 9.520000 10.000000 6.000000 8.000000
50% 10.000000 10.640000 25.000000 10.000000 10.000000
75% 15.000000 16.000000 35.000000 15.000000 16.000000
max 53.000000 30.000000 95.000000 35.000000 35.000000
fun4_1 amb4_1 shar4_1 attr2_1 sinc2_1 \
count 6489.000000 6489.000000 6467.000000 8299.000000 8299.000000
mean 15.566805 9.780089 11.014845 30.362192 13.273691
std 7.328256 6.998428 6.060150 16.249937 6.976775
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 10.000000 5.000000 7.000000 20.000000 10.000000
50% 15.000000 10.000000 10.000000 25.000000 15.000000
75% 20.000000 15.000000 15.000000 40.000000 18.750000
max 45.000000 50.000000 40.000000 100.000000 50.000000
intel2_1 fun2_1 amb2_1 shar2_1 attr3_1 \
count 8299.000000 8299.000000 8289.000000 8289.000000 8273.000000
mean 14.416891 18.422620 11.744499 11.854817 7.084733
std 6.263304 6.577929 6.886532 6.167314 1.395783
min 0.000000 0.000000 0.000000 0.000000 2.000000
25% 10.000000 15.000000 6.000000 10.000000 6.000000
50% 15.000000 20.000000 10.000000 10.000000 7.000000
75% 20.000000 20.000000 15.000000 15.630000 8.000000
max 40.000000 50.000000 50.000000 30.000000 10.000000
sinc3_1 fun3_1 intel3_1 amb3_1 attr5_1 \
count 8273.000000 8273.000000 8273.000000 8273.000000 4906.000000
mean 8.294935 7.704460 8.403965 7.578388 6.941908
std 1.407460 1.564321 1.076608 1.778315 1.498653
min 2.000000 2.000000 3.000000 2.000000 2.000000
25% 8.000000 7.000000 8.000000 7.000000 6.000000
50% 8.000000 8.000000 8.000000 8.000000 7.000000
75% 9.000000 9.000000 9.000000 9.000000 8.000000
max 10.000000 10.000000 10.000000 10.000000 10.000000
sinc5_1 intel5_1 fun5_1 amb5_1 dec \
count 4906.000000 4906.000000 4906.000000 4906.000000 8378.000000
mean 7.927232 8.284346 7.426213 7.617611 0.419909
std 1.627054 1.283657 1.779129 1.773094 0.493573
min 1.000000 3.000000 2.000000 1.000000 0.000000
25% 7.000000 8.000000 6.000000 7.000000 0.000000
50% 8.000000 8.000000 8.000000 8.000000 0.000000
75% 9.000000 9.000000 9.000000 9.000000 1.000000
max 10.000000 10.000000 10.000000 10.000000 1.000000
attr sinc intel fun amb \
count 8176.000000 8101.000000 8082.000000 8028.000000 7666.000000
mean 6.189995 7.175164 7.368597 6.400598 6.777524
std 1.950169 1.740315 1.550453 1.953702 1.794055
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 5.000000 6.000000 6.000000 5.000000 6.000000
50% 6.000000 7.000000 7.000000 7.000000 7.000000
75% 8.000000 8.000000 8.000000 8.000000 8.000000
max 10.000000 10.000000 10.000000 10.000000 10.000000
shar like prob met match_es \
count 7311.000000 8138.000000 8069.000000 8003.000000 7205.000000
mean 5.474559 6.134087 5.207523 0.948769 3.207814
std 2.156363 1.841285 2.129565 0.989889 2.444813
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 4.000000 5.000000 4.000000 0.000000 2.000000
50% 6.000000 6.000000 5.000000 0.000000 3.000000
75% 7.000000 7.000000 7.000000 2.000000 4.000000
max 10.000000 10.000000 10.000000 8.000000 18.000000
attr1_s sinc1_s intel1_s fun1_s amb1_s \
count 4096.000000 4096.000000 4096.000000 4096.000000 4096.000000
mean 20.791624 15.434255 17.243708 15.260869 11.144619
std 12.968524 6.915322 6.596420 5.356969 5.514028
min 3.000000 0.000000 0.000000 1.000000 0.000000
25% 14.810000 10.000000 10.000000 10.000000 7.000000
50% 17.650000 15.790000 18.420000 15.910000 10.000000
75% 25.000000 20.000000 20.000000 20.000000 15.000000
max 95.000000 50.000000 40.000000 40.000000 23.810000
shar1_s attr3_s sinc3_s intel3_s fun3_s \
count 4096.000000 4000.00000 4000.000000 4000.000000 4000.000000
mean 12.457925 7.21125 8.082000 8.257750 7.692500
std 5.921789 1.41545 1.455741 1.179317 1.626839
min 0.000000 3.00000 1.000000 4.000000 3.000000
25% 9.000000 7.00000 7.000000 8.000000 7.000000
50% 12.500000 7.00000 8.000000 8.000000 8.000000
75% 16.280000 8.00000 9.000000 9.000000 9.000000
max 30.000000 10.00000 10.000000 10.000000 10.000000
amb3_s satis_2 length numdat_2 attr7_2 \
count 4000.000000 7463.000000 7463.000000 7433.000000 1984.000000
mean 7.589250 5.711510 1.843495 2.338087 32.819556
std 1.793136 1.820764 0.975662 0.631240 17.155270
min 2.000000 1.000000 1.000000 1.000000 10.000000
25% 7.000000 5.000000 1.000000 2.000000 20.000000
50% 8.000000 6.000000 1.000000 2.000000 30.000000
75% 9.000000 7.000000 3.000000 3.000000 40.000000
max 10.000000 10.000000 3.000000 3.000000 80.000000
sinc7_2 intel7_2 fun7_2 amb7_2 shar7_2 \
count 1955.000000 1984.000000 1984.000000 1955.000000 1974.000000
mean 13.529923 15.293851 18.868448 7.286957 12.156028
std 7.977482 7.292868 8.535963 6.125187 8.241906
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 10.000000 10.000000 10.000000 0.000000 5.000000
50% 10.000000 15.000000 20.000000 5.000000 10.000000
75% 20.000000 20.000000 24.000000 10.000000 20.000000
max 40.000000 50.000000 50.000000 20.000000 40.000000
attr1_2 sinc1_2 intel1_2 fun1_2 amb1_2 \
count 7445.000000 7463.000000 7463.000000 7463.000000 7463.000000
mean 26.217194 15.865084 17.813755 17.654765 9.913436
std 14.388694 6.658494 6.535894 6.129746 5.675550
min 5.000000 0.000000 0.000000 0.000000 0.000000
25% 16.670000 10.000000 15.000000 15.000000 5.000000
50% 20.000000 16.670000 19.050000 18.370000 10.000000
75% 30.000000 20.000000 20.000000 20.000000 15.000000
max 85.000000 50.000000 40.000000 50.000000 22.220000
shar1_2 attr4_2 sinc4_2 intel4_2 fun4_2 \
count 7463.000000 5775.000000 5775.000000 5775.000000 5775.000000
mean 12.760263 26.806234 11.929177 12.103030 15.163810
std 6.651547 16.402836 6.401556 5.990607 7.290107
min 0.000000 6.000000 0.000000 0.000000 0.000000
25% 10.000000 10.000000 8.000000 8.000000 9.000000
50% 13.000000 25.000000 10.000000 10.000000 15.000000
75% 16.670000 40.000000 15.000000 15.000000 20.000000
max 35.000000 100.000000 35.000000 40.000000 50.000000
amb4_2 shar4_2 attr2_2 sinc2_2 intel2_2 \
count 5775.000000 5775.000000 5775.000000 5775.00000 5775.000000
mean 9.342511 11.320866 29.344369 13.89823 13.958265
std 5.856329 6.296155 14.551171 6.17169 5.398621
min 0.000000 0.000000 0.000000 0.00000 0.000000
25% 5.000000 7.000000 19.150000 10.00000 10.000000
50% 10.000000 10.000000 25.000000 15.00000 15.000000
75% 10.000000 15.000000 38.460000 19.23000 17.390000
max 35.000000 40.000000 85.000000 40.00000 30.770000
fun2_2 amb2_2 shar2_2 attr3_2 sinc3_2 \
count 5775.000000 5775.000000 5775.000000 7463.000000 7463.000000
mean 17.967233 11.909735 12.887976 7.125285 7.931529
std 6.100307 6.313281 5.615691 1.371390 1.503236
min 0.000000 0.000000 0.000000 2.000000 2.000000
25% 15.000000 10.000000 10.000000 7.000000 7.000000
50% 18.520000 10.000000 13.950000 7.000000 8.000000
75% 20.000000 15.090000 16.515000 8.000000 9.000000
max 40.000000 50.000000 30.000000 10.000000 10.000000
intel3_2 fun3_2 amb3_2 attr5_2 sinc5_2 \
count 7463.000000 7463.000000 7463.000000 4377.000000 4377.000000
mean 8.238912 7.602171 7.486802 6.827964 7.394106
std 1.180280 1.548200 1.744634 1.411096 1.588145
min 4.000000 1.000000 2.000000 2.000000 2.000000
25% 8.000000 7.000000 7.000000 6.000000 6.000000
50% 8.000000 8.000000 8.000000 7.000000 8.000000
75% 9.000000 9.000000 9.000000 8.000000 8.000000
max 10.000000 10.000000 10.000000 10.000000 10.000000
intel5_2 fun5_2 amb5_2 you_call them_cal \
count 4377.000000 4377.000000 4377.000000 3974.000000 3974.000000
mean 7.838702 7.279415 7.332191 0.780825 0.981631
std 1.280936 1.647478 1.521854 1.611694 1.382139
min 2.000000 2.000000 2.000000 0.000000 0.000000
25% 7.000000 6.000000 6.000000 0.000000 0.000000
50% 8.000000 7.000000 7.000000 0.000000 1.000000
75% 9.000000 8.000000 8.000000 1.000000 1.000000
max 10.000000 10.000000 10.000000 21.000000 9.000000
date_3 numdat_3 num_in_3 attr1_3 sinc1_3 \
count 3974.000000 1496.000000 668.000000 3974.000000 3974.000000
mean 0.376950 1.230615 0.934132 24.384524 16.588583
std 0.484683 1.294557 0.753902 13.712120 7.471537
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 1.000000 1.000000 15.220000 10.000000
50% 0.000000 1.000000 1.000000 20.000000 16.670000
75% 1.000000 1.000000 1.000000 30.000000 20.000000
max 1.000000 9.000000 4.000000 80.000000 65.000000
intel1_3 fun1_3 amb1_3 shar1_3 attr7_3 \
count 3974.000000 3974.000000 3974.000000 3974.000000 2016.000000
mean 19.411346 16.233415 10.898075 12.699142 31.330357
std 6.124502 5.163777 5.900697 6.557041 17.551540
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 16.670000 14.810000 5.000000 10.000000 20.000000
50% 20.000000 16.330000 10.000000 14.290000 25.000000
75% 20.000000 20.000000 15.000000 16.670000 40.000000
max 45.000000 30.000000 30.000000 55.000000 80.000000
sinc7_3 intel7_3 fun7_3 amb7_3 shar7_3 \
count 2016.000000 2016.000000 2016.000000 2016.000000 2016.000000
mean 15.654266 16.679563 16.418155 7.823909 12.207837
std 9.336288 7.880088 7.231325 6.100502 8.615985
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 10.000000 10.000000 10.000000 0.000000 5.000000
50% 15.000000 18.000000 17.000000 10.000000 10.000000
75% 20.000000 20.000000 20.000000 10.000000 20.000000
max 60.000000 45.000000 40.000000 30.000000 55.000000
attr4_3 sinc4_3 intel4_3 fun4_3 amb4_3 \
count 2959.000000 2959.000000 2959.000000 2959.000000 2959.000000
mean 25.610341 10.751267 11.524839 14.276783 9.207503
std 17.477134 5.740351 6.004222 6.927869 6.385852
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 10.000000 7.000000 7.000000 9.000000 5.000000
50% 20.000000 10.000000 10.000000 12.000000 9.000000
75% 37.000000 15.000000 15.000000 20.000000 10.000000
max 80.000000 40.000000 30.000000 30.000000 40.000000
shar4_3 attr2_3 sinc2_3 intel2_3 fun2_3 \
count 2959.000000 2959.000000 2959.000000 2959.000000 2959.000000
mean 11.253802 24.970936 10.923285 11.952687 14.959108
std 6.516178 17.007669 6.226283 7.010650 7.935509
min 0.000000 5.000000 0.000000 0.000000 0.000000
25% 7.000000 10.000000 7.000000 7.000000 9.000000
50% 10.000000 20.000000 10.000000 10.000000 15.000000
75% 15.000000 35.000000 15.000000 15.000000 20.000000
max 45.000000 80.000000 50.000000 60.000000 40.000000
amb2_3 shar2_3 attr3_3 sinc3_3 intel3_3 \
count 2959.000000 2016.000000 3974.000000 3974.000000 3974.000000
mean 9.526191 11.966270 7.240312 8.093357 8.388777
std 6.403117 7.012067 1.576596 1.610309 1.459094
min 0.000000 0.000000 2.000000 2.000000 3.000000
25% 6.000000 5.000000 7.000000 7.000000 8.000000
50% 10.000000 10.000000 7.000000 8.000000 8.000000
75% 10.000000 15.000000 8.000000 9.000000 9.000000
max 50.000000 45.000000 12.000000 12.000000 12.000000
fun3_3 amb3_3 attr5_3 sinc5_3 intel5_3 \
count 3974.000000 3974.000000 2016.000000 2016.000000 2016.000000
mean 7.658782 7.391545 6.810020 7.615079 7.932540
std 1.744670 1.961417 1.507341 1.504551 1.340868
min 2.000000 1.000000 2.000000 2.000000 4.000000
25% 7.000000 6.000000 6.000000 7.000000 7.000000
50% 8.000000 8.000000 7.000000 8.000000 8.000000
75% 9.000000 9.000000 8.000000 9.000000 9.000000
max 12.000000 12.000000 10.000000 10.000000 10.000000
fun5_3 amb5_3
count 2016.000000 2016.000000
mean 7.155258 7.048611
std 1.672787 1.717988
min 1.000000 1.000000
25% 6.000000 6.000000
50% 7.000000 7.000000
75% 8.000000 8.000000
max 10.000000 10.000000
In [13]:
#raw_dataset.groupby("gender").agg({"iid": pd.Series.nunique})
raw_dataset.groupby('match').iid.count()
Out[13]:
match
0 6998
1 1380
Name: iid, dtype: int64
In [14]:
raw_dataset.groupby('career').iid.nunique().sort_values(ascending=False)
Out[14]:
career
Finance 13
professor 12
Lawyer 11
Professor 10
Social Worker 9
Consulting 8
lawyer 7
Business 7
Law 7
Academic 6
Investment Banking 6
Writer 5
Scientist 5
Management Consulting 5
law 5
Teacher 4
Social Work 4
research 4
Clinical Psychologist 4
undecided 4
Epidemiologist 4
Entrepreneur 4
Actress 3
consulting 3
academia 3
Researcher 3
School Psychologist 3
teacher 3
investment banking 3
scientist 3
..
academician 1
academics 1
academics or journalism 1
assistant master of the universe (otherwise it's too much work) 1
attorney? 1
banker / academia 1
biology industry 1
boxing champ 1
ceo 1
Writer/Editor 1
Work in an investment bank 1
Work at the UN 1
Trade Specialist 1
Speech Language Pathologist 1
Speech Pathologist 1
TBA 1
TEACHING 1
Teacher/Professor 1
To create early childhood intervention programs 1
To go into Finance 1
Trading 1
What a question! 1
UN Civil Servant 1
University President 1
University Professor 1
Urban Planner 1
Venture Capital/Consulting/Government 1
WRITING 1
Wall Street Economist 1
? 1
Name: iid, dtype: int64
In [18]:
raw_dataset.groupby(["gender","match"]).iid.nunique()
Out[18]:
gender match
0 0 274
1 221
1 0 277
1 231
Name: iid, dtype: int64
In [9]:
raw_dataset.rename(columns={"age_o":"age_of_partner","race_o":"race_of_partner"},inplace=True)
In [ ]:
Content source: xebia-france/luigi-airflow
Similar notebooks: