In [1]:

    
import pandas as pd
import numpy as np
import sklearn
import subprocess
import warnings
pd.set_option('display.max_columns', None)



In [2]:

    
source_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"



In [3]:

    
# coding: ISO-8859-1

Import data



In [4]:

    
raw_dataset = pd.read_csv(source_path + "Speed_Dating_Data.csv",encoding = "ISO-8859-1")

Data exploration

Shape, types, distribution, modalities and potential missing values



In [5]:

    
raw_dataset.head(2)









    Out[5]:






  
    
      
      iid
      id
      gender
      idg
      condtn
      wave
      round
      position
      positin1
      order
      partner
      pid
      match
      int_corr
      samerace
      age_o
      race_o
      pf_o_att
      pf_o_sin
      pf_o_int
      pf_o_fun
      pf_o_amb
      pf_o_sha
      dec_o
      attr_o
      sinc_o
      intel_o
      fun_o
      amb_o
      shar_o
      like_o
      prob_o
      met_o
      age
      field
      field_cd
      undergra
      mn_sat
      tuition
      race
      imprace
      imprelig
      from
      zipcode
      income
      goal
      date
      go_out
      career
      career_c
      sports
      tvsports
      exercise
      dining
      museums
      art
      hiking
      gaming
      clubbing
      reading
      tv
      theater
      movies
      concerts
      music
      shopping
      yoga
      exphappy
      expnum
      attr1_1
      sinc1_1
      intel1_1
      fun1_1
      amb1_1
      shar1_1
      attr4_1
      sinc4_1
      intel4_1
      fun4_1
      amb4_1
      shar4_1
      attr2_1
      sinc2_1
      intel2_1
      fun2_1
      amb2_1
      shar2_1
      attr3_1
      sinc3_1
      fun3_1
      intel3_1
      amb3_1
      attr5_1
      sinc5_1
      intel5_1
      fun5_1
      amb5_1
      dec
      attr
      sinc
      intel
      fun
      amb
      shar
      like
      prob
      met
      match_es
      attr1_s
      sinc1_s
      intel1_s
      fun1_s
      amb1_s
      shar1_s
      attr3_s
      sinc3_s
      intel3_s
      fun3_s
      amb3_s
      satis_2
      length
      numdat_2
      attr7_2
      sinc7_2
      intel7_2
      fun7_2
      amb7_2
      shar7_2
      attr1_2
      sinc1_2
      intel1_2
      fun1_2
      amb1_2
      shar1_2
      attr4_2
      sinc4_2
      intel4_2
      fun4_2
      amb4_2
      shar4_2
      attr2_2
      sinc2_2
      intel2_2
      fun2_2
      amb2_2
      shar2_2
      attr3_2
      sinc3_2
      intel3_2
      fun3_2
      amb3_2
      attr5_2
      sinc5_2
      intel5_2
      fun5_2
      amb5_2
      you_call
      them_cal
      date_3
      numdat_3
      num_in_3
      attr1_3
      sinc1_3
      intel1_3
      fun1_3
      amb1_3
      shar1_3
      attr7_3
      sinc7_3
      intel7_3
      fun7_3
      amb7_3
      shar7_3
      attr4_3
      sinc4_3
      intel4_3
      fun4_3
      amb4_3
      shar4_3
      attr2_3
      sinc2_3
      intel2_3
      fun2_3
      amb2_3
      shar2_3
      attr3_3
      sinc3_3
      intel3_3
      fun3_3
      amb3_3
      attr5_3
      sinc5_3
      intel5_3
      fun5_3
      amb5_3
    
  
  
    
      0
      1
      1.0
      0
      1
      1
      1
      10
      7
      NaN
      4
      1
      11.0
      0
      0.14
      0
      27.0
      2.0
      35.0
      20.0
      20.0
      20.0
      0.0
      5.0
      0
      6.0
      8.0
      8.0
      8.0
      8.0
      6.0
      7.0
      4.0
      2.0
      21.0
      Law
      1.0
      NaN
      NaN
      NaN
      4.0
      2.0
      4.0
      Chicago
      60,521
      69,487.00
      2.0
      7.0
      1.0
      lawyer
      NaN
      9.0
      2.0
      8.0
      9.0
      1.0
      1.0
      5.0
      1.0
      5.0
      6.0
      9.0
      1.0
      10.0
      10.0
      9.0
      8.0
      1.0
      3.0
      2.0
      15.0
      20.0
      20.0
      15.0
      15.0
      15.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      35.0
      20.0
      15.0
      20.0
      5.0
      5.0
      6.0
      8.0
      8.0
      8.0
      7.0
      NaN
      NaN
      NaN
      NaN
      NaN
      1
      6.0
      9.0
      7.0
      7.0
      6.0
      5.0
      7.0
      6.0
      2.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      6.0
      2.0
      1.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      19.44
      16.67
      13.89
      22.22
      11.11
      16.67
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      6.0
      7.0
      8.0
      7.0
      6.0
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
      1.0
      0.0
      NaN
      NaN
      15.0
      20.0
      20.0
      15.0
      15.0
      15.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      5.0
      7.0
      7.0
      7.0
      7.0
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1
      1
      1.0
      0
      1
      1
      1
      10
      7
      NaN
      3
      2
      12.0
      0
      0.54
      0
      22.0
      2.0
      60.0
      0.0
      0.0
      40.0
      0.0
      0.0
      0
      7.0
      8.0
      10.0
      7.0
      7.0
      5.0
      8.0
      4.0
      2.0
      21.0
      Law
      1.0
      NaN
      NaN
      NaN
      4.0
      2.0
      4.0
      Chicago
      60,521
      69,487.00
      2.0
      7.0
      1.0
      lawyer
      NaN
      9.0
      2.0
      8.0
      9.0
      1.0
      1.0
      5.0
      1.0
      5.0
      6.0
      9.0
      1.0
      10.0
      10.0
      9.0
      8.0
      1.0
      3.0
      2.0
      15.0
      20.0
      20.0
      15.0
      15.0
      15.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      35.0
      20.0
      15.0
      20.0
      5.0
      5.0
      6.0
      8.0
      8.0
      8.0
      7.0
      NaN
      NaN
      NaN
      NaN
      NaN
      1
      7.0
      8.0
      7.0
      8.0
      5.0
      6.0
      7.0
      5.0
      1.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      6.0
      2.0
      1.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      19.44
      16.67
      13.89
      22.22
      11.11
      16.67
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      6.0
      7.0
      8.0
      7.0
      6.0
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
      1.0
      0.0
      NaN
      NaN
      15.0
      20.0
      20.0
      15.0
      15.0
      15.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      5.0
      7.0
      7.0
      7.0
      7.0
      NaN
      NaN
      NaN
      NaN
      NaN



In [6]:

    
raw_dataset_copy = raw_dataset



In [7]:

    
check1 = raw_dataset_copy[raw_dataset_copy["iid"] == 1]
check1_sel = check1[["iid", "pid", "match","gender","date","go_out","sports","tvsports","exercise","dining",
                          "museums","art","hiking","gaming","clubbing","reading","tv","theater","movies",
                          "concerts","music","shopping","yoga"]]



In [8]:

    
check1_sel.drop_duplicates().head(2)









    Out[8]:






  
    
      
      iid
      pid
      match
      gender
      date
      go_out
      sports
      tvsports
      exercise
      dining
      museums
      art
      hiking
      gaming
      clubbing
      reading
      tv
      theater
      movies
      concerts
      music
      shopping
      yoga
    
  
  
    
      0
      1
      11.0
      0
      0
      7.0
      1.0
      9.0
      2.0
      8.0
      9.0
      1.0
      1.0
      5.0
      1.0
      5.0
      6.0
      9.0
      1.0
      10.0
      10.0
      9.0
      8.0
      1.0
    
    
      1
      1
      12.0
      0
      0
      7.0
      1.0
      9.0
      2.0
      8.0
      9.0
      1.0
      1.0
      5.0
      1.0
      5.0
      6.0
      9.0
      1.0
      10.0
      10.0
      9.0
      8.0
      1.0



In [9]:

    
#merged_datasets = raw_dataset.merge(raw_dataset_copy, left_on="pid", right_on="iid")
#merged_datasets[["iid_x","gender_x","pid_y","gender_y"]].head(5)
#same_gender = merged_datasets[merged_datasets["gender_x"] == merged_datasets["gender_y"]]
#same_gender.head()



In [10]:

    
columns_by_types = raw_dataset.columns.to_series().groupby(raw_dataset.dtypes).groups



In [11]:

    
raw_dataset.dtypes.value_counts()









    Out[11]:





float64    174
int64       13
object       8
dtype: int64



In [12]:

    
raw_dataset.isnull().sum().head(3)









    Out[12]:





iid       0
id        1
gender    0
dtype: int64



In [13]:

    
summary = raw_dataset.describe() #.transpose()
#print (summary.head())



In [14]:

    
#raw_dataset.groupby("gender").agg({"iid": pd.Series.nunique})
raw_dataset.groupby('gender').iid.nunique()









    Out[14]:





gender
0    274
1    277
Name: iid, dtype: int64



In [15]:

    
raw_dataset.groupby('career').iid.nunique().sort_values(ascending=False).head(5)









    Out[15]:





career
Finance          13
professor        12
Lawyer           11
Professor        10
Social Worker     9
Name: iid, dtype: int64



In [16]:

    
raw_dataset.groupby(["gender","match"]).iid.nunique()









    Out[16]:





gender  match
0       0        274
        1        221
1       0        277
        1        231
Name: iid, dtype: int64

Data processing



In [17]:

    
local_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"
local_filename = "Speed_Dating_Data.csv"
my_variables_selection = ["iid", "pid", "match","gender","date","go_out","sports","tvsports","exercise","dining",
                          "museums","art","hiking","gaming","clubbing","reading","tv","theater","movies",
                          "concerts","music","shopping","yoga"]



In [18]:

    
class RawSetProcessing(object):
    """
    This class aims to load and clean the dataset.
    """
    def __init__(self,source_path,filename,features):
        self.source_path = source_path
        self.filename = filename
        self.features = features
    
    # Load data
    def load_data(self):
        raw_dataset_df = pd.read_csv(self.source_path + self.filename,encoding = "ISO-8859-1")
        return raw_dataset_df
    
    # Select variables to process and include in the model
    def subset_features(self, df):
        sel_vars_df = df[self.features]
        return sel_vars_df
    
    @staticmethod
    # Remove ids with missing values
    def remove_ids_with_missing_values(df):
        sel_vars_filled_df = df.dropna()
        return sel_vars_filled_df
    
    @staticmethod
    def drop_duplicated_values(df):
        df = df.drop_duplicates()
        return df
    
    # Combine processing stages
    def combiner_pipeline(self):
        raw_dataset = self.load_data()
        subset_df = self.subset_features(raw_dataset)
        subset_no_dup_df = self.drop_duplicated_values(subset_df)
        subset_filled_df = self.remove_ids_with_missing_values(subset_no_dup_df)
        return subset_filled_df



In [19]:

    
raw_set = RawSetProcessing(local_path, local_filename, my_variables_selection)
dataset_df = raw_set.combiner_pipeline()



In [20]:

    
dataset_df.head(2)









    Out[20]:






  
    
      
      iid
      pid
      match
      gender
      date
      go_out
      sports
      tvsports
      exercise
      dining
      museums
      art
      hiking
      gaming
      clubbing
      reading
      tv
      theater
      movies
      concerts
      music
      shopping
      yoga
    
  
  
    
      0
      1
      11.0
      0
      0
      7.0
      1.0
      9.0
      2.0
      8.0
      9.0
      1.0
      1.0
      5.0
      1.0
      5.0
      6.0
      9.0
      1.0
      10.0
      10.0
      9.0
      8.0
      1.0
    
    
      1
      1
      12.0
      0
      0
      7.0
      1.0
      9.0
      2.0
      8.0
      9.0
      1.0
      1.0
      5.0
      1.0
      5.0
      6.0
      9.0
      1.0
      10.0
      10.0
      9.0
      8.0
      1.0



In [21]:

    
# Number of unique participants
dataset_df.iid.nunique()









    Out[21]:





543



In [22]:

    
dataset_df.shape









    Out[22]:





(8271, 23)

Feature engineering



In [46]:

    
suffix_me = "_me"
suffix_partner = "_partner"
my_label = "match_perc"



In [47]:

    
class FeatureEngineering():
    """
    This class aims to load and clean the dataset.
    """
    def __init__(self,suffix_1, suffix_2, label):
        self.suffix_1 = suffix_1
        self.suffix_2 = suffix_2
        self.label = label
    
    def get_partner_features(self, df, ignore_vars=True):
        df_partner = df.copy()
        if ignore_vars is True:
            df_partner = df_partner.drop(['pid','match'], 1).drop_duplicates()
        else:
            df_partner = df_partner.copy()
        merged_datasets = df.merge(df_partner, how = "inner",left_on="pid", right_on="iid",suffixes=(self.suffix_1,self.suffix_2))
        return merged_datasets
    
    def add_success_failure_match(self, df):
        df['total_match'] = df['match'].groupby(df['iid']).transform('sum')
        df['total_dates'] = df['match'].groupby(df['iid']).transform('count')
        df['total_nomatch'] = df['total_dates'] - df['total_match']
        df['match_perc'] = df['total_match'] / df['total_dates']
        return df
    
    def label_to_categories(self, df):
        df['match_success'] = pd.cut(df[self.label], bins=(0,0.1,1), include_lowest=True)
        return df
    
    @staticmethod
    def aggregate_data(df):
        model_set = dataset_df.drop(["pid","match"],1)
        model_set = model_set.drop_duplicates()
        return model_set
    
    # Combine engineering stages
    def combiner_pipeline(self, df):
        add_match_feat_df = self.add_success_failure_match(df)
        labels_df = self.label_to_categories(df)
        model_set = self.aggregate_data(labels_df)
        return model_set



In [48]:

    
feat_eng = FeatureEngineering(suffix_me, suffix_partner, my_label)
feat_engineered_df = feat_eng.combiner_pipeline(dataset_df)

feat_engineered_df.head(5)









    Out[48]:






  
    
      
      iid
      gender
      date
      go_out
      sports
      tvsports
      exercise
      dining
      museums
      art
      hiking
      gaming
      clubbing
      reading
      tv
      theater
      movies
      concerts
      music
      shopping
      yoga
      total_match
      total_dates
      total_nomatch
      match_perc
      match_success
    
  
  
    
      0
      1
      0
      7.0
      1.0
      9.0
      2.0
      8.0
      9.0
      1.0
      1.0
      5.0
      1.0
      5.0
      6.0
      9.0
      1.0
      10.0
      10.0
      9.0
      8.0
      1.0
      4
      10
      6
      0.4
      (0.1, 1]
    
    
      10
      2
      0
      5.0
      1.0
      3.0
      2.0
      7.0
      10.0
      8.0
      6.0
      3.0
      5.0
      8.0
      10.0
      1.0
      9.0
      8.0
      7.0
      8.0
      3.0
      1.0
      2
      10
      8
      0.2
      (0.1, 1]
    
    
      20
      3
      0
      3.0
      1.0
      3.0
      8.0
      7.0
      8.0
      5.0
      5.0
      8.0
      4.0
      5.0
      7.0
      8.0
      7.0
      7.0
      7.0
      5.0
      8.0
      7.0
      0
      10
      10
      0.0
      [0, 0.1]
    
    
      30
      4
      0
      5.0
      1.0
      1.0
      1.0
      6.0
      7.0
      6.0
      7.0
      7.0
      5.0
      7.0
      7.0
      7.0
      9.0
      7.0
      8.0
      7.0
      1.0
      8.0
      2
      10
      8
      0.2
      (0.1, 1]
    
    
      40
      5
      0
      4.0
      1.0
      7.0
      4.0
      7.0
      7.0
      6.0
      8.0
      6.0
      6.0
      8.0
      6.0
      8.0
      6.0
      6.0
      3.0
      7.0
      8.0
      3.0
      2
      10
      8
      0.2
      (0.1, 1]

Modelling

This model aims to predict number of matches depending on interests of the person.

Variables:

gender
date (In general, how frequently do you go on dates?)
go out (How often do you go out (not necessarily on dates)?
sports: Playing sports/ athletics
tvsports: Watching sports
excersice: Body building/exercising
dining: Dining out
museums: Museums/galleries
art: Art
hiking: Hiking/camping
gaming: Gaming
clubbing: Dancing/clubbing
reading: Reading
tv: Watching TV
theater: Theater
movies: Movies
concerts: Going to concerts
music: Music
shopping: Shopping
yoga: Yoga/meditation



In [27]:

    
import sklearn
print (sklearn.__version__)



In [28]:

    
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import subprocess

Variables selection



In [29]:

    
#features = list(["gender","age_o","race_o","goal","samerace","imprace","imprelig","date","go_out","career_c"])
features = list(["gender","date","go_out","sports","tvsports","exercise","dining","museums","art",
                 "hiking","gaming","clubbing","reading","tv","theater","movies","concerts","music",
                 "shopping","yoga"])
label = "match_success"



In [30]:

    
#add suffix to each element of list
def process_features_names(features, suffix_1, suffix_2):
    features_me = [feat + suffix_1 for feat in features]
    features_partner = [feat + suffix_2 for feat in features]
    features_all = features_me + features_partner
    return features_all

#features_model = process_features_names(features, suffix_me, suffix_partner)



In [33]:

    
explanatory = feat_engineered_df[features]
explained = feat_engineered_df[label]

Decision Tree



In [34]:

    
clf = tree.DecisionTreeClassifier(min_samples_split=20,min_samples_leaf=10,max_depth=4)
clf = clf.fit(explanatory, explained)



In [35]:

    
# Download http://www.graphviz.org/

with open("data.dot", 'w') as f:
    f = tree.export_graphviz(clf, out_file=f, feature_names= features, class_names="match")
    
subprocess.call(['dot', '-Tpdf', 'data.dot', '-o' 'data.pdf'])









    Out[35]:





0



In [ ]:



In [ ]:



In [ ]:



In [ ]:

Tuning Parameters



In [193]:

    
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(explanatory, explained, test_size=0.5, random_state=0)



In [194]:

    
parameters = [
  {'criterion': ['gini','entropy'], 'max_depth': [4,6,10,12,14], 
   'min_samples_split': [10,20,30], 'min_samples_leaf': [10,15,20]
  }
]

scores = ['precision', 'recall']



In [195]:

    
dtc = tree.DecisionTreeClassifier()
clf = GridSearchCV(dtc, parameters,n_jobs=3, cv=5, refit=True)



In [204]:

    
#warnings.filterwarnings("ignore")

#for score in scores:
#    print("# Tuning hyper-parameters for %s" % score)
#    print("")

#    clf = GridSearchCV(dtc, parameters, cv=5,
#                       scoring='%s_macro' % score)
#    clf.fit(X_train, y_train)

#    print("Best parameters set found on development set:")
#    print("")
#    print(clf.best_params_)
#    print("")
#    y_true, y_pred = y_test, clf.predict(X_test)
#    print(classification_report(y_true, y_pred))
#    print("")



In [199]:

    
best_param_dtc = tree.DecisionTreeClassifier(criterion="entropy",min_samples_split=10,min_samples_leaf=10,max_depth=14)
best_param_dtc = best_param_dtc.fit(explanatory, explained)



In [200]:

    
best_param_dtc.feature_importances_









    Out[200]:





array([ 0.        ,  0.        ,  0.05548941,  0.09484015,  0.02814278,
        0.        ,  0.        ,  0.        ,  0.09036398,  0.10293257,
        0.07271205,  0.08113683,  0.06304781,  0.07858956,  0.        ,
        0.06668844,  0.09284968,  0.08549502,  0.03896343,  0.04874829])



In [ ]:



In [ ]:

Test



In [36]:

    
import unittest
from pandas.util.testing import assert_frame_equal

There is a weird thing, with self.XX the code does not work. I tried self.assertEqual



In [42]:

    
def get_partner_features(df, suffix_1, suffix_2, ignore_vars=True):
    df_partner = df.copy()
    if ignore_vars is True:
        df_partner = df_partner.drop(['pid','match'], 1).drop_duplicates()
    else:
        df_partner = df_partner.copy()
    merged_datasets = df.merge(df_partner, how = "inner",left_on="pid", right_on="iid",suffixes=(suffix_1,suffix_2))
    return merged_datasets



In [43]:

    
class FeatureEngineeringTest(unittest.TestCase):
    def test_get_partner_features(self):
        """

        :return:
        """
        # Given
        raw_data_a = {
        'iid': ['1', '2', '3', '4', '5','6'],
        'first_name': ['Sue', 'Maria', 'Sandra', 'Bill', 'Brian','Bruce'],
        'sport':['foot','run','volley','basket','swim','tv'],
        'pid': ['4', '5', '6', '1', '2','3'],}
        
        df_a = pd.DataFrame(raw_data_a, columns = ['iid', 'first_name', 'sport','pid'])
        
        expected_output_values = pd.DataFrame({
                'iid_me': ['1', '2', '3', '4', '5','6'],
                'first_name_me': ['Sue', 'Maria', 'Sandra', 'Bill', 'Brian','Bruce'],
                'sport_me': ['foot','run','volley','basket','swim','tv'],
                'pid_me': ['4', '5', '6', '1', '2','3'],
                'iid_partner': ['4', '5', '6', '1', '2','3'],
                'first_name_partner': ['Bill', 'Brian','Bruce','Sue', 'Maria', 'Sandra'],
                'sport_partner': ['basket','swim','tv','foot','run','volley'],
                'pid_partner':['1', '2', '3', '4', '5','6']
            }, columns = ['iid_me','first_name_me','sport_me','pid_me',
                          'iid_partner','first_name_partner','sport_partner','pid_partner'])

        # When

        output_values = get_partner_features(df_a, "_me","_partner",ignore_vars=False)

        # Then

        assert_frame_equal(output_values, expected_output_values)



In [44]:

    
suite = unittest.TestLoader().loadTestsFromTestCase(FeatureEngineeringTest)
unittest.TextTestRunner(verbosity=2).run(suite)









    



test_get_partner_features (__main__.FeatureEngineeringTest) ... ok

----------------------------------------------------------------------
Ran 1 test in 0.008s

OK






    Out[44]:





<unittest.runner.TextTestResult run=1 errors=0 failures=0>



In [ ]:

	iid	id	gender	idg	condtn	wave	round	position	positin1	order	partner	pid	match	int_corr	samerace	age_o	race_o	pf_o_att	pf_o_sin	pf_o_int	pf_o_fun	pf_o_amb	pf_o_sha	dec_o	attr_o	sinc_o	intel_o	fun_o	amb_o	shar_o	like_o	prob_o	met_o	age	field	field_cd	undergra	mn_sat	tuition	race	imprace	imprelig	from	zipcode	income	goal	date	go_out	career	career_c	sports	tvsports	exercise	dining	museums	art	hiking	gaming	clubbing	reading	tv	theater	movies	concerts	music	shopping	yoga	exphappy	expnum	attr1_1	sinc1_1	intel1_1	fun1_1	amb1_1	shar1_1	attr4_1	sinc4_1	intel4_1	fun4_1	amb4_1	shar4_1	attr2_1	sinc2_1	intel2_1	fun2_1	amb2_1	shar2_1	attr3_1	sinc3_1	fun3_1	intel3_1	amb3_1	attr5_1	sinc5_1	intel5_1	fun5_1	amb5_1	dec	attr	sinc	intel	fun	amb	shar	like	prob	met	match_es	attr1_s	sinc1_s	intel1_s	fun1_s	amb1_s	shar1_s	attr3_s	sinc3_s	intel3_s	fun3_s	amb3_s	satis_2	length	numdat_2	attr7_2	sinc7_2	intel7_2	fun7_2	amb7_2	shar7_2	attr1_2	sinc1_2	intel1_2	fun1_2	amb1_2	shar1_2	attr4_2	sinc4_2	intel4_2	fun4_2	amb4_2	shar4_2	attr2_2	sinc2_2	intel2_2	fun2_2	amb2_2	shar2_2	attr3_2	sinc3_2	intel3_2	fun3_2	amb3_2	attr5_2	sinc5_2	intel5_2	fun5_2	amb5_2	you_call	them_cal	date_3	numdat_3	num_in_3	attr1_3	sinc1_3	intel1_3	fun1_3	amb1_3	shar1_3	attr7_3	sinc7_3	intel7_3	fun7_3	amb7_3	shar7_3	attr4_3	sinc4_3	intel4_3	fun4_3	amb4_3	shar4_3	attr2_3	sinc2_3	intel2_3	fun2_3	amb2_3	shar2_3	attr3_3	sinc3_3	intel3_3	fun3_3	amb3_3	attr5_3	sinc5_3	intel5_3	fun5_3	amb5_3
0	1	1.0	0	1	1	1	10	7	NaN	4	1	11.0	0	0.14	0	27.0	2.0	35.0	20.0	20.0	20.0	0.0	5.0	0	6.0	8.0	8.0	8.0	8.0	6.0	7.0	4.0	2.0	21.0	Law	1.0	NaN	NaN	NaN	4.0	2.0	4.0	Chicago	60,521	69,487.00	2.0	7.0	1.0	lawyer	NaN	9.0	2.0	8.0	9.0	1.0	1.0	5.0	1.0	5.0	6.0	9.0	1.0	10.0	10.0	9.0	8.0	1.0	3.0	2.0	15.0	20.0	20.0	15.0	15.0	15.0	NaN	NaN	NaN	NaN	NaN	NaN	35.0	20.0	15.0	20.0	5.0	5.0	6.0	8.0	8.0	8.0	7.0	NaN	NaN	NaN	NaN	NaN	1	6.0	9.0	7.0	7.0	6.0	5.0	7.0	6.0	2.0	4.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6.0	2.0	1.0	NaN	NaN	NaN	NaN	NaN	NaN	19.44	16.67	13.89	22.22	11.11	16.67	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6.0	7.0	8.0	7.0	6.0	NaN	NaN	NaN	NaN	NaN	1.0	1.0	0.0	NaN	NaN	15.0	20.0	20.0	15.0	15.0	15.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	5.0	7.0	7.0	7.0	7.0	NaN	NaN	NaN	NaN	NaN
1	1	1.0	0	1	1	1	10	7	NaN	3	2	12.0	0	0.54	0	22.0	2.0	60.0	0.0	0.0	40.0	0.0	0.0	0	7.0	8.0	10.0	7.0	7.0	5.0	8.0	4.0	2.0	21.0	Law	1.0	NaN	NaN	NaN	4.0	2.0	4.0	Chicago	60,521	69,487.00	2.0	7.0	1.0	lawyer	NaN	9.0	2.0	8.0	9.0	1.0	1.0	5.0	1.0	5.0	6.0	9.0	1.0	10.0	10.0	9.0	8.0	1.0	3.0	2.0	15.0	20.0	20.0	15.0	15.0	15.0	NaN	NaN	NaN	NaN	NaN	NaN	35.0	20.0	15.0	20.0	5.0	5.0	6.0	8.0	8.0	8.0	7.0	NaN	NaN	NaN	NaN	NaN	1	7.0	8.0	7.0	8.0	5.0	6.0	7.0	5.0	1.0	4.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6.0	2.0	1.0	NaN	NaN	NaN	NaN	NaN	NaN	19.44	16.67	13.89	22.22	11.11	16.67	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6.0	7.0	8.0	7.0	6.0	NaN	NaN	NaN	NaN	NaN	1.0	1.0	0.0	NaN	NaN	15.0	20.0	20.0	15.0	15.0	15.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	5.0	7.0	7.0	7.0	7.0	NaN	NaN	NaN	NaN	NaN

	iid	date	go_out	sports	tvsports	exercise	dining	museums	art	hiking	gaming	clubbing	reading	tv	theater	movies	concerts	music	shopping	yoga	total_match	total_dates	total_nomatch	match_perc	match_success
0	1	7.0	1.0	9.0	2.0	8.0	9.0	1.0	1.0	5.0	1.0	5.0	6.0	9.0	1.0	10.0	10.0	9.0	8.0	1.0	4	10	6	0.4	(0.1, 1]
10	2	5.0	1.0	3.0	2.0	7.0	10.0	8.0	6.0	3.0	5.0	8.0	10.0	1.0	9.0	8.0	7.0	8.0	3.0	1.0	2	10	8	0.2	(0.1, 1]
20	3	3.0	1.0	3.0	8.0	7.0	8.0	5.0	5.0	8.0	4.0	5.0	7.0	8.0	7.0	7.0	7.0	5.0	8.0	7.0	0	10	10	0.0	[0, 0.1]
30	4	5.0	1.0	1.0	1.0	6.0	7.0	6.0	7.0	7.0	5.0	7.0	7.0	7.0	9.0	7.0	8.0	7.0	1.0	8.0	2	10	8	0.2	(0.1, 1]
40	5	4.0	1.0	7.0	4.0	7.0	7.0	6.0	8.0	6.0	6.0	8.0	6.0	8.0	6.0	6.0	3.0	7.0	8.0	3.0	2	10	8	0.2	(0.1, 1]