In [1]:
import pandas as pd
import numpy as np
import sklearn
pd.set_option('display.max_columns', None)
In [2]:
source_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"
In [3]:
raw_dataset = pd.read_csv(source_path + "Speed_Dating_Data.csv")
In [4]:
raw_dataset.head(3)
Out[4]:
In [5]:
raw_dataset_copy = raw_dataset
In [6]:
check1 = raw_dataset_copy[raw_dataset_copy["iid"] == 1]
check1_sel = check1[["iid", "pid", "match","gender","date","go_out","sports","tvsports","exercise","dining",
"museums","art","hiking","gaming","clubbing","reading","tv","theater","movies",
"concerts","music","shopping","yoga"]]
In [7]:
check1_sel.drop_duplicates().head(20)
Out[7]:
In [8]:
#merged_datasets = raw_dataset.merge(raw_dataset_copy, left_on="pid", right_on="iid")
#merged_datasets[["iid_x","gender_x","pid_y","gender_y"]].head(5)
#same_gender = merged_datasets[merged_datasets["gender_x"] == merged_datasets["gender_y"]]
#same_gender.head()
In [9]:
columns_by_types = raw_dataset.columns.to_series().groupby(raw_dataset.dtypes).groups
In [10]:
raw_dataset.dtypes.value_counts()
Out[10]:
In [11]:
raw_dataset.isnull().sum().head(3)
Out[11]:
In [12]:
summary = raw_dataset.describe() #.transpose()
print summary
In [13]:
#raw_dataset.groupby("gender").agg({"iid": pd.Series.nunique})
raw_dataset.groupby('gender').iid.nunique()
Out[13]:
In [14]:
raw_dataset.groupby('career').iid.nunique().sort_values(ascending=False).head(5)
Out[14]:
In [15]:
raw_dataset.groupby(["gender","match"]).iid.nunique()
Out[15]:
In [16]:
local_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"
local_filename = "Speed_Dating_Data.csv"
my_variables_selection = ["iid", "pid", "match","gender","date","go_out","sports","tvsports","exercise","dining",
"museums","art","hiking","gaming","clubbing","reading","tv","theater","movies",
"concerts","music","shopping","yoga"]
In [17]:
class RawSetProcessing(object):
"""
This class aims to load and clean the dataset.
"""
def __init__(self,source_path,filename,features):
self.source_path = source_path
self.filename = filename
self.features = features
# Load data
def load_data(self):
raw_dataset_df = pd.read_csv(self.source_path + self.filename)
return raw_dataset_df
# Select variables to process and include in the model
def subset_features(self, df):
sel_vars_df = df[self.features]
return sel_vars_df
@staticmethod
# Remove ids with missing values
def remove_ids_with_missing_values(df):
sel_vars_filled_df = df.dropna()
return sel_vars_filled_df
@staticmethod
def drop_duplicated_values(df):
df = df.drop_duplicates()
return df
# Combine processing stages
def combiner_pipeline(self):
raw_dataset = self.load_data()
subset_df = self.subset_features(raw_dataset)
subset_no_dup_df = self.drop_duplicated_values(subset_df)
subset_filled_df = self.remove_ids_with_missing_values(subset_no_dup_df)
return subset_filled_df
In [18]:
raw_set = RawSetProcessing(local_path, local_filename, my_variables_selection)
dataset_df = raw_set.combiner_pipeline()
In [19]:
dataset_df.head(3)
Out[19]:
In [20]:
# Number of unique participants
dataset_df.iid.nunique()
Out[20]:
In [21]:
dataset_df.shape
Out[21]:
In [22]:
suffix_me = "_me"
suffix_partner = "_partner"
In [23]:
def get_partner_features(df, suffix_1, suffix_2, ignore_vars=True):
#print df[df["iid"] == 1]
df_partner = df.copy()
if ignore_vars is True:
df_partner = df_partner.drop(['pid','match'], 1).drop_duplicates()
else:
df_partner = df_partner.copy()
#print df_partner.shape
merged_datasets = df.merge(df_partner, how = "inner",left_on="pid", right_on="iid",suffixes=(suffix_1,suffix_2))
#print merged_datasets[merged_datasets["iid_me"] == 1]
return merged_datasets
In [24]:
feat_eng_df = get_partner_features(dataset_df,suffix_me,suffix_partner)
feat_eng_df.head(3)
Out[24]:
This model aims to answer the questions what is the profile of the persons regarding interests that got the most matches.
Variables:
In [25]:
import sklearn
print sklearn.__version__
In [26]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import subprocess
In [27]:
#features = list(["gender","age_o","race_o","goal","samerace","imprace","imprelig","date","go_out","career_c"])
features = list(["gender","date","go_out","sports","tvsports","exercise","dining","museums","art",
"hiking","gaming","clubbing","reading","tv","theater","movies","concerts","music",
"shopping","yoga"])
label = "match"
In [28]:
#add suffix to each element of list
def process_features_names(features, suffix_1, suffix_2):
print features
print suffix_1
print suffix_2
features_me = [feat + suffix_1 for feat in features]
features_partner = [feat + suffix_2 for feat in features]
features_all = features_me + features_partner
print features_all
return features_all
features_model = process_features_names(features, suffix_me, suffix_partner)
In [29]:
feat_eng_df.head(5)
Out[29]:
In [30]:
explanatory = feat_eng_df[features_model]
explained = feat_eng_df[label]
In [31]:
clf = tree.DecisionTreeClassifier(min_samples_split=20,min_samples_leaf=10,max_depth=4)
clf = clf.fit(explanatory, explained)
In [32]:
# Download http://www.graphviz.org/
with open("data.dot", 'w') as f:
f = tree.export_graphviz(clf, out_file=f, feature_names= features_model, class_names="match")
import subprocess
subprocess.call(['dot', '-Tpdf', 'data.dot', '-o' 'data.pdf'])
Out[32]:
In [33]:
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(explanatory, explained, test_size=0.3, random_state=0)
In [34]:
parameters = [
{'criterion': ['gini','entropy'], 'max_depth': [4,6,10,12,14],
'min_samples_split': [10,20,30], 'min_samples_leaf': [10,15,20]
}
]
scores = ['precision', 'recall']
In [35]:
dtc = tree.DecisionTreeClassifier()
clf = GridSearchCV(dtc, parameters,n_jobs=3, cv=5, refit=True)
In [36]:
for score in scores:
print("# Tuning hyper-parameters for %s" % score)
print("")
clf = GridSearchCV(dtc, parameters, cv=5,
scoring='%s_macro' % score)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print("")
print(clf.best_params_)
print("")
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print("")
In [37]:
best_param_dtc = tree.DecisionTreeClassifier(criterion="entropy",min_samples_split=10,min_samples_leaf=10,max_depth=14)
best_param_dtc = best_param_dtc.fit(explanatory, explained)
In [38]:
best_param_dtc.feature_importances_
Out[38]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [9]:
raw_dataset.rename(columns={"age_o":"age_of_partner","race_o":"race_of_partner"},inplace=True)
In [ ]:
In [ ]:
In [39]:
import unittest
In [40]:
class FeatureEngineeringTest(unittest.TestCase):
def test_get_partner_features(self):
"""
:return:
"""
# Given
raw_data_a = {
'iid': ['1', '2', '3', '4', '5','6'],
'first_name': ['Sue', 'Maria', 'Sandra', 'Bill', 'Brian','Bruce'],
'sport':['foot','run','volley','basket','swim','tv'],
'pid': ['4', '5', '6', '1', '2','3'],}
df_a = pd.DataFrame(raw_data_a, columns = ['iid', 'first_name', 'sport','pid'])
expected_output_values = pd.DataFrame({
'iid_me': ['1', '2', '3', '4', '5','6'],
'first_name_me': ['Sue', 'Maria', 'Sandra', 'Bill', 'Brian','Bruce'],
'sport_me': ['foot','run','volley','basket','swim','tv'],
'pid_me': ['4', '5', '6', '1', '2','3'],
'iid_partner': ['4', '5', '6', '1', '2','3'],
'first_name_partner': ['Bill', 'Brian','Bruce','Sue', 'Maria', 'Sandra'],
'sport_partner': ['basket','swim','tv','foot','run','volley'],
'pid_partner':['1', '2', '3', '4', '5','6']
}, columns = ['iid_me','first_name_me','sport_me','pid_me',
'iid_partner','first_name_partner','sport_partner','pid_partner'])
# When
output_values = get_partner_features(df_a, "_me","_partner",ignore_vars=False)
print output_values
# Then
self.assertItemsEqual(output_values, expected_output_values)
In [41]:
suite = unittest.TestLoader().loadTestsFromTestCase(FeatureEngineeringTest)
unittest.TextTestRunner(verbosity=2).run(suite)
Out[41]:
In [ ]: