In [1]:
import pandas as pd
import numpy as np
import sklearn
import subprocess
import warnings
pd.set_option('display.max_columns', None)
In [2]:
source_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"
In [3]:
# coding: ISO-8859-1
In [4]:
raw_dataset = pd.read_csv(source_path + "Speed_Dating_Data.csv",encoding = "ISO-8859-1")
In [5]:
raw_dataset.head(2)
Out[5]:
In [6]:
raw_dataset_copy = raw_dataset
In [7]:
check1 = raw_dataset_copy[raw_dataset_copy["iid"] == 1]
check1_sel = check1[["iid", "pid", "match","gender","date","go_out","sports","tvsports","exercise","dining",
"museums","art","hiking","gaming","clubbing","reading","tv","theater","movies",
"concerts","music","shopping","yoga"]]
In [8]:
check1_sel.drop_duplicates().head(2)
Out[8]:
In [9]:
#merged_datasets = raw_dataset.merge(raw_dataset_copy, left_on="pid", right_on="iid")
#merged_datasets[["iid_x","gender_x","pid_y","gender_y"]].head(5)
#same_gender = merged_datasets[merged_datasets["gender_x"] == merged_datasets["gender_y"]]
#same_gender.head()
In [10]:
columns_by_types = raw_dataset.columns.to_series().groupby(raw_dataset.dtypes).groups
In [11]:
raw_dataset.dtypes.value_counts()
Out[11]:
In [12]:
raw_dataset.isnull().sum().head(3)
Out[12]:
In [13]:
summary = raw_dataset.describe() #.transpose()
#print (summary.head())
In [14]:
#raw_dataset.groupby("gender").agg({"iid": pd.Series.nunique})
raw_dataset.groupby('gender').iid.nunique()
Out[14]:
In [15]:
raw_dataset.groupby('career').iid.nunique().sort_values(ascending=False).head(5)
Out[15]:
In [16]:
raw_dataset.groupby(["gender","match"]).iid.nunique()
Out[16]:
In [17]:
local_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"
local_filename = "Speed_Dating_Data.csv"
my_variables_selection = ["iid", "pid", "match","gender","date","go_out","sports","tvsports","exercise","dining",
"museums","art","hiking","gaming","clubbing","reading","tv","theater","movies",
"concerts","music","shopping","yoga"]
In [18]:
class RawSetProcessing(object):
"""
This class aims to load and clean the dataset.
"""
def __init__(self,source_path,filename,features):
self.source_path = source_path
self.filename = filename
self.features = features
# Load data
def load_data(self):
raw_dataset_df = pd.read_csv(self.source_path + self.filename,encoding = "ISO-8859-1")
return raw_dataset_df
# Select variables to process and include in the model
def subset_features(self, df):
sel_vars_df = df[self.features]
return sel_vars_df
@staticmethod
# Remove ids with missing values
def remove_ids_with_missing_values(df):
sel_vars_filled_df = df.dropna()
return sel_vars_filled_df
@staticmethod
def drop_duplicated_values(df):
df = df.drop_duplicates()
return df
# Combine processing stages
def combiner_pipeline(self):
raw_dataset = self.load_data()
subset_df = self.subset_features(raw_dataset)
subset_no_dup_df = self.drop_duplicated_values(subset_df)
subset_filled_df = self.remove_ids_with_missing_values(subset_no_dup_df)
return subset_filled_df
In [19]:
raw_set = RawSetProcessing(local_path, local_filename, my_variables_selection)
dataset_df = raw_set.combiner_pipeline()
In [20]:
dataset_df.head(2)
Out[20]:
In [21]:
# Number of unique participants
dataset_df.iid.nunique()
Out[21]:
In [22]:
dataset_df.shape
Out[22]:
In [46]:
suffix_me = "_me"
suffix_partner = "_partner"
my_label = "match_perc"
In [47]:
class FeatureEngineering():
"""
This class aims to load and clean the dataset.
"""
def __init__(self,suffix_1, suffix_2, label):
self.suffix_1 = suffix_1
self.suffix_2 = suffix_2
self.label = label
def get_partner_features(self, df, ignore_vars=True):
df_partner = df.copy()
if ignore_vars is True:
df_partner = df_partner.drop(['pid','match'], 1).drop_duplicates()
else:
df_partner = df_partner.copy()
merged_datasets = df.merge(df_partner, how = "inner",left_on="pid", right_on="iid",suffixes=(self.suffix_1,self.suffix_2))
return merged_datasets
def add_success_failure_match(self, df):
df['total_match'] = df['match'].groupby(df['iid']).transform('sum')
df['total_dates'] = df['match'].groupby(df['iid']).transform('count')
df['total_nomatch'] = df['total_dates'] - df['total_match']
df['match_perc'] = df['total_match'] / df['total_dates']
return df
def label_to_categories(self, df):
df['match_success'] = pd.cut(df[self.label], bins=(0,0.1,1), include_lowest=True)
return df
@staticmethod
def aggregate_data(df):
model_set = dataset_df.drop(["pid","match"],1)
model_set = model_set.drop_duplicates()
return model_set
# Combine engineering stages
def combiner_pipeline(self, df):
add_match_feat_df = self.add_success_failure_match(df)
labels_df = self.label_to_categories(df)
model_set = self.aggregate_data(labels_df)
return model_set
In [48]:
feat_eng = FeatureEngineering(suffix_me, suffix_partner, my_label)
feat_engineered_df = feat_eng.combiner_pipeline(dataset_df)
feat_engineered_df.head(5)
Out[48]:
This model aims to predict number of matches depending on interests of the person.
Variables:
In [27]:
import sklearn
print (sklearn.__version__)
In [28]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import subprocess
In [29]:
#features = list(["gender","age_o","race_o","goal","samerace","imprace","imprelig","date","go_out","career_c"])
features = list(["gender","date","go_out","sports","tvsports","exercise","dining","museums","art",
"hiking","gaming","clubbing","reading","tv","theater","movies","concerts","music",
"shopping","yoga"])
label = "match_success"
In [30]:
#add suffix to each element of list
def process_features_names(features, suffix_1, suffix_2):
features_me = [feat + suffix_1 for feat in features]
features_partner = [feat + suffix_2 for feat in features]
features_all = features_me + features_partner
return features_all
#features_model = process_features_names(features, suffix_me, suffix_partner)
In [33]:
explanatory = feat_engineered_df[features]
explained = feat_engineered_df[label]
In [34]:
clf = tree.DecisionTreeClassifier(min_samples_split=20,min_samples_leaf=10,max_depth=4)
clf = clf.fit(explanatory, explained)
In [35]:
# Download http://www.graphviz.org/
with open("data.dot", 'w') as f:
f = tree.export_graphviz(clf, out_file=f, feature_names= features, class_names="match")
subprocess.call(['dot', '-Tpdf', 'data.dot', '-o' 'data.pdf'])
Out[35]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [193]:
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(explanatory, explained, test_size=0.5, random_state=0)
In [194]:
parameters = [
{'criterion': ['gini','entropy'], 'max_depth': [4,6,10,12,14],
'min_samples_split': [10,20,30], 'min_samples_leaf': [10,15,20]
}
]
scores = ['precision', 'recall']
In [195]:
dtc = tree.DecisionTreeClassifier()
clf = GridSearchCV(dtc, parameters,n_jobs=3, cv=5, refit=True)
In [204]:
#warnings.filterwarnings("ignore")
#for score in scores:
# print("# Tuning hyper-parameters for %s" % score)
# print("")
# clf = GridSearchCV(dtc, parameters, cv=5,
# scoring='%s_macro' % score)
# clf.fit(X_train, y_train)
# print("Best parameters set found on development set:")
# print("")
# print(clf.best_params_)
# print("")
# y_true, y_pred = y_test, clf.predict(X_test)
# print(classification_report(y_true, y_pred))
# print("")
In [199]:
best_param_dtc = tree.DecisionTreeClassifier(criterion="entropy",min_samples_split=10,min_samples_leaf=10,max_depth=14)
best_param_dtc = best_param_dtc.fit(explanatory, explained)
In [200]:
best_param_dtc.feature_importances_
Out[200]:
In [ ]:
In [ ]:
In [36]:
import unittest
from pandas.util.testing import assert_frame_equal
There is a weird thing, with self.XX the code does not work. I tried self.assertEqual
In [42]:
def get_partner_features(df, suffix_1, suffix_2, ignore_vars=True):
df_partner = df.copy()
if ignore_vars is True:
df_partner = df_partner.drop(['pid','match'], 1).drop_duplicates()
else:
df_partner = df_partner.copy()
merged_datasets = df.merge(df_partner, how = "inner",left_on="pid", right_on="iid",suffixes=(suffix_1,suffix_2))
return merged_datasets
In [43]:
class FeatureEngineeringTest(unittest.TestCase):
def test_get_partner_features(self):
"""
:return:
"""
# Given
raw_data_a = {
'iid': ['1', '2', '3', '4', '5','6'],
'first_name': ['Sue', 'Maria', 'Sandra', 'Bill', 'Brian','Bruce'],
'sport':['foot','run','volley','basket','swim','tv'],
'pid': ['4', '5', '6', '1', '2','3'],}
df_a = pd.DataFrame(raw_data_a, columns = ['iid', 'first_name', 'sport','pid'])
expected_output_values = pd.DataFrame({
'iid_me': ['1', '2', '3', '4', '5','6'],
'first_name_me': ['Sue', 'Maria', 'Sandra', 'Bill', 'Brian','Bruce'],
'sport_me': ['foot','run','volley','basket','swim','tv'],
'pid_me': ['4', '5', '6', '1', '2','3'],
'iid_partner': ['4', '5', '6', '1', '2','3'],
'first_name_partner': ['Bill', 'Brian','Bruce','Sue', 'Maria', 'Sandra'],
'sport_partner': ['basket','swim','tv','foot','run','volley'],
'pid_partner':['1', '2', '3', '4', '5','6']
}, columns = ['iid_me','first_name_me','sport_me','pid_me',
'iid_partner','first_name_partner','sport_partner','pid_partner'])
# When
output_values = get_partner_features(df_a, "_me","_partner",ignore_vars=False)
# Then
assert_frame_equal(output_values, expected_output_values)
In [44]:
suite = unittest.TestLoader().loadTestsFromTestCase(FeatureEngineeringTest)
unittest.TextTestRunner(verbosity=2).run(suite)
Out[44]:
In [ ]: