In [1]:
import pandas as pd
import numpy as np
import sklearn
import subprocess
import warnings
pd.set_option('display.max_columns', None)
In [2]:
source_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"
In [3]:
# coding: ISO-8859-1
In [4]:
raw_dataset = pd.read_csv(source_path + "Speed_Dating_Data.csv",encoding = "ISO-8859-1")
In [5]:
raw_dataset.head(2)
Out[5]:
In [6]:
raw_dataset_copy = raw_dataset
In [7]:
columns_by_types = raw_dataset.columns.to_series().groupby(raw_dataset.dtypes).groups
In [8]:
raw_dataset.dtypes.value_counts()
Out[8]:
In [9]:
raw_dataset.isnull().sum().head(3)
Out[9]:
In [10]:
summary = raw_dataset.describe() #.transpose()
#print (summary.head())
In [11]:
#raw_dataset.groupby("gender").agg({"iid": pd.Series.nunique})
raw_dataset.groupby('gender').iid.nunique()
Out[11]:
In [12]:
raw_dataset.groupby('career').iid.nunique().sort_values(ascending=False).head(5)
Out[12]:
In [13]:
raw_dataset.groupby(["gender","match"]).iid.nunique()
Out[13]:
In [14]:
local_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"
local_filename = "Speed_Dating_Data.csv"
my_variables_selection = ["iid", "pid", "match","gender","date","go_out","sports","tvsports","exercise","dining",
"museums","art","hiking","gaming","clubbing","reading","tv","theater","movies",
"concerts","music","shopping","yoga"]
In [15]:
class RawSetProcessing():
"""
This class aims to load and clean the dataset.
"""
def __init__(self,source_path,filename,features):
self.source_path = source_path
self.filename = filename
self.features = features
# Load data
def load_data(self):
raw_dataset_df = pd.read_csv(self.source_path + self.filename,encoding = "ISO-8859-1")
return raw_dataset_df
# Select variables to process and include in the model
def subset_features(self, df):
sel_vars_df = df[self.features]
return sel_vars_df
@staticmethod
# Remove ids with missing values
def remove_ids_with_missing_values(df):
sel_vars_filled_df = df.dropna()
return sel_vars_filled_df
@staticmethod
def drop_duplicated_values(df):
df = df.drop_duplicates()
return df
# Combine processing stages
def combiner_pipeline(self):
raw_dataset = self.load_data()
subset_df = self.subset_features(raw_dataset)
subset_no_dup_df = self.drop_duplicated_values(subset_df)
subset_filled_df = self.remove_ids_with_missing_values(subset_no_dup_df)
return subset_filled_df
In [16]:
raw_set = RawSetProcessing(local_path, local_filename, my_variables_selection)
dataset_df = raw_set.combiner_pipeline()
In [17]:
dataset_df.head(2)
Out[17]:
In [18]:
# Number of unique participants
dataset_df.iid.nunique()
Out[18]:
In [19]:
dataset_df.shape
Out[19]:
In [20]:
suffix_me = "_me"
suffix_partner = "_partner"
my_label = "match_perc"
In [21]:
class FeatureEngineering():
"""
This class aims to load and clean the dataset.
"""
def __init__(self,suffix_1, suffix_2, label):
self.suffix_1 = suffix_1
self.suffix_2 = suffix_2
self.label = label
def get_partner_features(self, df, ignore_vars=True):
df_partner = df.copy()
if ignore_vars is True:
df_partner = df_partner.drop(['pid','match'], 1).drop_duplicates()
else:
df_partner = df_partner.copy()
merged_datasets = df.merge(df_partner, how = "inner",left_on="pid", right_on="iid",suffixes=(self.suffix_1,self.suffix_2))
return merged_datasets
def add_success_failure_match(self, df):
df['total_match'] = df['match'].groupby(df['iid']).transform('sum')
df['total_dates'] = df['match'].groupby(df['iid']).transform('count')
df['total_nomatch'] = df['total_dates'] - df['total_match']
df['match_perc'] = df['total_match'] / df['total_dates']
return df
def label_to_categories(self, df):
df['match_success'] = pd.cut(df[self.label], bins=(0,0.2,1), include_lowest=True)
return df
@staticmethod
def aggregate_data(df):
model_set = dataset_df.drop(["pid","match"],1)
model_set = model_set.drop_duplicates()
return model_set
# Combine engineering stages
def combiner_pipeline(self, df):
add_match_feat_df = self.add_success_failure_match(df)
labels_df = self.label_to_categories(df)
model_set = self.aggregate_data(labels_df)
return model_set
In [22]:
feat_eng = FeatureEngineering(suffix_me, suffix_partner, my_label)
#feat_engineered_model_1_df = feat_eng.combiner_pipeline(dataset_df)
#feat_engineered_model_1_df.head(2)
In [23]:
feat_engineered_df = feat_eng.get_partner_features(dataset_df)
feat_engineered_df.head(2)
Out[23]:
In [24]:
feat_engineered_df.groupby("match").iid_me.count()
Out[24]:
This model aims to predict the match depending on interests of the person.
Variables:
In [25]:
import sklearn
print (sklearn.__version__)
In [26]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import subprocess
In [27]:
#features = list(["gender","age_o","race_o","goal","samerace","imprace","imprelig","date","go_out","career_c"])
features = list(['iid',"gender","date","go_out","sports","tvsports","exercise","dining","museums","art",
"hiking","gaming","clubbing","reading","tv","theater","movies","concerts","music",
"shopping","yoga"])
label = "match"
In [28]:
#add suffix to each element of list
def process_features_names(features, suffix_1, suffix_2):
features_me = [feat + suffix_1 for feat in features]
features_partner = [feat + suffix_2 for feat in features]
features_all = features_me + features_partner
return features_all
features_model = process_features_names(features, suffix_me, suffix_partner)
In [29]:
explanatory = feat_engineered_df[features_model]
explained = feat_engineered_df[label]
In [30]:
explanatory[explanatory["iid_me"] == 1].head(5)
Out[30]:
In [ ]:
In [31]:
from sklearn import ensemble
In [32]:
warnings.filterwarnings("ignore")
In [33]:
# Parameters for Random Forest
parameters = [
{'max_depth': [8,10,12,14,16,18],
'min_samples_split': [10,15,20,25,30],
'min_samples_leaf': [10,15,20,25,30]
}
]
scores = ['precision', 'recall']
In [34]:
RFModel = ensemble.RandomForestClassifier(n_estimators=5, oob_score=False)
In [41]:
class TuneParameters():
def __init__(self, explanatory_vars, explained_var, estimator, parameters, scores):
self.explanatory_vars = explanatory_vars
self.explained_var = explained_var
self.estimator = estimator
self.parameters = parameters
self.scores = scores
def create_train_test_splits(self):
X_train, X_test, y_train, y_test = train_test_split(self.explanatory_vars, self.explained_var,
test_size = 0.5, random_state = 0,
stratify = self.explained_var)
return X_train, X_test, y_train, y_test
def tuning_parameters(self, trainset_x, testset_x, trainset_y, testset_y):
for score in self.scores:
print("# Tuning hyper-parameters for %s" % score)
print("")
grid_rfc = GridSearchCV(self.estimator, self.parameters,n_jobs=100, cv=10, refit=True,
scoring='%s_macro' % score)
grid_rfc.fit(trainset_x, trainset_y)
print("Best parameters set found on development set:")
print("")
print(grid_rfc.best_params_)
print("")
y_true, y_pred = testset_y, grid_rfc.predict(testset_x)
print(classification_report(y_true, y_pred))
print("")
best_parameters = grid_rfc.best_estimator_.get_params()
return best_parameters
def combiner_pipeline(self):
X_train, X_test, y_train, y_test = self.create_train_test_splits()
best_params = self.tuning_parameters(X_train, X_test, y_train, y_test)
return best_params
In [42]:
tune = TuneParameters(explanatory, explained, RFModel, parameters, scores)
best_parameters = tune.combiner_pipeline()
X_train, X_test, y_train, y_test = tune.create_train_test_splits()
In [43]:
estimator_RFC = ensemble.RandomForestClassifier()
In [44]:
best_parameters
Out[44]:
In [45]:
class Trainer():
def __init__(self, X_train, y_train, best_params):
self.X_train = X_train
self.y_train = y_train
self.X_test = X_test
self.y_test = y_test
self.estimator = None
self.best_params = best_params
def build_best_estimator(self):
params = self.best_params
model = ensemble.RandomForestClassifier(**params)
self.estimator = model.fit(self.X_train,self.y_train)
return self.estimator
def score_estimator_train(self):
return self.estimator.score(self.X_train,self.y_train)
def score_estimator_test(self):
return self.estimator.score(self.X_test,self.y_test)
def combiner_pipeline(self):
self.estimator = self.build_best_estimator()
score_train = self.score_estimator_train()
score_test = self.score_estimator_test()
return self.estimator, score_train, score_test
In [46]:
train = Trainer(X_train, y_train, best_parameters)
estimator, score_train, score_test = train.combiner_pipeline()
print (estimator, score_train, score_test)
In [ ]:
In [38]:
import unittest
from pandas.util.testing import assert_frame_equal
There is a weird thing, with self.XX the code does not work. I tried self.assertEqual
In [39]:
def get_partner_features(df, suffix_1, suffix_2, ignore_vars=True):
df_partner = df.copy()
if ignore_vars is True:
df_partner = df_partner.drop(['pid','match'], 1).drop_duplicates()
else:
df_partner = df_partner.copy()
merged_datasets = df.merge(df_partner, how = "inner",left_on="pid", right_on="iid",suffixes=(suffix_1,suffix_2))
return merged_datasets
In [46]:
class FeatureEngineeringTest(unittest.TestCase):
def test_get_partner_features(self):
"""
:return:
"""
# Given
raw_data_a = {
'iid': ['1', '2', '3', '4', '5','6'],
'first_name': ['Sue', 'Maria', 'Sandra', 'Bill', 'Brian','Bruce'],
'sport':['foot','run','volley','basket','swim','tv'],
'pid': ['4', '5', '6', '1', '2','3'],}
df_a = pd.DataFrame(raw_data_a, columns = ['iid', 'first_name', 'sport','pid'])
expected_data = {
'iid_me': ['1', '2', '3', '4', '5','6'],
'first_name_me': ['Sue', 'Maria', 'Sandra', 'Bill', 'Brian','Bruce'],
'sport_me': ['foot','run','volley','basket','swim','tv'],
'pid_me': ['4', '5', '6', '1', '2','3'],
'iid_partner': ['4', '5', '6', '1', '2','3'],
'first_name_partner': ['Bill', 'Brian','Bruce','Sue', 'Maria', 'Sandra'],
'sport_partner': ['basket','swim','tv','foot','run','volley'],
'pid_partner':['1', '2', '3', '4', '5','6']}
expected_output_values = pd.DataFrame(expected_data,
columns = ['iid_me','first_name_me','sport_me','pid_me',
'iid_partner','first_name_partner','sport_partner',
'pid_partner'])
# When
output_values = get_partner_features(df_a, "_me","_partner",ignore_vars=False)
# Then
assert_frame_equal(output_values, expected_output_values)
In [47]:
suite = unittest.TestLoader().loadTestsFromTestCase(FeatureEngineeringTest)
unittest.TextTestRunner(verbosity=2).run(suite)
Out[47]:
In [ ]: