In [44]:
import pandas as pd
import numpy as np
import sklearn
import subprocess
import warnings
pd.set_option('display.max_columns', None)
In [45]:
source_path = "/home/dolounet/dev/workshops/"
In [46]:
# coding: ISO-8859-1
In [47]:
raw_dataset = pd.read_csv(source_path + "Speed_Dating_Data.csv",encoding = "ISO-8859-1")
In [48]:
raw_dataset.head(2)
Out[48]:
In [49]:
raw_dataset_copy = raw_dataset
In [50]:
columns_by_types = raw_dataset.columns.to_series().groupby(raw_dataset.dtypes).groups
In [51]:
raw_dataset.dtypes.value_counts()
Out[51]:
In [52]:
raw_dataset.isnull().sum().head(3)
Out[52]:
In [53]:
summary = raw_dataset.describe() #.transpose()
#print (summary.head())
In [54]:
#raw_dataset.groupby("gender").agg({"iid": pd.Series.nunique})
raw_dataset.groupby('gender').iid.nunique()
Out[54]:
In [55]:
raw_dataset.groupby('career').iid.nunique().sort_values(ascending=False).head(5)
Out[55]:
In [56]:
raw_dataset.groupby(["gender","match"]).iid.nunique()
Out[56]:
In [58]:
from love_matcher.refactored.main_cut import MainClass, RawSetProcessing, Trainer, FeatureEngineering, TuneParameters
local_path = source_path
local_filename = "Speed_Dating_Data.csv"
main_class = MainClass(workspace=local_path)
raw_dataframe = main_class.read_dataframe()
my_variables_selection = ["iid", "pid", "match","gender","date","go_out","sports","tvsports","exercise","dining",
"museums","art","hiking","gaming","clubbing","reading","tv","theater","movies",
"concerts","music","shopping","yoga"]
In [59]:
raw_set = RawSetProcessing(features=my_variables_selection, dataframe=raw_dataframe)
dataset_df = raw_set.combiner_pipeline()
In [60]:
dataset_df.head(2)
Out[60]:
In [61]:
# Number of unique participants
dataset_df.iid.nunique()
Out[61]:
In [62]:
dataset_df.shape
Out[62]:
In [63]:
suffix_me = "_me"
suffix_partner = "_partner"
my_label = "match_perc"
In [64]:
feat_eng = FeatureEngineering(suffix_me, suffix_partner, my_label)
feat_engineered_df = feat_eng.get_partner_features(dataset_df)
In [65]:
feat_engineered_df.head(2)
Out[65]:
In [66]:
feat_engineered_df.groupby("match").iid_me.count()
Out[66]:
This model aims to predict the match depending on interests of the person.
Variables:
In [67]:
import sklearn
print (sklearn.__version__)
In [68]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import subprocess
In [69]:
features = list(['iid',"gender","date","go_out","sports","tvsports","exercise","dining","museums","art",
"hiking","gaming","clubbing","reading","tv","theater","movies","concerts","music",
"shopping","yoga"])
label = "match"
In [70]:
#add suffix to each element of list
def process_features_names(features, suffix_1, suffix_2):
features_me = [feat + suffix_1 for feat in features]
features_partner = [feat + suffix_2 for feat in features]
features_all = features_me + features_partner
return features_all
features_model = process_features_names(features, suffix_me, suffix_partner)
In [71]:
explanatory = feat_engineered_df[features_model]
explained = feat_engineered_df[label]
In [72]:
explanatory[explanatory["iid_me"] == 1].head(5)
Out[72]:
In [ ]:
In [73]:
from sklearn import ensemble
In [74]:
warnings.filterwarnings("ignore")
In [75]:
# Parameters for Random Forest
parameters = [
{'max_depth': [8,10,12,14,16,18],
'min_samples_split': [10,15,20,25,30],
'min_samples_leaf': [10,15,20,25,30]
}
]
scores = ['precision', 'recall']
In [76]:
RFModel = ensemble.RandomForestClassifier(n_estimators=5, oob_score=False)
In [82]:
tune = TuneParameters(explanatory, explained, RFModel, parameters, scores)
best_parameters = tune.combiner_pipeline()
x_train, x_test, y_train, y_test = tune.create_train_test_splits()
In [83]:
estimator_RFC = ensemble.RandomForestClassifier()
In [84]:
best_parameters
Out[84]:
In [87]:
train = Trainer(x_train, y_train, x_test, y_test, best_parameters)
estimator, score_train, score_test = train.combiner_pipeline()
print (estimator, score_train, score_test)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: