In [1]:
import pandas as pd
import numpy as np
import sklearn
pd.set_option('display.max_columns', None)
In [2]:
source_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"
In [3]:
raw_dataset = pd.read_csv(source_path + "Speed_Dating_Data.csv")
In [4]:
raw_dataset.head(3)
Out[4]:
In [5]:
raw_dataset_copy = raw_dataset
In [6]:
#merged_datasets = raw_dataset.merge(raw_dataset_copy, left_on="pid", right_on="iid")
#merged_datasets[["iid_x","gender_x","pid_y","gender_y"]].head(5)
#same_gender = merged_datasets[merged_datasets["gender_x"] == merged_datasets["gender_y"]]
#same_gender.head()
In [7]:
columns_by_types = raw_dataset.columns.to_series().groupby(raw_dataset.dtypes).groups
In [8]:
raw_dataset.dtypes.value_counts()
Out[8]:
In [9]:
raw_dataset.isnull().sum().head(3)
Out[9]:
In [10]:
summary = raw_dataset.describe() #.transpose()
print summary
In [11]:
#raw_dataset.groupby("gender").agg({"iid": pd.Series.nunique})
raw_dataset.groupby('gender').iid.nunique()
Out[11]:
In [12]:
raw_dataset.groupby('career').iid.nunique().sort_values(ascending=False).head(5)
Out[12]:
In [13]:
raw_dataset.groupby(["gender","match"]).iid.nunique()
Out[13]:
In [14]:
local_path = "/Users/sandrapietrowska/Documents/Trainings/luigi/data_source/"
local_filename = "Speed_Dating_Data.csv"
my_variables_selection = ["iid", "pid", "match","gender","date","go_out","sports","tvsports","exercise","dining",
"museums","art","hiking","gaming","clubbing","reading","tv","theater","movies",
"concerts","music","shopping","yoga"]
In [37]:
class RawSetProcessing(object):
"""
This class aims to load and clean the dataset.
"""
def __init__(self,source_path,filename,features):
self.source_path = source_path
self.filename = filename
self.features = features
# Load data
def load_data(self):
raw_dataset_df = pd.read_csv(self.source_path + self.filename)
return raw_dataset_df
# Select variables to process and include in the model
def subset_features(self, df):
sel_vars_df = df[self.features]
return sel_vars_df
@staticmethod
# Remove ids with missing values
def remove_ids_with_missing_values(df):
sel_vars_filled_df = df.dropna()
return sel_vars_filled_df
@staticmethod
def drop_duplicated_values(df):
df = df.drop_duplicates()
return df
# Combine processing stages
def combiner_pipeline(self):
raw_dataset = self.load_data()
subset_df = self.subset_features(raw_dataset)
subset_no_dup_df = self.drop_duplicated_values(subset_df)
subset_filled_df = self.remove_ids_with_missing_values(subset_no_dup_df)
return subset_filled_df
In [38]:
raw_set = RawSetProcessing(local_path, local_filename, my_variables_selection)
dataset_df = raw_set.combiner_pipeline()
In [39]:
dataset_df.head(3)
Out[39]:
In [41]:
# Number of unique participants
dataset_df.iid.nunique()
Out[41]:
In [42]:
dataset_df.shape
Out[42]:
In [43]:
def get_partner_features(df):
#print df[df["iid"] == 1]
df_partner = df.copy()
df_partner = df_partner.drop(['pid','match'], 1).drop_duplicates()
#print df_partner.shape
merged_datasets = df.merge(df_partner, how = "inner",left_on="pid", right_on="iid",suffixes=('_me','_partner'))
#print merged_datasets[merged_datasets["iid_me"] == 1]
return merged_datasets
In [46]:
feat_eng_df = get_partner_features(dataset_df)
feat_eng_df.head(3)
Out[46]:
This model aims to answer the questions what is the profile of the persons regarding interests that got the most matches.
Variables:
In [20]:
import sklearn
print sklearn.__version__
In [21]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import subprocess
In [61]:
#features = list(["gender","age_o","race_o","goal","samerace","imprace","imprelig","date","go_out","career_c"])
features = list(["gender","date","go_out","sports","tvsports","exercise","dining","museums","art",
"hiking","gaming","clubbing","reading","tv","theater","movies","concerts","music",
"shopping","yoga"])
suffix_me = "_me"
suffix_partner = "_partner"
In [62]:
#add suffix to each element of list
def process_features_names(features, suffix_1, suffix_2):
features_me = [feat + suffix_1 for feat in features]
features_partner = [feat + suffix_2 for feat in features]
features_all = features_me + features_partner
return features_all
features_model = process_features_names(features, suffix_me, suffix_partner)
In [63]:
explanatory = feat_eng_df[features_model]
explained = feat_eng_df[label]
In [24]:
clf = tree.DecisionTreeClassifier(min_samples_split=20,min_samples_leaf=10,max_depth=4)
clf = clf.fit(explanatory, explained)
In [25]:
# Download http://www.graphviz.org/
with open("data.dot", 'w') as f:
f = tree.export_graphviz(clf, out_file=f, feature_names= features_model, class_names="match")
import subprocess
subprocess.call(['dot', '-Tpdf', 'data.dot', '-o' 'data.pdf'])
Out[25]:
In [30]:
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(explanatory, explained, test_size=0.3, random_state=0)
In [31]:
parameters = [
{'criterion': ['gini','entropy'], 'max_depth': [4,6,10,12,14],
'min_samples_split': [10,20,30], 'min_samples_leaf': [10,15,20]
}
]
scores = ['precision', 'recall']
In [32]:
dtc = tree.DecisionTreeClassifier()
clf = GridSearchCV(dtc, parameters,n_jobs=3, cv=5, refit=True)
In [64]:
for score in scores:
print("# Tuning hyper-parameters for %s" % score)
print("")
clf = GridSearchCV(dtc, parameters, cv=5,
scoring='%s_macro' % score)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print("")
print(clf.best_params_)
print("")
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print("")
In [69]:
best_param_dtc = tree.DecisionTreeClassifier(criterion="entropy",min_samples_split=10,min_samples_leaf=10,max_depth=14)
best_param_dtc = best_param_dtc.fit(explanatory, explained)
In [71]:
best_param_dtc.feature_importances_
Out[71]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [9]:
raw_dataset.rename(columns={"age_o":"age_of_partner","race_o":"race_of_partner"},inplace=True)
In [ ]:
In [ ]:
In [302]:
raw_data = {
'subject_id': ['14', '15', '16', '17', '18'],
'first_name': ['Sue', 'Maria', 'Sandra', 'Kate', 'Aurelie'],
'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan'],
'pid': ['4', '5', '6', '7', '8'],}
df_a = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name','pid'])
df_a
raw_data = {
'subject_id': ['4', '5', '6', '7', '8'],
'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan'],
'pid': ['14', '15', '16', '17', '18'],}
df_b = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name','pid'])
df_b
df_a.merge(df_b, left_on='pid', right_on='subject_id', how='outer', suffixes=('_me','_partner'))
Out[302]:
In [ ]: