In [14]:
# Import the libraries we will be using
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
#from dstools import data_tools #if plot decision tree surface
import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 10, 8
In [13]:
df = pd.read_pickle('../data_processeing/Yelp_Cuisine_Chinese.pkl')
print df.shape
df.head(2)
Out[13]:
In [29]:
#df.review_count.unique()
In [15]:
df2 = df.copy()
In [31]:
df_att = df2.loc[ :, u'AcceptsInsurance':'WiFi'] # include 'WiFi'
att_type = {}
col_b = []
col_m = []
for i in range(df_att.shape[1]):
temp = df_att.groupby(df_att.iloc[:,i]).size().keys()
print temp.name, ': ', temp.values
if set(temp.values) == set([False, True]) or set(temp.values).issubset(set([False, True])):
att_type[temp.name] = 'b'
col_b.append(temp.name)
else:
att_type[temp.name] = 'm'
col_m.append(temp.name)
print len(col_b), len(col_m)
col_all = col_b + col_m
In [35]:
### join label with all the expanded"attributes" and "review count"
df_att_b = df2.loc[:,col_b].join(df[['review_count','cuisine_Chinese']])
df_att_b.head(1)
Out[35]:
In [36]:
df_att_m = df2.loc[:,col_m].join(df[['review_count','cuisine_Chinese']])
df_att_m.head(1)
Out[36]:
In [37]:
df_att_all = df_att.join(df[['review_count','cuisine_Chinese']])
df_att_all.head(1)
Out[37]:
In [38]:
df_att_all.shape
Out[38]:
In [39]:
df_att_all_filled = df_att_all.fillna(value='none')
df_att_all_filled.head(3)
Out[39]:
In [45]:
## transformed certain columns to BINARY varibales #####
df_chin_bi = pd.get_dummies(df_att_all_filled, \
prefix=col_all, prefix_sep='_', dummy_na=False, \
columns=col_all, sparse=False, drop_first=False)
In [56]:
## change column order
df_chin_bi = df_chin_bi.loc[:,['cuisine_Chinese','review_count'] + list(df_chin_bi.columns.values[2:])]
In [57]:
df_chin_bi.head(2)
Out[57]:
In [58]:
df_chin_bi.shape
Out[58]:
In [59]:
# Split training(labeled) and test(unlabled)
df_chin_train = df_chin_bi[df_chin_bi['cuisine_Chinese'] != 0]
df_chin_test = df_chin_bi[df_chin_bi['cuisine_Chinese'] == 0]
In [61]:
# panalty and C value tuning
penalties = ['l1', 'l2']
C_values = sorted([np.exp(a*5) for a in np.linspace(-8,4,20)])
C_values
Out[61]:
In [ ]:
# dummy varibale
In [78]:
X = df_chin_train.ix[:, 1:]
Y = df_chin_train['cuisine_Chinese'].apply(lambda n: n-1)
In [79]:
Y.unique()
Out[79]:
In [80]:
# pick train test split ratio as 0.7
# LR GridSearch 1
X_train, X_vali, Y_train, Y_vali = train_test_split(
X, Y, train_size=0.7, random_state=90) #Use random_state to fix samples
# tuning hyper parameter
tuned_parameters_LR = {'C': C_values,
'penalty': penalties}
lr = LogisticRegression()
gr_lr = GridSearchCV(lr, param_grid=tuned_parameters_LR, cv=5)
gr_lr.fit(X_train, Y_train)
gr_lr_auc_scores = cross_val_score(gr_lr, X, Y, scoring="roc_auc", cv=5)
print ( "Logistic regression training size(0.8): Mean AUC %.4f\n" % (np.mean(gr_lr_auc_scores)))
print "Best parameters set found:"
print gr_lr.best_params_
print"Gridsearch Scores: "
print gr_lr.score(X_vali, Y_vali)
means = gr_lr.cv_results_['mean_test_score']
stds = gr_lr.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gr_lr.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
In [81]:
p_opt = gr_lr.best_params_.values()[0]
c_opt = gr_lr.best_params_.values()[1]
In [91]:
models = []
labels = []
# fit model
lr_opt = LogisticRegression(C=c_opt, penalty=p_opt, random_state=99)
lr_opt.fit(X_train, Y_train)
models.append(lr_opt)
labels.append('Logistic Regression')
In [92]:
lr_opt
Out[92]:
In [120]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 6, 6
#### Plot One model - train & validation sets ####
def plot_ROC(model, Y_train, X_train, Y_vali, X_vali, label):
print ("AUC on the %s Train data = %.5f" % \
(label, metrics.roc_auc_score(model.predict(X_train), Y_train)))
print ("AUC on the %s Validation data = %.5f\n" % \
(label, metrics.roc_auc_score(model.predict(X_vali), Y_vali)))
#fpr, tpr, thresholds = metrics.roc_curve(Y_train, model.predict_proba(X_train)[:,1])
#plt.plot(fpr, tpr, label='{} Train set'.format(label))
fpr, tpr, thresholds = metrics.roc_curve(Y_vali, model.predict_proba(X_vali)[:,1])
plt.plot(fpr, tpr, label='{} Validation set'.format(label))
plt.xlabel("false-positive rate", size=18)
plt.ylabel("true-positibe rate", size=18)
plt.title("ROC Curve of {} Model".format(label), size=20)
plt.legend(loc='best')
### more than one model - validation sets ###
def plot_ROCs(models, Y_trains, X_trains, Y_valis, X_valis, labels):
for model, Y_train, X_train, Y_vali, X_vali, label in \
zip(models, Y_trains, X_trains, Y_valis, X_valis, labels):
print ("AUC on the %s Train data = %.5f" % \
(label, metrics.roc_auc_score(model.predict(X_train), Y_train)))
print ("AUC on the %s Validation data = %.5f\n" % \
(label, metrics.roc_auc_score(model.predict(X_vali), Y_vali)))
fpr, tpr, thresholds = metrics.roc_curve(Y_vali, model.predict_proba(X_vali)[:,1])
plt.plot(fpr, tpr, label=label)
plt.xlabel("fpr", size=18)
plt.ylabel("tpr", size=18)
plt.title("ROC Curves for models and transformed X sets", size=20)
plt.legend(loc='best')
In [121]:
plot_ROC(lr_opt, Y_train, X_train, Y_vali, X_vali, label='LR')
In [ ]: