In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc,recall_score,precision_score,accuracy_score
from sklearn.metrics import average_precision_score
In [ ]:
# Reading in the data
probs = pd.read_parquet('./probs.parquet', engine='pyarrow')
normals = pd.read_parquet('./normals.parquet', engine='pyarrow')
In [ ]:
pident = set(probs.ident.unique())
nident = set(normals.ident.unique())
probs_coding = {}
norms_coding = {}
i = 0
for ident in list(pident):
probs_coding[ident] = i
i += 1
for ident in list(nident):
norms_coding[ident] = i
i += 1
probs['id'] = probs.ident.apply(lambda x: probs_coding[x])
normals['id'] = normals.ident.apply(lambda x: norms_coding[x])
In [ ]:
df = pd.concat([probs, normals])
In [ ]:
del probs, normals
gc.collect()
In [ ]:
df.event_id = df.event_id.astype('str')
In [ ]:
# save unix time as an index to keep the order of events
df['index'] = df.start.copy()
df.start = df.start.astype('M8[s]')
In [ ]:
# Generating timedeltas
def group_timedelta(group):
group.sort_values('index', inplace=True)
group['timedelta'] = group.start - group.start.shift()
group.timedelta.fillna(0, inplace=True)
return group
def make_timedelta(df):
df = df.groupby('id', as_index=False).apply(group_timedelta)
df = df.reset_index().iloc[:,2:]
df.timedelta = df.timedelta.dt.seconds
return df
df = make_timedelta(df)
In [ ]:
df.timedelta = pd.cut(df.timedelta, [0,1,5,60,300,1800,3600,3600*5,3600*24], right=False)
df.timedelta = df.timedelta.astype(str)
In [ ]:
df['weekday'] = df.start.dt.weekday_name
df['hour'] = df.start.dt.hour.astype(str)
In [ ]:
# Adding the device info
phone_models = pd.read_msgpack('./phone_models.msgpack')
device_info = phone_models.loc[:,phone_models.columns.drop(['phone_id', 'allocation_date', 'storage_code', 'handsetmodel_id', 'phone_cnt', 'g3', 'g4', 'android'])]
device_info.rename(columns={'seadme_tyyp': 'network_type'}, inplace=True)
df = df.merge(device_info, on='TAC')
df.drop(labels=['TAC'], axis=1, inplace=True)
df.nfc = df.nfc.astype(str).apply(lambda x: None if x == 'nan' else x)
In [ ]:
cats = ['event_result', 'cause_code', 'sub_cause_code', 'mecontext', 'event_id', 'network_type', 'phone_type', 'Manufacturer', 'Model', 'os', 'category', 'nfc', 'sim_type', 'screen_size', 'timedelta', 'weekday', 'hour']
non_cats = ['index', 'id', 'ident', 'start', 'probs']
def encode_columns(df):
return pd.concat([df[non_cats], pd.get_dummies(df[cats], cats, dummy_na = True)], axis=1)
df = encode_columns(df)
In [ ]:
# Generating random sets of ids for the training and testing set - these need to be kept separate in both the
# non-aggregated and aggregated datasets
probs_indexes = random.sample(range(500), 100)
normals_indexes = random.sample(range(500,5500), 1000)
test_indexes = probs_indexes + normals_indexes
In [ ]:
def partition(id, test = test_indexes):
if id in test_indexes:
return 1
else:
return 0
In [ ]:
df['test'] = df['id'].apply(partition)
In [ ]:
df_train = df.loc[df['test'] == 0]
df_test = df.loc[df['test'] == 1]
del df_train['test']
del df_test['test']
In [ ]:
# The following steps should also be done for the competition dataset, but removing the 'probs' column
In [ ]:
# grouping the variables by information source
non_cats = ['index', 'id', 'ident', 'start', 'probs'] # - remove 'probs' from the list for the competition dataset
identical = ['id', 'ident', 'probs'] # - remove 'probs' from the list for the competition dataset
event_result = [x for x in df.columns.tolist() if x.startswith('event_result')]
cause_code = [x for x in df.columns.tolist() if x.startswith('cause_code')]
sub_cause_code = [x for x in df.columns.tolist() if x.startswith('sub_cause_code')]
event_id = [x for x in df.columns.tolist() if x.startswith('event_id')]
network_type = [x for x in df.columns.tolist() if x.startswith('network_type')]
phone_type = [x for x in df.columns.tolist() if x.startswith('phone_type')]
Manufacturer = [x for x in df.columns.tolist() if x.startswith('Manufacturer')]
Model = [x for x in df.columns.tolist() if x.startswith('Model')]
os = [x for x in df.columns.tolist() if x.startswith('os')]
category = [x for x in df.columns.tolist() if x.startswith('category')]
nfc = [x for x in df.columns.tolist() if x.startswith('nfc')]
sim_type = [x for x in df.columns.tolist() if x.startswith('sim_type')]
screen_size = [x for x in df.columns.tolist() if x.startswith('screen_size')]
timedelta = [x for x in df.columns.tolist() if x.startswith('timedelta')]
weekday = [x for x in df.columns.tolist() if x.startswith('weekday')]
hour = [x for x in df.columns.tolist() if x.startswith('hour')]
history = [x for x in df.columns.tolist() if x.startswith('history')]
In [ ]:
def join_lists(list_of_lists):
list3 = []
if type(list_of_lists) is list:
for element in list_of_lists:
list3.extend(element)
else:
list3.extend(list_of_lists)
return list(set(list3))
In [ ]:
# determining which variables will be summed and which ones will be averaged
identical_info = join_lists([identical, weekday])
event_info = join_lists([event_result, cause_code, sub_cause_code, event_id])
device_info = join_lists([network_type, phone_type, Manufacturer, Model, os, category, nfc, sim_type, screen_size])
time_info = join_lists([timedelta, weekday, hour])
summable = join_lists([event_info, timedelta, hour])
meanable = join_lists([event_info, device_info, timedelta, hour])
summable.append('id')
meanable.append('id')
In [ ]:
# aggregating the data
df_daily = pd.concat([df[identical_info].groupby('id').mean(), df[summable].groupby('id').sum().rename(columns = dict(zip(df[summable].groupby('id').sum().columns, ['sum_' + x for x in df[summable].groupby('id').sum().columns]))), df[meanable].groupby('id').mean().rename(columns = dict(zip(df[meanable].groupby('id').mean().columns, ['mean_' + x for x in df[meanable].groupby('id').mean().columns])))], axis = 1)
In [ ]:
# Saving the data for later use:
# df_daily.to_csv('df_competition_historical_data.txt', sep = '\t')
In [ ]:
df_test_agg = df_daily.loc[test_indexes,:]
df_train_agg = df_daily.loc[~df.index.isin(df_train_agg.index)]
In [ ]:
# Feature selection with ANOVA f-test
from sklearn.feature_selection import chi2, f_classif
f_values, f_p_values = f_classif(np.array(X_train_subsample),np.array(y_train_subsample))
selected_features = X_train_subsample.columns[f_values > 5000]
In [ ]:
y_train = df_train['probs']
y_test = df_test['probs']
id_train = df_train['id']
id_test = df_test['id']
X_train = df_train[selected_features]
X_test = X_test[selected_features]
In [ ]:
# a simple random forest classifer for this large dataset:
model = RandomForestClassifier(n_estimators = 100, max_depth = 3, n_jobs = 16, verbose = 10)
model.fit(X_train, y_train)
check = model.predict_proba(X_test)
train = model.predict_proba(X_train)
In [ ]:
# aggregating the predictions
result_train = pd.DataFrame(data=[])
result_train['id']= id_train
result_train['pred'] = train[:,1]
train_mean = result_train.groupby('id').mean()
train_mean.columns = ['pred_mean']
train_max = result_train.groupby('id').max()
train_max.columns = ['pred_max']
# ...and for the testing set
result_test = pd.DataFrame(data=[])
result_test['id']= id_test
result_test['pred'] = check[:,1]
test_mean = result_test.groupby('id').mean()
test_mean.columns = ['pred_mean']
test_max = result_test.groupby('id').max()
test_max.columns = ['pred_max']
In [ ]:
# adding single event predictions to the training data:
data = df_train_agg.merge(train_mean, on='id')
data = df_train_agg.merge(train_max, on='id')
data.columns = [x.replace('[', '').replace(']', '') for x in data.columns]
# ... and the test data:
test = df_test_agg.merge(test_mean, on='id')
test = df_test_agg.merge(test_max, on='id')
test.columns = [x.replace('[', '').replace(']', '') for x in test.columns]
In [ ]:
# Getting the feature importances
from sklearn.feature_selection import chi2, f_classif
f_values, f_p_values = f_classif(np.array(X_train),np.array(y_train))
selected_features = X_train.columns[f_values > 65]
selected_features
In [ ]:
# The variables that will be used for the modeling:
cols = ['sum_sub_cause_code_NO_VALUE',
'sum_sub_cause_code_SIGNALING_INTERFERENCE',
'sum_sub_cause_code_SUBSCRIPTION_CANCELLED',
'sum_cause_code_UNKNOWN_CAUSE_CODE_PROT_TYPE_0',
'sum_sub_cause_code_SGSN_CONTEXT_REQUEST_RECEIVED',
'sum_event_id_13',
'sum_event_id_2',
'sum_event_result_IGNORE',
'sum_event_id_15',
'sum_sub_cause_code_GGSN_RESPONDED_WITH_REJECT_CAUSE_CONTEXT_NON_EXISTENT',
'sum_sub_cause_code_QOS_CHANGED_BY_GGSN_DURING_RAU_OR_SGSN_OR_BSS_MODIFICATION_PROCEDURE',
'sum_cause_code_REACTIVATION_REQUIRED',
'sum_sub_cause_code_TIMEOUT_PAGING',
'sum_event_id_8',
'sum_sub_cause_code_NO_RESPONSE_FROM_MS_DURING_SGSN_INITIATED_MODIFICATION',
'mean_sub_cause_code_SIGNALING_INTERFERENCE',
'mean_event_id_4',
'mean_event_id_6',
'mean_event_id_2',
'mean_timedelta_0, 1)',
'mean_event_id_5',
'mean_event_id_8',
'mean_cause_code_UNKNOWN_CAUSE_CODE_PROT_TYPE_0',
'mean_event_id_13',
'mean_sub_cause_code_QOS_CHANGED_BY_GGSN_DURING_RAU_OR_SGSN_OR_BSS_MODIFICATION_PROCEDURE',
'mean_event_id_12',
'mean_cause_code_REACTIVATION_REQUIRED',
'mean_event_result_ABORT',
'mean_sub_cause_code_NO_VALUE',
'mean_timedelta_60, 300)',
'mean_sub_cause_code_SGSN_CONTEXT_REQUEST_RECEIVED',
'mean_sub_cause_code_SUCCESS',
'mean_cause_code_NOCAUSECODE',
'mean_sub_cause_code_DETACH_TRIGGERED_PDN_DISCONNECTION',
'mean_event_result_IGNORE',
'mean_sub_cause_code_NO_RESPONSE_FROM_MS_DURING_SGSN_INITIATED_MODIFICATION',
'pred_mean',
'pred_max',
'sum_hour_22',
'sum_hour_14',
'mean_hour_13',
'weekday_Friday',
'mean_hour_21',
'mean_hour_16',
'weekday_Saturday',
'mean_hour_23']
In [ ]:
y_train = df_train_agg['probs']
y_test = df_test_agg['probs']
X_train = df_train_agg[cols]
X_test = df_test_agg[cols]
In [ ]:
# Oversampling the minority 'probs' = True class for training
sm = SMOTE(ratio = {True:(4*y_train.sum())},k_neighbors=5, random_state=np.random.randint(567))
X_train, y_train = sm.fit_sample(X_train, y_train)
X_train = pd.DataFrame(data=X_train, columns = feature_names)
In [ ]:
# tuning the hyperparameters
parameter_ranges = {'max_depth': [3,5,10],
'learning_rate': [0.01, 0.05, 0.1],
'n_estimators': [50, 100, 500]}
model = xgb.XGBClassifier().fit(X_train, y_train)
model_tuned = GridSearchCV(model, parameter_ranges, scoring = 'roc_auc', verbose = 10, n_jobs = 16)
model_tuned.fit(X_train, y_train)
model_xgb = model_tuned.best_estimator_
print(model_tuned.best_score_)
print(model_tuned.best_params_)
In [ ]:
# tuning the hyperparameters
parameter_ranges = {'max_depth': [10,15,20],
'n_estimators': [100, 500, 1000]}
model = RandomForestClassifier().fit(X_train, y_train)
model_tuned = GridSearchCV(model, parameter_ranges, scoring = 'roc_auc', verbose = 10, n_jobs = 16)
model_tuned.fit(X_train, y_train)
model_rf = model_tuned.best_estimator_
print(model_tuned.best_score_)
print(model_tuned.best_params_)
In [ ]:
# Making the predictions for the test set:
predictions_xgb = model_xgb.predict_proba(X_test)
predictions_rf = model_rf.predict_proba(X_test)
In [ ]:
# Taking the max probability for class 'probs' = True
predictions_final = [np.max([x,y]) for x,y in zip(predictions_xgb[:,1], predictions_rf[:,1])]
predictions_final_binary = [1 if x >= 0.52 else 0 for x in predictions_final]
In [ ]:
#area under the precision-recall curve
score = average_precision_score(y_test, predictions_final)
print('area under the precision-recall curve: {:.6f}'.format(score))
check2 = predictions_final_binary
score = precision_score(y_test, check2)
print('precision score: {:.6f}'.format(score))
score = recall_score(y_test, check2)
print('recall score: {:.6f}'.format(score))
score = accuracy_score(y_test, check2)
print('accuracy score: {:.6f}'.format(score))
fpr, tpr, _ = roc_curve(y_test, predictions_final)
roc_auc = auc(fpr, tpr)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([-0.02, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend(loc="lower right")
plt.show()
In [ ]:
# Training the single event classifer:
X_full = df[selected_features]
y_full = df['probs']
model = model.fit(X_full, y_full)
In [ ]:
# Training the aggregated data classifers:
X_full = df_daily[cols]
y_full = df_daily['probs']
model_xgb = model_xgb.fit(X_full, y_full)
model_rf = model_rf.fit(X_full, y_full)
In [ ]:
# Saving the models for later use:
joblib.dump(model, './final_models/nonagg_full_model.pkl')
joblib.dump(model_xgb, './final_models/model_xgb.pkl')
joblib.dump(model_rf, './final_models/model_rf.pkl')
In [61]:
competition = pd.read_parquet('./competition_data.parquet', engine='pyarrow')
In [26]:
# This file is created using the same lines of code used to aggregate the training data
# The steps used though "aggregating the single events" should be used
competition_agg = pd.read_csv('./df_competition_historical_data.txt', sep = '\t')
In [27]:
# Loading in the saved models:
model = joblib.load('./final_models/nonagg_full_model.pkl')
model_xgb = joblib.load('./final_models/model_xgb.pkl')
model_rf = joblib.load('./final_models/model_rf.pkl')
In [28]:
nident = set(competition.ident.unique())
norms_coding = {}
i = 0
for ident in list(nident):
norms_coding[ident] = i
i += 1
competition['id'] = competition.ident.apply(lambda x: norms_coding[x])
In [29]:
competition.event_id = competition.event_id.astype('str')
In [30]:
competition['index'] = competition.start.copy()
competition.start = competition.start.astype('M8[s]')
In [31]:
def group_timedelta(group):
group.sort_values('index', inplace=True)
group['timedelta'] = group.start - group.start.shift()
group.timedelta.fillna(0, inplace=True)
return group
def make_timedelta(df):
df = df.groupby('id', as_index=False).apply(group_timedelta)
df = df.reset_index().iloc[:,2:]
df.timedelta = df.timedelta.dt.seconds
return df
competition = make_timedelta(competition)
In [32]:
competition.timedelta = pd.cut(competition.timedelta, [0,1,5,60,300,1800,3600,3600*5,3600*24], right=False)
competition.timedelta = competition.timedelta.astype(str)
competition['weekday'] = competition.start.dt.weekday_name
competition['hour'] = competition.start.dt.hour.astype(str)
In [33]:
phone_models = pd.read_msgpack('./phone_models.msgpack')
device_info = phone_models.loc[:,phone_models.columns.drop(['phone_id', 'allocation_date', 'storage_code', 'handsetmodel_id', 'phone_cnt', 'g3', 'g4', 'android'])]
device_info.rename(columns={'seadme_tyyp': 'network_type'}, inplace=True)
competition = competition.merge(device_info, on='TAC')
competition.drop(labels=['TAC'], axis=1, inplace=True)
competition.nfc = competition.nfc.astype(str).apply(lambda x: None if x == 'nan' else x)
In [34]:
cats = ['event_result', 'cause_code', 'sub_cause_code', 'mecontext', 'event_id', 'network_type', 'phone_type', 'Manufacturer', 'Model', 'os', 'category', 'nfc', 'sim_type', 'screen_size', 'timedelta', 'weekday', 'hour']
non_cats = ['index', 'id', 'ident', 'start']
def encode_columns(df):
return pd.concat([df[non_cats], pd.get_dummies(df[cats], cats, dummy_na = True)], axis=1)
competition = encode_columns(competition)
In [37]:
# apparently this model does not appear in the competition dataset so I'm adding all zeros
competition['Model_GQ3030'] = len(competition.index) * [0]
In [41]:
data = competition[['id', 'cause_code_EPS_SERVICES_NOT_ALLOWED_IN_THIS_PLMN',
'cause_code_NOCAUSECODE', 'cause_code_REACTIVATION_REQUIRED',
'cause_code_SUCCESSFUL_HANDOVER',
'cause_code_UNKNOWN_CAUSE_CODE_PROT_TYPE_0',
'sub_cause_code_ALL_PACKET_ORIENTED_SERVICES_BARRED',
'sub_cause_code_NO_VALUE',
'sub_cause_code_QOS_CHANGED_BY_GGSN_DURING_RAU_OR_SGSN_OR_BSS_MODIFICATION_PROCEDURE',
'sub_cause_code_SUCCESS', 'event_id_0', 'event_id_1', 'event_id_13',
'event_id_15', 'event_id_2', 'event_id_4', 'event_id_5', 'event_id_6',
'event_id_7', 'event_id_8', 'network_type_3G', 'network_type_4G',
'phone_type_Nutitelefon', 'phone_type_[No Data]', 'Manufacturer_Apple',
'Manufacturer_Samsung', 'Manufacturer_Teised', 'Manufacturer_ZTE',
'Model_Alcatel Onetouch Pop 8S', 'Model_Apple iPhone 7 (A1778)',
'Model_Apple iPhone X 64GB', 'Model_BV7000', 'Model_CAT S41',
'Model_CAT S60', 'Model_Coolpad Modena', 'Model_E303', 'Model_GQ3030',
'Model_HUAWEI Y6 Pro', 'Model_Huawei P9 Lite', 'Model_LG G4 Stylus',
'Model_Lenovo A6000', 'Model_Lenovo Z90-7', 'Model_Maya-L41',
'Model_MediaPad S7-303u', 'Model_Nokia LUMIA 635',
'Model_Original One, Original Pure, Original Shock, Kindo',
'Model_PM-1023-BV', 'Model_SM-G800F Galaxy S5 mini',
'Model_Samsung Galaxy Alpha 32GB', 'Model_Sony Xperia XA',
'Model_ZTE Blade V6', 'Model_iPhone 6 (A1586)', 'Model_m3 note',
'os_Android', 'os_Windows', 'os_iOS', 'nfc_0.0', 'nfc_1.0',
'sim_type_mini', 'sim_type_nan', 'screen_size_4.7', 'screen_size_5.0',
'screen_size_5.0000', 'screen_size_5.2000', 'screen_size_7.0',
'screen_size_nan']]
In [45]:
predictions = model.predict_proba(data.drop(['id'], axis = 1))
In [46]:
# aggregating the single event predictions
result = pd.DataFrame(data=[])
result['id']= data['id']
result['pred'] = predictions[:,1]
result_mean = result.groupby('id').mean()
result_mean.columns = ['pred_mean']
result_max = result.groupby('id').max()
result_max.columns = ['pred_max']
In [55]:
# creating the id column so I can easily merge
result_mean['id'] = result_mean.index
result_max['id'] = result_max.index
In [59]:
data = competition_agg.merge(result_mean, on='id')
data = data.merge(result_max, on='id')
data.columns = [x.replace('[', '').replace(']', '') for x in data.columns]
In [75]:
# selecting the variables to be used with the aggregated data models
data = data[['sum_sub_cause_code_NO_VALUE',
'sum_sub_cause_code_SIGNALING_INTERFERENCE',
'sum_sub_cause_code_SUBSCRIPTION_CANCELLED',
'sum_cause_code_UNKNOWN_CAUSE_CODE_PROT_TYPE_0',
'sum_sub_cause_code_SGSN_CONTEXT_REQUEST_RECEIVED',
'sum_event_id_13',
'sum_event_id_2',
'sum_event_result_IGNORE',
'sum_event_id_15',
'sum_sub_cause_code_GGSN_RESPONDED_WITH_REJECT_CAUSE_CONTEXT_NON_EXISTENT',
'sum_sub_cause_code_QOS_CHANGED_BY_GGSN_DURING_RAU_OR_SGSN_OR_BSS_MODIFICATION_PROCEDURE',
'sum_cause_code_REACTIVATION_REQUIRED',
'sum_sub_cause_code_TIMEOUT_PAGING',
'sum_event_id_8',
'sum_sub_cause_code_NO_RESPONSE_FROM_MS_DURING_SGSN_INITIATED_MODIFICATION',
'mean_sub_cause_code_SIGNALING_INTERFERENCE',
'mean_event_id_4',
'mean_event_id_6',
'mean_event_id_2',
'mean_timedelta_0, 1)',
'mean_event_id_5',
'mean_event_id_8',
'mean_cause_code_UNKNOWN_CAUSE_CODE_PROT_TYPE_0',
'mean_event_id_13',
'mean_sub_cause_code_QOS_CHANGED_BY_GGSN_DURING_RAU_OR_SGSN_OR_BSS_MODIFICATION_PROCEDURE',
'mean_event_id_12',
'mean_cause_code_REACTIVATION_REQUIRED',
'mean_event_result_ABORT',
'mean_sub_cause_code_NO_VALUE',
'mean_timedelta_60, 300)',
'mean_sub_cause_code_SGSN_CONTEXT_REQUEST_RECEIVED',
'mean_sub_cause_code_SUCCESS',
'mean_cause_code_NOCAUSECODE',
'mean_sub_cause_code_DETACH_TRIGGERED_PDN_DISCONNECTION',
'mean_event_result_IGNORE',
'mean_sub_cause_code_NO_RESPONSE_FROM_MS_DURING_SGSN_INITIATED_MODIFICATION',
'pred_mean',
'pred_max',
'sum_hour_22',
'sum_hour_14',
'mean_hour_13',
'weekday_Friday',
'mean_hour_21',
'mean_hour_16',
'weekday_Saturday',
'mean_hour_23']]
In [77]:
# predicting probabilities:
predictions_xgb = model_xgb.predict_proba(data)
predictions_rf = model_rf.predict_proba(data)
In [79]:
# creating lists of probabilities and final predictions
predictions_final = [np.max([x,y]) for x,y in zip(predictions_xgb[:,1], predictions_rf[:,1])]
predictions_final_binary = [True if x >= 0.52 else False for x in predictions_final]
In [82]:
# joining the ids and predictions into single dataframe
predictions = pd.DataFrame(data=[])
predictions['ident'] = competition_agg['id']
predictions['probs'] = predictions_final_binary
predictions['probs_probability'] = predictions_final
In [85]:
# checking if the distribution is about the same as in training dataset (10%)
predictions['probs'].mean()
Out[85]:
In [86]:
# adding the predictions to the competition dataset
competition_results = competition.merge(predictions, on = 'ident')
In [88]:
competition_results['probs'].mean()
Out[88]:
In [89]:
competition_results['probs_probability'].mean()
Out[89]:
In [95]:
# saving the results
competition_results.to_parquet('./competition_results.parquet')
predictions.to_csv('./results.csv', sep=',')
In [ ]: