Dépôt du projet :
Auteur :
Tuteurs CNAM :
In [1]:
import sys #only needed to determine Python version number
import os
import glob
import re
In [279]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
IMPORTS SERIALISATION
In [3]:
import pickle as pk
In [4]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score,accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
#http://scikit-learn.org/0.19/modules/generated/sklearn.metrics.precision_recall_fscore_support.html#sklearn.metrics.precision_recall_fscore_support
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.metrics import precision_recall_curve,average_precision_score
In [5]:
import matplotlib
from matplotlib import pyplot as plt#only needed to determine Matplotlib version number
# Enable inline plotting
%matplotlib inline
In [6]:
#https://stackoverflow.com/questions/11707586/python-pandas-how-to-widen-output-display-to-see-more-columns
#pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
In [280]:
print('Python version ' + sys.version)
print('Pandas version ' + pd.__version__)
print('Matplotlib version ' + matplotlib.__version__)print('Sklearn version' + sk.__version__ )
In [8]:
print (os.name)
In [9]:
windows_path = 'C:\\PFD\\Data\\git\\_PRV\\CNAM-UASB03\\donnees'
linux_path= '/home/user/Bureau/CNAM/git/CNAM-UASB03/donnees'
In [10]:
if(os.name == 'nt'):
os.chdir(windows_path)
intermediaires_path = os.path.join(windows_path,'intermediaires\\')
else :
os.chdir(linux_path)
intermediaires_path = os.path.join(linux_path,'intermediaires/')
In [11]:
print(intermediaires_path)
In [12]:
import warnings
warnings.filterwarnings('ignore')
In [13]:
from jupyter_core.paths import jupyter_data_dir
print(jupyter_data_dir())
In [14]:
%%time
#CPU times: user 5.41 s, sys: 500 ms, total: 5.91 s
#Wall time: 7.13 s
#index_col=0,
meteo_cata_DF=pd.read_csv(os.path.join(intermediaires_path,"meteo_stations_catastrophes_alt.csv"),encoding='UTF-8',sep=',',skip_blank_lines=True, header=0,skipinitialspace=True,parse_dates=[1],infer_datetime_format=True,dayfirst=False)
In [15]:
meteo_cata_DF.info()
In [16]:
meteo_cata_DF.date.head()
Out[16]:
In [17]:
meteo_cata_DF.head()
Out[17]:
In [18]:
meteo_cata_DF.iscatastrophe.value_counts().plot(kind='bar')
Out[18]:
In [19]:
meteo_cata_ko_DF=meteo_cata_DF[meteo_cata_DF.iscatastrophe==False]
meteo_cata_ok_DF=meteo_cata_DF[meteo_cata_DF.iscatastrophe==True]
row_nb_pos,_=meteo_cata_ok_DF.shape
row_nb_neg,_=meteo_cata_ko_DF.shape
print("neg:",row_nb_neg, row_nb_neg/(row_nb_pos+row_nb_neg)*100)
print("pos:",row_nb_pos,row_nb_pos/(row_nb_pos+row_nb_neg)*100)
Choix:
In [20]:
meteo_cata_dwn_pos_DF= resample(meteo_cata_ok_DF,
replace=True, # sample with replacement
n_samples=row_nb_neg, # to match majority class
random_state=123) # reproducible results
In [21]:
meteo_cata_eq_DF=pd.concat([meteo_cata_ko_DF,meteo_cata_dwn_pos_DF])
In [22]:
meteo_cata_eq_DF.info()
In [23]:
meteo_cata_eq_DF.iscatastrophe.value_counts()
Out[23]:
In [24]:
Y=meteo_cata_eq_DF.iscatastrophe
X=meteo_cata_eq_DF.drop(['iscatastrophe','date','numer_sta'],1)
In [25]:
Y.isna().sum()
Out[25]:
In [26]:
X.isna().sum()
Out[26]:
In [258]:
#on complete les 8 valeurs manquantes ....
X.fillna(inplace=True,method="bfill")
In [28]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2, random_state=42)
In [29]:
def plot_roc_curve(fpr, tpr, label=None):
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([0, 1, 0, 1])
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
In [30]:
def display_scores(scores):
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())
In [31]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
plt.xlabel("Threshold", fontsize=16)
plt.legend(loc="upper left", fontsize=16)
plt.ylim([0, 1])
In [32]:
def plot_precision_vs_recall(precisions, recalls):
plt.plot(recalls, precisions, "b-", linewidth=2)
plt.xlabel("Recall", fontsize=16)
plt.ylabel("Precision", fontsize=16)
plt.axis([0, 1, 0, 1])
In [33]:
#precision_recall_curve(modele_ada,probas_pred=1)
In [34]:
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
In [35]:
# forets aléatoires
modele_rf=RandomForestClassifier(n_estimators=100, random_state=42)
#methodes ensemble
modele_ada=AdaBoostClassifier(n_estimators=100, random_state=42)
modele_gbm=GradientBoostingClassifier(n_estimators=100, random_state=42)
In [36]:
%%time
#en upsampling
#CPU times: user 37min 9s, sys: 6.21 s, total: 37min 15s
#Wall time: 37min 16s
modele_rf.fit(X_train, Y_train)
Out[36]:
In [37]:
%%time
modele_ada.fit(X_train, Y_train)
Out[37]:
In [38]:
%%time
modele_gbm.fit(X_train, Y_train)
Out[38]:
In [39]:
modele_rf_pred=modele_rf.predict(X_test)
In [40]:
modele_ada_pred=modele_ada.predict(X_test)
In [41]:
modele_gbm_pred=modele_gbm.predict(X_test)
In [42]:
modele_pred = modele_rf_pred
modele = modele_rf
In [43]:
confusion_matrix(Y_test,modele_pred)
Out[43]:
In [44]:
roc_auc_score(Y_test, modele_pred)
Out[44]:
In [45]:
fpr, tpr, thresholds = roc_curve(Y_test, modele_pred)
plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
plt.show()
In [46]:
precision_recall_fscore_support(Y_test, modele_pred,average='binary')
Out[46]:
In [47]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=10, scoring="precision",n_jobs=-1)
display_scores(modele_score_acc)
In [48]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=10, scoring="recall",n_jobs=-1)
display_scores(modele_score_acc)
In [49]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=10, scoring="f1",n_jobs=-1)
display_scores(modele_score_acc)
In [50]:
%%time
modele_score_roc=cross_val_score(modele, X, Y, cv=10, scoring="roc_auc",n_jobs=-1)
display_scores(modele_score_roc)
In [51]:
modele_pred = modele_ada_pred
modele=modele_ada
In [52]:
confusion_matrix(Y_test,modele_pred)
Out[52]:
In [53]:
roc_auc_score(Y_test, modele_pred)
Out[53]:
In [54]:
fpr, tpr, thresholds = roc_curve(Y_test, modele_pred)
plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
plt.show()
In [55]:
precision_recall_fscore_support(Y_test, modele_pred,average='binary')
Out[55]:
In [56]:
%%time
modele_score_acc=cross_val_score(modele,X, Y, cv=10, scoring="precision",n_jobs=-1)
display_scores(modele_score_acc)
In [57]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=10, scoring="recall",n_jobs=-1)
display_scores(modele_score_acc)
In [58]:
%%time
modele_score_acc=cross_val_score(modele, X, Y,cv=10, scoring="f1",n_jobs=-1)
display_scores(modele_score_acc)
In [59]:
%%time
modele_score_roc=cross_val_score(modele,X, Y, cv=10, scoring="roc_auc",n_jobs=-1)
display_scores(modele_score_roc)
In [60]:
modele_pred = modele_gbm_pred
modele=modele_gbm
In [61]:
confusion_matrix(Y_test,modele_pred)
Out[61]:
In [62]:
roc_auc_score(Y_test, modele_pred)
Out[62]:
In [63]:
fpr, tpr, thresholds = roc_curve(Y_test, modele_pred)
plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
plt.show()
In [64]:
precision_recall_fscore_support(Y_test, modele_pred,average='binary')
Out[64]:
In [65]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=10, scoring="precision",n_jobs=-1)
display_scores(modele_score_acc)
In [66]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=10, scoring="recall",n_jobs=-1)
display_scores(modele_score_acc)
In [67]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=10, scoring="f1",n_jobs=-1)
display_scores(modele_score_acc)
In [68]:
%%time
modele_score_roc=cross_val_score(modele, X, Y, cv=10, scoring="roc_auc",n_jobs=-1)
display_scores(modele_score_roc)
In [69]:
modele_rf.feature_importances_
Out[69]:
In [70]:
pd.DataFrame(modele_rf.feature_importances_,index=X_train.columns,columns=["Importance"]).head()
Out[70]:
In [71]:
pd.DataFrame(modele_gbm.feature_importances_,index=X_train.columns,columns=["Importance"]).head()
Out[71]:
In [72]:
pd.DataFrame(modele_ada.feature_importances_,index=X_train.columns,columns=["Importance"]).head()
Out[72]:
In [73]:
from sklearn.naive_bayes import GaussianNB
modele_gnb = GaussianNB()
In [74]:
gmodel = modele_gnb.fit(X_train,Y_train)
modeleNB_pred = gmodel.predict(X_test)
In [75]:
X_test.shape[0]
Out[75]:
In [76]:
Y_test_v=Y_test.values
Y_test_v=Y_test_v.ravel()
Y_test_v
Out[76]:
In [77]:
modeleNB_pred
Out[77]:
In [78]:
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
.format(
X_test.shape[0],
(Y_test_v != modeleNB_pred).sum(),
100*(1-(Y_test_v != modeleNB_pred).sum()/X_test.shape[0])))
In [79]:
modele_pred = modeleNB_pred
modele = modele_gnb
In [80]:
confusion_matrix(Y_test,modele_pred)
Out[80]:
In [81]:
roc_auc_score(Y_test, modele_pred)
Out[81]:
In [82]:
fpr, tpr, thresholds = roc_curve(Y_test, modele_pred)
plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
plt.show()
In [83]:
precision_recall_fscore_support(Y_test, modele_pred,average='binary')
Out[83]:
In [84]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=10, scoring="precision",n_jobs=-1)
display_scores(modele_score_acc)
In [85]:
%%time
modele_score_acc=cross_val_score(modele, X, Y,cv=10, scoring="recall",n_jobs=-1)
display_scores(modele_score_acc)
In [86]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=10, scoring="f1",n_jobs=-1)
display_scores(modele_score_acc)
In [87]:
%%time
modele_score_roc=cross_val_score(modele, X, Y, cv=10, scoring="roc_auc",n_jobs=-1)
display_scores(modele_score_roc)
In [88]:
from sklearn.linear_model import LogisticRegression
In [89]:
# Set regularization parameter
for _, C in enumerate((100, 1, 0.01)):
# turn down tolerance for short training time
modele_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01)
modele_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01)
modele_l1_LR.fit(X_train, Y_train)
modele_l2_LR.fit(X_train, Y_train)
coef_l1_LR = modele_l1_LR.coef_.ravel()
coef_l2_LR = modele_l2_LR.coef_.ravel()
# coef_l1_LR contains zeros due to the
# L1 sparsity inducing norm
sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100
sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100
print("C=%.2f" % C)
print("Sparsity with L1 penalty: %.2f%%" % sparsity_l1_LR)
print("score with L1 penalty: %.4f" % modele_l1_LR.score(X_test, Y_test))
print("Sparsity with L2 penalty: %.2f%%" % sparsity_l2_LR)
print("score with L2 penalty: %.4f" % modele_l2_LR.score(X_test, Y_test))
In [90]:
modele_l1_LR_pred = modele_l1_LR.predict(X_test)
modele_l2_LR_pred= modele_l2_LR.predict(X_test)
In [91]:
modele_pred = modele_l1_LR_pred
#modele_pred = modele_l2_LR_pred
modele = modele_l1_LR
#modele = modele_l2_LR
In [92]:
confusion_matrix(Y_test,modele_pred)
Out[92]:
In [93]:
roc_auc_score(Y_test, modele_pred)
Out[93]:
In [94]:
fpr, tpr, thresholds = roc_curve(Y_test, modele_pred)
plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
plt.show()
In [95]:
precision_recall_fscore_support(Y_test, modele_pred,average='binary')
Out[95]:
In [96]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=10, scoring="precision",n_jobs=-1)
display_scores(modele_score_acc)
In [97]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=10, scoring="recall",n_jobs=-1)
display_scores(modele_score_acc)
In [98]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=10, scoring="f1",n_jobs=-1)
display_scores(modele_score_acc)
In [99]:
%%time
modele_score_roc=cross_val_score(modele, X, Y, cv=10, scoring="roc_auc",n_jobs=-1)
display_scores(modele_score_roc)
In [100]:
from sklearn.neighbors import KNeighborsClassifier
modele_knn=KNeighborsClassifier()
modele_knn.fit(X_train, Y_train)
modeleKnn_pred=modele_knn.predict(X_test)
In [101]:
modele=modele_knn
modele_pred = modeleKnn_pred
In [102]:
confusion_matrix(Y_test,modele_pred)
Out[102]:
In [103]:
roc_auc_score(Y_test, modele_pred)
Out[103]:
In [104]:
fpr, tpr, thresholds = roc_curve(Y_test, modele_pred)
plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
plt.show()
In [105]:
precision_recall_fscore_support(Y_test, modele_pred,average='binary')
Out[105]:
In [106]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=10, scoring="precision",n_jobs=-1)
display_scores(modele_score_acc)
In [107]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=10, scoring="recall",n_jobs=-1)
display_scores(modele_score_acc)
In [108]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=10, scoring="f1",n_jobs=-1)
display_scores(modele_score_acc)
In [109]:
%%time
modele_score_roc=cross_val_score(modele, X, Y, cv=10, scoring="roc_auc",n_jobs=-1)
display_scores(modele_score_roc)
In [140]:
# algorithme de classification
from sklearn.svm import SVC
In [141]:
modele_SVM = SVC()
In [142]:
%%time
#CPU times: user 20h 57min 44s, sys: 12.1 s, total: 20h 57min 56s
#Wall time: 21h 35min 25s
modele_SVM.fit(X_train, Y_train)
Out[142]:
In [146]:
with open("SVM_model.pk", 'wb') as pickle_file:
pk.dump(all, pickle_file)
In [147]:
modele_SVM_test=modele_SVM.predict(X_test)
In [148]:
modele_pred = modele_SVM_test
modele = modele_SVM
In [149]:
confusion_matrix(Y_test,modele_pred)
Out[149]:
In [150]:
roc_auc_score(Y_test, modele_pred)
Out[150]:
In [151]:
fpr, tpr, thresholds = roc_curve(Y_test, modele_pred)
plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
plt.show()
In [152]:
precision_recall_fscore_support(Y_test, modele_pred,average='binary')
Out[152]:
In [153]:
%%time
modele_score_acc=cross_val_score(modele, X_train, Y_train, cv=3, scoring="precision",n_jobs=-1)
display_scores(modele_score_acc)
In [154]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=3, scoring="recall",n_jobs=-1)
display_scores(modele_score_acc)
In [155]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=3, scoring="f1",n_jobs=-1)
display_scores(modele_score_acc)
In [156]:
%%time
modele_score_roc=cross_val_score(modele, X, Y, cv=3, scoring="roc_auc",n_jobs=-1)
display_scores(modele_score_roc)
In [157]:
# % de bien classes
# attention trompeur
accuracy_score(Y_test,modele_SVM_test)
Out[157]:
In [158]:
precision_score(Y_test,modele_SVM_test)
Out[158]:
In [232]:
from sklearn.model_selection import GridSearchCV
In [285]:
parameters = {
'n_estimators':[1,500],
'criterion' :['gini', 'entropy']
}
In [234]:
#nb de jeux en CV
nb_cv=5
In [286]:
modele_RF_GSCV = GridSearchCV(modele_rf, parameters,cv=nb_cv,n_jobs=-1)
In [287]:
%%time
modele_RF_GSCV.fit(X_train,Y_train)
Out[287]:
In [288]:
modele_RF_GSCV.best_estimator_
Out[288]:
In [289]:
modele_RF_GSCV.score(X_test,Y_test)
Out[289]:
In [290]:
modele_RF_GSCV_pred=modele_RF_GSCV.predict(X_test)
In [291]:
modele_pred = modele_RF_GSCV_pred
modele=modele_RF_GSCV
In [292]:
confusion_matrix(Y_test,modele_pred)
Out[292]:
In [293]:
roc_auc_score(Y_test, modele_pred)
Out[293]:
In [294]:
fpr, tpr, thresholds = roc_curve(Y_test, modele_pred)
plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
plt.show()
In [295]:
precision_recall_fscore_support(Y_test, modele_pred,average='binary')
Out[295]:
In [296]:
%%time
#CPU times: user 1.6 s, sys: 543 ms, total: 2.14 s
#Wall time: 30min 30s
modele_score_acc=cross_val_score(modele, X, Y, cv=nb_cv, scoring="precision",n_jobs=-1)
display_scores(modele_score_acc)
In [271]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=nb_cv, scoring="recall",n_jobs=-1)
display_scores(modele_score_acc)
In [272]:
%%time
modele_score_acc=cross_val_score(modele, X, Y, cv=nb_cv, scoring="f1",n_jobs=-1)
display_scores(modele_score_acc)
In [273]:
%%time
modele_score_roc=cross_val_score(modele, X, Y, cv=nb_cv, scoring="roc_auc",n_jobs=-1)
display_scores(modele_score_roc)