In [204]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn import metrics
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression ,Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import ExtraTreesClassifier , GradientBoostingClassifier , RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold , StratifiedShuffleSplit , cross_val_score
from sklearn.naive_bayes import MultinomialNB , BernoulliNB
from sklearn.svm import LinearSVC , SVC
from sklearn.neighbors import KNeighborsClassifier
def getScoreClf(DF , Y , clf ):
clf = clone(clf)
return cross_val_score(clf , DF , Y , scoring='roc_auc' , cv = 2 , n_jobs=-1)
def stackingBuildProbaMatrix(Clfs , Y ):
empty = pd.DataFrame()
for clf in Clfs:
Clf = clf[1]
DF = clf[2]
name = clf[0]
ET = Clf.predict_proba(DF)[:,1]
empty[name]=ET
#clf['clf'].fit(clf['X'],Y)
#empty[clf[0]]=clf[1].predict_proba(clf[2])
empty['Action'] = Y
return empty
def stackingBuildProbaMatrixSend(Clfs ):
empty = pd.DataFrame()
for clf in Clfs:
Clf = clf[1]
DF = clf[2]
name = clf[0]
ET = Clf.predict_proba(DF)[:,1]
empty[name]=ET
#clf['clf'].fit(clf['X'],Y)
#empty[clf[0]]=clf[1].predict_proba(clf[2])
#empty['Action'] = Y
return empty
def getChanges(df , Y, column , clf , seuil ):
DF = df.copy()
R = DF[column].value_counts()
Ind = list(R.index)
R = list(R)
Replace = Ind[R.index(min(R))]
for i in range(0 , len(R)):
if R[i] <= seuil :
DF.loc[df[column] == Ind[i] , column] = Replace
return df
In [191]:
DF = pd.read_csv('De.csv')
TRAIN = pd.DataFrame(DF)
DG = pd.read_csv('TEST.csv')
TEST= pd.DataFrame(DG)
Y = pd.read_csv('tr.csv')
Y=pd.DataFrame(Y)
Y=Y.ACTION.values
all_data = np.vstack((TRAIN, TEST))
train_rows = len(TRAIN)
X_all = OneHotEncoder().fit_transform(all_data.astype(str))
X_train_all = X_all[:train_rows,:]
X_test_all = X_all[train_rows:,:]
In [9]:
Sss = StratifiedShuffleSplit(Y, 1, test_size=0.2, random_state=0)
In [16]:
Sss = StratifiedShuffleSplit(Y, 1, test_size=0.2, random_state=0)
for train , test in Sss:
TRAIN , TEST , YTRAIN , YTEST = X_train_all[train,:],X_train_all[test,:] , Y[train] , Y[test]
In [119]:
Clf = LogisticRegression(C = 0.8 , tol = 1, penalty = 'l2' , fit_intercept = False , class_weight='balanced' , max_iter=10000 , multi_class = 'ovr' , solver= 'lbfgs' , random_state=2)
ABC = ExtraTreesClassifier(n_estimators= 200 , min_samples_split = 5,min_samples_leaf=0.01, criterion = 'entropy',random_state=1 , n_jobs = -1 )
NA = MultinomialNB(alpha= 0.04 )
SVC = SVC(probability=True, random_state=1, C= 9, gamma= 0.5)
In [121]:
Korona = pd.DataFrame()
Korona['LogisticRegression'] = Clf.predict_proba(TRAIN)[:,1]
Korona['ExtraTreesClassifier'] = ABC.predict_proba(TRAIN)[:,1]
Korona['MultinomialNB'] = NA.predict_proba(TRAIN)[:,1]
Korona['SVC'] = SVC.predict_proba(TRAIN)[:,1]
In [120]:
Clf.fit(TRAIN , YTRAIN)
ABC.fit(TRAIN , YTRAIN)
NA.fit(TRAIN , YTRAIN)
SVC.fit(TRAIN , YTRAIN)
########################
Korona = pd.DataFrame()
Korona['LogisticRegression'] = Clf.predict_proba(TRAIN)[:,1]
Korona['ExtraTreesClassifier'] = ABC.predict_proba(TRAIN)[:,1]
Korona['MultinomialNB'] = NA.predict_proba(TRAIN)[:,1]
Korona['SVC'] = SVC.predict_proba(TRAIN)[:,1]
Korona['Action'] = YTRAIN
########################
Out[120]:
In [174]:
Reg3 = LogisticRegression(C= 200 , intercept_scaling = 100, class_weight ="balanced" , random_state=1)
print(getScoreClf(Korona.drop(['Action','SVC'] , axis = 1) , Korona.Action , Reg3).mean())
In [127]:
Korona1 = pd.DataFrame()
Korona1['LogisticRegression'] = Clf.predict_proba(TEST)[:,1]
Korona1['ExtraTreesClassifier'] = ABC.predict_proba(TEST)[:,1]
Korona1['MultinomialNB'] = NA.predict_proba(TEST)[:,1]
Korona1['SVC'] = SVC.predict_proba(TEST)[:,1]
##Korona['Action'] = YTRAIN
In [198]:
clf4 = RandomForestClassifier(max_depth = 22 , min_samples_leaf = 9 , min_samples_split = 19 ,n_estimators = 97, random_state=5)
Regle = [['ROLE_CODE' , 5] , ['ROLE_ROLLUP_2' , 1 ] ,['ROLE_FAMILY' , 4 ] ]
DF = TRAIN.copy()
for r in Regle:
DF = getChanges(DF , Y , r[0], clf4 , r[1])
clf4.fit(DF , Y)
Out[198]:
In [275]:
DT = pd.read_csv('TR_vot_num.csv' )
DT = pd.DataFrame(DT)
DT.head()
DT2 = pd.read_csv('TS_vot_num.csv' )
DT2 = pd.DataFrame(DT2)
#DT2.head()
In [276]:
DT2.head()
Out[276]:
In [133]:
## Finishing With THE TRAINING ::
In [178]:
Clf = LogisticRegression(C = 0.8 , tol = 1, penalty = 'l2' , fit_intercept = False , class_weight='balanced' , max_iter=10000 , multi_class = 'ovr' , solver= 'lbfgs' , random_state=2)
ABC = ExtraTreesClassifier(n_estimators= 200 , min_samples_split = 5,min_samples_leaf=0.01, criterion = 'entropy',random_state=1 , n_jobs = -1 )
NA = MultinomialNB(alpha= 0.04 )
SV = SVC(probability=True, random_state=1, C= 9, gamma= 0.5)
###############################
Clf.fit(X_train_all ,Y)
ABC.fit(X_train_all ,Y)
NA.fit(X_train_all ,Y)
SV.fit(X_train_all , Y)
Out[178]:
In [179]:
########################
Korona2 = pd.DataFrame()
Korona2['LogisticRegression'] = Clf.predict_proba(X_train_all)[:,1]
Korona2['ExtraTreesClassifier'] = ABC.predict_proba(X_train_all)[:,1]
Korona2['MultinomialNB'] = NA.predict_proba(X_train_all)[:,1]
Korona2['SVC'] = SV.predict_proba(X_train_all)[:,1]
Korona2['RandomForestClassifier'] = clf4.predict_proba(DF)[:,1]
#Korona2['GradiantBoostingClassifier'] = DT['GB'] GB --> LOW SCORE ON PUBLIC BOARD
Korona2['GradiantBoostingClassifier'] = DT['XT']
Korona2['RandomForestClassifier'] = clf4.predict_proba(DF)[:,1]
Korona2['RF'] = DT['RF']
Korona2['ABCn'] = DT['ABCn']
Korona2['ABC'] = DT['ABC']
Korona['Action'] = YTRAIN
In [239]:
Korona2.head()
Out[239]:
In [237]:
D21.head()
Out[237]:
In [278]:
Korona2['Voting'] = DT['Voting']
In [226]:
In [231]:
In [215]:
Korona2.head()
Out[215]:
In [307]:
RegFinal = LogisticRegression(C=6, intercept_scaling = 100, class_weight ="balanced" , random_state=1)
RegFinal.fit(Korona2.drop(['Action'] , axis =1) , Y)
Out[307]:
In [183]:
Korona3 = pd.DataFrame()
Korona3['LogisticRegression'] = Clf.predict_proba(X_test_all)[:,1]
Korona3['ExtraTreesClassifier'] = ABC.predict_proba(X_test_all)[:,1]
Korona3['MultinomialNB'] = NA.predict_proba(X_test_all)[:,1]
Korona3['SVC'] = SV.predict_proba(X_test_all)[:,1]
Korona3['RandomForestClassifier'] = clf4.predict_proba(TEST)[:,1]
Korona3['GradiantBoostingClassifier'] = DT2['XT']
Korona3['RF'] = DT2['RF']
Korona3['ABCn'] = DT2['ABCn']
Korona3['ABC'] = DT2['ABC']
In [200]:
Korona3['RandomForestClassifier'] = clf4.predict_proba(TEST)[:,1]
In [218]:
Korona3['GradiantBoostingClassifier'] = DT2['XT']
In [223]:
Korona3['RF'] = DT2['RF']
In [228]:
Korona3['ABCn'] = DT2['ABCn']
In [233]:
Korona3['ABC'] = DT2['ABC']
In [279]:
Korona3['Voting'] = DT2['Voting']
In [310]:
Korona2.to_csv('TRAIN2.csv' , index = False)
Korona3.to_csv('TEST2.csv' , index = False)
In [308]:
YFINAL = RegFinal.predict_proba(Korona3.drop(['ANOUR_PREDICTION'] , axis = 1 ))[:,1]
In [309]:
DR = pd.read_csv('sampleSubmission.csv')
DR =pd.DataFrame(DR)
DR['Action']=YFINAL
DR.to_csv('f9.csv' , index = False)
In [ ]: