In [204]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn import metrics
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression ,Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import  ExtraTreesClassifier , GradientBoostingClassifier , RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold , StratifiedShuffleSplit , cross_val_score
from sklearn.naive_bayes import MultinomialNB , BernoulliNB
from sklearn.svm import LinearSVC , SVC
from sklearn.neighbors import KNeighborsClassifier
def getScoreClf(DF , Y , clf ):
    clf = clone(clf)
    return cross_val_score(clf , DF , Y , scoring='roc_auc' ,  cv = 2 , n_jobs=-1)
def stackingBuildProbaMatrix(Clfs , Y ):
    empty = pd.DataFrame()
    for clf in Clfs:
        Clf = clf[1]
        DF = clf[2]
        name = clf[0]
        ET = Clf.predict_proba(DF)[:,1]
        empty[name]=ET
        #clf['clf'].fit(clf['X'],Y)
        #empty[clf[0]]=clf[1].predict_proba(clf[2])
    empty['Action'] = Y
    return empty
def stackingBuildProbaMatrixSend(Clfs ):
    empty = pd.DataFrame()
    for clf in Clfs:
        Clf = clf[1]
        DF = clf[2]
        name = clf[0]
        ET = Clf.predict_proba(DF)[:,1]
        empty[name]=ET
        #clf['clf'].fit(clf['X'],Y)
        #empty[clf[0]]=clf[1].predict_proba(clf[2])
    #empty['Action'] = Y
    return empty
def getChanges(df , Y, column , clf , seuil ):
    DF = df.copy()
    R = DF[column].value_counts()
    Ind = list(R.index)
    R = list(R)
    Replace = Ind[R.index(min(R))]
    for i in range(0 , len(R)):
        if R[i] <= seuil :
            DF.loc[df[column] == Ind[i] , column] = Replace
    return  df

In [191]:
DF = pd.read_csv('De.csv')
TRAIN = pd.DataFrame(DF)
DG = pd.read_csv('TEST.csv')
TEST= pd.DataFrame(DG)
Y = pd.read_csv('tr.csv')
Y=pd.DataFrame(Y)
Y=Y.ACTION.values
all_data = np.vstack((TRAIN, TEST))
train_rows = len(TRAIN)

X_all = OneHotEncoder().fit_transform(all_data.astype(str))
X_train_all = X_all[:train_rows,:]
X_test_all = X_all[train_rows:,:]

In [9]:
Sss = StratifiedShuffleSplit(Y, 1, test_size=0.2, random_state=0)

In [16]:
Sss = StratifiedShuffleSplit(Y, 1, test_size=0.2, random_state=0)
for train , test in Sss:
    TRAIN , TEST , YTRAIN , YTEST = X_train_all[train,:],X_train_all[test,:] , Y[train] , Y[test]

In [119]:
Clf = LogisticRegression(C =  0.8 , tol = 1,   penalty = 'l2' , fit_intercept = False , class_weight='balanced' , max_iter=10000 , multi_class = 'ovr' , solver= 'lbfgs' , random_state=2)
ABC = ExtraTreesClassifier(n_estimators= 200 , min_samples_split = 5,min_samples_leaf=0.01, criterion = 'entropy',random_state=1 , n_jobs = -1 )
NA = MultinomialNB(alpha= 0.04 )
SVC = SVC(probability=True, random_state=1, C= 9, gamma= 0.5)

In [121]:
Korona = pd.DataFrame()
Korona['LogisticRegression'] = Clf.predict_proba(TRAIN)[:,1]
Korona['ExtraTreesClassifier'] = ABC.predict_proba(TRAIN)[:,1]
Korona['MultinomialNB'] = NA.predict_proba(TRAIN)[:,1]
Korona['SVC'] = SVC.predict_proba(TRAIN)[:,1]

In [120]:
Clf.fit(TRAIN ,  YTRAIN)
ABC.fit(TRAIN ,  YTRAIN)
NA.fit(TRAIN ,  YTRAIN)
SVC.fit(TRAIN ,  YTRAIN)
########################
Korona = pd.DataFrame()
Korona['LogisticRegression'] = Clf.predict_proba(TRAIN)[:,1]
Korona['ExtraTreesClassifier'] = ABC.predict_proba(TRAIN)[:,1]
Korona['MultinomialNB'] = NA.predict_proba(TRAIN)[:,1]
Korona['SVC'] = SVC.predict_proba(TRAIN)[:,1]
Korona['Action'] = YTRAIN
########################


Out[120]:
SVC(C=9, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.5, kernel='rbf',
  max_iter=-1, probability=True, random_state=1, shrinking=True, tol=0.001,
  verbose=False)

In [174]:
Reg3 = LogisticRegression(C= 200 , intercept_scaling = 100, class_weight ="balanced" , random_state=1)
print(getScoreClf(Korona.drop(['Action','SVC'] , axis = 1) , Korona.Action , Reg3).mean())


0.999923765206

In [127]:
Korona1 = pd.DataFrame()
Korona1['LogisticRegression'] = Clf.predict_proba(TEST)[:,1]
Korona1['ExtraTreesClassifier'] = ABC.predict_proba(TEST)[:,1]
Korona1['MultinomialNB'] = NA.predict_proba(TEST)[:,1]
Korona1['SVC'] = SVC.predict_proba(TEST)[:,1]
##Korona['Action'] = YTRAIN

In [198]:
clf4 = RandomForestClassifier(max_depth = 22 , min_samples_leaf = 9 , min_samples_split = 19 ,n_estimators = 97, random_state=5)
Regle = [['ROLE_CODE' ,  5] , ['ROLE_ROLLUP_2' , 1 ] ,['ROLE_FAMILY' , 4 ] ]
DF = TRAIN.copy()
for r in Regle:
    DF = getChanges(DF , Y , r[0], clf4 , r[1])
clf4.fit(DF , Y)


Out[198]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=22, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=9, min_samples_split=19,
            min_weight_fraction_leaf=0.0, n_estimators=97, n_jobs=1,
            oob_score=False, random_state=5, verbose=0, warm_start=False)

In [275]:
DT = pd.read_csv('TR_vot_num.csv' )
DT = pd.DataFrame(DT)
DT.head()

DT2 = pd.read_csv('TS_vot_num.csv' )
DT2 = pd.DataFrame(DT2)
#DT2.head()

In [276]:
DT2.head()


Out[276]:
Unnamed: 0 Voting
0 0 0.838805
1 1 0.975624
2 2 0.989321
3 3 0.988575
4 4 0.989956

In [133]:
## Finishing With THE TRAINING ::

In [178]:
Clf = LogisticRegression(C =  0.8 , tol = 1,   penalty = 'l2' , fit_intercept = False , class_weight='balanced' , max_iter=10000 , multi_class = 'ovr' , solver= 'lbfgs' , random_state=2)
ABC = ExtraTreesClassifier(n_estimators= 200 , min_samples_split = 5,min_samples_leaf=0.01, criterion = 'entropy',random_state=1 , n_jobs = -1 )
NA = MultinomialNB(alpha= 0.04 )
SV = SVC(probability=True, random_state=1, C= 9, gamma= 0.5)
###############################
Clf.fit(X_train_all ,Y)
ABC.fit(X_train_all ,Y)
NA.fit(X_train_all  ,Y)
SV.fit(X_train_all , Y)


Out[178]:
SVC(C=9, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.5, kernel='rbf',
  max_iter=-1, probability=True, random_state=1, shrinking=True, tol=0.001,
  verbose=False)

In [179]:
########################
Korona2 = pd.DataFrame()
Korona2['LogisticRegression'] = Clf.predict_proba(X_train_all)[:,1]
Korona2['ExtraTreesClassifier'] = ABC.predict_proba(X_train_all)[:,1]
Korona2['MultinomialNB'] = NA.predict_proba(X_train_all)[:,1]
Korona2['SVC'] = SV.predict_proba(X_train_all)[:,1]
Korona2['RandomForestClassifier'] = clf4.predict_proba(DF)[:,1]
#Korona2['GradiantBoostingClassifier'] = DT['GB'] GB --> LOW SCORE ON PUBLIC BOARD
Korona2['GradiantBoostingClassifier'] = DT['XT']
Korona2['RandomForestClassifier'] = clf4.predict_proba(DF)[:,1]
Korona2['RF'] = DT['RF']
Korona2['ABCn'] = DT['ABCn']
Korona2['ABC'] = DT['ABC']
Korona['Action'] = YTRAIN

In [239]:
Korona2.head()


Out[239]:
LogisticRegression ExtraTreesClassifier MultinomialNB SVC Action RandomForestClassifier GradiantBoostingClassifier RF ABCn ABC
0 0.990381 1.000000 0.999962 0.980334 1 0.994530 0.998567 0.997622 0.984622 0.928572
1 0.936808 0.997500 0.999947 0.980363 1 0.963492 0.966326 0.955832 0.963648 0.887020
2 0.750225 0.982500 0.978939 0.980318 1 0.916217 0.988525 0.908988 0.897310 0.799879
3 0.977326 1.000000 0.999972 0.980618 1 0.994417 0.987551 0.980117 0.975305 0.890264
4 0.719636 0.998333 0.976973 0.980353 1 0.961037 0.947103 0.975461 0.952147 0.869679

In [237]:
D21.head()


Out[237]:
Unnamed: 0 LR NB SVC XT KN
0 0 0.994146 0.999830 0.979237 1.000000 NaN
1 1 0.992071 0.999988 0.976685 0.998684 NaN
2 2 0.975762 0.998207 0.962786 0.993421 NaN
3 3 0.993063 0.999543 0.983093 1.000000 NaN
4 4 0.985266 0.999927 0.976292 1.000000 NaN

In [278]:
Korona2['Voting'] = DT['Voting']

In [226]:


In [231]:


In [215]:
Korona2.head()


Out[215]:
LogisticRegression ExtraTreesClassifier MultinomialNB SVC Action RandomForestClassifier GradiantBoostingClassifier
0 0.990381 1.000000 0.999962 0.980334 1 0.994530 0.999992
1 0.936808 0.997500 0.999947 0.980363 1 0.963492 0.999962
2 0.750225 0.982500 0.978939 0.980318 1 0.916217 0.999970
3 0.977326 1.000000 0.999972 0.980618 1 0.994417 0.999871
4 0.719636 0.998333 0.976973 0.980353 1 0.961037 0.999998

In [307]:
RegFinal = LogisticRegression(C=6, intercept_scaling = 100, class_weight ="balanced" , random_state=1)
RegFinal.fit(Korona2.drop(['Action'] , axis =1) , Y)


Out[307]:
LogisticRegression(C=6, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=100, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=1,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [183]:
Korona3 = pd.DataFrame()
Korona3['LogisticRegression'] = Clf.predict_proba(X_test_all)[:,1]
Korona3['ExtraTreesClassifier'] = ABC.predict_proba(X_test_all)[:,1]
Korona3['MultinomialNB'] = NA.predict_proba(X_test_all)[:,1]
Korona3['SVC'] = SV.predict_proba(X_test_all)[:,1]
Korona3['RandomForestClassifier'] = clf4.predict_proba(TEST)[:,1]
Korona3['GradiantBoostingClassifier'] = DT2['XT']
Korona3['RF'] = DT2['RF']
Korona3['ABCn'] = DT2['ABCn']
Korona3['ABC'] = DT2['ABC']

In [200]:
Korona3['RandomForestClassifier'] = clf4.predict_proba(TEST)[:,1]

In [218]:
Korona3['GradiantBoostingClassifier'] = DT2['XT']

In [223]:
Korona3['RF'] = DT2['RF']

In [228]:
Korona3['ABCn'] = DT2['ABCn']

In [233]:
Korona3['ABC'] = DT2['ABC']

In [279]:
Korona3['Voting'] = DT2['Voting']

In [310]:
Korona2.to_csv('TRAIN2.csv' , index = False)
Korona3.to_csv('TEST2.csv' , index = False)

In [308]:
YFINAL = RegFinal.predict_proba(Korona3.drop(['ANOUR_PREDICTION'] , axis = 1  ))[:,1]

In [309]:
DR = pd.read_csv('sampleSubmission.csv')
DR =pd.DataFrame(DR)
DR['Action']=YFINAL
DR.to_csv('f9.csv' , index = False)

In [ ]: