In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
%matplotlib inline
In [10]:
df_dados= pd.read_csv('../data/features.csv', sep=';', index_col=0)
df_dados.head()
Out[10]:
nu_CPFCNPJ
qtdAditivosPorCPFCNPJ
qtdContratos
vl_TotalContrato
vl_Contrato
participacoes
valor_total_pregao
valor_total_tomada
valor_total_convite
participacoes_pregao
participacoes_tomada
participacoes_convite
label
label_pred
confianca
0
10140642000140
26
8
415668.77
71257.0
5
0.0
0.0
71257.0
0.0
0.0
5.0
0
0
1.0
1
10142026000128
19
3
67887.20
64587.2
2
0.0
0.0
64587.2
0.0
0.0
2.0
0
0
1.0
2
10158142000135
11
1
80900.00
80900.0
1
0.0
0.0
80900.0
0.0
0.0
1.0
0
0
1.0
3
10171194000141
4
1
45285.00
45285.0
1
0.0
0.0
45285.0
0.0
0.0
1.0
0
0
1.0
4
10175041000172
45
4
144250.00
133450.0
5
31050.0
0.0
102400.0
1.0
0.0
4.0
0
0
1.0
In [3]:
samples = df_dados.drop(['nu_CPFCNPJ', 'label'], axis=1).values
labels = df_dados.label.values
print(samples.shape, labels.shape)
(2075, 11) (2075,)
In [4]:
estimators = [('std', StandardScaler()), ('svm', SVC(probability=True))]
pipe = Pipeline(estimators)
parameters = dict(svm__C=[1,10], svm__kernel=['linear', 'rbf'])
clf = GridSearchCV(pipe, parameters, scoring='f1', verbose=10, cv=10)
clf.fit(samples, labels)
print(clf.best_params_, clf.best_score_)
Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.964912, total= 0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.955752, total= 0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.944444, total= 0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.1s remaining: 0.0s
[Parallel(n_jobs=1)]: Done 2 out of 2 | elapsed: 0.1s remaining: 0.0s
[Parallel(n_jobs=1)]: Done 3 out of 3 | elapsed: 0.2s remaining: 0.0s
[Parallel(n_jobs=1)]: Done 4 out of 4 | elapsed: 0.3s remaining: 0.0s
[CV] ..... svm__kernel=linear, svm__C=1, score=0.920354, total= 0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.963636, total= 0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.955752, total= 0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.907407, total= 0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 0.3s remaining: 0.0s
[Parallel(n_jobs=1)]: Done 6 out of 6 | elapsed: 0.4s remaining: 0.0s
[Parallel(n_jobs=1)]: Done 7 out of 7 | elapsed: 0.4s remaining: 0.0s
[Parallel(n_jobs=1)]: Done 8 out of 8 | elapsed: 0.5s remaining: 0.0s
[CV] ..... svm__kernel=linear, svm__C=1, score=0.914286, total= 0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.929825, total= 0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.880734, total= 0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[Parallel(n_jobs=1)]: Done 9 out of 9 | elapsed: 0.6s remaining: 0.0s
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.954955, total= 0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.982143, total= 0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.963636, total= 0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.972477, total= 0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.944444, total= 0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.972477, total= 0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.915888, total= 0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.912621, total= 0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.972973, total= 0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.915888, total= 0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.964912, total= 0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.938053, total= 0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.954128, total= 0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.939130, total= 0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.955752, total= 0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.955752, total= 0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.907407, total= 0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.934579, total= 0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.939130, total= 0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.880734, total= 0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.964286, total= 0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.982143, total= 0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.982143, total= 0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.982456, total= 0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.982143, total= 0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.981818, total= 0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.927273, total= 0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.953271, total= 0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.972973, total= 0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.945455, total= 0.1s
{'svm__kernel': 'rbf', 'svm__C': 10} 0.967414311082
[Parallel(n_jobs=1)]: Done 40 out of 40 | elapsed: 3.6s finished
In [6]:
best_c, best_kernel = 10, 'rbf'
In [7]:
df_dados['label_pred'] = clf.predict(samples)
df_dados.head()
Out[7]:
nu_CPFCNPJ
qtdAditivosPorCPFCNPJ
qtdContratos
vl_TotalContrato
vl_Contrato
participacoes
valor_total_pregao
valor_total_tomada
valor_total_convite
participacoes_pregao
participacoes_tomada
participacoes_convite
label
label_pred
0
10140642000140
26
8
415668.77
71257.0
5
0.0
0.0
71257.0
0.0
0.0
5.0
0
0
1
10142026000128
19
3
67887.20
64587.2
2
0.0
0.0
64587.2
0.0
0.0
2.0
0
0
2
10158142000135
11
1
80900.00
80900.0
1
0.0
0.0
80900.0
0.0
0.0
1.0
0
0
3
10171194000141
4
1
45285.00
45285.0
1
0.0
0.0
45285.0
0.0
0.0
1.0
0
0
4
10175041000172
45
4
144250.00
133450.0
5
31050.0
0.0
102400.0
1.0
0.0
4.0
0
0
In [8]:
probs = clf.predict_proba(samples)
df_dados['confianca'] = np.around(probs[:,0], decimals=2)
df_dados.head()
Out[8]:
nu_CPFCNPJ
qtdAditivosPorCPFCNPJ
qtdContratos
vl_TotalContrato
vl_Contrato
participacoes
valor_total_pregao
valor_total_tomada
valor_total_convite
participacoes_pregao
participacoes_tomada
participacoes_convite
label
label_pred
confianca
0
10140642000140
26
8
415668.77
71257.0
5
0.0
0.0
71257.0
0.0
0.0
5.0
0
0
1.0
1
10142026000128
19
3
67887.20
64587.2
2
0.0
0.0
64587.2
0.0
0.0
2.0
0
0
1.0
2
10158142000135
11
1
80900.00
80900.0
1
0.0
0.0
80900.0
0.0
0.0
1.0
0
0
1.0
3
10171194000141
4
1
45285.00
45285.0
1
0.0
0.0
45285.0
0.0
0.0
1.0
0
0
1.0
4
10175041000172
45
4
144250.00
133450.0
5
31050.0
0.0
102400.0
1.0
0.0
4.0
0
0
1.0
In [9]:
df_dados.to_csv(path_or_buf='../data/features.csv', sep=';')
In [ ]:
Content source: DiegoPereira/Teupassadotecondena
Similar notebooks: