Classification


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

%matplotlib inline

Carregamento das amostras


In [10]:
df_dados= pd.read_csv('../data/features.csv', sep=';', index_col=0)
df_dados.head()


Out[10]:
nu_CPFCNPJ qtdAditivosPorCPFCNPJ qtdContratos vl_TotalContrato vl_Contrato participacoes valor_total_pregao valor_total_tomada valor_total_convite participacoes_pregao participacoes_tomada participacoes_convite label label_pred confianca
0 10140642000140 26 8 415668.77 71257.0 5 0.0 0.0 71257.0 0.0 0.0 5.0 0 0 1.0
1 10142026000128 19 3 67887.20 64587.2 2 0.0 0.0 64587.2 0.0 0.0 2.0 0 0 1.0
2 10158142000135 11 1 80900.00 80900.0 1 0.0 0.0 80900.0 0.0 0.0 1.0 0 0 1.0
3 10171194000141 4 1 45285.00 45285.0 1 0.0 0.0 45285.0 0.0 0.0 1.0 0 0 1.0
4 10175041000172 45 4 144250.00 133450.0 5 31050.0 0.0 102400.0 1.0 0.0 4.0 0 0 1.0

In [3]:
samples = df_dados.drop(['nu_CPFCNPJ', 'label'], axis=1).values
labels = df_dados.label.values
print(samples.shape, labels.shape)


(2075, 11) (2075,)

Classificação


In [4]:
estimators = [('std', StandardScaler()), ('svm', SVC(probability=True))]
pipe = Pipeline(estimators)
parameters = dict(svm__C=[1,10], svm__kernel=['linear', 'rbf'])

clf = GridSearchCV(pipe, parameters, scoring='f1', verbose=10, cv=10)
clf.fit(samples, labels)
print(clf.best_params_, clf.best_score_)


Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.964912, total=   0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.955752, total=   0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.944444, total=   0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.3s remaining:    0.0s
[CV] ..... svm__kernel=linear, svm__C=1, score=0.920354, total=   0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.963636, total=   0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.955752, total=   0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.907407, total=   0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.5s remaining:    0.0s
[CV] ..... svm__kernel=linear, svm__C=1, score=0.914286, total=   0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.929825, total=   0.1s
[CV] svm__kernel=linear, svm__C=1 ....................................
[CV] ..... svm__kernel=linear, svm__C=1, score=0.880734, total=   0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.6s remaining:    0.0s
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.954955, total=   0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.982143, total=   0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.963636, total=   0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.972477, total=   0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.944444, total=   0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.972477, total=   0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.915888, total=   0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.912621, total=   0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.972973, total=   0.1s
[CV] svm__kernel=rbf, svm__C=1 .......................................
[CV] ........ svm__kernel=rbf, svm__C=1, score=0.915888, total=   0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.964912, total=   0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.938053, total=   0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.954128, total=   0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.939130, total=   0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.955752, total=   0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.955752, total=   0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.907407, total=   0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.934579, total=   0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.939130, total=   0.1s
[CV] svm__kernel=linear, svm__C=10 ...................................
[CV] .... svm__kernel=linear, svm__C=10, score=0.880734, total=   0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.964286, total=   0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.982143, total=   0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.982143, total=   0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.982456, total=   0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.982143, total=   0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.981818, total=   0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.927273, total=   0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.953271, total=   0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.972973, total=   0.1s
[CV] svm__kernel=rbf, svm__C=10 ......................................
[CV] ....... svm__kernel=rbf, svm__C=10, score=0.945455, total=   0.1s
{'svm__kernel': 'rbf', 'svm__C': 10} 0.967414311082
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    3.6s finished

In [6]:
best_c, best_kernel = 10, 'rbf'

In [7]:
df_dados['label_pred'] = clf.predict(samples)
df_dados.head()


Out[7]:
nu_CPFCNPJ qtdAditivosPorCPFCNPJ qtdContratos vl_TotalContrato vl_Contrato participacoes valor_total_pregao valor_total_tomada valor_total_convite participacoes_pregao participacoes_tomada participacoes_convite label label_pred
0 10140642000140 26 8 415668.77 71257.0 5 0.0 0.0 71257.0 0.0 0.0 5.0 0 0
1 10142026000128 19 3 67887.20 64587.2 2 0.0 0.0 64587.2 0.0 0.0 2.0 0 0
2 10158142000135 11 1 80900.00 80900.0 1 0.0 0.0 80900.0 0.0 0.0 1.0 0 0
3 10171194000141 4 1 45285.00 45285.0 1 0.0 0.0 45285.0 0.0 0.0 1.0 0 0
4 10175041000172 45 4 144250.00 133450.0 5 31050.0 0.0 102400.0 1.0 0.0 4.0 0 0

In [8]:
probs = clf.predict_proba(samples)
df_dados['confianca'] = np.around(probs[:,0], decimals=2)
df_dados.head()


Out[8]:
nu_CPFCNPJ qtdAditivosPorCPFCNPJ qtdContratos vl_TotalContrato vl_Contrato participacoes valor_total_pregao valor_total_tomada valor_total_convite participacoes_pregao participacoes_tomada participacoes_convite label label_pred confianca
0 10140642000140 26 8 415668.77 71257.0 5 0.0 0.0 71257.0 0.0 0.0 5.0 0 0 1.0
1 10142026000128 19 3 67887.20 64587.2 2 0.0 0.0 64587.2 0.0 0.0 2.0 0 0 1.0
2 10158142000135 11 1 80900.00 80900.0 1 0.0 0.0 80900.0 0.0 0.0 1.0 0 0 1.0
3 10171194000141 4 1 45285.00 45285.0 1 0.0 0.0 45285.0 0.0 0.0 1.0 0 0 1.0
4 10175041000172 45 4 144250.00 133450.0 5 31050.0 0.0 102400.0 1.0 0.0 4.0 0 0 1.0

In [9]:
df_dados.to_csv(path_or_buf='../data/features.csv', sep=';')

In [ ]: