In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sb
from scipy.spatial.distance import cdist
%matplotlib inline
In [40]:
df_dados = pd.read_csv('../data/features.csv', sep=';', index_col=0, dtype={'nu_CPFCNPJ':str})
print(df_dados.shape)
df_dados.head()
Out[40]:
In [41]:
df_inidon = pd.read_csv('../data/ceis102016.csv', sep='\t', dtype={'CPF ou CNPJ do Sancionado':str})
df_inidon = df_inidon.rename(columns={'CPF ou CNPJ do Sancionado':'nu_CPFCNPJ'})
df_inidon = df_inidon.dropna(subset=['nu_CPFCNPJ'])
print(df_inidon.shape)
df_inidon.head()
Out[41]:
In [42]:
df_merge = df_dados.merge(df_inidon[['nu_CPFCNPJ']], on='nu_CPFCNPJ')
print(df_merge.shape)
df_merge.head()
Out[42]:
In [43]:
def plot_samples(samples, labels):
plt.figure(figsize=(10,6))
for l, c in zip([0, 1], ['red', 'blue']):
plt.scatter(samples[labels==l,0], samples_pca[labels==l,1], c=c)
def process_samples(samples):
std = StandardScaler()
samples = std.fit_transform(samples)
kmeans = KMeans(n_clusters=2)
labels = kmeans.fit_predict(samples)
print(np.bincount(labels))
pca = PCA(n_components=2)
samples_pca = pca.fit_transform(samples)
print(np.cumsum(pca.explained_variance_ratio_))
plot_samples(samples_pca, labels)
In [54]:
samples = df_dados.drop(['nu_CPFCNPJ', 'label', 'label_pred', 'confianca'], axis=1).values
samples_merge = df_merge.drop(['nu_CPFCNPJ', 'label', 'label_pred', 'confianca'], axis=1).values
std = StandardScaler()
std.fit(samples)
samples = std.transform(samples)
samples_merge = std.transform(samples_merge)
kmeans = KMeans(n_clusters=2)
kmeans.fit(samples)
labels = kmeans.predict(samples)
print(np.bincount(labels))
pca = PCA(n_components=2)
pca.fit(samples)
samples_pca = pca.transform(samples)
samples_merge = pca.transform(samples_merge)
print(np.cumsum(pca.explained_variance_ratio_))
plt.figure(figsize=(16,10))
p1 = plt.scatter(samples_pca[labels==0,0], samples_pca[labels==0,1], c='red', label='Possíveis Inidôneas')
p2 = plt.scatter(samples_pca[labels==1,0], samples_pca[labels==1,1], c='blue', label='Enedôneas')
p3 = plt.scatter(samples_merge[:,0], samples_merge[:,1], s=100, c='green', label='Inidôneas do CEIS')
plt.legend(handles=[p1, p2, p3])
plt.savefig('../figures/PCA.png', dpi=300)
In [ ]:
samples = df_dados.drop('nu_CPFCNPJ', axis=1).values
process_samples(samples)
In [24]:
samples = df_dados.drop('nu_CPFCNPJ', axis=1).values
samples_merge = df_merge.drop('nu_CPFCNPJ', axis=1).values
std = StandardScaler()
std.fit(samples)
samples = std.transform(samples)
samples_merge = std.transform(samples_merge)
dist = cdist(samples, samples_merge)
threshold = 1.0
labels[dist[:,0] < threshold] = 0
labels[dist[:,0] >= threshold] = 1
print(np.bincount(labels))
print(samples.shape, samples_merge.shape, dist.shape)
print(dist.min(), dist.max(), dist.mean())
plot_samples(samples, labels)
In [25]:
df_dados['label'] = labels
df_dados.head()
Out[25]:
In [26]:
df_dados.to_csv(path_or_buf='../data/features.csv', sep=';')
In [ ]: