In [17]:
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
%matplotlib inline
In [18]:
#Primeiro, geramos alguns centroides e uns pontos aos seus redores:
np.random.seed(seed=101)
somex = np.random.randn(4)
centroids = np.array([[x,np.random.randn()] for x in somex])
X = np.zeros(shape=(104,2))
z = 0
plt.scatter(centroids[:,0],centroids[:,1],c='r',marker='d')
for centroid in centroids:
X[z] = centroid
z = z+1
for i in range(0,25):
x = 0.12 * np.random.randn() + centroid[0]
y = 0.12 * np.random.randn() + centroid[1]
X[z] = [x,y]
z = z+1
plt.scatter(X[:,0],X[:,1],c='b',marker='*')
plt.scatter(centroids[:,0],centroids[:,1],c='r',marker='d')
Out[18]:
In [19]:
model = KMeans(n_clusters=4)
In [20]:
model = model.fit(X)
In [21]:
print(model.labels_)
In [22]:
plt.figure()
plt.scatter(X[:,0],X[:,1],c=model.labels_.astype(float))
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [7]:
import pandas as pd
import numpy as np
from sklearn import svm, datasets
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn import svm
from mpl_toolkits.mplot3d import axes3d, Axes3D
In [8]:
df = pd.read_csv('evasao.csv')
df.head()
Out[8]:
In [9]:
df2 = df[['periodo','repetiu','desempenho']][df.abandonou == 1]
In [10]:
df2.head()
Out[10]:
In [11]:
fig = plt.figure()
#ax = fig.add_subplot(111, projection='3d')
ax = Axes3D(fig) # Para Matplotlib 0.99
ax.scatter(xs=df2['periodo'],ys=df2['repetiu'],zs=df2['desempenho'], c='r',s=8)
ax.set_xlabel('periodo')
ax.set_ylabel('repetiu')
ax.set_zlabel('desempenho')
Out[11]:
In [12]:
model_cp = KMeans(n_clusters=4)
Antes de reduzir a escala das variáveis, verifique se já não estão na mesma escala! É recomendado sempre colocarmos as variáveis na mesma escala, embora nem sempre isso gere o melhor resultado. Vamos começar usando escala e depois sem usarmos escala.
Finalmente, vamos rodar sem fazer escala das variáveis.
In [13]:
model_cp = model.fit(df2)
cluster_df = df2.assign(cluster = model_cp.labels_)
print(model_cp.labels_)
cluster1 = cluster_df[cluster_df.cluster==0]
cluster2 = cluster_df[cluster_df.cluster==1]
cluster3 = cluster_df[cluster_df.cluster==2]
cluster4 = cluster_df[cluster_df.cluster==3]
fig=plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs=cluster1['periodo'],ys=cluster1['repetiu'],zs=cluster1['desempenho'], c='b', marker='d', s=18)
ax.scatter(xs=cluster2['periodo'],ys=cluster2['repetiu'],zs=cluster2['desempenho'], c='r', marker='*', s=18)
ax.scatter(xs=cluster3['periodo'],ys=cluster3['repetiu'],zs=cluster3['desempenho'], c='g', marker='o', s=18)
ax.scatter(xs=cluster4['periodo'],ys=cluster4['repetiu'],zs=cluster4['desempenho'], c='m', marker='+', s=38)
ax.set_xlabel('periodo')
ax.set_ylabel('repetiu')
ax.set_zlabel('desempenho')
ax.scatter(xs=model_cp.cluster_centers_[0,0],ys=model_cp.cluster_centers_[0,1],zs=model_cp.cluster_centers_[0,2], c='k', marker='s', s=100)
ax.scatter(xs=model_cp.cluster_centers_[1,0],ys=model_cp.cluster_centers_[1,1],zs=model_cp.cluster_centers_[1,2], c='k', marker='s', s=100)
ax.scatter(xs=model_cp.cluster_centers_[2,0],ys=model_cp.cluster_centers_[2,1],zs=model_cp.cluster_centers_[2,2], c='k', marker='s', s=100)
ax.scatter(xs=model_cp.cluster_centers_[3,0],ys=model_cp.cluster_centers_[3,1],zs=model_cp.cluster_centers_[3,2], c='k', marker='s', s=100)
Out[13]:
Faz mas sentido rodar a clusterização sem transformar as variáveis, pois já estão na mesma escala.
In [ ]: