In [1]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
# dataset: https://archive.ics.uci.edu/ml/datasets/User+Knowledge+Modeling
data = pd.read_csv('user_knowledge_level.csv', header=None)
data.head()
Out[2]:
In [3]:
data[5] = data[5].astype('category')
data[5] = data[5].cat.codes
data.head()
Out[3]:
In [4]:
# plots inúteis =p
for i in range(4):
for j in range(4):
if i != j:
fig, ax = plt.subplots()
ax.scatter(data[i], data[j], c=data[5])
plt.show()
In [5]:
kmeans = KMeans(n_clusters=5, random_state=0).fit(data.iloc[:,:5])
In [6]:
predictions = kmeans.predict(data.iloc[:,:5])
reduced_data = PCA(n_components=2).fit_transform(data.iloc[:,:5])
reduced_data_df = pd.DataFrame(reduced_data)
reduced_data_df[2] = predictions
# k-means clusters
fig, ax = plt.subplots()
ax.scatter(reduced_data_df[0], reduced_data_df[1], c=reduced_data_df[2])
plt.show()
# Original
fig, ax = plt.subplots()
ax.scatter(reduced_data_df[0], reduced_data_df[1], c=data[5])
plt.show()
In [ ]: