In [7]:
import pandas
features = [
"mean_of_the_integrated_profile",
"standard_deviation_of_the_integrated_profile",
"excess_kurtosis_of_the_integrated_profile",
"skewness_of_the_integrated_profile",
"mean_of_the_DM-SNR_curve",
"standard_deviation_of_the_DM-SNR_curve",
"excess_kurtosis_of_the_DM-SNR_curve",
"skewness_of_the_DM-SNR_curve",
"class"
]
data = pandas.read_csv('data/HTRU_2.csv', sep=",", names=features)
labels = data['class']
In [8]:
import numpy
def verify_missing_data(data, features):
missing_data = []
for feature in features:
count = 0
for x in range(0, len(data)):
if type(data[feature][x]) is numpy.float64 or type(data[feature][x]) is numpy.int64:
count = count + 1
missing_data.append(count)
print(missing_data)
verify_missing_data(data, features)
seguin do exemplo disponível em: http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py
In [9]:
import numpy
number_samples, number_features = data.shape
number_labels = len(numpy.unique(labels))
In [12]:
from time import time
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
sample_size = 1900 # Pesquisar melhor valor para esse parâmetro usado na métrica silhouette
def bench_k_means(estimator, name, data):
initial_time = time()
estimator.fit(data)
execution_time = time() - initial_time
# metrics
inertia = estimator.inertia_
homogeneity_score = metrics.homogeneity_score(labels, estimator.labels_)
completeness_score = metrics.completeness_score(labels, estimator.labels_)
v_measure_score = metrics.v_measure_score(labels, estimator.labels_)
adjusted_rand_score = metrics.adjusted_rand_score(labels, estimator.labels_)
adjusted_mutual_info_score = metrics.adjusted_mutual_info_score(labels, estimator.labels_)
silhouette_score = metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=sample_size)
#show metrics
print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
% (name, execution_time, inertia, homogeneity_score,completeness_score, v_measure_score,
adjusted_rand_score, adjusted_mutual_info_score, silhouette_score))
print(90 * '_')
print('init\t\ttime\tinertia\t\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
bench_k_means(KMeans(init='k-means++', n_clusters=number_labels, n_init=10),
name="k-means++", data=data)
bench_k_means(KMeans(init='random', n_clusters=number_labels, n_init=10),
name="random", data=data)
# in this case the seeding of the centers is deterministic, hence we run the
# kmeans algorithm only once with n_init=1
pca = PCA(n_components=number_labels).fit(data)
bench_k_means(KMeans(init=pca.components_, n_clusters=number_labels, n_init=1),
name="PCA-based", data=data)
print(90 * '_')
Também disponível no exemplo: http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py