In [7]:
import pandas

features = [
    "mean_of_the_integrated_profile",
    "standard_deviation_of_the_integrated_profile",
    "excess_kurtosis_of_the_integrated_profile",
    "skewness_of_the_integrated_profile",
    "mean_of_the_DM-SNR_curve",
    "standard_deviation_of_the_DM-SNR_curve",
    "excess_kurtosis_of_the_DM-SNR_curve",
    "skewness_of_the_DM-SNR_curve",
    "class"
]

data = pandas.read_csv('data/HTRU_2.csv', sep=",", names=features)
labels = data['class']

Features:

  1. mean_of_the_integrated_profile
  2. standard_deviation_of_the_integrated_profile
  3. excess_kurtosis_of_the_integrated_profile
  4. skewness_of_the_integrated_profile
  5. mean_of_the_DM-SNR_curve
  6. standard_deviation_of_the_DM-SNR_curve
  7. excess_kurtosis_of_the_DM-SNR_curve
  8. skewness_of_the_DM-SNR_curve
  9. class

Labels

  • Pulsar: 1
  • Não Pulsar: 0

Missing Data?

A base de dados não possui missig data porque seus campos são preenchidos com numpy.float64 e os rótulos com numpy.int64


In [8]:
import numpy

def verify_missing_data(data, features):
    missing_data = []
    
    for feature in features:
        count = 0
        for x in range(0, len(data)):
            if type(data[feature][x]) is numpy.float64 or type(data[feature][x]) is numpy.int64:
                count = count + 1
        missing_data.append(count)
    print(missing_data)
    
verify_missing_data(data, features)


[17898, 17898, 17898, 17898, 17898, 17898, 17898, 17898, 17898]

In [9]:
import numpy

number_samples, number_features = data.shape
number_labels = len(numpy.unique(labels))

In [12]:
from time import time
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

sample_size = 1900 # Pesquisar melhor valor para esse parâmetro usado na métrica silhouette

def bench_k_means(estimator, name, data):
    initial_time = time()
    estimator.fit(data)
    execution_time = time() - initial_time
    
    # metrics
    inertia = estimator.inertia_
    homogeneity_score = metrics.homogeneity_score(labels, estimator.labels_)
    completeness_score = metrics.completeness_score(labels, estimator.labels_)
    v_measure_score = metrics.v_measure_score(labels, estimator.labels_)
    adjusted_rand_score = metrics.adjusted_rand_score(labels, estimator.labels_)
    adjusted_mutual_info_score = metrics.adjusted_mutual_info_score(labels,  estimator.labels_)
    silhouette_score = metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=sample_size)
    
    #show metrics
    print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          % (name, execution_time, inertia, homogeneity_score,completeness_score, v_measure_score,
             adjusted_rand_score, adjusted_mutual_info_score, silhouette_score))

print(90 * '_')
print('init\t\ttime\tinertia\t\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')

bench_k_means(KMeans(init='k-means++', n_clusters=number_labels, n_init=10),
              name="k-means++", data=data)

bench_k_means(KMeans(init='random', n_clusters=number_labels, n_init=10),
              name="random", data=data)

# in this case the seeding of the centers is deterministic, hence we run the
# kmeans algorithm only once with n_init=1
pca = PCA(n_components=number_labels).fit(data)
bench_k_means(KMeans(init=pca.components_, n_clusters=number_labels, n_init=1),
              name="PCA-based", data=data)
print(90 * '_')


__________________________________________________________________________________________
init		time	inertia		homo	compl	v-meas	ARI	AMI	silhouette
k-means++	0.44s	122775992	0.031	0.023	0.026	-0.078	0.023	0.599
random   	0.45s	122775992	0.031	0.023	0.026	-0.078	0.023	0.594
PCA-based	0.07s	122777789	0.032	0.023	0.027	-0.078	0.023	0.587
__________________________________________________________________________________________

Visualização

  • Adicionar visualização dos clusters

Também disponível no exemplo: http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py