In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist, pdist
import seaborn as sns
sns.set()
%matplotlib inline

In [99]:
data = pd.read_csv("./data/UN.csv")
print(data.columns)
data

#Para ver la relación entre las variable podemos usar
#sns.pairplot(data)
#Que grafica scatter plots e histogramas de cada variable.


Index(['country', 'region', 'tfr', 'contraception', 'educationMale',
       'educationFemale', 'lifeMale', 'lifeFemale', 'infantMortality',
       'GDPperCapita', 'economicActivityMale', 'economicActivityFemale',
       'illiteracyMale', 'illiteracyFemale'],
      dtype='object')
Out[99]:
country region tfr contraception educationMale educationFemale lifeMale lifeFemale infantMortality GDPperCapita economicActivityMale economicActivityFemale illiteracyMale illiteracyFemale
0 Afghanistan Asia 6.90 NaN NaN NaN 45.0 46.0 154 2848 87.5 7.2 52.800 85.000
1 Albania Europe 2.60 NaN NaN NaN 68.0 74.0 32 863 NaN NaN NaN NaN
2 Algeria Africa 3.81 52 11.1 9.9 67.5 70.3 44 1531 76.4 7.8 26.100 51.000
3 American.Samoa Asia NaN NaN NaN NaN 68.0 73.0 11 NaN 58.8 42.4 0.264 0.360
4 Andorra Europe NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
5 Angola Africa 6.69 NaN NaN NaN 44.9 48.1 124 355 NaN NaN NaN NaN
6 Antigua America NaN 53 NaN NaN NaN NaN 24 6966 74.4 56.2 NaN NaN
7 Argentina America 2.62 NaN NaN NaN 69.6 76.8 22 8055 76.2 41.3 3.800 3.800
8 Armenia Europe 1.70 22 NaN NaN 67.2 74.0 25 354 65.0 52.0 0.300 0.500
9 Australia Oceania 1.89 76 16.3 16.1 75.4 81.2 6 20046 74.0 53.8 NaN NaN
10 Austria Europe 1.42 71 14.4 14.2 73.7 80.1 6 29006 69.5 47.7 NaN NaN
11 Azerbaijan Asia 2.30 17 NaN NaN 66.5 74.5 33 321 NaN NaN 0.300 0.500
12 Bahamas America 1.95 62 12.1 13.2 70.5 77.1 14 12545 81.2 67.0 1.500 2.000
13 Bahrain Asia 2.97 53 12.6 13.3 71.1 75.3 18 9073 88.2 29.2 10.900 20.600
14 Bangladesh Asia 3.14 49 NaN NaN 58.1 58.2 78 280 88.8 55.9 50.600 73.900
15 Barbados America 1.73 55 NaN NaN 73.6 78.7 9 7173 73.4 61.4 2.000 3.200
16 Belarus Europe 1.40 50 NaN NaN 64.4 74.8 15 994 76.4 61.3 0.300 0.600
17 Belgium Europe 1.62 79 15.6 15.4 73.9 80.6 7 26582 NaN NaN NaN NaN
18 Belize America 3.66 47 10.6 10.4 73.4 76.1 30 2569 79.0 34.0 21.252 23.472
19 Benin Africa 5.83 16 NaN NaN 52.4 57.2 84 391 90.0 57.8 51.300 74.200
20 Bhutan Asia 5.89 19 NaN NaN 51.6 54.9 104 166 NaN NaN 43.800 71.900
21 Bolivia America 4.36 45 NaN NaN 59.8 63.2 66 909 74.1 56.3 9.500 24.000
22 Bosnia Europe 1.40 NaN NaN NaN 70.5 75.9 13 271 NaN NaN NaN NaN
23 Botswana Africa 4.45 33 10.5 10.7 48.9 51.7 56 3640 75.4 41.7 19.500 40.100
24 Brazil America 2.17 74 NaN NaN 63.4 71.2 42 4510 84.0 53.6 16.700 16.800
25 Brunei Asia 2.70 NaN 11.8 12.1 73.4 78.1 9 16683 82.2 46.4 7.400 16.600
26 Bulgaria Europe 1.45 NaN 11.8 12.5 67.8 74.9 16 1518 60.7 53.8 0.924 2.376
27 Burkina.Faso Africa 6.57 8 3.3 2.0 45.1 47.0 97 165 88.9 79.4 70.500 90.800
28 Burundi Africa 6.28 9 5.1 4.0 45.5 48.8 114 205 90.1 90.6 50.700 77.500
29 Cambodia Asia 4.50 NaN NaN NaN 52.6 55.4 102 130 77.3 84.7 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
177 Swaziland Africa 4.46 20 11.5 10.8 57.7 62.3 65 1389 64.3 27.7 22.000 24.400
178 Sweden Europe 1.80 78 13.9 14.5 76.2 80.8 5 26253 80.0 75.6 NaN NaN
179 Switzerland Europe 1.46 71 14.5 13.5 75.3 81.8 5 42416 78.5 56.8 NaN NaN
180 Syria Asia 4.00 36 9.8 8.5 66.7 71.2 33 3573 NaN NaN 14.300 44.200
181 Tajikistan Asia 3.93 21 NaN NaN 64.2 70.2 56 122 75.0 60.0 0.300 0.400
182 Tanzania Africa 5.48 18 NaN NaN 50.0 52.8 80 139 NaN NaN 20.600 43.200
183 Thailand Asia 1.74 74 NaN NaN 66.3 72.3 30 2896 83.8 65.2 4.000 8.400
184 Togo Africa 6.08 12 NaN NaN 48.8 51.5 86 322 77.5 50.8 33.000 63.000
185 Tonga Oceania 4.02 74 NaN NaN 67.0 71.0 3 1787 74.2 45.4 0.264 0.504
186 Trinidad.and.Tobago America 2.10 53 10.1 11.3 71.5 76.2 14 4083 75.5 44.9 1.200 3.000
187 Tunisia Africa 2.92 60 NaN NaN 68.4 70.7 37 2030 75.4 20.3 21.400 45.400
188 Turkey Asia 2.50 63 10.6 8.7 66.5 71.7 44 2814 75.9 30.6 8.300 27.600
189 Turkmenistan Asia 3.58 20 NaN NaN 61.2 68.0 57 321 78.0 62.0 0.200 0.400
190 Tuvalu Oceania NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
191 Uganda Africa 7.10 15 NaN NaN 40.4 42.3 113 305 NaN NaN 26.300 49.800
192 Ukraine Europe 1.38 23 NaN NaN 63.6 74.0 18 694 69.1 57.1 0.330 2.160
193 United.Arab.Emirates Asia 3.46 NaN 9.8 10.3 73.9 76.5 15 17690 92.5 24.2 21.100 20.200
194 United.Kingdom Europe 1.72 82 16.1 16.6 74.5 79.8 6 18913 71.9 53.5 NaN NaN
195 United.States America 1.96 71 15.4 16.2 73.4 80.1 7 26037 74.9 59.3 2.244 2.232
196 Uruguay America 2.25 NaN NaN NaN 69.6 76.1 17 5602 74.0 46.7 3.100 2.300
197 Uzbekistan Asia 3.48 56 NaN NaN 64.3 70.7 43 435 75.0 61.0 0.200 0.400
198 Vanuatu Oceania 4.36 15 NaN NaN 65.5 69.5 38 1289 88.6 79.3 34.914 46.368
199 Venezuela America 2.98 52 10.2 10.7 70.0 75.7 21 3496 82.1 41.2 8.200 9.700
200 Viet.Nam Asia 2.97 65 NaN NaN 64.9 69.6 37 270 81.6 74.1 3.500 8.800
201 Virgin.Islands America 3.03 NaN NaN NaN NaN NaN 12 NaN 72.3 59.5 NaN NaN
202 Western.Sahara Africa 3.98 NaN NaN NaN 59.8 63.1 64 NaN NaN NaN NaN NaN
203 Yemen Asia 7.60 7 NaN NaN 57.4 58.4 80 732 80.6 1.9 32.406 69.552
204 Yugoslavia Europe 1.80 NaN NaN NaN 69.8 75.3 19 1487 NaN NaN 1.782 9.072
205 Zambia Africa 5.49 25 7.9 6.8 42.2 43.7 103 382 NaN NaN 14.400 28.700
206 Zimbabwe Africa 4.68 48 NaN NaN 47.6 49.4 68 786 77.7 46.7 9.600 20.100

207 rows × 14 columns


In [109]:
#Habiendo analizado los datos, se elige las siguientes features, y se elimina las entradas con NA
features = ["country", "lifeMale", "lifeFemale", "GDPperCapita", "infantMortality"]
data = data[features].copy()
data.dropna(inplace=True)
X = data[data.columns[1:]] #Me deshago del país, estos son los datos a modelizar

In [107]:
km = KMeans(3, init='k-means++') # initialize
#dtf = model.fit_transform(X)
#plt.scatter(x=dtf[:,0],y=dtf[:,1], cmap = plt.cm.get_cmap("RdYlGn"))
#plt.plot(data[c==0]["lifeMale"],data[c==0]["GDPperCapita"],'ro')
#plt.plot(data[c==1]["lifeMale"],data[c==1]["GDPperCapita"],'go')
#plt.plot(data[c==2]["lifeMale"],data[c==2]["GDPperCapita"],'bo')

Para encontrar significado a los datos, podemos reducir las dimensiones, pero en este caso no tiene mucho sentido o generar clusters de datos, que permite separar en diferentes casos. El método de K-Means necesita a priori saber la cantidad de clusters, para lo cual vamos a usar el método del codo


In [116]:
K = range(1,10)
KM = [KMeans(n_clusters=k).fit(X) for k in K] #Listas de modelos, todos con un k diferente.
centroids = [k.cluster_centers_ for k in KM]  #Centros de los clusters, donde voy a definir distancia

D_k = [cdist(X, cent, 'euclidean') for cent in centroids]  #Distancias de los puntos a los centros
dist = [np.min(D,axis=1) for D in D_k]
avgWithinSS = [sum(d)/X.shape[0] for d in dist] #Promedio pesado en la cantidad de datos


#Acá graficamos laa distancia promedio frente a la cantidad de clusters.
fig = plt.figure()
plt.plot(K, avgWithinSS, 'b*-')
plt.grid(True)
plt.xlabel('Cantidad de clusters')
plt.ylabel('Promedio de distancia a los centroides');



In [87]:
digits = load_digits()
digits.data.shape
model = Isomap(n_components=2)
model.fit(digits.data)
dtf = model.fit_transform(digits.data)
plt.scatter(x=dtf[:,0],y=dtf[:,1], c = digits.target, cmap = plt.cm.get_cmap("RdYlGn"))
plt.colorbar()


C:\Users\sadeus\Anaconda3\lib\site-packages\sklearn\base.py:175: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead
  args, varargs, kw, default = inspect.getargspec(init)
C:\Users\sadeus\Anaconda3\lib\site-packages\sklearn\base.py:175: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead
  args, varargs, kw, default = inspect.getargspec(init)
Out[87]:
<matplotlib.colorbar.Colorbar at 0x22bd73cc50>
C:\Users\sadeus\Anaconda3\lib\site-packages\matplotlib\collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  self.set_edgecolor(c)