In [1]:
import parser_macl as ps

In [2]:
reload(ps)
dt=ps.read_data("../ICSD/icsd-ternaries.csv")

In [3]:
from sklearn.preprocessing import StandardScaler

In [3]:
dt_array=ps.get_array_form(dt)

In [5]:
scaler=StandardScaler()
dt_scaled=scaler.fit_transform(dt_array)

In [6]:
from sklearn.cluster import KMeans

In [7]:
kmeans=KMeans(n_clusters=10,random_state=42)

In [8]:
labels=kmeans.fit_predict(dt_scaled)

In [9]:
from collections import Counter
print Counter(labels)


Counter({5: 4548, 4: 3914, 2: 3811, 8: 3171, 3: 2969, 7: 2966, 1: 2776, 6: 702, 0: 645, 9: 637})

In [10]:
dt["Composition"][labels==5]


Out[10]:
10        Ge1 Pt1 Se1
61        Ag3 As1 Se3
63          Hg2 P2 S7
64          Hg2 P2 S6
125        In2 Co3 S2
156        Fe1 Bi1 O3
163        Zr1 Si1 S1
165       Zr1 Si1 Te1
166        Zr1 Ge1 S1
168       Zr1 Ge1 Te1
169        Zn1 In2 S4
203        Te2 Au1 I1
230        Ag1 As1 S2
242       Ag1 As1 Se2
260       In1 Mo6 Se8
263         Ta1 P1 S6
272         Ta2 C1 S2
304       Al2 Cd1 Se4
307       Al2 Cd1 Te4
311       Cd1 Ga2 Te4
312       Cd1 In2 Se4
314       Zn1 In2 Te4
321       Hf1 Si1 Te1
324       Hf1 Ge1 Te1
353         Ta3 O7 F1
370       In1 Cu1 Se2
371       In1 Cu1 Te2
374       Al1 Ag1 Se2
375       Al1 Ag1 Te2
380       In1 Ag1 Te2
             ...     
25980     Rb1 Zn4 As3
25981      Rb1 Zn4 P3
25990     Sr2 Zn1 Bi2
26000     Ce1 Mg1 Co4
26001       Cd1 B2 H8
26016      Ta7 B8 Ru6
26018      Ta7 B8 Rh6
26019    Dy4 Cr1 Ga12
26036     Er4 Sb6 Rh7
26037     Er2 Mg2 Ru1
26039     La1 Sc1 Sb1
26041     Sm1 Sc1 Sb1
26042     Tb1 Sc1 Sb1
26045     Er2 Mg3 Ru1
26053      K4 Zn1 Bi2
26060      Ta3 B2 Ru5
26061     Li3 La1 As2
26071     La8 In3 Co2
26073     La3 Ga2 Ru2
26074      Ta1 B1 Ru1
26075      Ta2 B2 Os1
26092      La1 P1 Rh1
26098       Sm1 B2 H8
26099      K6 Zn1 Bi5
26102      Sc2 B1 Ir6
26110     La1 Ga2 Rh3
26115     Sc5 In4 Co2
26117       Al1 B1 W1
26122     Ce2 Al1 Ru2
26132     Sm1 Mg2 Sb2
Name: Composition, dtype: object

In [7]:
dt_slice=dt_array[0:10000]

In [5]:
from sklearn.metrics.pairwise import euclidean_distances

In [8]:
dist=euclidean_distances(dt_slice)

In [16]:
dist.shape


Out[16]:
(5000, 5000)

In [17]:
import matplotlib.pyplot as plt


/usr/local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [18]:
%matplotlib inline
plt.figure(figsize=(10,10))
plt.imshow(euclidean_distances(dist))
plt.colorbar()


Out[18]:
<matplotlib.colorbar.Colorbar at 0x10ec593d0>

In [ ]: