In [1]:
import fingerprint as fp


/usr/local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [2]:
s_all=fp.read_pickle("struct_all.pickle")

In [4]:
import pandas as pd
import tqdm

In [5]:
s250=s_all[0:250]
f_ones_250=[fp.get_phi(s250[i],obser="ones",rmax=12,delta=0.05) for i in tqdm.tqdm_notebook(range(len(s250)))]
f_Z_250=[fp.get_phi(s250[i],obser="Z",rmax=12,delta=0.05) for i in tqdm.tqdm_notebook(range(len(s250)))]
f_Chi_250=[fp.get_phi(s250[i],obser="Chi",rmax=12,delta=0.05) for i in tqdm.tqdm_notebook(range(len(s250)))]





In [6]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

In [12]:
plt.figure(figsize=(10,10))
r=np.linspace(0.05,12,(12/0.05))
for i in range(2):
    lab=s250[i].composition
    plt.plot(r,f_Z_250[i],label=lab)
plt.legend()


Out[12]:
<matplotlib.legend.Legend at 0x12830d1d0>

In [14]:
df_ones=pd.DataFrame(f_ones_250)

In [15]:
metric_ones=np.array([np.dot(f_ones_250[i],f_ones_250[j]) for i in range(250) for j in range(250)]).reshape(250,250)

In [16]:
metric_Z=np.array([np.dot(f_ones_250[i],f_ones_250[j]) for i in range(250) for j in range(250)]).reshape(250,250)

In [17]:
dist=np.array([np.dot(f_ones_250[0],f_ones_250[i]) for i in range(250)])

In [18]:
df_ones=pd.DataFrame({"phi_ones":f_ones_250})

In [19]:
df_ones["dist"]=dist

dt_sort=df_ones.sort("dist").drop("dist",axis=1)


/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:3: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  app.launch_new_instance()

In [20]:
comps=[s250[i].composition for i in range(250)]

In [21]:
df_ones["Composition"]=comps

In [22]:
dt_sort=df_ones.sort("dist",ascending=False)


/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  if __name__ == '__main__':

In [23]:
dt_sort.set_index(np.arange(dt_sort.shape[0]),inplace=True)

In [24]:
for i in range(5):
    print dt_sort["Composition"][i]


Nb1 Ag1 O3
Ni1 Ag1 F3
Mg1 Ag1 F3
Zn1 Ag1 F3
S2 Bi1 Ag1

In [25]:
sorted_f_ones=dt_sort['phi_ones'].values

In [26]:
metric_ones=np.array([np.dot(sorted_f_ones[i],sorted_f_ones[j]) for i in range(250) for j in range(250)]).reshape(250,250)

In [27]:
#for i in range(250):    
#    metric_ones[i][i]=0
plt.figure(figsize=(15,15))
plt.imshow(metric_ones[0:250,0:250])
plt.colorbar()


Out[27]:
<matplotlib.colorbar.Colorbar at 0x11e4e2950>

In [73]:
import sklearn

In [74]:
from sklearn.metrics.pairwise import euclidean_distances

In [79]:
euclid=np.array([np.sqrt(np.dot(sorted_f_ones[i],sorted_f_ones[i])+np.dot(sorted_f_ones[j],sorted_f_ones[j])-2*np.dot(sorted_f_ones[i],sorted_f_ones[j])) for i in tqdm.tqdm_notebook(range(250)) for j in range(250)]).reshape(250,250)




In [82]:
plt.figure(figsize=(15,15))
plt.imshow(euclid[0:250,0:250])
plt.colorbar()


Out[82]:
<matplotlib.colorbar.Colorbar at 0x11a183790>

In [85]:
len(s_all)


Out[85]:
17311

In [95]:
dist2=np.array([np.dot(f_ones_250[0]-f_ones_250[i],f_ones_250[0]-f_ones_250[i]) for i in range(250)])
df_ones["dist2"]=dist2
dt_sort=df_ones.sort("dist2").drop("dist2",axis=1)
dt_sort.set_index(np.arange(dt_sort.shape[0]),inplace=True)
sorted_f_ones=dt_sort['phi_ones'].values
euclid=np.array([np.sqrt(np.dot(sorted_f_ones[i],sorted_f_ones[i])+np.dot(sorted_f_ones[j],sorted_f_ones[j])-2*np.dot(sorted_f_ones[i],sorted_f_ones[j])) for i in tqdm.tqdm_notebook(range(250)) for j in range(250)]).reshape(250,250)
plt.figure(figsize=(12,12))
plt.imshow(euclid[0:250,0:250])
plt.colorbar()


/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:3: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  app.launch_new_instance()

Out[95]:
<matplotlib.colorbar.Colorbar at 0x1346657d0>

In [99]:
plt.figure(figsize=(10,10))
plt.imshow(euclid[0:100,0:100],interpolation='nearest')
plt.colorbar()


Out[99]:
<matplotlib.colorbar.Colorbar at 0x135c9d690>

In [96]:
dt_sort["Composition"][0:20]


Out[96]:
0      (O, Nb, Ag)
1      (F, Ni, Ag)
2      (F, Mg, Ag)
3      (F, Zn, Ag)
4      (O, Nb, Ag)
5       (O, W, Ag)
6      (O, Te, Ag)
7      (Hf, F, Ag)
8      (S, Na, Ag)
9     (Se, Ho, Ag)
10     (O, Te, Ag)
11    (Se, Er, Ag)
12     (O, Ru, Ag)
13     (O, Nb, Ag)
14     (F, Sn, Ag)
15    (As, Nd, Ag)
16      (O, W, Ag)
17     (Ge, O, Ag)
18     (S, Bi, Ag)
19     (F, Ru, Ag)
Name: Composition, dtype: object

In [102]:
df_Chi=pd.DataFrame({"phi_Chi":f_Chi_250})
dist2=np.array([np.dot(f_Chi_250[0]-f_Chi_250[i],f_Chi_250[0]-f_Chi_250[i]) for i in range(250)])
df_Chi["dist2"]=dist2
dt_sort=df_Chi.sort("dist2").drop("dist2",axis=1)
dt_sort.set_index(np.arange(dt_sort.shape[0]),inplace=True)
sorted_f_ones=dt_sort['phi_Chi'].values
euclid=np.array([np.sqrt(np.dot(sorted_f_ones[i],sorted_f_ones[i])+np.dot(sorted_f_ones[j],sorted_f_ones[j])-2*np.dot(sorted_f_ones[i],sorted_f_ones[j])) for i in tqdm.tqdm_notebook(range(250)) for j in range(250)]).reshape(250,250)
plt.figure(figsize=(12,12))
plt.imshow(euclid[0:250,0:250])
plt.colorbar()


/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:4: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)

Out[102]:
<matplotlib.colorbar.Colorbar at 0x193847510>

In [103]:
from sklearn.cluster import KMeans

In [104]:
s250=s_all[0:2000]
f_ones_2000=[fp.get_phi(s250[i],obser="ones",rmax=12,delta=0.05) for i in tqdm.tqdm_notebook(range(len(s250)))]




In [111]:
inertia=np.zeros(50,dtype=float)
for i in tqdm.tqdm_notebook(range(50)):
    Km=KMeans(n_clusters=5+i)
    Km.fit_predict(f_ones_2000)
    inertia[i]=Km.inertia_
plt.plot(inertia)



Out[111]:
[<matplotlib.lines.Line2D at 0x134f6c610>]

In [108]:
comps=[s250[i].composition for i in range(2000)]
df_ones=pd.DataFrame({"phi_ones":f_ones_2000})
df_ones["cluster"]=clust
df_ones["Composition"]=comps
df_sorted=df_ones.sort("cluster")
sorted_f_ones=df_sorted['phi_ones'].values
euclid=np.array([np.sqrt(np.dot(sorted_f_ones[i],sorted_f_ones[i])+np.dot(sorted_f_ones[j],sorted_f_ones[j])-2*np.dot(sorted_f_ones[i],sorted_f_ones[j])) for i in tqdm.tqdm_notebook(range(2000)) for j in range(2000)]).reshape(2000,2000)
plt.figure(figsize=(12,12))
plt.imshow(euclid)
plt.colorbar()


/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:5: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)

Out[108]:
<matplotlib.colorbar.Colorbar at 0x136da7750>

In [136]:
Km=KMeans(n_clusters=12)
clust=Km.fit_predict(f_ones_2000)
comps=[s250[i].composition.formula for i in range(2000)]
df_ones=pd.DataFrame({"phi_ones":f_ones_2000})
df_ones["cluster"]=clust
df_ones["Composition"]=comps
df_sorted=df_ones.sort("cluster")
df_sorted.set_index(np.arange(2000))
sorted_f_ones=df_sorted['phi_ones'].values
euclid=np.array([np.sqrt(np.dot(sorted_f_ones[i],sorted_f_ones[i])+np.dot(sorted_f_ones[j],sorted_f_ones[j])-2*np.dot(sorted_f_ones[i],sorted_f_ones[j])) for i in tqdm.tqdm_notebook(range(2000)) for j in range(2000)]).reshape(2000,2000)
plt.figure(figsize=(12,12))
plt.imshow(euclid)
plt.colorbar()


/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:7: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)

Out[136]:
<matplotlib.colorbar.Colorbar at 0x118fab490>

In [139]:
df_sorted=df_sorted.set_index(np.arange(2000))
df_sorted


Out[139]:
phi_ones cluster Composition
0 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 Ag4 Cl4 O8
1 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 K4 Na2 As2
2 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 Ba32 Ge8 As32
3 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 Rb8 Ag4 Br12
4 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 Ba1 Cu2 As2
5 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 Sr2 Ag2 O4
6 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 K20 Sn4 As12
7 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 K3 Ag3 As2
8 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 K1 Zn1 As1
9 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 Tl12 Ag4 Te8
10 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 K14 Nb2 As8
11 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 Ba32 Ti8 As32
12 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 K4 Zn1 As2
13 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 Rb14 Nb2 As8
14 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 K14 Ta2 As8
15 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 K16 Ga16 As24
16 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 K12 In8 As12
17 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 K16 Ga8 As16
18 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 Ba1 Ag2 Ge2
19 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 Tl2 Ag2 Cl6
20 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 K4 Ag8 Se6
21 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 Rb4 Ag8 S6
22 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 Ag3 C6 N9
23 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 Sr2 Ag2 As2
24 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 Tl4 Ag12 S8
25 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 Eu2 Ag2 Sb2
26 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 Tl1 Ag1 Cl3
27 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 Tl4 Ag4 Se4
28 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 K4 Ag2 P2
29 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 0 K18 Nb4 As12
... ... ... ...
1970 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Sr1 Zn2 As2
1971 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Ba1 Al2 Si2
1972 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Pr1 Al2 Si2
1973 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Ba1 Zn2 As2
1974 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 In2 Ag2 S4
1975 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Ti1 Co1 As1
1976 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Al2 Ag2 Se4
1977 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Mn1 Ni1 As1
1978 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Cu2 As2 Se4
1979 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Ca1 Cu4 As2
1980 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Li22 Mn2 As12
1981 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Li10 Si2 As6
1982 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Li14 V2 As8
1983 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Cd2 Ge2 As4
1984 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Ca1 Zn2 As2
1985 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Zr1 Ag1 B1
1986 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Ta1 Mn1 As1
1987 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Li2 Mn2 As2
1988 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Eu1 Cu4 As2
1989 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Gd1 Al2 Si2
1990 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Li1 Al1 Si1
1991 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Mn1 Nb1 As1
1992 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Ce1 Al2 Ge2
1993 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Li1 Zn1 As1
1994 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Na2 Al2 Si2
1995 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 In4 Ag4 S8
1996 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 In5 Ag1 Se8
1997 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 In6 As2 Se6
1998 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Ca3 As1 Br3
1999 [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 11 Ti1 As1 Rh1

2000 rows × 3 columns


In [144]:
df_sorted[df_sorted["cluster"]==3]["Composition"]


Out[144]:
739      La20 B20 C20
740        Sc1 Co3 B2
741         Fe2 B2 O6
742        U4 As2 Se2
743        Ho1 Co3 B2
744       Mg2 Cu2 As2
745       Yb2 Cu2 As4
746       Rb2 Ca2 As2
747        Sm4 As4 S4
748         Sm2 B4 C4
749       Mg8 Cu8 As8
750       Er2 Cu2 As4
751       Li6 Mn2 As4
752       Tm4 Ni2 As4
753        Al1 Mo1 O3
754        U2 As4 Pd2
755       Tm2 Cu2 As4
756        Eu1 As1 O3
757        Ce4 As4 S4
758       Ho4 Ni2 As4
759        Y2 Cu2 As4
760       Ge1 Te4 As2
761        Tm1 Co3 B2
762        Er1 Co3 B2
763         Cr2 B2 O6
764       Dy2 Cu2 As4
765        U2 Cu4 As5
766         Ca2 B4 C4
767        U2 Co2 As4
768        Yb1 Co3 B2
            ...      
839        Ce4 Ag4 P8
840     Tl4 Ag32 Te22
841        Tb5 Ag1 S8
842       Cs4 Ag8 I12
843        Sm2 Ag2 O5
844        Sr1 Ag1 O3
845       Tb2 Ag2 Se4
846      In6 Ag6 Te12
847       Ho2 Ag2 Se4
848        Pr5 Ag1 S8
849        Ag2 Bi2 S4
850       Ag2 Bi2 Se4
851       Ag9 Te4 Br3
852        Cu1 Ag1 F3
853       Ag3 Bi3 Se6
854        Al1 Ag1 O3
855         Al2 V8 C6
856       Nd2 Ag2 As4
857         Y1 Ag1 O3
858       Mg1 Al2 Si2
859        Y2 Ag2 Se4
860        Zn1 Ag1 F3
861        Tl2 Ag2 I4
862       Ce4 Ag4 As8
863       Nb8 Ag8 O24
864       Ag1 As1 Se2
865        Ag1 Bi1 S2
866    Ag40 Te16 Br12
867     Cs4 Ag20 Se12
868        Na2 Ag2 F8
Name: Composition, dtype: object

In [ ]: