In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
%matplotlib inline









    



/usr/local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')



In [2]:

    
from pymatgen import Composition

Reading old Icsd ternaries csv file



In [3]:

    
old_icsd=pd.read_csv("../ICSD/icsd-ternaries.csv",sep='\t',names=["Entry Number","Space Group","Structure","Structure Type","Description","Authors","Reference"],header=None)
old_icsd.head()









    Out[3]:






  
    
      
      Entry Number
      Space Group
      Structure
      Structure Type
      Description
      Authors
      Reference
    
  
  
    
      0
      25
      F -4 3 m
      Ag8 Ge Te6
      Ag8SiTe6
      Structure cristalline de Ag8 Ge Te6 (gamma)
      Rysanek, N.;  Laruelle, P.;  Katty, A.
      Acta Crystallographica B (24,1968-38,1982) (19...
    
    
      1
      33
      P 1 21/n 1
      Pb (W O4)
      PbWO4(mP48)
      Pb W O4-III (a high-pressure form)
      Richter, P.W.;  Kruger, G.J.;  Pistorius, C.W....
      Acta Crystallographica B (24,1968-38,1982) (19...
    
    
      2
      68
      R -3 R
      Pb0.5 Mo3 Se4
      Mo6PbS8
      Etude structurale de combinaisons sulfurees et...
      Guillevic, J.;  Lestrat, H.;  Grandjean, D.
      Acta Crystallographica B (24,1968-38,1982) (19...
    
    
      3
      208
      I 41/a m d Z
      Cu Br Te
      NaN
      Crystal structure of copper bromide telluride
      Carkner, P.M.;  Haendler, H.M.
      Journal of Solid State Chemistry (1976) 18,  p...
    
    
      4
      286
      P 63
      Fe (I O3)3
      Fe(IO3)3
      Zur Kristallstruktur von Fe I3 O9
      Jansen, M.
      Journal of Solid State Chemistry (1976) 17,  p...



In [4]:

    
old_icsd["Structure Type"].value_counts()









    Out[4]:





Spinel-Al2MgO4             1922
ThCr2Si2                   1887
Perovskite-GdFeO3          1179
NaCl                       1080
Perovskite-CaTiO3           987
TiNiSi-MgSrSi               903
ZrNiAl-Fe2P                 864
Laves(cub)-Cu2Mg            808
Mn12Th                      739
Heusler-AlCu2Mn             691
Laves(2H)-MgZn2             670
PbClF/Cu2Sb                 625
Delafossite-NaCrS2          600
CaCu5                       468
BaCuSn2-CeNi(1-x)Si2        445
Chalcopyrite-CuFeS2         414
Pyrochlore-NaCa(Nb2O6)F     404
K4Si23                      398
Fluorite-CaF2               385
Th2Zn17                     361
AlB2                        337
Heusler(alloy)-AlLiSi       331
Sphalerite-ZnS(cF8)         301
MnP                         299
Perovskite-NdAlO3           289
Zircon-ZrSiO4               275
Si2U3                       263
CaFe2O4                     263
Th3P4                       261
Auricupride-AuCu3           257
                           ... 
NaAsSe2                       1
Perovskite-NaBaLiNiF6         1
Co2Sr3O6-x                    1
(La,Sr)(Ga,Mg)O3-x            1
Na2.4Fe11O16                  1
Fe2P2O7                       1
SrTb2Fe2O7                    1
Na2Si2O5(oP36)                1
Pb9Pd13                       1
Ni2SrP2                       1
Sr5Co4O12                     1
Ni12P5                        1
K1-xFeF3                      1
SrSn4                         1
CaSi2(hR6)                    1
K1.4Sn2.2Bi7.4Se14            1
OsO4                          1
Pt2Si3                        1
Na(UO2)4O2(OH)5(H2O)5         1
Li4SrN2                       1
MgSi(OH)6                     1
MoMn4SiC                      1
Bi2Sr2Nb2.5Fe.5O12            1
Ba3TiNb4O15                   1
SrCuO2+x                      1
Sr8(FeN3)2FeN2                1
BiPd3                         1
In2OPO4                       1
Ni3Sn4                        1
CaTiSiO5                      1
Name: Structure Type, dtype: int64



In [5]:

    
Structures=[Composition(j).formula for j in old_icsd["Structure"].values]



In [6]:

    
old_icsd["Structures"]=Structures



In [7]:

    
import fingerprint as fp
struct_all=s_all=fp.read_pickle("struct_all.pickle")



In [8]:

    
all_comps=[x.composition.formula for x in struct_all]



In [9]:

    
def find_overlap(struct_type):
    lis=old_icsd[old_icsd["Structure Type"]==struct_type]["Structures"]
    unique_lis=[str(x) for x in np.unique(lis)]
    overlap=[x for x in unique_lis if x in all_comps]
    print "{} matches found for {}".format(len(overlap),struct_type)
    return overlap



In [10]:

    
overlap_GdFeO3=find_overlap("Perovskite-GdFeO3")
overlap_122=find_overlap("ThCr2Si2")
overlap_CaTiO3=find_overlap("Perovskite-CaTiO3")
overlap_NaCl=find_overlap("NaCl")
overlap_spinel=find_overlap("Spinel-Al2MgO4")
overlap_delaf=find_overlap("Delafossite-NaCrS2")









    



57 matches found for Perovskite-GdFeO3
244 matches found for ThCr2Si2
143 matches found for Perovskite-CaTiO3
34 matches found for NaCl
18 matches found for Spinel-Al2MgO4
222 matches found for Delafossite-NaCrS2



In [11]:

    
def matching_struct(comp_list):
    return[x for x in struct_all if x.composition.formula in comp_list]



In [12]:

    
GdFeO3_structs=matching_struct(overlap_GdFeO3)
one22_structs=matching_struct(overlap_122)
CaTiO3_structs=matching_struct(overlap_CaTiO3)
NaCl_structs=matching_struct(overlap_NaCl)
Spinel_structs=matching_struct(overlap_spinel)
delaf_structs=matching_struct(overlap_delaf)



In [13]:

    
import itertools
matching_structs=[]
matching_structs.extend(itertools.chain(one22_structs,CaTiO3_structs,Spinel_structs,delaf_structs))
len(matching_structs)









    Out[13]:





627



In [15]:

    
dict_structs={}
dict_structs.update({x:0 for x in overlap_delaf})
dict_structs.update({x:1 for x in overlap_CaTiO3})
dict_structs.update({x:2 for x in overlap_122})
dict_structs.update({x:3 for x in overlap_spinel})
#dict_structs.update({x:4 for x in overlap_NaCl})
print len(dict_structs)



In [15]:

    
import tqdm



In [16]:

    
f_ones=[fp.get_phi(matching_structs[i],obser="ones",rmax=10,delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
f_Z=[fp.get_phi(matching_structs[i],obser="Z",rmax=10,delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
f_Chi=[fp.get_phi(matching_structs[i],obser="Chi",rmax=10,delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
Structures=[i.composition.formula for i in matching_structs]
labels=[dict_structs[i] for i in Structures]



In [17]:

    
fingerprints=np.array([list(itertools.chain(f_ones[i],f_Z[i],f_Chi[i])) for i in range(len(f_ones))])



In [18]:

    
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.metrics import adjusted_rand_score



In [19]:

    
Km=KMeans(n_clusters=4,n_init=250)
clust_km=Km.fit_predict(fingerprints)
print confusion_matrix(labels,clust_km)
print adjusted_rand_score(labels,clust_km)









    



[[ 18  69  78  57]
 [ 15  37  41  50]
 [133  99  11   1]
 [  4   3   6   5]]
0.164250307488



In [20]:

    
from sklearn.cluster import AgglomerativeClustering



In [21]:

    
Ag=AgglomerativeClustering(n_clusters=4)



In [22]:

    
Ag=AgglomerativeClustering(n_clusters=4)
clust_ag=Ag.fit_predict(fingerprints)
print confusion_matrix(labels,clust_ag)
print adjusted_rand_score(labels,clust_ag)









    



[[100  48  41  33]
 [ 49  21  43  30]
 [ 12 145   0  87]
 [  6  11   1   0]]
0.163697127181

Lets try supervised learning



In [23]:

    
from numpy import random
r=random.RandomState(42)
perm_state=r.permutation(len(matching_structs))
labels=np.array(labels)
labels_perm=labels[perm_state]
fingerprints_perm=fingerprints[perm_state]



In [24]:

    
from sklearn.svm import SVC



In [25]:

    
sv=SVC(random_state=42)
sv.fit(fingerprints_perm,labels_perm)
clust_svc=sv.predict(fingerprints_perm)



In [26]:

    
print confusion_matrix(labels_perm,clust_svc)
print adjusted_rand_score(labels_perm,clust_svc)









    



[[222   0   0   0]
 [  2 140   1   0]
 [  1   0 243   0]
 [  7   0   1  10]]
0.960100502991



In [27]:

    
from sklearn.svm import LinearSVC
lsv=LinearSVC(random_state=42)



In [28]:

    
lsv.fit(fingerprints_perm,labels_perm)









    Out[28]:





LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)



In [29]:

    
clust_lsv=lsv.predict(fingerprints_perm)
print confusion_matrix(labels_perm,clust_lsv)
print adjusted_rand_score(labels_perm,clust_lsv)









    



[[222   0   0   0]
 [  0 143   0   0]
 [  0   0 244   0]
 [  0   0   0  18]]
1.0



In [30]:

    
from sklearn.linear_model import LogisticRegression



In [31]:

    
logist=LogisticRegression()

logist.fit(fingerprints_perm,labels_perm)

clust_logist=logist.predict(fingerprints_perm)
print confusion_matrix(labels_perm,clust_logist)
print adjusted_rand_score(labels_perm,clust_logist)









    



[[222   0   0   0]
 [  0 143   0   0]
 [  0   0 244   0]
 [  0   0   0  18]]
1.0



In [32]:

    
ones_perm=np.array(f_ones)[perm_state]
Z_perm=np.array(f_Z)[perm_state]
Chi_perm=np.array(f_Chi)[perm_state]



In [33]:

    
logist2=LogisticRegression()

logist2.fit(ones_perm,labels_perm)
clust_logist2=logist2.predict(ones_perm)
print confusion_matrix(labels_perm,clust_logist2)
print adjusted_rand_score(labels_perm,clust_logist2)









    



[[212   4   5   1]
 [  8 135   0   0]
 [  8   1 235   0]
 [  3   1   0  14]]
0.866898948153



In [34]:

    
logist3=LogisticRegression()

logist3.fit(Z_perm,labels_perm)
clust_logist3=logist3.predict(Z_perm)
print confusion_matrix(labels_perm,clust_logist3)
print adjusted_rand_score(labels_perm,clust_logist3)









    



[[204  13   5   0]
 [ 18 122   3   0]
 [  2   2 240   0]
 [  0   0   0  18]]
0.824594942179



In [35]:

    
logist4=LogisticRegression()

logist4.fit(Chi_perm,labels_perm)
clust_logist4=logist4.predict(Chi_perm)
print confusion_matrix(labels_perm,clust_logist4)
print adjusted_rand_score(labels_perm,clust_logist4)









    



[[220   2   0   0]
 [ 11 131   1   0]
 [  0   0 244   0]
 [  6   0   1  11]]
0.923791715633

Lets try Projection techniques



In [36]:

    
from sklearn.decomposition import PCA



In [37]:

    
label_names=["Delaffosite","Perovskite","122","Spinel"]



In [38]:

    
label_names=np.array(["Delaffosite","Perovskite","122","Spinel"])
c_arr=['r','g','y','b']



In [39]:

    
pca=PCA(n_components=2)

pca_fingerprint=pca.fit_transform(fingerprints_perm)

plt.figure(figsize=(10,10))
for i in range(4):
    plt.scatter(pca_fingerprint[(labels_perm==i),0],pca_fingerprint[(labels_perm==i),1],c=c_arr[i],label=label_names[i])
plt.legend()









    Out[39]:





<matplotlib.legend.Legend at 0x11b505890>



In [40]:

    
from sklearn.manifold import TSNE



In [41]:

    
ts=TSNE(n_components=2,random_state=42)
tsne_fingerprints=ts.fit_transform(fingerprints_perm)
plt.figure(figsize=(10,10))
for i in range(4):
    plt.scatter(tsne_fingerprints[(labels_perm==i),0],tsne_fingerprints[(labels_perm==i),1],c=c_arr[i],label=label_names[i])
plt.legend()









    Out[41]:





<matplotlib.legend.Legend at 0x119718b10>



In [42]:

    
ts2=TSNE(n_components=2,random_state=42)
tsne_fingerprints=ts2.fit_transform(fingerprints_perm)
pca=PCA(n_components=5)
pca_fingerprint=pca.fit_transform(fingerprints_perm)
combined_finger=np.hstack((pca_fingerprint,tsne_fingerprints))



In [43]:

    
combined_finger.shape









    Out[43]:





(627, 7)



In [44]:

    
sv2=SVC(random_state=42)
sv2.fit(combined_finger,labels_perm)
clust_svc_projected=sv2.predict(combined_finger)
print confusion_matrix(labels_perm,clust_svc_projected)
print adjusted_rand_score(labels_perm,clust_svc_projected)









    



[[222   0   0   0]
 [  0 143   0   0]
 [  0   0 244   0]
 [  1   0   0  17]]
0.997242139921



In [45]:

    
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
combined_finger_scaled=scaler.fit_transform(combined_finger)



In [46]:

    
combined_finger_scaled.shape









    Out[46]:





(627, 7)



In [47]:

    
Km2=KMeans(n_clusters=4,n_init=250)
clust_km_projected=Km2.fit_predict(combined_finger_scaled)
print confusion_matrix(labels_perm,clust_km_projected)
print adjusted_rand_score(labels_perm,clust_km_projected)









    



[[ 45  26  31 120]
 [ 31  47  24  41]
 [133  13  93   5]
 [ 11   6   0   1]]
0.177999189098

Lets create similarity matrices



In [48]:

    
euclid=np.array([[np.sqrt(np.dot(fingerprints[i],fingerprints[i])+np.dot(fingerprints[j],fingerprints[j])-2*np.dot(fingerprints[i],fingerprints[j])) for i in range(len(fingerprints))] for j in range(len(fingerprints))])
plt.figure(figsize=(12,12))
plt.imshow(euclid)
plt.colorbar()









    Out[48]:





<matplotlib.colorbar.Colorbar at 0x136fd8c10>



In [341]:

    
Counter(labels)









    Out[341]:





Counter({0: 222, 1: 143, 2: 244, 3: 18})



In [49]:

    
from sklearn.metrics.pairwise import euclidean_distances



In [54]:

    
#fig,axes=plt.subplots(2,4,figsize=(20,12))
for i in range(100):
#    ax=axes.flat[i]
    pca=PCA(n_components=2+i)
    pca_fingerprint=pca.fit_transform(fingerprints)
    print "Covariance ratio for {} components={}".format(2+i,np.sum(pca.explained_variance_ratio_))
#    dist_pca=euclidean_distances(pca_fingerprint)
#   im=ax.imshow(dist_pca)
    #ax.set_title("N_components={}".format(2+i),fontsize=25)
#cax = fig.add_axes([0.95, 0.1, 0.03, 0.85])
#fig.colorbar(im,cax=cax)









    



Covariance ratio for 2 components=0.199235337867
Covariance ratio for 3 components=0.270893320012
Covariance ratio for 4 components=0.333998989211
Covariance ratio for 5 components=0.38519232833
Covariance ratio for 6 components=0.430329714498
Covariance ratio for 7 components=0.470219742083
Covariance ratio for 8 components=0.507294357802
Covariance ratio for 9 components=0.537584685843
Covariance ratio for 10 components=0.562415871807
Covariance ratio for 11 components=0.585677571834
Covariance ratio for 12 components=0.607883101943
Covariance ratio for 13 components=0.629333408764
Covariance ratio for 14 components=0.648778658992
Covariance ratio for 15 components=0.666842205742
Covariance ratio for 16 components=0.684065038007
Covariance ratio for 17 components=0.699983593493
Covariance ratio for 18 components=0.714240880104
Covariance ratio for 19 components=0.72806467537
Covariance ratio for 20 components=0.741832891423
Covariance ratio for 21 components=0.752933697495
Covariance ratio for 22 components=0.763637461952
Covariance ratio for 23 components=0.773933316029
Covariance ratio for 24 components=0.78378565981
Covariance ratio for 25 components=0.793247992016
Covariance ratio for 26 components=0.802041733042
Covariance ratio for 27 components=0.810778959099
Covariance ratio for 28 components=0.819311646481
Covariance ratio for 29 components=0.827126715109
Covariance ratio for 30 components=0.83465551978
Covariance ratio for 31 components=0.841694439791
Covariance ratio for 32 components=0.848539842083
Covariance ratio for 33 components=0.854547971179
Covariance ratio for 34 components=0.860362991173
Covariance ratio for 35 components=0.866080303985
Covariance ratio for 36 components=0.871668209469
Covariance ratio for 37 components=0.877187344925
Covariance ratio for 38 components=0.882277802505
Covariance ratio for 39 components=0.887197283857
Covariance ratio for 40 components=0.891609520878
Covariance ratio for 41 components=0.895875348559
Covariance ratio for 42 components=0.899870189341
Covariance ratio for 43 components=0.903765696279
Covariance ratio for 44 components=0.907572843242
Covariance ratio for 45 components=0.911159908574
Covariance ratio for 46 components=0.914576386291
Covariance ratio for 47 components=0.917850384718
Covariance ratio for 48 components=0.920858193082
Covariance ratio for 49 components=0.923701213997
Covariance ratio for 50 components=0.926480487605
Covariance ratio for 51 components=0.929166070152
Covariance ratio for 52 components=0.931787105493
Covariance ratio for 53 components=0.934233057498
Covariance ratio for 54 components=0.936534376862
Covariance ratio for 55 components=0.938797094665
Covariance ratio for 56 components=0.940916630111
Covariance ratio for 57 components=0.942947305933
Covariance ratio for 58 components=0.944820231481
Covariance ratio for 59 components=0.946640929752
Covariance ratio for 60 components=0.948400102861
Covariance ratio for 61 components=0.950085490402
Covariance ratio for 62 components=0.951716362925
Covariance ratio for 63 components=0.953279818406
Covariance ratio for 64 components=0.95479733228
Covariance ratio for 65 components=0.956309371703
Covariance ratio for 66 components=0.957770989803
Covariance ratio for 67 components=0.959199904073
Covariance ratio for 68 components=0.960534076522
Covariance ratio for 69 components=0.961836699266
Covariance ratio for 70 components=0.963101799679
Covariance ratio for 71 components=0.964321166046
Covariance ratio for 72 components=0.965478132349
Covariance ratio for 73 components=0.966579362519
Covariance ratio for 74 components=0.967640108687
Covariance ratio for 75 components=0.968675274586
Covariance ratio for 76 components=0.969666285478
Covariance ratio for 77 components=0.970618143312
Covariance ratio for 78 components=0.971526414132
Covariance ratio for 79 components=0.972420277231
Covariance ratio for 80 components=0.973296949451
Covariance ratio for 81 components=0.974126934733
Covariance ratio for 82 components=0.974940965506
Covariance ratio for 83 components=0.975729411658
Covariance ratio for 84 components=0.976468413171
Covariance ratio for 85 components=0.977199954259
Covariance ratio for 86 components=0.977896750638
Covariance ratio for 87 components=0.978581079156
Covariance ratio for 88 components=0.979249176915
Covariance ratio for 89 components=0.979887427396
Covariance ratio for 90 components=0.980490706565
Covariance ratio for 91 components=0.981076269051
Covariance ratio for 92 components=0.981646632146
Covariance ratio for 93 components=0.982197807976
Covariance ratio for 94 components=0.982734992305
Covariance ratio for 95 components=0.983233271662
Covariance ratio for 96 components=0.983723688101
Covariance ratio for 97 components=0.984201275729
Covariance ratio for 98 components=0.984667993068
Covariance ratio for 99 components=0.985123098957
Covariance ratio for 100 components=0.985566009262
Covariance ratio for 101 components=0.985995187068



In [58]:

    
[x for x in overlap_CaTiO3 if "Ti" in x]









    Out[58]:





['Ba1 Ti1 O3',
 'Ca1 Ti1 O3',
 'Eu1 Ti1 O3',
 'La1 Ti1 O3',
 'Sr1 Ti1 O3',
 'Ti1 Pb1 O3']



In [3]:

    
BaTiO3_struct=matching_structs(['Ba1 Ti1 O3'])[0]



In [2]:

    
def matching_structs(comp_list):
    return[x for x in struct_all if x.composition.formula in comp_list]



In [4]:

    
print BaTiO3_struct









    



Full Formula (Ba1 Ti1 O3)
Reduced Formula: BaTiO3
abc   :   4.112467   4.112467   5.035613
angles:  90.000000  90.000000  90.000000
Sites (5)
  #  SP      a    b    c
---  ----  ---  ---  ---
  0  Ba    0    0    0
  1  Ti    0.5  0.5  0.5
  2  O     0    0.5  0.5
  3  O     0.5  0    0.5
  4  O     0    0    0.5



In [64]:

    
[x for x in overlap_CaTiO3 ]









    Out[64]:





['Ag3 S1 I1',
 'Al1 Bi1 O3',
 'Ba1 Ce1 O3',
 'Ba1 Ir1 O3',
 'Ba1 Li1 F3',
 'Ba1 Li1 H3',
 'Ba1 Mo1 O3',
 'Ba1 Nb1 O3',
 'Ba1 Pb1 O3',
 'Ba1 Pr1 O3',
 'Ba1 Sn1 O3',
 'Ba1 Th1 O3',
 'Ba1 Ti1 O3',
 'Ba1 Zr1 O3',
 'Ca1 Si1 O3',
 'Ca1 Ti1 O3',
 'Ca1 Zr1 O3',
 'Ca3 As1 N1',
 'Ca3 Bi1 N1',
 'Ca3 Ge1 O1',
 'Ca3 P1 N1',
 'Ca3 Sb1 N1',
 'Ce1 Al1 O3',
 'Ce1 Cr1 O3',
 'Ce1 V1 O3',
 'Cr1 Pb1 O3',
 'Cs1 Ca1 Br3',
 'Cs1 Ca1 F3',
 'Cs1 Ca1 H3',
 'Cs1 Cd1 Br3',
 'Cs1 Cd1 Cl3',
 'Cs1 Cd1 F3',
 'Cs1 Fe1 F3',
 'Cs1 Ge1 Br3',
 'Cs1 Ge1 Cl3',
 'Cs1 Hg1 Br3',
 'Cs1 Hg1 Cl3',
 'Cs1 Hg1 F3',
 'Cs1 Mg1 F3',
 'Cs1 Pb1 Br3',
 'Cs1 Pb1 Cl3',
 'Cs1 Pb1 F3',
 'Cs1 Sm1 Cl3',
 'Cs1 Sn1 Br3',
 'Cs1 Sn1 I3',
 'Cs1 Sr1 F3',
 'Cs1 Tm1 Cl3',
 'Eu1 Cs1 F3',
 'Eu1 Li1 H3',
 'Eu1 Ti1 O3',
 'Ga1 Bi1 O3',
 'Hf1 Pb1 O3',
 'In1 Bi1 O3',
 'K1 Ca1 F3',
 'K1 Cd1 F3',
 'K1 Co1 F3',
 'K1 Fe1 F3',
 'K1 Mg1 F3',
 'K1 Mg1 H3',
 'K1 Mn1 F3',
 'K1 Nb1 O3',
 'K1 Ni1 F3',
 'K1 Pd1 F3',
 'K1 Ta1 O3',
 'K1 U1 O3',
 'K1 V1 F3',
 'K1 Zn1 F3',
 'K3 Br1 O1',
 'K3 I1 O1',
 'La1 Al1 O3',
 'La1 Co1 O3',
 'La1 Cr1 O3',
 'La1 Fe1 O3',
 'La1 Mn1 O3',
 'La1 Ti1 O3',
 'La1 V1 O3',
 'La3 In1 C1',
 'La3 Tl1 C1',
 'Li3 Br1 O1',
 'Mn1 Tl1 Cl3',
 'Na1 Be1 H3',
 'Na1 Mg1 F3',
 'Na1 Nb1 O3',
 'Na1 Ta1 O3',
 'Na1 V1 F3',
 'Na1 W1 O3',
 'Na3 Cl1 O1',
 'Nb1 Ag1 O3',
 'Nd1 Co1 O3',
 'Nd1 Cr1 O3',
 'Nd1 V1 O3',
 'Pr1 Co1 O3',
 'Pr1 Cr1 O3',
 'Rb1 Ca1 F3',
 'Rb1 Ca1 H3',
 'Rb1 Cd1 F3',
 'Rb1 Fe1 F3',
 'Rb1 Ge1 I3',
 'Rb1 Hg1 F3',
 'Rb1 Mg1 F3',
 'Rb1 Mn1 F3',
 'Rb1 N1 O3',
 'Rb1 Pb1 F3',
 'Rb1 Pd1 F3',
 'Rb1 U1 O3',
 'Rb1 V1 F3',
 'Rb3 Br1 O1',
 'Sc1 Bi1 O3',
 'Sc1 Co3 C1',
 'Sc1 Rh3 C1',
 'Sc3 Ga1 C1',
 'Sc3 In1 C1',
 'Sc3 Tl1 C1',
 'Sm1 Co1 O3',
 'Sm1 Cr1 O3',
 'Sm1 V1 O3',
 'Sr1 Co1 O3',
 'Sr1 Cr1 O3',
 'Sr1 Fe1 O3',
 'Sr1 Hf1 O3',
 'Sr1 Li1 H3',
 'Sr1 Mo1 O3',
 'Sr1 Nb1 O3',
 'Sr1 Sn1 O3',
 'Sr1 Ti1 O3',
 'Sr1 V1 O3',
 'Sr3 Bi1 N1',
 'Sr3 Sb1 N1',
 'Ta1 Sn1 O3',
 'Tb1 Ba1 O3',
 'Tb1 Rh3 C1',
 'Th1 Ta1 N3',
 'Ti1 Pb1 O3',
 'V1 Pb1 O3',
 'Y1 Rh3 C1',
 'Y3 Ga1 C1',
 'Y3 In1 C1',
 'Y3 Tl1 C1',
 'Yb1 Cs1 Br3',
 'Yb1 Cs1 F3',
 'Yb1 Rb1 F3',
 'Zn1 Ag1 F3',
 'Zr1 Pb1 O3']



In [42]:

    
LaCoO3_struct=matching_struct(['La1 Co1 O3'])[0]



In [ ]:

    
LaCoO3_struct.



In [6]:

    
print LaCoO3_struct









    



Full Formula (La1 Co1 O3)
Reduced Formula: LaCoO3
abc   :   4.445931   4.445931   4.445931
angles:  90.000000  90.000000  90.000000
Sites (5)
  #  SP      a    b    c
---  ----  ---  ---  ---
  0  La    0.5  0.5  0.5
  1  Co    0    0    0
  2  O     0.5  0.5  0
  3  O     0.5  0    0.5
  4  O     0    0.5  0.5



In [7]:

    
st_list=[BaTiO3_struct,LaCoO3_struct]
st_list









    Out[7]:





[Structure Summary
 Lattice
     abc : 4.1124665499999997 4.1124665499999997 5.03561294
  angles : 90.0 90.0 90.0
  volume : 85.164205238601596
       A : 4.1124665499999997 0.0 2.5181594985290289e-16
       B : -2.5181594985290289e-16 4.1124665499999997 2.5181594985290289e-16
       C : 0.0 0.0 5.03561294
 PeriodicSite: Ba (0.0000, 0.0000, 0.0000) [0.0000, 0.0000, 0.0000]
 PeriodicSite: Ti (2.0562, 2.0562, 2.5178) [0.5000, 0.5000, 0.5000]
 PeriodicSite: O (-0.0000, 2.0562, 2.5178) [0.0000, 0.5000, 0.5000]
 PeriodicSite: O (2.0562, 0.0000, 2.5178) [0.5000, 0.0000, 0.5000]
 PeriodicSite: O (0.0000, 0.0000, 2.5178) [0.0000, 0.0000, 0.5000],
 Structure Summary
 Lattice
     abc : 4.4459305200000001 4.4459305200000001 4.4459305200000001
  angles : 90.0 90.0 90.0
  volume : 87.879588384417431
       A : 4.4459305200000001 0.0 2.7223472902747637e-16
       B : -2.7223472902747637e-16 4.4459305200000001 2.7223472902747637e-16
       C : 0.0 0.0 4.4459305200000001
 PeriodicSite: La (2.2230, 2.2230, 2.2230) [0.5000, 0.5000, 0.5000]
 PeriodicSite: Co (0.0000, 0.0000, 0.0000) [0.0000, 0.0000, 0.0000]
 PeriodicSite: O (2.2230, 2.2230, 0.0000) [0.5000, 0.5000, 0.0000]
 PeriodicSite: O (2.2230, 0.0000, 2.2230) [0.5000, 0.0000, 0.5000]
 PeriodicSite: O (-0.0000, 2.2230, 2.2230) [0.0000, 0.5000, 0.5000]]



In [18]:

    
from sklearn.metrics.pairwise import euclidean_distances



In [33]:

    
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
f_small_ones=[fp.get_phi(st_list[i],obser='ones',rmax=10,delta=0.05) for i in (0,1)]
plt.figure(figsize=(10,10))
labels=['BaTiO3','LaCoO3']
for i in range(2):
    plt.plot(f_small_ones[i],label=labels[i])
plt.legend()
plt.grid()
print euclidean_distances(f_small_ones[0],f_small_ones[1])









    



Average of last twenty bins -0.216141020749
Average of last twenty bins 0.204810079618
[[ 19.21078272]]






    



/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)



In [86]:

    
reload(fp)









    Out[86]:





<module 'fingerprint' from 'fingerprint.py'>



In [32]:

    
reload(fp)
f_small_ones=[fp.get_phi_scaled(st_list[i],obser='ones',debug=True,n_bins=200) for i in (0,1)]
plt.figure(figsize=(10,10))
labels=['BaTiO3','LaCoO3']
for i in range(2):
    plt.plot(f_small_ones[i],label=labels[i])
plt.legend()
plt.grid()
print (np.dot(f_small_ones[0],f_small_ones[0])+np.dot(f_small_ones[1],f_small_ones[1])-2*np.dot(f_small_ones[0],f_small_ones[1]))
print euclidean_distances(f_small_ones[0],f_small_ones[1])









    



0.0417308992617
Scale factor= 2.54873534051
Average of last twenty bins -0.0562769915004
0.0421786091774
Scale factor= 2.57527103505
Average of last twenty bins 0.00251314435054
27.8855270834
[[ 5.28067487]]






    



/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)



In [34]:

    
plt.plot(f_small_ones[0]-f_small_ones[1])









    Out[34]:





[<matplotlib.lines.Line2D at 0x1678975d0>]



In [ ]:

    
import tqdm
reload(fp)
f_ones_scaled=[fp.get_phi_scaled(matching_structs[i],obser="ones",delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
f_Z_scaled=[fp.get_phi_scaled(matching_structs[i],obser="Z",delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
f_Chi_scaled=[fp.get_phi_scaled(matching_structs[i],obser="Chi",delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]



In [16]:

    
Structures_scaled=[i.composition.formula for i in matching_structs]
labels_scaled=[dict_structs[i] for i in Structures_scaled]



In [19]:

    
fingerprints_scaled=np.array([list(itertools.chain(f_ones_scaled[i],f_Z_scaled[i],f_Chi_scaled[i])) for i in range(len(f_ones_scaled))])
euclid_scaled=euclidean_distances(fingerprints_scaled)
plt.figure(figsize=(12,12))
plt.imshow(euclid_scaled)
plt.colorbar()









    Out[19]:





<matplotlib.colorbar.Colorbar at 0x1250d0450>



In [20]:

    
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.metrics import adjusted_rand_score
Km=KMeans(n_clusters=4,n_init=250)
clust_km=Km.fit_predict(fingerprints_scaled)
print confusion_matrix(labels_scaled,clust_km)
print adjusted_rand_score(labels_scaled,clust_km)









    



[[  3   0  88 131]
 [  2 139   0   2]
 [236   0   0   8]
 [  0   1   0  17]]
0.763526707028



In [21]:

    
from sklearn.cluster import AgglomerativeClustering
Ag=AgglomerativeClustering(n_clusters=4)
clust_ag=Ag.fit_predict(fingerprints_scaled)
print confusion_matrix(labels_scaled,clust_ag)
print adjusted_rand_score(labels_scaled,clust_ag)









    



[[  0 152   1  69]
 [136   7   0   0]
 [  0   0 244   0]
 [  0  18   0   0]]
0.811586871338

Lets dump these annoying spinels



In [22]:

    
not_spinel=np.array(labels_scaled)!=3
labels_scaled_new=np.array(labels_scaled)[not_spinel]
fingerprints_scaled_new=fingerprints_scaled[not_spinel]
len(labels_scaled_new),len(fingerprints_scaled_new)









    Out[22]:





(609, 609)



In [23]:

    
Km=KMeans(n_clusters=3,n_init=250)
clust_km=Km.fit_predict(fingerprints_scaled_new)
print confusion_matrix(labels_scaled_new,clust_km)
print adjusted_rand_score(labels_scaled_new,clust_km)









    



[[205  17   0]
 [  1   3 139]
 [  0 244   0]]
0.891086502917



In [24]:

    
Ag=AgglomerativeClustering(n_clusters=3)
clust_ag=Ag.fit_predict(fingerprints_scaled_new)
print confusion_matrix(labels_scaled_new,clust_ag)
print "Rand-score=",adjusted_rand_score(labels_scaled_new,clust_ag)









    



[[221   0   1]
 [  1 139   3]
 [  0   0 244]]
Rand-score= 0.976451581472

Let's try and visualize this using PCA and TSNE



In [25]:

    
from sklearn.decomposition import PCA
label_names=np.array(["Delaffosite","Perovskite","122","Spinel"])
c_arr=['r','g','y','b']



In [97]:



In [26]:

    
labels_scaled=np.array(labels_scaled)



In [39]:

    
pca=PCA(n_components=5)
pca_fingerprint=pca.fit_transform(fingerprints_scaled)
plt.figure(figsize=(10,10))
for i in range(4):
    plt.scatter(pca_fingerprint[(labels_scaled==i),0],pca_fingerprint[(labels_scaled==i),1],c=c_arr[i],label=label_names[i])
plt.legend()









    Out[39]:





<matplotlib.legend.Legend at 0x13b0dc5d0>



In [33]:

    
from sklearn.manifold import TSNE



In [34]:

    
ts=TSNE(n_components=2,random_state=42)
ts_fingerprint=ts.fit_transform(fingerprints_scaled)
plt.figure(figsize=(10,10))
for i in range(4):
    plt.scatter(ts_fingerprint[(labels_scaled==i),0],ts_fingerprint[(labels_scaled==i),1],c=c_arr[i],label=label_names[i])
plt.legend()









    Out[34]:





<matplotlib.legend.Legend at 0x19a03f4d0>



In [129]:

    
n_comp=100
cov=np.zeros(n_comp)
pca=PCA(n_components=n_comp)
pca_fingerprint=pca.fit_transform(fingerprints_scaled)
cov=np.cumsum(pca.explained_variance_ratio_)
plt.figure(figsize=(10,10))
plt.grid()
plt.xlabel("Number of components")
plt.ylabel("Covariance Explained")
plt.plot(np.arange(1,n_comp+1),cov,'ro')









    Out[129]:





[<matplotlib.lines.Line2D at 0x10a767ed0>]

Lets Keep 50 components of PCA and redo clustering



In [35]:

    
pca=PCA(n_components=50)
pca_fingerprint_50=pca.fit_transform(fingerprints_scaled)



In [36]:

    
Km=KMeans(n_clusters=4,n_init=250)
clust_km_pca=Km.fit_predict(pca_fingerprint_50)
print confusion_matrix(labels_scaled,clust_km_pca)
print adjusted_rand_score(labels_scaled,clust_km_pca)









    



[[131   3   0  88]
 [  2   2 139   0]
 [  8 236   0   0]
 [ 17   0   1   0]]
0.763526707028



In [37]:

    
Ag=AgglomerativeClustering(n_clusters=4)
clust_ag_pca=Ag.fit_predict(pca_fingerprint_50)
print confusion_matrix(labels_scaled,clust_ag_pca)
print adjusted_rand_score(labels_scaled,clust_ag_pca)









    



[[  0 152   1  69]
 [136   7   0   0]
 [  0   0 244   0]
 [  0  18   0   0]]
0.811586871338



In [159]:

    
comp_lim=100
pca=PCA(n_components=comp_lim)
accu=np.zeros(comp_lim)
x_temp=np.arange(1,comp_lim+1)
pca_fingerprint_running=pca.fit_transform(fingerprints_scaled)
for i in range(1,comp_lim+1):    
    fing=pca_fingerprint_running[:,0:i]
    Ag=AgglomerativeClustering(n_clusters=4)
    clust_ag_running=Ag.fit_predict(fing)
    accu[i-1]=adjusted_rand_score(labels_scaled,clust_ag_running)
plt.figure(figsize=(10,10))
plt.plot(x_temp,accu,'ro-')
print "Maximum score obtained for n_comp={}, max score={}".format(x_temp[np.argmax(accu)],np.amax(accu))









    



Maximum score obtained for n_comp=6, max score=0.839359726423



In [147]:

    
plt.figure(figsize=(10,10))
plt.plot(x_temp[4:],accu[4:],'ro-')
print "Maximum score obtained for n_comp={}, max score={}".format(np.argmax(accu),np.amax(accu))









    



Maximum score obtained for n_comp=5, max score=0.839359726423

Doing 6 component PCA gives best results!



In [38]:

    
pca=PCA(n_components=6)
pca_fingerprint_6=pca.fit_transform(fingerprints_scaled)
Ag=AgglomerativeClustering(n_clusters=4)
clust_ag_pca6=Ag.fit_predict(pca_fingerprint_6)
print confusion_matrix(labels_scaled,clust_ag_pca6)
print adjusted_rand_score(labels_scaled,clust_ag_pca6)









    



[[  0 165   3  54]
 [142   1   0   0]
 [  0   0 244   0]
 [  1  17   0   0]]
0.839359726423

Clustering all structures



In [40]:

    
import tqdm
reload(fp)
fall_ones_scaled=[fp.get_phi_scaled(struct_all[i],obser="ones",delta=0.05) for i in tqdm.tqdm_notebook(range(len(struct_all)))]
#fall_Z_scaled=[fp.get_phi_scaled(matching_structs[i],obser="Z",delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
#fall_Chi_scaled=[fp.get_phi_scaled(matching_structs[i],obser="Chi",delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]









    









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-40-beb52ef7cc73> in <module>()
      1 import tqdm
      2 reload(fp)
----> 3 fall_ones_scaled=[fp.get_phi_scaled(struct_all[i],obser="ones",delta=0.05) for i in tqdm.tqdm_notebook(range(len(struct_all)))]
      4 #fall_Z_scaled=[fp.get_phi_scaled(matching_structs[i],obser="Z",delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
      5 #fall_Chi_scaled=[fp.get_phi_scaled(matching_structs[i],obser="Chi",delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]

/Users/bismayan/Google Drive/machine_learning_project_git/machine_learning_project/src/fingerprint.pyc in get_phi_scaled(struct, obser, n_bins, delta, sigma, kernsize, tol, debug)
    125             b_j=obs[names[j]]
    126             prefac_inner=b_i*b_j/(dist * dist)
--> 127             poin = np.floor(dist / delta)+kernsize
    128             finger[int(poin - kernsize):int(poin + kernsize + 1)] += ((prefac_inner) * kern)
    129 

KeyboardInterrupt:



In [43]:

    
num_atoms=[len(x.species) for x in struct_all]



In [45]:

    
from collections import Counter
num_counts=Counter(num_atoms)









    Out[45]:





Counter({3: 145,
         4: 502,
         5: 915,
         6: 586,
         7: 272,
         8: 715,
         9: 304,
         10: 651,
         11: 179,
         12: 938,
         13: 187,
         14: 921,
         15: 119,
         16: 600,
         17: 64,
         18: 615,
         19: 67,
         20: 983,
         21: 74,
         22: 667,
         23: 46,
         24: 805,
         25: 43,
         26: 265,
         27: 90,
         28: 813,
         29: 60,
         30: 367,
         31: 14,
         32: 412,
         33: 37,
         34: 168,
         35: 18,
         36: 515,
         37: 26,
         38: 127,
         39: 40,
         40: 432,
         41: 30,
         42: 201,
         43: 31,
         44: 286,
         45: 24,
         46: 74,
         47: 10,
         48: 263,
         49: 21,
         50: 46,
         51: 12,
         52: 179,
         53: 5,
         54: 71,
         55: 12,
         56: 285,
         57: 13,
         58: 39,
         59: 9,
         60: 174,
         61: 4,
         62: 23,
         63: 9,
         64: 156,
         65: 14,
         66: 42,
         67: 9,
         68: 167,
         69: 12,
         70: 51,
         71: 4,
         72: 179,
         73: 5,
         74: 9,
         75: 6,
         76: 82,
         77: 8,
         78: 35,
         79: 7,
         80: 179,
         81: 4,
         82: 15,
         83: 10,
         84: 74,
         85: 3,
         86: 18,
         87: 8,
         88: 108,
         89: 6,
         90: 22,
         91: 4,
         92: 23,
         93: 2,
         94: 14,
         95: 6,
         96: 60,
         97: 3,
         98: 13,
         99: 3,
         100: 26,
         102: 4,
         104: 41,
         106: 1,
         107: 3,
         108: 21,
         109: 2,
         110: 7,
         112: 21,
         114: 8,
         116: 11,
         117: 2,
         118: 2,
         120: 15,
         123: 1,
         124: 7,
         125: 1,
         126: 8,
         127: 1,
         128: 14,
         130: 2,
         132: 12,
         135: 1,
         136: 17,
         138: 1,
         140: 5,
         142: 1,
         144: 17,
         145: 1,
         146: 1,
         148: 4,
         150: 2,
         152: 7,
         154: 2,
         156: 7,
         160: 18,
         162: 1,
         164: 3,
         166: 1,
         168: 6,
         171: 1,
         172: 3,
         174: 3,
         176: 6,
         178: 2,
         180: 3,
         182: 1,
         184: 5,
         186: 2,
         188: 1,
         192: 5,
         195: 2,
         198: 1,
         200: 1,
         208: 2,
         240: 1,
         264: 3})



In [46]:

    
num_array=np.zeros(265)
for i in range(3,265):
    if num_counts.has_key(i):
        num_array[i]=num_counts[i]



In [47]:

    
num_array









    Out[47]:





array([   0.,    0.,    0.,  145.,  502.,  915.,  586.,  272.,  715.,
        304.,  651.,  179.,  938.,  187.,  921.,  119.,  600.,   64.,
        615.,   67.,  983.,   74.,  667.,   46.,  805.,   43.,  265.,
         90.,  813.,   60.,  367.,   14.,  412.,   37.,  168.,   18.,
        515.,   26.,  127.,   40.,  432.,   30.,  201.,   31.,  286.,
         24.,   74.,   10.,  263.,   21.,   46.,   12.,  179.,    5.,
         71.,   12.,  285.,   13.,   39.,    9.,  174.,    4.,   23.,
          9.,  156.,   14.,   42.,    9.,  167.,   12.,   51.,    4.,
        179.,    5.,    9.,    6.,   82.,    8.,   35.,    7.,  179.,
          4.,   15.,   10.,   74.,    3.,   18.,    8.,  108.,    6.,
         22.,    4.,   23.,    2.,   14.,    6.,   60.,    3.,   13.,
          3.,   26.,    0.,    4.,    0.,   41.,    0.,    1.,    3.,
         21.,    2.,    7.,    0.,   21.,    0.,    8.,    0.,   11.,
          2.,    2.,    0.,   15.,    0.,    0.,    1.,    7.,    1.,
          8.,    1.,   14.,    0.,    2.,    0.,   12.,    0.,    0.,
          1.,   17.,    0.,    1.,    0.,    5.,    0.,    1.,    0.,
         17.,    1.,    1.,    0.,    4.,    0.,    2.,    0.,    7.,
          0.,    2.,    0.,    7.,    0.,    0.,    0.,   18.,    0.,
          1.,    0.,    3.,    0.,    1.,    0.,    6.,    0.,    0.,
          1.,    3.,    0.,    3.,    0.,    6.,    0.,    2.,    0.,
          3.,    0.,    1.,    0.,    5.,    0.,    2.,    0.,    1.,
          0.,    0.,    0.,    5.,    0.,    0.,    2.,    0.,    0.,
          1.,    0.,    1.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    2.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    1.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    3.])



In [52]:

    
list(enumerate(np.cumsum(num_array)))









    Out[52]:





[(0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (3, 145.0),
 (4, 647.0),
 (5, 1562.0),
 (6, 2148.0),
 (7, 2420.0),
 (8, 3135.0),
 (9, 3439.0),
 (10, 4090.0),
 (11, 4269.0),
 (12, 5207.0),
 (13, 5394.0),
 (14, 6315.0),
 (15, 6434.0),
 (16, 7034.0),
 (17, 7098.0),
 (18, 7713.0),
 (19, 7780.0),
 (20, 8763.0),
 (21, 8837.0),
 (22, 9504.0),
 (23, 9550.0),
 (24, 10355.0),
 (25, 10398.0),
 (26, 10663.0),
 (27, 10753.0),
 (28, 11566.0),
 (29, 11626.0),
 (30, 11993.0),
 (31, 12007.0),
 (32, 12419.0),
 (33, 12456.0),
 (34, 12624.0),
 (35, 12642.0),
 (36, 13157.0),
 (37, 13183.0),
 (38, 13310.0),
 (39, 13350.0),
 (40, 13782.0),
 (41, 13812.0),
 (42, 14013.0),
 (43, 14044.0),
 (44, 14330.0),
 (45, 14354.0),
 (46, 14428.0),
 (47, 14438.0),
 (48, 14701.0),
 (49, 14722.0),
 (50, 14768.0),
 (51, 14780.0),
 (52, 14959.0),
 (53, 14964.0),
 (54, 15035.0),
 (55, 15047.0),
 (56, 15332.0),
 (57, 15345.0),
 (58, 15384.0),
 (59, 15393.0),
 (60, 15567.0),
 (61, 15571.0),
 (62, 15594.0),
 (63, 15603.0),
 (64, 15759.0),
 (65, 15773.0),
 (66, 15815.0),
 (67, 15824.0),
 (68, 15991.0),
 (69, 16003.0),
 (70, 16054.0),
 (71, 16058.0),
 (72, 16237.0),
 (73, 16242.0),
 (74, 16251.0),
 (75, 16257.0),
 (76, 16339.0),
 (77, 16347.0),
 (78, 16382.0),
 (79, 16389.0),
 (80, 16568.0),
 (81, 16572.0),
 (82, 16587.0),
 (83, 16597.0),
 (84, 16671.0),
 (85, 16674.0),
 (86, 16692.0),
 (87, 16700.0),
 (88, 16808.0),
 (89, 16814.0),
 (90, 16836.0),
 (91, 16840.0),
 (92, 16863.0),
 (93, 16865.0),
 (94, 16879.0),
 (95, 16885.0),
 (96, 16945.0),
 (97, 16948.0),
 (98, 16961.0),
 (99, 16964.0),
 (100, 16990.0),
 (101, 16990.0),
 (102, 16994.0),
 (103, 16994.0),
 (104, 17035.0),
 (105, 17035.0),
 (106, 17036.0),
 (107, 17039.0),
 (108, 17060.0),
 (109, 17062.0),
 (110, 17069.0),
 (111, 17069.0),
 (112, 17090.0),
 (113, 17090.0),
 (114, 17098.0),
 (115, 17098.0),
 (116, 17109.0),
 (117, 17111.0),
 (118, 17113.0),
 (119, 17113.0),
 (120, 17128.0),
 (121, 17128.0),
 (122, 17128.0),
 (123, 17129.0),
 (124, 17136.0),
 (125, 17137.0),
 (126, 17145.0),
 (127, 17146.0),
 (128, 17160.0),
 (129, 17160.0),
 (130, 17162.0),
 (131, 17162.0),
 (132, 17174.0),
 (133, 17174.0),
 (134, 17174.0),
 (135, 17175.0),
 (136, 17192.0),
 (137, 17192.0),
 (138, 17193.0),
 (139, 17193.0),
 (140, 17198.0),
 (141, 17198.0),
 (142, 17199.0),
 (143, 17199.0),
 (144, 17216.0),
 (145, 17217.0),
 (146, 17218.0),
 (147, 17218.0),
 (148, 17222.0),
 (149, 17222.0),
 (150, 17224.0),
 (151, 17224.0),
 (152, 17231.0),
 (153, 17231.0),
 (154, 17233.0),
 (155, 17233.0),
 (156, 17240.0),
 (157, 17240.0),
 (158, 17240.0),
 (159, 17240.0),
 (160, 17258.0),
 (161, 17258.0),
 (162, 17259.0),
 (163, 17259.0),
 (164, 17262.0),
 (165, 17262.0),
 (166, 17263.0),
 (167, 17263.0),
 (168, 17269.0),
 (169, 17269.0),
 (170, 17269.0),
 (171, 17270.0),
 (172, 17273.0),
 (173, 17273.0),
 (174, 17276.0),
 (175, 17276.0),
 (176, 17282.0),
 (177, 17282.0),
 (178, 17284.0),
 (179, 17284.0),
 (180, 17287.0),
 (181, 17287.0),
 (182, 17288.0),
 (183, 17288.0),
 (184, 17293.0),
 (185, 17293.0),
 (186, 17295.0),
 (187, 17295.0),
 (188, 17296.0),
 (189, 17296.0),
 (190, 17296.0),
 (191, 17296.0),
 (192, 17301.0),
 (193, 17301.0),
 (194, 17301.0),
 (195, 17303.0),
 (196, 17303.0),
 (197, 17303.0),
 (198, 17304.0),
 (199, 17304.0),
 (200, 17305.0),
 (201, 17305.0),
 (202, 17305.0),
 (203, 17305.0),
 (204, 17305.0),
 (205, 17305.0),
 (206, 17305.0),
 (207, 17305.0),
 (208, 17307.0),
 (209, 17307.0),
 (210, 17307.0),
 (211, 17307.0),
 (212, 17307.0),
 (213, 17307.0),
 (214, 17307.0),
 (215, 17307.0),
 (216, 17307.0),
 (217, 17307.0),
 (218, 17307.0),
 (219, 17307.0),
 (220, 17307.0),
 (221, 17307.0),
 (222, 17307.0),
 (223, 17307.0),
 (224, 17307.0),
 (225, 17307.0),
 (226, 17307.0),
 (227, 17307.0),
 (228, 17307.0),
 (229, 17307.0),
 (230, 17307.0),
 (231, 17307.0),
 (232, 17307.0),
 (233, 17307.0),
 (234, 17307.0),
 (235, 17307.0),
 (236, 17307.0),
 (237, 17307.0),
 (238, 17307.0),
 (239, 17307.0),
 (240, 17308.0),
 (241, 17308.0),
 (242, 17308.0),
 (243, 17308.0),
 (244, 17308.0),
 (245, 17308.0),
 (246, 17308.0),
 (247, 17308.0),
 (248, 17308.0),
 (249, 17308.0),
 (250, 17308.0),
 (251, 17308.0),
 (252, 17308.0),
 (253, 17308.0),
 (254, 17308.0),
 (255, 17308.0),
 (256, 17308.0),
 (257, 17308.0),
 (258, 17308.0),
 (259, 17308.0),
 (260, 17308.0),
 (261, 17308.0),
 (262, 17308.0),
 (263, 17308.0),
 (264, 17311.0)]



In [53]:

    
structs_lim_50=[x for x in struct_all if len(x.species)<50]



In [55]:

    
structs_88=[x for x in struct_all if len(x.species)==88]



In [56]:

    
fomulae_88=[x.composition.formula for x in structs_88]



In [57]:

    
fomulae_88









    Out[57]:





[u'Sr8 Al16 Cl64',
 u'Zn16 As16 O56',
 u'Sr8 B16 F64',
 u'Ca8 B16 F64',
 u'Ba8 B16 F64',
 u'Rb12 B28 O48',
 u'Ba4 B32 O52',
 u'Cd8 B16 F64',
 u'Ca8 B16 H64',
 u'Be8 B16 H64',
 u'Ba8 Fe16 Br64',
 u'Ba4 Ti28 O56',
 u'Ba18 Rh16 O54',
 u'Ba24 Y4 I60',
 u'Ba24 Y4 Br60',
 u'Ba24 Y4 Cl60',
 u'Be16 P16 O56',
 u'Zn8 Co24 O56',
 u'Cr12 P16 O60',
 u'Cr8 S16 O64',
 u'Cs12 Ta20 O56',
 u'Cs12 Sb20 O56',
 u'Cu8 Mo24 I56',
 u'Li12 Cu16 F60',
 u'Dy16 Ti16 O56',
 u'Na20 Mn12 F56',
 u'Li12 Mn16 F60',
 u'Li20 Mn12 F56',
 u'V28 O24 F36',
 u'Li20 V12 F56',
 u'Li20 Fe12 F56',
 u'Fe16 Se16 O56',
 u'Fe12 P16 O60',
 u'Gd24 Ru8 O56',
 u'Gd16 Ti16 O56',
 u'Na32 Ge16 Te40',
 u'Na32 Ge16 Se40',
 u'Ge24 Pb8 O56',
 u'H40 N8 O40',
 u'H16 S16 O56',
 u'Rb16 Hg20 Te52',
 u'K12 Sb20 O56',
 u'La16 Si16 O56',
 u'La16 Ti16 O56',
 u'La17 Mn17 O54',
 u'La16 Mn18 O54',
 u'Li4 V24 O60',
 u'Th8 Mo16 O64',
 u'Te4 Mo20 O64',
 u'Sm24 Mo8 O56',
 u'U8 Mo16 O64',
 u'Zr32 N17 O39',
 u'Na32 Si16 Te40',
 u'Na12 Sb20 O56',
 u'Nb8 S16 O64',
 u'Nd16 Ti16 O56',
 u'Nd16 Si16 O56',
 u'Ni8 S16 O64',
 u'Zn16 P16 O56',
 u'Sr4 V24 O60',
 u'Pr16 Si16 O56',
 u'V12 P16 O60',
 u'Sr16 V16 O56',
 u'Sm16 Si16 O56',
 u'Sr20 U12 O56',
 u'Rb12 Sb20 O56',
 u'U8 W16 O64',
 u'Sm24 Ru8 O56',
 u'Sm16 Zr16 O56',
 u'Y16 Ti16 O56',
 u'Sb8 S16 O64',
 u'V24 Pb4 O60',
 u'Ta16 W8 O64',
 u'V16 P16 O56',
 u'Sn16 P16 O56',
 u'Ti8 S16 O64',
 u'Sc16 Ti16 O56',
 u'V8 S16 O64',
 u'W8 S16 O64',
 u'Sn8 S16 O64',
 u'Rb16 P24 Se48',
 u'Bi4 W24 Cl60',
 u'Bi4 Mo24 Cl60',
 u'Bi28 O24 F36',
 u'Cu8 W24 Br56',
 u'Cu8 Mo24 Br56',
 u'La28 Sb44 Br16',
 u'Tl8 W24 Br56',
 u'K8 Mo24 Br56',
 u'K8 W24 Br56',
 u'W16 C8 Cl64',
 u'Ca16 Nb16 O56',
 u'Ca16 P16 O56',
 u'Ca20 W12 O56',
 u'Ca8 Co24 O56',
 u'Ce8 S16 O64',
 u'Ce8 Se16 O64',
 u'Ce16 Si16 O56',
 u'Ce8 W16 O64',
 u'Cu8 W24 Cl56',
 u'P24 N28 Cl36',
 u'Nb24 In4 Cl60',
 u'Cu8 Mo24 Cl56',
 u'Sb16 Cl40 F32',
 u'Nb8 Se8 Cl72',
 u'Na32 Co16 O40',
 u'Co12 P16 O60',
 u'Mg8 Co24 O56']



In [58]:

    
finger_pyro=[fp.get_phi_scaled(structs_88[i],obser='ones') for i in tqdm.tqdm_notebook(range(len(structs_88)))]



In [59]:

    
euclid_scaled=euclidean_distances(finger_pyro)
plt.figure(figsize=(12,12))
plt.imshow(euclid_scaled)
plt.colorbar()









    Out[59]:





<matplotlib.colorbar.Colorbar at 0x1414acd50>



In [60]:

    
fall_ones_scaled=[fp.get_phi_scaled(structs_lim_50[i],obser="ones",delta=0.05) for i in tqdm.tqdm_notebook(range(len(structs_lim_50)))]









    









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-60-b0025fbda97f> in <module>()
----> 1 fall_ones_scaled=[fp.get_phi_scaled(structs_lim_50[i],obser="ones",delta=0.05) for i in tqdm.tqdm_notebook(range(len(structs_lim_50)))]

/Users/bismayan/Google Drive/machine_learning_project_git/machine_learning_project/src/fingerprint.pyc in get_phi_scaled(struct, obser, n_bins, delta, sigma, kernsize, tol, debug)
    125             b_j=obs[names[j]]
    126             prefac_inner=b_i*b_j/(dist * dist)
--> 127             poin = np.floor(dist / delta)+kernsize
    128             finger[int(poin - kernsize):int(poin + kernsize + 1)] += ((prefac_inner) * kern)
    129 

KeyboardInterrupt:



In [69]:

    
def phi_getter(i):
    phi_ones=fp.get_phi_scaled(i,obser='ones')
    phi_Z=fp.get_phi_scaled(i,obser='Z')
    phi_Chi=fp.get_phi_scaled(i,obser='Chi')
    return list(itertools.chain(phi_ones,phi_Z,phi_Chi))



In [ ]:

    
from multiprocessing import Pool
p=Pool(4)
finger_lt50=np.array(p.map(phi_getter,structs_lim_50))
finger_lt50.shape



In [74]:

    
finger_lt50.shape









    Out[74]:





(10, 300)



In [ ]:

	Entry Number	Space Group	Structure	Structure Type	Description	Authors	Reference
0	25	F -4 3 m	Ag8 Ge Te6	Ag8SiTe6	Structure cristalline de Ag8 Ge Te6 (gamma)	Rysanek, N.; Laruelle, P.; Katty, A.	Acta Crystallographica B (24,1968-38,1982) (19...
1	33	P 1 21/n 1	Pb (W O4)	PbWO4(mP48)	Pb W O4-III (a high-pressure form)	Richter, P.W.; Kruger, G.J.; Pistorius, C.W....	Acta Crystallographica B (24,1968-38,1982) (19...
2	68	R -3 R	Pb0.5 Mo3 Se4	Mo6PbS8	Etude structurale de combinaisons sulfurees et...	Guillevic, J.; Lestrat, H.; Grandjean, D.	Acta Crystallographica B (24,1968-38,1982) (19...
3	208	I 41/a m d Z	Cu Br Te	NaN	Crystal structure of copper bromide telluride	Carkner, P.M.; Haendler, H.M.	Journal of Solid State Chemistry (1976) 18, p...
4	286	P 63	Fe (I O3)3	Fe(IO3)3	Zur Kristallstruktur von Fe I3 O9	Jansen, M.	Journal of Solid State Chemistry (1976) 17, p...