1.2 Testing Kmeans with supervised learning and rescaled fingerprints -checkpoint



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
%matplotlib inline


/usr/local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [2]:
from pymatgen import Composition

Reading old Icsd ternaries csv file


In [3]:
old_icsd=pd.read_csv("../ICSD/icsd-ternaries.csv",sep='\t',names=["Entry Number","Space Group","Structure","Structure Type","Description","Authors","Reference"],header=None)
old_icsd.head()


Out[3]:
Entry Number Space Group Structure Structure Type Description Authors Reference
0 25 F -4 3 m Ag8 Ge Te6 Ag8SiTe6 Structure cristalline de Ag8 Ge Te6 (gamma) Rysanek, N.; Laruelle, P.; Katty, A. Acta Crystallographica B (24,1968-38,1982) (19...
1 33 P 1 21/n 1 Pb (W O4) PbWO4(mP48) Pb W O4-III (a high-pressure form) Richter, P.W.; Kruger, G.J.; Pistorius, C.W.... Acta Crystallographica B (24,1968-38,1982) (19...
2 68 R -3 R Pb0.5 Mo3 Se4 Mo6PbS8 Etude structurale de combinaisons sulfurees et... Guillevic, J.; Lestrat, H.; Grandjean, D. Acta Crystallographica B (24,1968-38,1982) (19...
3 208 I 41/a m d Z Cu Br Te NaN Crystal structure of copper bromide telluride Carkner, P.M.; Haendler, H.M. Journal of Solid State Chemistry (1976) 18, p...
4 286 P 63 Fe (I O3)3 Fe(IO3)3 Zur Kristallstruktur von Fe I3 O9 Jansen, M. Journal of Solid State Chemistry (1976) 17, p...

In [4]:
old_icsd["Structure Type"].value_counts()


Out[4]:
Spinel-Al2MgO4             1922
ThCr2Si2                   1887
Perovskite-GdFeO3          1179
NaCl                       1080
Perovskite-CaTiO3           987
TiNiSi-MgSrSi               903
ZrNiAl-Fe2P                 864
Laves(cub)-Cu2Mg            808
Mn12Th                      739
Heusler-AlCu2Mn             691
Laves(2H)-MgZn2             670
PbClF/Cu2Sb                 625
Delafossite-NaCrS2          600
CaCu5                       468
BaCuSn2-CeNi(1-x)Si2        445
Chalcopyrite-CuFeS2         414
Pyrochlore-NaCa(Nb2O6)F     404
K4Si23                      398
Fluorite-CaF2               385
Th2Zn17                     361
AlB2                        337
Heusler(alloy)-AlLiSi       331
Sphalerite-ZnS(cF8)         301
MnP                         299
Perovskite-NdAlO3           289
Zircon-ZrSiO4               275
Si2U3                       263
CaFe2O4                     263
Th3P4                       261
Auricupride-AuCu3           257
                           ... 
NaAsSe2                       1
Perovskite-NaBaLiNiF6         1
Co2Sr3O6-x                    1
(La,Sr)(Ga,Mg)O3-x            1
Na2.4Fe11O16                  1
Fe2P2O7                       1
SrTb2Fe2O7                    1
Na2Si2O5(oP36)                1
Pb9Pd13                       1
Ni2SrP2                       1
Sr5Co4O12                     1
Ni12P5                        1
K1-xFeF3                      1
SrSn4                         1
CaSi2(hR6)                    1
K1.4Sn2.2Bi7.4Se14            1
OsO4                          1
Pt2Si3                        1
Na(UO2)4O2(OH)5(H2O)5         1
Li4SrN2                       1
MgSi(OH)6                     1
MoMn4SiC                      1
Bi2Sr2Nb2.5Fe.5O12            1
Ba3TiNb4O15                   1
SrCuO2+x                      1
Sr8(FeN3)2FeN2                1
BiPd3                         1
In2OPO4                       1
Ni3Sn4                        1
CaTiSiO5                      1
Name: Structure Type, dtype: int64

In [5]:
Structures=[Composition(j).formula for j in old_icsd["Structure"].values]

In [6]:
old_icsd["Structures"]=Structures

In [7]:
import fingerprint as fp
struct_all=s_all=fp.read_pickle("struct_all.pickle")

In [8]:
all_comps=[x.composition.formula for x in struct_all]

In [9]:
def find_overlap(struct_type):
    lis=old_icsd[old_icsd["Structure Type"]==struct_type]["Structures"]
    unique_lis=[str(x) for x in np.unique(lis)]
    overlap=[x for x in unique_lis if x in all_comps]
    print "{} matches found for {}".format(len(overlap),struct_type)
    return overlap

In [10]:
overlap_GdFeO3=find_overlap("Perovskite-GdFeO3")
overlap_122=find_overlap("ThCr2Si2")
overlap_CaTiO3=find_overlap("Perovskite-CaTiO3")
overlap_NaCl=find_overlap("NaCl")
overlap_spinel=find_overlap("Spinel-Al2MgO4")
overlap_delaf=find_overlap("Delafossite-NaCrS2")


57 matches found for Perovskite-GdFeO3
244 matches found for ThCr2Si2
143 matches found for Perovskite-CaTiO3
34 matches found for NaCl
18 matches found for Spinel-Al2MgO4
222 matches found for Delafossite-NaCrS2

In [11]:
def matching_struct(comp_list):
    return[x for x in struct_all if x.composition.formula in comp_list]

In [12]:
GdFeO3_structs=matching_struct(overlap_GdFeO3)
one22_structs=matching_struct(overlap_122)
CaTiO3_structs=matching_struct(overlap_CaTiO3)
NaCl_structs=matching_struct(overlap_NaCl)
Spinel_structs=matching_struct(overlap_spinel)
delaf_structs=matching_struct(overlap_delaf)

In [13]:
import itertools
matching_structs=[]
matching_structs.extend(itertools.chain(one22_structs,CaTiO3_structs,Spinel_structs,delaf_structs))
len(matching_structs)


Out[13]:
627

In [15]:
dict_structs={}
dict_structs.update({x:0 for x in overlap_delaf})
dict_structs.update({x:1 for x in overlap_CaTiO3})
dict_structs.update({x:2 for x in overlap_122})
dict_structs.update({x:3 for x in overlap_spinel})
#dict_structs.update({x:4 for x in overlap_NaCl})
print len(dict_structs)


627

In [15]:
import tqdm

In [16]:
f_ones=[fp.get_phi(matching_structs[i],obser="ones",rmax=10,delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
f_Z=[fp.get_phi(matching_structs[i],obser="Z",rmax=10,delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
f_Chi=[fp.get_phi(matching_structs[i],obser="Chi",rmax=10,delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
Structures=[i.composition.formula for i in matching_structs]
labels=[dict_structs[i] for i in Structures]





In [17]:
fingerprints=np.array([list(itertools.chain(f_ones[i],f_Z[i],f_Chi[i])) for i in range(len(f_ones))])

In [18]:
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.metrics import adjusted_rand_score

In [19]:
Km=KMeans(n_clusters=4,n_init=250)
clust_km=Km.fit_predict(fingerprints)
print confusion_matrix(labels,clust_km)
print adjusted_rand_score(labels,clust_km)


[[ 18  69  78  57]
 [ 15  37  41  50]
 [133  99  11   1]
 [  4   3   6   5]]
0.164250307488

In [20]:
from sklearn.cluster import AgglomerativeClustering

In [21]:
Ag=AgglomerativeClustering(n_clusters=4)

In [22]:
Ag=AgglomerativeClustering(n_clusters=4)
clust_ag=Ag.fit_predict(fingerprints)
print confusion_matrix(labels,clust_ag)
print adjusted_rand_score(labels,clust_ag)


[[100  48  41  33]
 [ 49  21  43  30]
 [ 12 145   0  87]
 [  6  11   1   0]]
0.163697127181

Lets try supervised learning


In [23]:
from numpy import random
r=random.RandomState(42)
perm_state=r.permutation(len(matching_structs))
labels=np.array(labels)
labels_perm=labels[perm_state]
fingerprints_perm=fingerprints[perm_state]

In [24]:
from sklearn.svm import SVC

In [25]:
sv=SVC(random_state=42)
sv.fit(fingerprints_perm,labels_perm)
clust_svc=sv.predict(fingerprints_perm)

In [26]:
print confusion_matrix(labels_perm,clust_svc)
print adjusted_rand_score(labels_perm,clust_svc)


[[222   0   0   0]
 [  2 140   1   0]
 [  1   0 243   0]
 [  7   0   1  10]]
0.960100502991

In [27]:
from sklearn.svm import LinearSVC
lsv=LinearSVC(random_state=42)

In [28]:
lsv.fit(fingerprints_perm,labels_perm)


Out[28]:
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)

In [29]:
clust_lsv=lsv.predict(fingerprints_perm)
print confusion_matrix(labels_perm,clust_lsv)
print adjusted_rand_score(labels_perm,clust_lsv)


[[222   0   0   0]
 [  0 143   0   0]
 [  0   0 244   0]
 [  0   0   0  18]]
1.0

In [30]:
from sklearn.linear_model import LogisticRegression

In [31]:
logist=LogisticRegression()

logist.fit(fingerprints_perm,labels_perm)

clust_logist=logist.predict(fingerprints_perm)
print confusion_matrix(labels_perm,clust_logist)
print adjusted_rand_score(labels_perm,clust_logist)


[[222   0   0   0]
 [  0 143   0   0]
 [  0   0 244   0]
 [  0   0   0  18]]
1.0

In [32]:
ones_perm=np.array(f_ones)[perm_state]
Z_perm=np.array(f_Z)[perm_state]
Chi_perm=np.array(f_Chi)[perm_state]

In [33]:
logist2=LogisticRegression()

logist2.fit(ones_perm,labels_perm)
clust_logist2=logist2.predict(ones_perm)
print confusion_matrix(labels_perm,clust_logist2)
print adjusted_rand_score(labels_perm,clust_logist2)


[[212   4   5   1]
 [  8 135   0   0]
 [  8   1 235   0]
 [  3   1   0  14]]
0.866898948153

In [34]:
logist3=LogisticRegression()

logist3.fit(Z_perm,labels_perm)
clust_logist3=logist3.predict(Z_perm)
print confusion_matrix(labels_perm,clust_logist3)
print adjusted_rand_score(labels_perm,clust_logist3)


[[204  13   5   0]
 [ 18 122   3   0]
 [  2   2 240   0]
 [  0   0   0  18]]
0.824594942179

In [35]:
logist4=LogisticRegression()

logist4.fit(Chi_perm,labels_perm)
clust_logist4=logist4.predict(Chi_perm)
print confusion_matrix(labels_perm,clust_logist4)
print adjusted_rand_score(labels_perm,clust_logist4)


[[220   2   0   0]
 [ 11 131   1   0]
 [  0   0 244   0]
 [  6   0   1  11]]
0.923791715633

Lets try Projection techniques


In [36]:
from sklearn.decomposition import PCA

In [37]:
label_names=["Delaffosite","Perovskite","122","Spinel"]

In [38]:
label_names=np.array(["Delaffosite","Perovskite","122","Spinel"])
c_arr=['r','g','y','b']

In [39]:
pca=PCA(n_components=2)

pca_fingerprint=pca.fit_transform(fingerprints_perm)

plt.figure(figsize=(10,10))
for i in range(4):
    plt.scatter(pca_fingerprint[(labels_perm==i),0],pca_fingerprint[(labels_perm==i),1],c=c_arr[i],label=label_names[i])
plt.legend()


Out[39]:
<matplotlib.legend.Legend at 0x11b505890>

In [40]:
from sklearn.manifold import TSNE

In [41]:
ts=TSNE(n_components=2,random_state=42)
tsne_fingerprints=ts.fit_transform(fingerprints_perm)
plt.figure(figsize=(10,10))
for i in range(4):
    plt.scatter(tsne_fingerprints[(labels_perm==i),0],tsne_fingerprints[(labels_perm==i),1],c=c_arr[i],label=label_names[i])
plt.legend()


Out[41]:
<matplotlib.legend.Legend at 0x119718b10>

In [42]:
ts2=TSNE(n_components=2,random_state=42)
tsne_fingerprints=ts2.fit_transform(fingerprints_perm)
pca=PCA(n_components=5)
pca_fingerprint=pca.fit_transform(fingerprints_perm)
combined_finger=np.hstack((pca_fingerprint,tsne_fingerprints))

In [43]:
combined_finger.shape


Out[43]:
(627, 7)

In [44]:
sv2=SVC(random_state=42)
sv2.fit(combined_finger,labels_perm)
clust_svc_projected=sv2.predict(combined_finger)
print confusion_matrix(labels_perm,clust_svc_projected)
print adjusted_rand_score(labels_perm,clust_svc_projected)


[[222   0   0   0]
 [  0 143   0   0]
 [  0   0 244   0]
 [  1   0   0  17]]
0.997242139921

In [45]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
combined_finger_scaled=scaler.fit_transform(combined_finger)

In [46]:
combined_finger_scaled.shape


Out[46]:
(627, 7)

In [47]:
Km2=KMeans(n_clusters=4,n_init=250)
clust_km_projected=Km2.fit_predict(combined_finger_scaled)
print confusion_matrix(labels_perm,clust_km_projected)
print adjusted_rand_score(labels_perm,clust_km_projected)


[[ 45  26  31 120]
 [ 31  47  24  41]
 [133  13  93   5]
 [ 11   6   0   1]]
0.177999189098

Lets create similarity matrices


In [48]:
euclid=np.array([[np.sqrt(np.dot(fingerprints[i],fingerprints[i])+np.dot(fingerprints[j],fingerprints[j])-2*np.dot(fingerprints[i],fingerprints[j])) for i in range(len(fingerprints))] for j in range(len(fingerprints))])
plt.figure(figsize=(12,12))
plt.imshow(euclid)
plt.colorbar()


Out[48]:
<matplotlib.colorbar.Colorbar at 0x136fd8c10>