In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
%matplotlib inline
In [2]:
from pymatgen import Composition
In [3]:
old_icsd=pd.read_csv("../ICSD/icsd-ternaries.csv",sep='\t',names=["Entry Number","Space Group","Structure","Structure Type","Description","Authors","Reference"],header=None)
old_icsd.head()
Out[3]:
In [4]:
old_icsd["Structure Type"].value_counts()
Out[4]:
In [5]:
Structures=[Composition(j).formula for j in old_icsd["Structure"].values]
In [6]:
old_icsd["Structures"]=Structures
In [7]:
import fingerprint as fp
struct_all=s_all=fp.read_pickle("struct_all.pickle")
In [8]:
all_comps=[x.composition.formula for x in struct_all]
In [9]:
def find_overlap(struct_type):
lis=old_icsd[old_icsd["Structure Type"]==struct_type]["Structures"]
unique_lis=[str(x) for x in np.unique(lis)]
overlap=[x for x in unique_lis if x in all_comps]
print "{} matches found for {}".format(len(overlap),struct_type)
return overlap
In [10]:
overlap_GdFeO3=find_overlap("Perovskite-GdFeO3")
overlap_122=find_overlap("ThCr2Si2")
overlap_CaTiO3=find_overlap("Perovskite-CaTiO3")
overlap_NaCl=find_overlap("NaCl")
overlap_spinel=find_overlap("Spinel-Al2MgO4")
overlap_delaf=find_overlap("Delafossite-NaCrS2")
In [11]:
def matching_struct(comp_list):
return[x for x in struct_all if x.composition.formula in comp_list]
In [12]:
GdFeO3_structs=matching_struct(overlap_GdFeO3)
one22_structs=matching_struct(overlap_122)
CaTiO3_structs=matching_struct(overlap_CaTiO3)
NaCl_structs=matching_struct(overlap_NaCl)
Spinel_structs=matching_struct(overlap_spinel)
delaf_structs=matching_struct(overlap_delaf)
In [13]:
import itertools
matching_structs=[]
matching_structs.extend(itertools.chain(one22_structs,CaTiO3_structs,Spinel_structs,delaf_structs))
len(matching_structs)
Out[13]:
In [15]:
dict_structs={}
dict_structs.update({x:0 for x in overlap_delaf})
dict_structs.update({x:1 for x in overlap_CaTiO3})
dict_structs.update({x:2 for x in overlap_122})
dict_structs.update({x:3 for x in overlap_spinel})
#dict_structs.update({x:4 for x in overlap_NaCl})
print len(dict_structs)
In [15]:
import tqdm
In [16]:
f_ones=[fp.get_phi(matching_structs[i],obser="ones",rmax=10,delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
f_Z=[fp.get_phi(matching_structs[i],obser="Z",rmax=10,delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
f_Chi=[fp.get_phi(matching_structs[i],obser="Chi",rmax=10,delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
Structures=[i.composition.formula for i in matching_structs]
labels=[dict_structs[i] for i in Structures]
In [17]:
fingerprints=np.array([list(itertools.chain(f_ones[i],f_Z[i],f_Chi[i])) for i in range(len(f_ones))])
In [18]:
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.metrics import adjusted_rand_score
In [19]:
Km=KMeans(n_clusters=4,n_init=250)
clust_km=Km.fit_predict(fingerprints)
print confusion_matrix(labels,clust_km)
print adjusted_rand_score(labels,clust_km)
In [20]:
from sklearn.cluster import AgglomerativeClustering
In [21]:
Ag=AgglomerativeClustering(n_clusters=4)
In [22]:
Ag=AgglomerativeClustering(n_clusters=4)
clust_ag=Ag.fit_predict(fingerprints)
print confusion_matrix(labels,clust_ag)
print adjusted_rand_score(labels,clust_ag)
In [23]:
from numpy import random
r=random.RandomState(42)
perm_state=r.permutation(len(matching_structs))
labels=np.array(labels)
labels_perm=labels[perm_state]
fingerprints_perm=fingerprints[perm_state]
In [24]:
from sklearn.svm import SVC
In [25]:
sv=SVC(random_state=42)
sv.fit(fingerprints_perm,labels_perm)
clust_svc=sv.predict(fingerprints_perm)
In [26]:
print confusion_matrix(labels_perm,clust_svc)
print adjusted_rand_score(labels_perm,clust_svc)
In [27]:
from sklearn.svm import LinearSVC
lsv=LinearSVC(random_state=42)
In [28]:
lsv.fit(fingerprints_perm,labels_perm)
Out[28]:
In [29]:
clust_lsv=lsv.predict(fingerprints_perm)
print confusion_matrix(labels_perm,clust_lsv)
print adjusted_rand_score(labels_perm,clust_lsv)
In [30]:
from sklearn.linear_model import LogisticRegression
In [31]:
logist=LogisticRegression()
logist.fit(fingerprints_perm,labels_perm)
clust_logist=logist.predict(fingerprints_perm)
print confusion_matrix(labels_perm,clust_logist)
print adjusted_rand_score(labels_perm,clust_logist)
In [32]:
ones_perm=np.array(f_ones)[perm_state]
Z_perm=np.array(f_Z)[perm_state]
Chi_perm=np.array(f_Chi)[perm_state]
In [33]:
logist2=LogisticRegression()
logist2.fit(ones_perm,labels_perm)
clust_logist2=logist2.predict(ones_perm)
print confusion_matrix(labels_perm,clust_logist2)
print adjusted_rand_score(labels_perm,clust_logist2)
In [34]:
logist3=LogisticRegression()
logist3.fit(Z_perm,labels_perm)
clust_logist3=logist3.predict(Z_perm)
print confusion_matrix(labels_perm,clust_logist3)
print adjusted_rand_score(labels_perm,clust_logist3)
In [35]:
logist4=LogisticRegression()
logist4.fit(Chi_perm,labels_perm)
clust_logist4=logist4.predict(Chi_perm)
print confusion_matrix(labels_perm,clust_logist4)
print adjusted_rand_score(labels_perm,clust_logist4)
In [36]:
from sklearn.decomposition import PCA
In [37]:
label_names=["Delaffosite","Perovskite","122","Spinel"]
In [38]:
label_names=np.array(["Delaffosite","Perovskite","122","Spinel"])
c_arr=['r','g','y','b']
In [39]:
pca=PCA(n_components=2)
pca_fingerprint=pca.fit_transform(fingerprints_perm)
plt.figure(figsize=(10,10))
for i in range(4):
plt.scatter(pca_fingerprint[(labels_perm==i),0],pca_fingerprint[(labels_perm==i),1],c=c_arr[i],label=label_names[i])
plt.legend()
Out[39]:
In [40]:
from sklearn.manifold import TSNE
In [41]:
ts=TSNE(n_components=2,random_state=42)
tsne_fingerprints=ts.fit_transform(fingerprints_perm)
plt.figure(figsize=(10,10))
for i in range(4):
plt.scatter(tsne_fingerprints[(labels_perm==i),0],tsne_fingerprints[(labels_perm==i),1],c=c_arr[i],label=label_names[i])
plt.legend()
Out[41]:
In [42]:
ts2=TSNE(n_components=2,random_state=42)
tsne_fingerprints=ts2.fit_transform(fingerprints_perm)
pca=PCA(n_components=5)
pca_fingerprint=pca.fit_transform(fingerprints_perm)
combined_finger=np.hstack((pca_fingerprint,tsne_fingerprints))
In [43]:
combined_finger.shape
Out[43]:
In [44]:
sv2=SVC(random_state=42)
sv2.fit(combined_finger,labels_perm)
clust_svc_projected=sv2.predict(combined_finger)
print confusion_matrix(labels_perm,clust_svc_projected)
print adjusted_rand_score(labels_perm,clust_svc_projected)
In [45]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
combined_finger_scaled=scaler.fit_transform(combined_finger)
In [46]:
combined_finger_scaled.shape
Out[46]:
In [47]:
Km2=KMeans(n_clusters=4,n_init=250)
clust_km_projected=Km2.fit_predict(combined_finger_scaled)
print confusion_matrix(labels_perm,clust_km_projected)
print adjusted_rand_score(labels_perm,clust_km_projected)
In [48]:
euclid=np.array([[np.sqrt(np.dot(fingerprints[i],fingerprints[i])+np.dot(fingerprints[j],fingerprints[j])-2*np.dot(fingerprints[i],fingerprints[j])) for i in range(len(fingerprints))] for j in range(len(fingerprints))])
plt.figure(figsize=(12,12))
plt.imshow(euclid)
plt.colorbar()
Out[48]: