In [25]:
import numpy as np
import pandas as pd
train=pd.read_csv('train.csv')
import numpy as np
def kin_energy(random_vec):
"""return kinetic energy of random vector represented of (1,) dimmensional array"""
freq=np.unique(random_vec,return_counts=True)
prob=freq[1]/random_vec.shape[0]
energy=np.sum(prob**2)
return energy
def ic(vector1,vector2):
"""return information coefficient IC for 2 random variables
-defined as dot product of probabilities corresponding to each class
"""
a=vector1
b=vector2
# get the probs in order to do dot product with them
prob1=np.unique(a,return_counts=True)[1]/a.shape[0]
prob2=np.unique(b,return_counts=True)[1]/b.shape[0]
p1=list(prob1)
p2=list(prob2)
diff=len(p1)-len(p2)
if diff>0:
for elem in range(diff):
p2.append(0)
if diff<0:
for elem in range((diff*-1)):
p1.append(0)
ic=np.dot(np.array(p1),np.array(p2))
return ic
def o(vector1,vector2):
"""return onicescu information correlation based on kinetic energy """
i_c=ic(vector1,vector2)
o=i_c/np.sqrt(kin_energy(vector1)*kin_energy(vector2))
return o
In [26]:
train.head(10)
Out[26]:
In [83]:
#subset from P30 TO P37 CAUS THEY LOOK CATEGORICAL
names=["P1","P5","P30","P31","P32","P33","P34","P35","P36","P37"]
In [84]:
tr=train[[col for col in train.columns if col in names]]
In [85]:
tr.head(10)
Out[85]:
In [86]:
rows = tr.shape[1]
rows
Out[86]:
In [87]:
matrix= np.zeros((rows,rows))
In [88]:
chech_simmetry=[]
for i in range(rows):
for j in range(i, rows):
cor=o(tr[names[i]],tr[names[j]])
matrix[i,j]=cor
matrix[j,i]=cor
chech_simmetry.append(o(tr[names[i]],tr[names[j]])==o(tr[names[j]],tr[names[i]]))
In [89]:
matrix=pd.DataFrame(matrix)
matrix
Out[89]:
In [90]:
matrix.columns=names
In [97]:
matrix.index=matrix.index.set_names=names
In [98]:
matrix
Out[98]:
In [101]:
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
In [123]:
corr=matrix
In [124]:
corr
Out[124]:
In [127]:
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
In [159]:
def correlation_matrix(matrix,names):
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import cm as cm
fig = plt.figure()
ax1 = fig.add_subplot(111)
cmap = cm.get_cmap('jet', 130)
cax = ax1.imshow(matrix)
ax1.grid(True)
plt.title('TFI KINETIC CORRELATION')
labels= names
ax1.set_xticklabels(labels,fontsize=12)
ax1.set_yticklabels(labels,fontsize=12)
cbar = fig.colorbar(cax, ticks=[.1,.2,.3,.4,.5,.6,.7,.8,.9])
plt.show()
correlation_matrix(matrix,names)
In [162]:
def correlation_matrix(matrix,names):
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import cm as cm
fig = plt.figure()
ax1 = fig.add_subplot(111)
cmap = cm.get_cmap('jet', 130)
cax = ax1.imshow(matrix)
ax1.grid(True)
plt.title('TFI PEARSON CORRELATION')
labels= names
ax1.set_xticklabels(labels,fontsize=12)
ax1.set_yticklabels(labels,fontsize=12)
cbar = fig.colorbar(cax, ticks=[.1,.2,.3,.4,.5,.6,.7,.8,.9])
plt.show()
correlation_matrix(tr.corr(),names)
In [ ]: