In [1]:
import df_utils as du
import pandas as pd
In [2]:
df = pd.DataFrame([[-1, 0, 1], [1, 0, -1], [.5, 0, .5], [-1, 0, 1], [1, 0, -1]])
df
Out[2]:
In [3]:
#Try now
"""
def pairwise_correlation(df):
#Print data first to make it easy to check.
print("data:")
print(df,"\n\nP value:")
metrix = pd.DataFrame()
labels=[]
#Use 'iterrows()' to get rows repeatedly.
#There are two for loops.
The outer for loop is used to get each row,
and the inner for loop is to make each row from the outer loop compare to each row(including itself).
#'np.corrcoef(X,Y)' is used to caculate the Pearson correlation coefficient.
It will yield the matrix:
P(X,X) P(X,Y)
P(Y,X) P(Y,Y)
So I use 'np.corrcoef(X,Y)[0,1]' just to get value P(X,Y) at positon (0,1) in matrix.
It can also be used if there are three or more arrays, like 'np.corrcoef(X,Y,Z,......)'.
#Because the output of rows from 'interrows()' is a series, I use 'tolist()' to make it bacome an array,
and then can be used in 'np.corrcoef(X,Y)'.
#I remain the original way I did printing out all the P valuse line by line.
After Monday class, I appended all Ps into a data frame,
and then change the labels of index and columns by using a list 'labels',
and use 'labels.append('row'+str(index+1))' in outer for loop to create labels.
for index, row in df.iterrows():
labels.append('row'+str(index+1))
for index2, row2 in df.iterrows():
P = np.corrcoef(row.tolist(), row2.tolist())[0,1]
print("P value of row%d and row%d is %.2f." %(index+1,index2+1,P))
metrix.loc[index,index2] = P
metrix.columns = labels
metrix.index = labels
print("\nTable of P values")
print(metrix)
return metrix
"""
du.pairwise_correlation(df)
Out[3]:
In [4]:
#Try now
"""
def corr_rowi_rowj(df,i,j):
#Print data first to make it easy to check.
print("data:")
print(df,"\n\nP value:")
#'np.corrcoef(X,Y)' is used to caculate the Pearson correlation coefficient.
It will yield the matrix:
P(X,X) P(X,Y)
P(Y,X) P(Y,Y)
So I use 'np.corrcoef(X,Y)[0,1]' just to get value P(X,Y) at positon (0,1) in matrix.
It can also be used if there are three or more arrays, like 'np.corrcoef(X,Y,Z,......)'.
#'iloc[i-1]'is used to locate the row.
'i-1' is used because the difference between index (from 0) and actual row is 1.
#Because the output of rows from 'iloc[i-1]' is a series, I use 'tolist()' to make it bacome an array,
and then can be used in 'np.corrcoef(X,Y)'.
P = np.corrcoef(df.iloc[i-1].tolist(), df.iloc[j-1].tolist())[0,1]
print("P value of row%d and row%d is %.2f." %(i,j,P))
return P
"""
du.corr_rowi_rowj(df,2,3)
#Note: In this case, row2(row of index 1) and row3 (row of index 2) are compared.
Out[4]:
In [5]:
#Try now
"""
def corr_rowi_vs_all1(df):
#Print data first to make it easy to check.
print("data:")
print(df,"\n\nP value:")
metrix = pd.DataFrame()
labels = []
#Use 'iterrows()' to get rows repeatedly.
#There are two for loops.
The outer for loop is used to get each row,
and the inner for loop is to make each row from the outer loop compare to each row(including itself).
#There is a if loop in the inner for loop to avoid calculating P between 2 same rows
by checking the index values between 2 rows. I use "P=1" instead of "1" to clearly show it.
#'np.corrcoef(X,Y)' is used to caculate the Pearson correlation coefficient.
It will yield the matrix:
P(X,X) P(X,Y)
P(Y,X) P(Y,Y)
So I use 'np.corrcoef(X,Y)[0,1]' just to get value P(X,Y) at positon (0,1) in matrix.
It can also be used if there are three or more arrays, like 'np.corrcoef(X,Y,Z,......)'.
#Because the output of rows from 'interrows()' is a series, I use 'tolist()' to make it bacome an array,
and then can be used in 'np.corrcoef(X,Y)'.
#I remain the original way I did printing out all the P valuse line by line.
After Monday class, I appended all Ps into a data frame,
and then change the labels of index and columns by using a list 'labels',
and use 'labels.append('row'+str(index+1))' in outer for loop to create labels.
for index, row in df.iterrows():
labels.append('row'+str(index+1))
for index2, row2 in df.iterrows():
if index==index2:
metrix.loc[index,index] = "P=1"
else:
P = np.corrcoef(row.tolist(), row2.tolist())[0,1]
print("P value of row%d and row%d is %.2f." %(index+1,index2+1,P))
metrix.loc[index, index2] = P
metrix.columns = labels
metrix.index = labels
print("\nTables for P values:")
print(metrix)
return metrix
"""
du.corr_rowi_vs_all1(df)
Out[5]:
In [6]:
#Try now
"""
def corr_rowi_vs_all2(df,i):
#Print data first to make it easy to check.
print("data:")
print(df,"\n\nP value:")
metrix = pd.DataFrame()
labels = []
#Use a for loop and 'iterrows()' to get rows repeatedly.
#There is a if loop in the for loop to avoid calculating P between 2 same rows
by checking the index values between 2 rows. I use "P=1" instead of "1" to clearly show it.
#'np.corrcoef(X,Y)' is used to caculate the Pearson correlation coefficient.
It will yield the matrix:
P(X,X) P(X,Y)
P(Y,X) P(Y,Y)
So I use 'np.corrcoef(X,Y)[0,1]' just to get value P(X,Y) at positon (0,1) in matrix.
It can also be used if there are three or more arrays, like 'np.corrcoef(X,Y,Z,......)'.
#Because the output of rows from 'interrows()' is a series, I use 'tolist()' to make it bacome an array,
and then can be used in 'np.corrcoef(X,Y)'.
#I remain the original way I did printing out all the P valuse line by line.
After Monday class, I appended all Ps into a data frame,
and then change the labels of index and columns by using a list 'labels',
and use 'labels.append('row'+str(index+1))' in outer for loop to create labels.
for index, row in df.iterrows():
labels.append('row'+str(index+1))
if (index+1)==i:
metrix.loc[index,index] = "P=1"
else:
P = np.corrcoef(df.iloc[i-1].tolist(), row.tolist())[0,1]
metrix.loc[i-1,index] = P
print("P value of row%d and row%d is %.2f." %(i,index+1,P))
metrix.columns = labels
metrix.index = ["row"+str(i)]
print("\nTable for P values:")
print(metrix)
return metrix
"""
du.corr_rowi_vs_all2(df,4)
#Note: In this case, row4(row of index 3) is selected.
Out[6]:
In [7]:
def test_pairwise_correlation():
df = pd.DataFrame([[-1, 0, 1], [1, 0, -1], [.5, 0, .5]])
assert int(du.pairwise_correlation(df).iloc[0,0]) == 1, "Diagonal elements not handled properly"
assert int(du.pairwise_correlation(df).iloc[0,1]) == -1, "Anticorrelated elements not handled properly"
assert int(du.pairwise_correlation(df).iloc[0,2]) == 0, "Uncorrelated elements not handled properly"
assert int(du.pairwise_correlation(df).iloc[1,2]) == int(du.pairwise_correlation(df).iloc[2,1]), "Data not appended properly"
return
In [8]:
test_pairwise_correlation()
In [9]:
def test_corr_rowi_rowj():
df = pd.DataFrame([[-1, 0, 1], [1, 0, -1], [.5, 0, .5]])
assert int(du.corr_rowi_rowj(df,1,1)) == 1, "Diagonal elements not handled properly"
assert int(du.corr_rowi_rowj(df,1,2)) == -1, "Anticorrelated elements not handled properly"
assert int(du.corr_rowi_rowj(df,1,3)) == 0, "Uncorrelated elements not handled properly"
return
In [10]:
test_corr_rowi_rowj()
In [11]:
def test_corr_rowi_vs_all1():
df = pd.DataFrame([[-1, 0, 1], [1, 0, -1], [.5, 0, .5]])
assert type(du.corr_rowi_vs_all1(df).iloc[0,0]) == str, "Diagonal elements not handled properly"
assert int(du.corr_rowi_vs_all1(df).iloc[0,1]) == -1, "Anticorrelated elements not handled properly"
assert int(du.corr_rowi_vs_all1(df).iloc[0,2]) == 0, "Uncorrelated elements not handled properly"
assert int(du.corr_rowi_vs_all1(df).iloc[1,2]) == int(du.corr_rowi_vs_all1(df).iloc[2,1]), "Data not appended properly"
return
In [12]:
test_corr_rowi_vs_all1()
In [13]:
def test_corr_rowi_vs_all2():
df = pd.DataFrame([[-1, 0, 1], [1, 0, -1], [.5, 0, .5]])
assert type(du.corr_rowi_vs_all2(df,2).iloc[0,1]) == str, "Diagonal elements not handled properly"
assert int(du.corr_rowi_vs_all2(df,2).iloc[0,0]) == -1, "Anticorrelated elements not handled properly"
assert int(du.corr_rowi_vs_all2(df,2).iloc[0,2]) == 0, "Uncorrelated elements not handled properly"
return
In [14]:
test_corr_rowi_vs_all2()
In [15]:
import test_df_utils as ts
In [16]:
ts.test_pairwise_correlation()
In [17]:
ts.test_corr_rowi_rowj()
In [18]:
ts.test_corr_rowi_vs_all1()
In [19]:
ts.test_corr_rowi_vs_all2()
In [ ]: