In [4]:
import sys
sys.path.append("./scripts")
from helpers import *
from proj1_helpers import *
from feature_processing import *
import numpy as np
import csv
from math import sqrt
from matplotlib import pyplot as plt
Prepare the data by standardizing and Imputing Missing data with Mean
In [12]:
def prepare_data(dataPath):
yb , input_data, ids = load_csv_data(dataPath) # load data
tx,_ = process_X(input_data,2) #feature standardization
return tx[:,1:31]
Compute the Correalation Matrix ; Corr[IJ] = (E[IJ] - E[I]E[J])/sqrt(Var(I)Var(J))
In [6]:
def compute_correlation(data):
dimension = data.shape[1] # get the number of columns in the data
correlation_matrix = np.zeros((dimension, dimension)) # construct a matrix to save the correlation coeff inside
for i in range(0, dimension): # iterate over all columns
for j in range(0, dimension): # iterate over all columns
col_i = data[:,i] # get the ith column
col_j = data[:,j] # get the jth column
vecIJ = np.multiply(col_i,col_j) # compute dot product of both columns
mean_IJ = np.mean(vecIJ) # compute the expected value of the joint density of the columns
mean_col_i = np.mean(col_i) # expected value of column i
mean_col_j = np.mean(col_j) # expected value of column j
variance_col_i = np.var(col_i) # variance of column i
variance_col_j = np.var(col_j) # variance of column j
# hence Corr(I,J) = (E(IJ) - E(I)E(J)) / radical (Var(I)Var(J))
correlation_coefficient = float(mean_IJ - (mean_col_i * mean_col_j))/sqrt(variance_col_i * variance_col_j)
correlation_matrix[i][j] = correlation_coefficient
correlation_matrix[j][i] = correlation_coefficient
return correlation_matrix
Change format of result to save to csv folder
In [7]:
def parse_and_save(result,labels):
# change each row in a correlation matrix into a line of csv
parse = []
for i in range(0,result.shape[0]):
line = labels[i]
for j in range(0,result.shape[1]):
line = line + "," + str(result[i][j])
parse.append(line)
organizeLabels = "Labels"
for i in range(0,labels.shape[0]):
organizeLabels= organizeLabels + "," + str(labels[i])
# write result to csv file
with open('./analysis/correlation.csv','w') as file:
file.write(organizeLabels)
file.write('\n')
for line in parse:
file.write(line)
file.write('\n')
To visualize a heatmap plot
In [8]:
import matplotlib.pyplot as plt
def plot_correlation(correlation_matrix,labels,filePath):
plt.figure(figsize=(12,12))
plt.imshow(correlation_matrix, cmap='RdYlGn', interpolation='none', aspect='auto')
plt.colorbar()
plt.xticks(range(len(labels)), labels, rotation='vertical')
plt.yticks(range(len(labels)), labels);
plt.suptitle('Higgs Bosson Correlation Heat Map', fontsize=15, fontweight='bold')
plt.savefig(filePath)
plt.close('all')
In [14]:
def run():
dataPath = '../data/train.csv'
features = get_header(dataPath)
data = prepare_data(dataPath)
correlation_matrix = compute_correlation(data)
fileName = "./analysis/correlation.png"
plot_correlation(correlation_matrix,features,fileName)
In [15]:
run()
In [ ]: