In [4]:
import sys
sys.path.append("./scripts")
from helpers import *
from proj1_helpers import *
from feature_processing import *
import numpy as np
import csv
from math import sqrt
from matplotlib import pyplot as plt

Prepare the data by standardizing and Imputing Missing data with Mean


In [12]:
def prepare_data(dataPath):

    yb , input_data, ids = load_csv_data(dataPath) # load data
    tx,_ =  process_X(input_data,2) #feature standardization
    
    return  tx[:,1:31]

Compute the Correalation Matrix ; Corr[IJ] = (E[IJ] - E[I]E[J])/sqrt(Var(I)Var(J))


In [6]:
def compute_correlation(data):

    dimension = data.shape[1] # get the number of columns in the data
    correlation_matrix  = np.zeros((dimension, dimension)) # construct a matrix to save the correlation coeff inside
    for i in range(0, dimension): # iterate over all columns
        for j in range(0, dimension): # iterate over all columns
           
            col_i = data[:,i] # get the ith column
            col_j = data[:,j] # get the jth column
            vecIJ = np.multiply(col_i,col_j) # compute dot product of both columns
            mean_IJ = np.mean(vecIJ) # compute the expected value of the joint density of the columns
            mean_col_i = np.mean(col_i) # expected value of column i
            mean_col_j = np.mean(col_j) # expected value of column j
            variance_col_i = np.var(col_i) # variance of column i
            variance_col_j = np.var(col_j) # variance of column j
            # hence Corr(I,J) =  (E(IJ) - E(I)E(J)) / radical (Var(I)Var(J))
            correlation_coefficient = float(mean_IJ  - (mean_col_i * mean_col_j))/sqrt(variance_col_i * variance_col_j)
            correlation_matrix[i][j] = correlation_coefficient
            correlation_matrix[j][i] = correlation_coefficient


    return correlation_matrix

Change format of result to save to csv folder


In [7]:
def parse_and_save(result,labels):
    
    # change each row in a correlation matrix into a line of csv
    parse = []
    for i in range(0,result.shape[0]):
        line = labels[i]
        for j in range(0,result.shape[1]):
            line = line + "," + str(result[i][j])
        parse.append(line)
       
    organizeLabels = "Labels"
    for i in range(0,labels.shape[0]):
             organizeLabels= organizeLabels + "," + str(labels[i])
    
    # write result to csv file
    with open('./analysis/correlation.csv','w') as file:
        
        file.write(organizeLabels)
        file.write('\n')
                
        for line in parse:
            file.write(line)
            file.write('\n')

To visualize a heatmap plot


In [8]:
import matplotlib.pyplot as plt
def plot_correlation(correlation_matrix,labels,filePath):

    plt.figure(figsize=(12,12))
    plt.imshow(correlation_matrix, cmap='RdYlGn', interpolation='none', aspect='auto')
    plt.colorbar()
    plt.xticks(range(len(labels)), labels, rotation='vertical')
    plt.yticks(range(len(labels)), labels);
    plt.suptitle('Higgs Bosson Correlation Heat Map', fontsize=15, fontweight='bold')
    plt.savefig(filePath)
    plt.close('all')

In [14]:
def run():
    
    dataPath = '../data/train.csv'
    
    features = get_header(dataPath)
        
    data = prepare_data(dataPath)  
   
    
    correlation_matrix = compute_correlation(data)
    fileName = "./analysis/correlation.png"
    plot_correlation(correlation_matrix,features,fileName)

In [15]:
run()


100%|██████████| 5/5 [00:02<00:00,  2.00it/s]

In [ ]: