In [1]:
# read in the data file and truncate everything except the smiles data and the columns we have with positive variance
import csv
import numpy as np
import pandas as pd
from sklearn import linear_model

train_filename = 'train.csv'
test_filename = 'test.csv'
pred_filename = 'regression.csv'

# read in the data
df = pd.read_csv(train_filename, header = 0)
df.head(10)


Out[1]:
smiles feat_001 feat_002 feat_003 feat_004 feat_005 feat_006 feat_007 feat_008 feat_009 ... feat_248 feat_249 feat_250 feat_251 feat_252 feat_253 feat_254 feat_255 feat_256 gap
0 c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n... 0 0 0 0 1 0 1 0 0 ... 1 0 0 0 0 0 0 0 0 1.19
1 C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si... 1 0 0 0 1 0 1 0 0 ... 1 0 0 1 0 0 0 0 0 1.60
2 [nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2... 1 0 0 0 1 1 1 0 0 ... 1 0 0 0 1 0 0 0 0 1.49
3 [nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13... 1 0 0 0 1 1 1 0 0 ... 1 0 0 0 1 0 0 0 0 1.36
4 c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1 0 0 0 0 1 0 1 0 0 ... 1 0 0 0 0 0 0 0 0 1.98
5 C1=Cc2cnc3cc4cc(-c5scc6[nH]ccc56)c5ccccc5c4cc3... 1 0 0 0 1 1 1 0 0 ... 1 0 0 0 0 0 0 0 0 1.81
6 c1ncc(s1)-c1cnc2c(c1)oc1c2ccc2ccccc12 0 0 0 0 1 0 1 0 0 ... 1 0 0 0 0 0 0 0 0 2.91
7 c1sc(-c2ccc3c(c2)sc2c3c3=CCC=c3c3cccnc23)c2[se... 1 0 0 0 1 0 1 0 0 ... 1 0 0 0 0 0 0 0 0 2.17
8 c1ccc(o1)-c1cc2cc3cc4c5c[nH]cc5ccc4cc3cc2o1 0 0 0 0 1 1 1 0 0 ... 1 0 0 0 0 0 0 0 0 2.19
9 [nH]1ccc2c3c[nH]cc3c3cc(-c4cncs4)c4=CCC=c4c3c12 1 0 0 0 1 1 1 0 0 ... 1 0 0 0 0 0 0 0 0 1.71

10 rows × 258 columns


In [2]:
# subset it so we only work with positive variance variables
posVarColNames = [colName for colName in df.columns.values if colName != "smiles" and np.var(df[colName]) > 0]
df = df.reindex(columns=posVarColNames)

# from the selected, find only those that aren't heavily correlated
cov = df.cov()
cov.head(10)


Out[2]:
feat_001 feat_005 feat_006 feat_007 feat_025 feat_037 feat_044 feat_068 feat_069 feat_072 ... feat_200 feat_208 feat_218 feat_225 feat_226 feat_243 feat_248 feat_251 feat_252 gap
feat_001 0.229833 -4.653836e-06 -0.022276 -0.006160 0.137995 0.128322 1.801034e-03 -0.039533 0.127190 -0.033189 ... 2.831680e-04 -0.005676 0.103999 -0.026799 -0.001413 -0.022276 -0.014717 0.078323 0.013356 -0.053720
feat_005 -0.000005 1.299984e-05 0.000006 0.000013 -0.000008 0.000005 6.540307e-08 0.000008 0.000005 0.000010 ... 1.028301e-08 0.000001 0.000004 0.000004 0.000013 0.000006 0.000012 0.000003 0.000000 0.000002
feat_006 -0.022276 6.209456e-06 0.249501 -0.006395 -0.023490 -0.023016 -7.360579e-04 -0.008038 -0.023756 -0.026795 ... -2.338214e-04 -0.002994 -0.017046 0.174740 0.001069 0.249501 -0.013587 -0.014564 0.014967 -0.003985
feat_007 -0.006160 1.268619e-05 -0.006395 0.023557 -0.008087 -0.002467 1.214485e-04 0.014997 -0.001800 0.018607 ... 1.909476e-05 0.000967 -0.007191 -0.005006 0.000091 -0.006395 0.022146 -0.003955 -0.002424 -0.000152
feat_025 0.137995 -7.988846e-06 -0.023490 -0.008087 0.236884 -0.016812 4.506808e-04 -0.035545 -0.018547 -0.020568 ... -1.319101e-04 -0.002839 0.178527 -0.024126 -0.000886 -0.023490 -0.020639 0.061949 -0.001952 -0.083165
feat_037 0.128322 4.659894e-06 -0.023016 -0.002467 -0.016812 0.229965 2.322625e-03 -0.021549 0.227936 -0.028729 ... -9.753642e-05 -0.005382 -0.014980 -0.021324 -0.002369 -0.023016 -0.003218 0.056245 -0.005554 -0.003679
feat_044 0.001801 6.540307e-08 -0.000736 0.000121 0.000451 0.002323 5.005694e-03 -0.002301 0.002301 -0.000964 ... -3.979525e-06 -0.000164 -0.000139 -0.000533 -0.000344 -0.000736 -0.001131 0.003930 -0.000128 -0.000328
feat_068 -0.039533 8.076284e-06 -0.008038 0.014997 -0.035545 -0.021549 -2.300521e-03 0.235298 -0.020153 0.009518 ... 2.755899e-04 -0.002885 -0.026424 -0.000338 0.001364 -0.008038 0.032505 -0.077976 -0.001217 -0.013140
feat_069 0.127190 4.618788e-06 -0.023756 -0.001800 -0.018547 0.227936 2.300533e-03 -0.020153 0.229060 -0.027517 ... -9.503528e-05 -0.005287 -0.014933 -0.021006 -0.002381 -0.023756 -0.002435 0.053775 -0.005436 -0.003285
feat_072 -0.033189 1.002054e-05 -0.026795 0.018607 -0.020568 -0.028729 -9.639461e-04 0.009518 -0.027517 0.176662 ... -2.487110e-04 -0.002833 -0.010378 -0.021258 0.000481 -0.026795 0.014087 -0.018271 -0.009906 -0.010017

10 rows × 32 columns


In [7]:
%matplotlib inline

import seaborn as sns
import matplotlib.pyplot as plt

In [19]:
sns.set(style="darkgrid")
f, ax = plt.subplots(figsize=(9, 9))

rs = np.random.RandomState(33)
d = rs.normal(size=(100, 30))

f, ax = plt.subplots(figsize=(9, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.corrplot(cov.as_matrix(), annot=False, sig_stars=False,
             diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [23]:
f.savefig('covariance_plot.pdf')

In [ ]: