In [1]:
# read in the data file and truncate everything except the smiles data and the columns we have with positive variance
import csv
import numpy as np
import pandas as pd
from sklearn import linear_model
train_filename = 'train.csv'
test_filename = 'test.csv'
pred_filename = 'regression.csv'
# read in the data
df = pd.read_csv(train_filename, header = 0)
df.head(10)
Out[1]:
In [2]:
# subset it so we only work with positive variance variables
posVarColNames = [colName for colName in df.columns.values if colName != "smiles" and np.var(df[colName]) > 0]
df = df.reindex(columns=posVarColNames)
# from the selected, find only those that aren't heavily correlated
cov = df.cov()
cov.head(10)
Out[2]:
In [7]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
In [19]:
sns.set(style="darkgrid")
f, ax = plt.subplots(figsize=(9, 9))
rs = np.random.RandomState(33)
d = rs.normal(size=(100, 30))
f, ax = plt.subplots(figsize=(9, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.corrplot(cov.as_matrix(), annot=False, sig_stars=False,
diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()
In [23]:
f.savefig('covariance_plot.pdf')
In [ ]: