In [14]:
%matplotlib inline
# used for dataframe manipulation
import pandas as pd
import numpy as np
import json
# used for plotting
import seaborn as sns
import matplotlib.pyplot as plt
# person files
import util
In [5]:
def save_and_load(featurefile, train_dir, ffs = None, global_feat_dict=None):
# load features from textfile if possible (so we don't waste time recalculating this!)
if ffs is None:
print "loading features from file: {}".format(featurefile)
X_train = util.load_sparse_csr(featurefile + "_mat.npz")
global_feat_dict = json.load(open(featurefile + "_dict.save")) if global_feat_dict is None else global_feat_dict
t_train = np.load(featurefile + "_t_train.npy")
train_ids = np.load(featurefile + "_train_ids.npy")
print "loaded features"
return X_train, global_feat_dict, t_train, train_ids
else:
print "generating feature set and saving to file: {}".format(featurefile)
X_train, global_feat_dict, t_train, train_ids = extract_feats(ffs, train_dir, global_feat_dict)
json.dump(global_feat_dict, open(featurefile + "_dict.save", "w"))
np.save(featurefile + "_train_ids", train_ids)
np.save(featurefile + "_t_train", t_train)
util.save_sparse_csr(featurefile + "_mat", X_train)
print "generated and saved features"
return X_train, global_feat_dict, t_train, train_ids
In [6]:
def toPandasDataFrame(Xarray, feat_dict, classes = None):
'''
arguments:
a sparse numpy matrix of features
a dictionary mapping column indexes to column names
a numpy array of virus type for each element row in Xarray
if none, it simply does not include this information in the matrix
returns:
a pandas dataframe with all features and a final column 'class'
specifying the virus TYPE as discussed in the spec
'''
data = pd.DataFrame(data=Xarray.toarray(), columns=feat_dict)
if classes is not None:
data['class'] = pd.Series(classes)
return data
In [7]:
X_train, global_feat_dict, t_train, train_ids = save_and_load(featurefile="generative_features", train_dir="train")
data = toPandasDataFrame(X_train, global_feat_dict)
data.head(10)
Out[7]:
In [18]:
# calculate covariance matrix of the first 30 features
cov = data.loc[:, :data.columns[50]].cov()
cov.head(10)
Out[18]:
In [19]:
sns.set(style="darkgrid")
f, ax = plt.subplots(figsize=(9, 9))
f, ax = plt.subplots(figsize=(9, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.corrplot(cov.as_matrix(), annot=False, sig_stars=False,
diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()
In [20]:
f.savefig("covariance")
In [ ]: