In [14]:
%matplotlib inline

# used for dataframe manipulation
import pandas as pd
import numpy as np
import json

# used for plotting
import seaborn as sns
import matplotlib.pyplot as plt

# person files
import util

In [5]:
def save_and_load(featurefile, train_dir, ffs = None, global_feat_dict=None):
    # load features from textfile if possible (so we don't waste time recalculating this!)
    if ffs is None:
        print "loading features from file: {}".format(featurefile)
        X_train = util.load_sparse_csr(featurefile + "_mat.npz")
        global_feat_dict = json.load(open(featurefile + "_dict.save")) if global_feat_dict is None else global_feat_dict
        t_train = np.load(featurefile + "_t_train.npy")
        train_ids = np.load(featurefile + "_train_ids.npy")
        print "loaded features"
        return X_train, global_feat_dict, t_train, train_ids
    else:
        print "generating feature set and saving to file: {}".format(featurefile)
        X_train, global_feat_dict, t_train, train_ids = extract_feats(ffs, train_dir, global_feat_dict)
        json.dump(global_feat_dict, open(featurefile + "_dict.save", "w"))
        np.save(featurefile + "_train_ids", train_ids)
        np.save(featurefile + "_t_train", t_train)
        util.save_sparse_csr(featurefile + "_mat", X_train)
        print "generated and saved features"
        return X_train, global_feat_dict, t_train, train_ids

In [6]:
def toPandasDataFrame(Xarray, feat_dict, classes = None):
    '''
    arguments:
        a sparse numpy matrix of features
        a dictionary mapping column indexes to column names
        a numpy array of virus type for each element row in Xarray
            if none, it simply does not include this information in the matrix
        
    returns:
        a pandas dataframe with all features and a final column 'class'
        specifying the virus TYPE as discussed in the spec
    '''
    data = pd.DataFrame(data=Xarray.toarray(), columns=feat_dict)
    if classes is not None: 
        data['class'] = pd.Series(classes)
    return data

In [7]:
X_train, global_feat_dict, t_train, train_ids = save_and_load(featurefile="generative_features", train_dir="train")
data = toPandasDataFrame(X_train, global_feat_dict)
data.head(10)


loading features from file: generative_features
loaded features
Out[7]:
valve 00020142_least ytc8mzct yellow 0007fcc4_least 00010158_least sleep 00094ac0_least 000a4a98_least 154112_least ... zj9 zv9 00010222_least 0003012c_least 09bc329c3fb7972865ebce1fff08a3e8_least zzfwdfhj 00030108_least zv0 zv7 zv6
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
7 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
8 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

10 rows × 3289 columns


In [18]:
# calculate covariance matrix of the first 30 features
cov = data.loc[:, :data.columns[50]].cov()
cov.head(10)


Out[18]:
valve 00020142_least ytc8mzct yellow 0007fcc4_least 00010158_least sleep 00094ac0_least 000a4a98_least 154112_least ... 044116df442b1f1cf72a6919766ada744bf40452_least zdg zdf zda 000a0bf0_least 234_least zist ztuxmfifglituhm 000800ac_least 0002011c_least
valve 0.005185 -0.000080 -0.000097 -0.000002 -0.000105 -0.000001 -0.000002 -0.000001 -0.000001 -0.000002 ... -0.000014 -0.000245 -0.000028 -0.000007 -0.000005 -0.000007 -0.000012 -0.000013 -0.000003 -0.000011
00020142_least -0.000080 0.301841 0.362971 -0.000100 0.234855 -0.000060 -0.000100 -0.000060 -0.000060 -0.000080 ... -0.000662 0.549386 -0.001324 -0.000341 -0.000241 -0.000321 -0.000562 -0.000602 -0.000140 -0.000502
ytc8mzct -0.000097 0.362971 0.436583 -0.000121 0.281805 -0.000072 -0.000121 -0.000072 -0.000072 -0.000097 ... -0.000797 0.660770 -0.001594 -0.000411 -0.000290 -0.000387 -0.000676 -0.000725 -0.000169 -0.000604
yellow -0.000002 -0.000100 -0.000121 0.008101 -0.000131 -0.000002 -0.000003 -0.000002 -0.000002 -0.000002 ... -0.000017 -0.000307 -0.000035 -0.000009 -0.000006 -0.000008 -0.000015 -0.000016 -0.000004 -0.000013
0007fcc4_least -0.000105 0.234855 0.281805 -0.000131 0.300080 -0.000079 -0.000131 -0.000079 -0.000079 0.003785 ... -0.000867 0.646577 -0.001733 -0.000446 -0.000315 -0.000420 -0.000735 -0.000788 -0.000184 -0.000656
00010158_least -0.000001 -0.000060 -0.000072 -0.000002 -0.000079 0.002916 -0.000002 -0.000001 -0.000001 -0.000001 ... -0.000010 -0.000184 -0.000021 -0.000005 -0.000004 -0.000005 -0.000009 -0.000009 -0.000002 -0.000008
sleep -0.000002 -0.000100 -0.000121 -0.000003 -0.000131 -0.000002 0.008101 -0.000002 -0.000002 -0.000002 ... -0.000017 -0.000307 0.016173 -0.000009 -0.000006 -0.000008 -0.000015 -0.000016 -0.000004 -0.000013
00094ac0_least -0.000001 -0.000060 -0.000072 -0.000002 -0.000079 -0.000001 -0.000002 0.002916 -0.000001 -0.000001 ... -0.000010 -0.000184 -0.000021 -0.000005 -0.000004 0.007775 -0.000009 -0.000009 -0.000002 -0.000008
000a4a98_least -0.000001 -0.000060 -0.000072 -0.000002 -0.000079 -0.000001 -0.000002 -0.000001 0.002916 -0.000001 ... -0.000010 -0.000184 -0.000021 -0.000005 -0.000004 0.007775 -0.000009 -0.000009 -0.000002 -0.000008
154112_least -0.000002 -0.000080 -0.000097 -0.000002 0.003785 -0.000001 -0.000002 -0.000001 -0.000001 0.005185 ... -0.000014 -0.000245 -0.000028 -0.000007 -0.000005 -0.000007 -0.000012 -0.000013 -0.000003 -0.000011

10 rows × 51 columns


In [19]:
sns.set(style="darkgrid")
f, ax = plt.subplots(figsize=(9, 9))

f, ax = plt.subplots(figsize=(9, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.corrplot(cov.as_matrix(), annot=False, sig_stars=False,
             diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [20]:
f.savefig("covariance")

In [ ]: