In [20]:
from custom import load_sequence_and_metadata, seq2chararray, encode_array, compute_seq_lengths, get_density_interval
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


plt.style.use('fivethirtyeight')
sns.set_context('talk')

%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [2]:
# Load data
sequences, metadata = load_sequence_and_metadata()

In [13]:
metadata['Year'] = metadata['Collection Date'].apply(lambda x: x.year)
ax = metadata.groupby('Year').count()['Name'].plot()
ax.set_title('Number of Sequences Per Year')
ax.set_ylabel('Number of Sequences')
plt.show()



In [27]:
meta_coords = pd.read_csv('data/metadata_with_embeddings.csv')
array = meta_coords[['coords0', 'coords1', 'coords2']]

lowp, highp = get_density_interval(99, array)
lowp, highp


Out[27]:
(array([-1.62802434, -4.02198646, -0.79146308]),
 array([ 3.00617146,  0.3219904 ,  3.30449963]))

In [30]:
array


Out[30]:
coords0 coords1 coords2
0 0.918940 -1.174675 0.871852
1 1.288200 -1.232958 0.798981
2 0.831635 -1.563137 2.395415
3 3.006171 -0.969287 1.302068
4 1.470726 0.321990 3.304500
5 1.227757 -1.556512 1.652497
6 0.022043 -2.085446 2.359227
7 -0.148276 -1.450171 1.902529
8 1.518494 -0.856068 1.071871
9 1.227757 -1.556512 1.652497
10 -0.118792 -1.631921 2.102484
11 1.801143 -1.598644 1.551610
12 1.470726 0.321990 3.304500
13 -0.118792 -1.631921 2.102484
14 2.441803 -0.911016 1.256986
15 1.801143 -1.598644 1.551610
16 1.470726 0.321990 3.304500
17 -0.046337 -1.960277 2.297898
18 -0.118792 -1.631921 2.102484
19 1.470726 0.321990 3.304500
20 -0.046337 -1.960277 2.297898
21 -0.118792 -1.631921 2.102484
22 -0.118792 -1.631921 2.102484
23 2.771851 -1.085633 1.298355
24 1.493231 -1.324761 1.873742
25 1.106338 -1.567368 2.446098
26 0.022043 -2.085446 2.359227
27 -0.137974 -1.954751 2.048233
28 2.803096 -1.060387 1.415775
29 1.557137 -0.937651 1.780598
... ... ... ...
13343 -0.023649 -2.053885 2.349242
13344 -0.096931 -1.959386 2.210831
13345 -0.145467 -1.361705 1.142086
13346 0.078348 -1.477327 2.253168
13347 -0.741128 -2.192901 2.914381
13348 -0.534840 -2.141206 2.858298
13349 1.179843 0.191758 2.854454
13350 0.184662 -1.652853 2.476905
13351 -0.010600 -1.319169 2.427523
13352 -0.024472 -1.635905 1.815091
13353 -0.579322 -0.916727 1.031197
13354 0.479878 -2.350614 2.619806
13355 0.151900 -1.849754 2.196189
13356 0.249665 -1.980036 2.394748
13357 0.068630 -1.922177 2.133150
13358 0.273144 -2.067154 2.432444
13359 -0.261155 -1.289115 1.311853
13360 0.135078 -1.212672 1.505061
13361 0.902746 -0.476329 2.201073
13362 0.862971 -3.604417 0.606099
13363 1.387338 -1.326220 1.562776
13364 1.401712 -1.112762 1.710803
13365 0.945322 -3.376436 0.436639
13366 0.580914 -3.405534 0.482366
13367 2.803096 -1.060387 1.415775
13368 1.193006 -1.211506 2.009974
13369 1.327766 -0.830434 2.577443
13370 2.722209 -1.042559 1.311239
13371 1.223868 -1.283525 2.154937
13372 1.327765 -0.830435 2.577443

13373 rows × 3 columns


In [ ]:
from bokeh.plotting import figure, show, output_notebook

output_notebook()

In [ ]:
p = figure()

In [ ]: