In [1]:
#Uniqueness per Level
log = "/project/flatiron2/analysis_SHOGUN/data/references/linear/rep82_combined.fixed.ubt.gg.log"

In [3]:
import numpy as np
import bokeh
import pandas as pd


/export/scratch/miniconda3/envs/analysis_SHOGUN/lib/python3.5/site-packages/matplotlib/__init__.py:841: UserWarning: Found matplotlib configuration in ~/.matplotlib/. To conform with the XDG base directory standard, this configuration location has been deprecated on Linux, and the new location is now /home/grad00/hillm096/.config/matplotlib/. Please move your configuration there to ensure that matplotlib will continue to find it in the future.
  _get_xdg_config_dir())

In [17]:
df_gg = pd.read_csv(log, sep="\t", header=None)

In [18]:
df_gg.head()


Out[18]:
0 1
0 k__Viruses;p__ssRNA viruses;c__ssRNA positive-... 339
1 k__Viruses;p__ssDNA viruses;c__Inoviridae;o__;... 278
2 k__Viruses;p__ssRNA viruses;c__ssRNA positive-... 583
3 k__Viruses;p__ssDNA viruses;c__Geminiviridae;o... 170
4 k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacill... 206882

In [19]:
from bokeh.io import output_notebook
from bokeh.charts import Histogram, show

In [20]:
output_notebook()


Loading BokehJS ...

In [28]:
x = np.array(df_gg.values[:,1], dtype=int)
hist = Histogram(x)
show(hist)



In [32]:
df_gg['level'] = [_.count(";") + 1 for _ in df_gg[0]]

In [33]:
df_gg['level']


Out[33]:
0        8
1        8
2        8
3        8
4        8
5        8
6        8
7        8
8        8
9        8
10       8
11       8
12       8
13       8
14       8
15       8
16       8
17       8
18       8
19       8
20       8
21       8
22       8
23       8
24       8
25       8
26       8
27       8
28       8
29       8
        ..
14533    8
14534    8
14535    8
14536    8
14537    8
14538    8
14539    8
14540    8
14541    8
14542    8
14543    8
14544    8
14545    8
14546    8
14547    8
14548    8
14549    8
14550    8
14551    8
14552    8
14553    8
14554    8
14555    8
14556    8
14557    8
14558    8
14559    8
14560    8
14561    8
14562    8
Name: level, Length: 14563, dtype: int64