In [91]:
from __future__ import division
import os
import re
import json

import numpy as np
import pandas as pd
%matplotlib nbagg
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go

In [36]:
def fig_3d(d, n):
    p_data = [
        go.Surface(
            z=d.as_matrix()
        )
    ]
    
    layout = go.Layout(
        title=n,
        autosize=False,
        width=500,
        height=500,
        margin=dict(l=65, r=50, b=65, t=90)
    )
    return go.Figure(data=p_data, layout=layout)

def get_reads_data(run):
    run_data = {}
    dir_data = os.path.join(os.path.join('/Users/codeunsolved/NGS/NGS-Dashboard/data', 'BRCA%s' % run), 'sample_cover')
    for sample in os.listdir(dir_data):
        dir_sample = os.path.join(dir_data, sample)
        if os.path.isdir(dir_sample):
            path_sdp = os.path.join(dir_sample, 'sample_data_pointer.json')
            with open(path_sdp, 'rb') as sdp:
                run_data[sample] = json.loads(sdp.read())['frag_reads']
    return pd.DataFrame(run_data)

def norm_data(d, option='double'):
    if option == 'by_s':
        return (d - d.min())/(d.max() - d.min())
    elif option == 'by_a':
        return norm_data(d.T, option='by_s').T
    elif option == 'double':
        return norm_data(norm_data(d, 'by_s'), 'by_a')
    else:
        raise Exception("Unknown Option: %s" % option)
        
def coverage_uniforminity(d):
    return pd.DataFrame([round(len(data[c][data[c] > data[c].mean()*0.2]) / len(data[c]) * 100, 2) for c in data.columns], data.columns, columns=["0.2x"])

In [34]:
data = get_reads_data(161116)
data.head(3)


Out[34]:
NGS161111-2 NGS161111-3 NGS161111-4 NGS161111-5-2 NGS161111-6-2 NGS161111-7-2 NGS161111-8 NGS161114-1 NGS161114-11 NGS161114-12 ... NGS161114-15 NGS161114-16 NGS161114-17 NGS161114-2 NGS161114-3 NGS161114-4 NGS161114-9 NGS161115-6 NGS161115-7 NGS161115-8
chr13-B150A01-32890522-32890740 1980 1263 1073 1245 4843 5554 5384 4517 11863 7591 ... 7138 5091 2932 5764 5784 6835 8695 9452 7933 7434
chr13-B150A02-32893271-32893493 398 124 106 275 4704 3691 6198 5709 12019 7411 ... 8211 5164 5068 6477 5772 6763 9441 11375 8916 8193
chr13-B150A03-32899235-32899405 593 441 252 407 8463 9436 11415 10418 24115 11946 ... 9643 9094 9087 11650 10463 11651 13199 15900 11603 12512

3 rows × 22 columns


In [37]:
cu_0_2 = coverage_uniforminity(data)
print cu_0_2


                 0.2x
NGS161111-2     92.50
NGS161111-3     90.00
NGS161111-4     86.67
NGS161111-5-2   90.83
NGS161111-6-2  100.00
NGS161111-7-2   98.33
NGS161111-8    100.00
NGS161114-1    100.00
NGS161114-11   100.00
NGS161114-12   100.00
NGS161114-13   100.00
NGS161114-14    99.17
NGS161114-15   100.00
NGS161114-16   100.00
NGS161114-17    96.67
NGS161114-2    100.00
NGS161114-3    100.00
NGS161114-4    100.00
NGS161114-9    100.00
NGS161115-6    100.00
NGS161115-7    100.00
NGS161115-8    100.00

In [10]:
py.iplot(fig_3d(data, 'BRCA161116'), filename='BRCA161116')


Out[10]:

Method01 两次归一化

对同一批次的数据

  • 先按照sample进行归一化
  • 再按照amplicon进行归一化

归一化方法:Min-Max Normalization

后者这样做是否合理并不清楚


In [11]:
py.iplot(fig_3d(norm_data(data, 'by_s'), 'BRCA161116_norm_sample'), filename='BRCA161116_norm_sample')


Out[11]:

In [12]:
py.iplot(fig_3d(norm_data(data, 'double'), 'BRCA161116_norm_double'), filename='BRCA161116_norm_double')


Out[12]:

In [13]:
norm_data(data, 'double')['NGS161111-6-2'].plot()


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x106e06e10>

In [14]:
norm_data(data, 'double')['NGS161111-7-2'].plot()


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x106e06e10>

Method01 结论

先对样本进行归一化取得了较好的效果。见图 BRCA161116_norm_sample。

对同一个amplicon,不同sample之间的数据拉到了较同一的水平。

但是再进行对amplicon的归一化并不能达到预期效果。将数据用不同统计方法校准后也不行。

之前认为AB pool之间差异较大,实际上还是Amplicon之间的差异,并不是A或B pool内部有较好的均一性而之间就不行。

以后测试数据附上均一性评估的数据

Method02 按sample归一化后,比较相对比例

  • 先同Method01按sample归一化
  • 再选择一个sample的amplicon的reads数作为基准1,看其他sample与它的比例
  • 去检视每个sample是否有连续的低于0.5的amplicon,光一个假阳性会较高

当然这个方法还是需要数据具有较好的均一性。


In [111]:
def plot_brca_largeindel(d, dir_pic):
    def choose_one(d):
        cu_sort = coverage_uniforminity(d).sort_values(by='0.2x')
        one = cu_sort.iloc[-1]
        if one.values[0] < 98:
            print "[WARNING] Max Coverage Uniformity 0.2x < 98%%: %s" % one
        return one
    
    def get_plot_data():
        p_data = (d.T / d[choose_one(data).name]).T
        for s in p_data:
            if p_data[s].max() < 0.8:
                print "[WARNING] Sample ID: %s's data quality is low, poped!" % s
                p_data.pop(s)
            elif p_data[s].min() > 1:
                print "[WARNING] Sample ID: %s's data is over amplified, poped!" % s
        return p_data
        
    dir_pic = os.path.join('.', 'pic/%s' % dir_pic)
    if not os.path.exists(dir_pic):
        print "[WARNING] %s doesn't exist, create it!" % dir_pic
        os.makedirs(dir_pic)
    
    plot_data = get_plot_data()
    
    plt.violinplot([plot_data.T[a] for a in plot_data.T])
    plt.show()
    plot_data.plot()

In [112]:
plot_brca_largeindel(data, '161116')


[WARNING] Sample ID: NGS161111-2's data quality is low, poped!
[WARNING] Sample ID: NGS161111-3's data quality is low, poped!
[WARNING] Sample ID: NGS161111-4's data quality is low, poped!
[WARNING] Sample ID: NGS161111-5-2's data quality is low, poped!
[WARNING] Sample ID: NGS161114-11's data is over amplified, poped!