In [1]:
%pylab inline
In [2]:
import numpy as np
import scipy.stats as sst
import matplotlib.pyplot as plt
import os
import os.path as osp
from __future__ import print_function
from __future__ import division
import six
In [3]:
CWD = osp.join(osp.expanduser('~'), 'documents','grants_projects','roberto_projects', \
'guillaume_huguet_CNV','File_OK')
filename = 'Imagen_QC_CIA_MMAP_V2_Annotation.tsv'
fullfname = osp.join(CWD, filename)
In [4]:
arr = np.loadtxt(fullfname, dtype='str', comments=None, delimiter='\Tab',
converters=None, skiprows=0, usecols=None, unpack=False, ndmin=0)
In [5]:
EXPECTED_LINES = 19542
expected_nb_values = EXPECTED_LINES - 1
assert arr.shape[0] == EXPECTED_LINES
line0 = arr[0].split('\t')
print(line0)
danger = 'Pvalue_MMAP_V2_sans_intron_and_Intergenic'
score = 'SCORE'
i_danger = line0.index(danger)
i_score = line0.index(score)
print(i_danger, i_score)
In [6]:
# check that all lines have the same number of tab separated elements
larr = np.asarray([len(arr[i].split('\t')) for i in range(arr.shape[0])])
assert not (larr - larr[0]).any() # all element have the same value
In [7]:
dangers = np.asarray([line.split('\t')[i_danger] for line in arr[1:]])
scores = np.asarray([line.split('\t')[i_score] for line in arr[1:]])
# print(np.unique(scores))
assert len(dangers) == expected_nb_values
assert len(scores) == expected_nb_values
In [8]:
#print(len(scores[danger_not_empty]))
#f,b = plt.hist(dangers[danger_not_empty].astype('float'), bin=50)
def str2floats(value, sep=',', comma2point=False):
"""
takes a string with one or several p-values
returns:
combined score dangerosite
"""
if comma2point:
value = value.replace(',', '.')
lst = value.split(sep)
lst = [l.rstrip().lstrip() for l in lst if (l != '')]
try:
return [float(f) for f in lst]
except:
print("value: ", value, "value.split(sep): ", lst)
raise ValueError
def pH1_with_apriori(alpha, pi=0.5, beta=.8):
"""
Returns the probability of H1 if we know power and a priori
"""
return (1 - beta) * pi / ((1 - beta) * pi + alpha * (1 - pi))
def pH1_simple(alpha, pi=None, beta=None):
"""
pi and beta unsused:
"""
return (1. - alpha)
defaultarg = {'pi':0.5, 'beta':0.8}
def danger_score(pvalues, pval2score=pH1_simple, argvals=defaultarg):
"""
take a series of pvalue (one line), return a dangerosity score
"""
pvalues = np.asarray(str2floats(pvalues))
assert np.all( np.logical_and(pvalues <= 1., pvalues >=0)), \
[pvalues, np.logical_and((pvalues <= 1.), (pvalues >=0))]
""" how to combine the p-values?
I want a score that scales with the number of gene affected. Per gene, if the p is zero,
I want my score to be 1. and if the p value is close to 1, the score should be 0.
To start with, I'm setting the score to be 1-p:
res = (1. - pvalues).sum()
The problem with this approach: I would like something closer to the probability of
CNV to be dangerous, which requires an apriori on the probablility that the CNV is dangerous,
and the power of the test (leading to the p value that we are using).
p(CNV dangerous) = (1 - beta) * pi / ((1 - beta) * pi + alpha * (1 - pi))
"""
res = np.asarray([pval2score(p, **argvals) for p in pvalues]).sum()
return res
In [9]:
def test_danger_score(tests, verbose=True):
"""
tests: the list of strings that look like "danger"
results: expected results
"""
for idx, test in enumerate(tests):
pstring = test[0]
func = test[1]
arg = test[2]
result = test[3]
status = True
try:
dger = danger_score(pstring, func, arg)
if verbose: print('dger:',dger)
if abs(dger - result) < np.finfo('float').eps:
pass
else:
if verbose: print(test, " is failing", dger, results[idx])
status = False
except:
if result == "Failed":
pass
else:
if verbose: print(test, " is failing", results[idx])
status = False
return(status)
In [10]:
print(danger_score('1., 0., .5', pH1_with_apriori))
print(danger_score('1.', pH1_with_apriori))
print(danger_score('1.', pH1_with_apriori, {'pi':0.5, 'beta':0.8}))
In [11]:
one_beta_pi = (1. - defaultarg['beta'])*defaultarg['pi']
pH1_res_1 = one_beta_pi / (one_beta_pi + 1. - defaultarg['pi'])
assert abs(pH1_res_1 - pH1_with_apriori(1.)) < np.finfo('float').eps
tests = [('1., 0., .5', pH1_simple, defaultarg, 1.5),
('.2, -.01', pH1_simple, defaultarg, 'Failed'),
('0.1, 0.1, .8',pH1_simple, defaultarg, 2.),
('.0', pH1_simple, defaultarg, 1.),
('.0', pH1_with_apriori, defaultarg, 1.),
('1.', pH1_with_apriori, defaultarg, pH1_res_1)]
assert test_danger_score(tests, verbose=False)
QUESTION pour Guillaume: a quoi correspondent les '' dans la colonne "Pvalue_MMAP_V2_sans_intron_and_Intergenic" (danger)?
Ansewer: cnv for which we have no dangerosity information
In [12]:
danger_not_empty = dangers != ''
danger_scores = dangers[danger_not_empty]
danger_scores = np.asarray([danger_score(pstr, pH1_with_apriori) for pstr in danger_scores])
In [13]:
print("We have {:d} empty score !".format(EXPECTED_LINES -1 - len(danger_scores)))
In [14]:
danger_scores.max()
Out[14]:
In [15]:
h = plt.hist(danger_scores[danger_scores > sst.scoreatpercentile(danger_scores, 99)], bins=100)
In [16]:
#get the scores
scores = np.asarray([line.split('\t')[i_score] for line in arr[1:]])
assert len(scores) == expected_nb_values
In [17]:
# show first 42 values
scores[:42]
Out[17]:
In [18]:
len(np.unique(scores))
Out[18]:
In [19]:
# comma to point for floats
tmp = [str2floats(s, comma2point=True)[0] for s in scores]
In [20]:
a = [print(scores[i], tmp[i]) for i in range(1400,1410)]
In [21]:
#h = plt.hist(tmp, bins=100)
tmp = np.asarray(tmp)
print(tmp.shape)
assert tmp.shape[0] == EXPECTED_LINES - 1
# h = plt.hist(tmp[tmp < sst.scoreatpercentile(tmp, 95)], bins=100)
In [22]:
h = plt.hist(tmp[tmp > sst.scoreatpercentile(tmp, 99)], bins=100)
In [23]:
h = plt.hist(tmp[tmp < 50], bins=100)
In [24]:
print("# CNV with score == 0.: ", (tmp==0.).sum())
print("# CNV with score >=15 < 17.5 : ", np.logical_and(tmp >= 15., tmp < 17.5).sum())
tmp.max()
Out[24]:
QUESTION Guillaume:
Y a t il seulement 27 CNV qui ont un score de "zero" ?
In [25]:
# if the score is zero, put it to maximum value : it means the CNV has a maximum score
clean_score = tmp
clean_score[tmp==0.] = tmp.max()
h = plt.hist(tmp[tmp < 60], bins=100)
In [26]:
# from a copy paste of CNV_scores.xlsx
prob_cnv_vrai = \
"""15 20 80 25
17,5 24 76 25
20 36 64 25
22,5 44 56 25
25 83 17 60
27,5 77 23 44
30 95 5 60
"""
QUESTION Guillaume :
25 -> 27.5 : 83% true
27.5 -> 30 : 77% true
Y a t-il eu inversion ? ou s'agit il de bruit ? Reponse : il s'agit de bruit.
In [27]:
from collections import OrderedDict
p_cnv = OrderedDict({})
for line in prob_cnv_vrai.splitlines():
line = line.split('\t')
# print(line)
p_cnv[float(line[0].replace(',','.'))] = float(line[1])/100.
# if model with linear piecewise
# p_cnv[45.] = 1.
# CORRECTION for a possible inversion - or simply "smooth" the curve !
# p_cnv[25], p_cnv[27.5] = p_cnv[27.5] , p_cnv[25]
print(p_cnv)
QUESTION Guillaume
The last interval -- the one for which proba is 95.0 -- is [30.0, 32.5) ?
In [28]:
# Solution 1 : affine by pieces
def aff(bi, bs, vbi, vbs, x0, y0):
"""
returns the affine function
"""
a = (vbs - vbi)/(bs - bi)
b = y0 - a*x0
def this_aff(x):
return a*x + b
return this_aff
def create_score2prob(p_cnv):
# create a dict that has the affine function
pf_cnv = OrderedDict({})
kcnv = p_cnv.keys()
for idx,k in enumerate(kcnv[:-1]):
k1 = kcnv[idx+1]
pf_cnv[k] = aff(k, k1, p_cnv[k], p_cnv[k1], k, p_cnv[k])
# testing just this
def test_pf_cnv():
assert pf_cnv[15](15) == .20
assert abs(pf_cnv[15](17.5) - pf_cnv[17.5](17.5)) < np.finfo(float).eps
assert abs(pf_cnv[17.5](20) - pf_cnv[20.](20.)) < np.finfo(float).eps
assert abs(pf_cnv[20.](22.5) - pf_cnv[22.5](22.5)) < np.finfo(float).eps
#assert pf_cnv[30](45) == 1.
return True
test_pf_cnv()
# function linear piecewise:
def sc2prob(sc):
"""
returns the proba that cnv with score sc is a cnv;
"""
#
assert sc >= 15.
kcnv = pf_cnv.keys()
# if greater than the last key, returns 1.0
if sc >= kcnv[-1]:
return 1.0
idx = 0
while sc > kcnv[idx+1]: idx += 1
return pf_cnv[kcnv[idx]](sc)
return sc2prob
In [29]:
score2prob = create_score2prob(p_cnv)
scores = np.arange(15,50,1)
probs = [score2prob(sc) for sc in scores]
plt.plot(scores, probs)
Out[29]:
In [30]:
from scipy.interpolate import UnivariateSpline as spl
def create_score2prob_v2(p_cnv):
kpcnv = p_cnv.keys()
lin_funct = spl(kpcnv, p_cnv.values(), k=1)
#x = np.arange(0,50,1)
#plt.plot(x, lin_funct(x), '-', p_cnv.keys(), p_cnv.values(), '+')
(x1, x2) = (kpcnv[0], kpcnv[-1])
a = (lin_funct(x2) - lin_funct(x1))/(x2 - x1)
b = lin_funct(x2) - a*x2
x_where_y_is_1 = (1. - b)/a
x_where_y_is_0 = (-b)/a
def sc2prob(x):
if x < x_where_y_is_0:
return 0.
elif x < x_where_y_is_1:
return lin_funct(x)
else:
return 1.
return sc2prob
x = np.arange(0,50,1)
score2prob = create_score2prob_v2(p_cnv)
plt.plot(x, [score2prob(_) for _ in x], '-', p_cnv.keys(), p_cnv.values(), '+')
Out[30]:
In [31]:
p_scores = [score2prob(sc) for sc in clean_score]
assert len(p_scores) == EXPECTED_LINES -1
In [32]:
# re-loading
CWD = osp.join(osp.expanduser('~'), 'documents','grants_projects','roberto_projects', \
'guillaume_huguet_CNV','File_OK')
filename = 'Imagen_QC_CIA_MMAP_V2_Annotation.tsv'
fullfname = osp.join(CWD, filename)
# in numpy array
arr = np.loadtxt(fullfname, dtype='str', comments=None, delimiter='\Tab',
converters=None, skiprows=0, usecols=None, unpack=False, ndmin=0)
line0 = arr[0].split('\t')
DANGER = 'Pvalue_MMAP_V2_sans_intron_and_Intergenic'
SCORE = 'SCORE'
i_DANGER = line0.index(DANGER)
i_SCORE = line0.index(SCORE)
i_START = line0.index('START')
i_STOP = line0.index('STOP')
i_5pGENE = line0.index("5'gene")
i_3pGENE = line0.index("3'gene")
i_5pDIST = line0.index("5'dist(kb)")
i_3pDIST = line0.index("3'dist(kb)")
#i_LOC = line0.index('Location')
for idx in range(len(line0)):
print(line0[idx],': ', arr[1].split('\t')[idx])
In [33]:
# A function to create unique identifiers of subj, CNV, etc
def cnv_uiid(tsv_arr, columns_names, first_line=None):
"""
columns_names: list of column names with which the uiid are constructed
returns a list if given the full array,
returns a string if given only one line (and the first line)
"""
chr2rm = ''.join([',', '.', ' ']) # others: ['!', '?', ...] ?
if isinstance(tsv_arr, six.string_types) and (first_line != None):
# assume we are given one line of the tsv array
indexes = [first_line.split('\t').index(colname) for colname in columns_names]
uiid = '_'.join([(tsv_arr.split('\t')[ind]).translate(None,chr2rm) for ind in indexes])
else:
uiid = []
indexes = [tsv_arr[0].split('\t').index(colname) for colname in columns_names]
for line in tsv_arr[1:]:
ll = line.split('\t')
uiid.append('_'.join([ll[ind] for ind in indexes]))
return uiid
In [35]:
print(line0)
In [38]:
#names_from = ['START', 'STOP', "5'gene", "3'gene", "5'dist(kb)", "3'dist(kb)"]
#---------- ligne uniques:
names_from = ['IID_projet', 'IID_genotype', "CHR de Merge_CIA_610_660_QC", 'START', 'STOP']
cnv_names = cnv_uiid(arr, names_from)
print("with names from: ", names_from)
print("we have {} unique elements out of {} rows in the tsv".format(
len(np.unique(cnv_names)), len(cnv_names)))
#---------- CNV uniques ?
names_from = ["CHR de Merge_CIA_610_660_QC", 'START', 'STOP']
cnv_names = cnv_uiid(arr, names_from)
print("with names from: ", names_from)
print("we have {} unique elements out of {} rows in the tsv".format(
len(np.unique(cnv_names)), len(cnv_names)))
#---------- CNV uniques ?
"""
names_from = ['START', 'STOP', "5'gene", "3'gene", "5'dist(kb)", "3'dist(kb)"]
cnv_names = cnv_uiid(arr, names_from)
print("with names from: ", names_from)
print("we have {} unique elements out of {} rows in the tsv".format(
len(np.unique(cnv_names)), len(cnv_names)))
"""
#---------- sujets uniques ?
names_from = ['IID_projet'] # , 'IID_genotype']
cnv_names = cnv_uiid(arr, names_from)
print("with names from: ", names_from)
print("we have {} unique elements out of {} rows in the tsv".format(
len(np.unique(cnv_names)), len(cnv_names)))
dangers = np.asarray([line.split('\t')[i_DANGER] for line in arr[1:]])
scores = np.asarray([line.split('\t')[i_SCORE] for line in arr[1:]])
#danger_not_empty = dangers != ''
#print(danger_not_empty.sum())
#print(len(np.unique(cnv_name)))
#print(cnv_name[:10])
In [39]:
#gene_name_set = set([line.split('\t')[i_3pGENE] for line in arr[1:]]) | \
# set([line.split('\t')[i_5pGENE] for line in arr[1:]])
#print('len(gene_name_set): ', len(gene_name_set))
# this creates an dict where the keys are unique indentifiers of cnv, values
# are (dangerosity, probability_cnv)
cnv = OrderedDict({})
names_from = ["CHR de Merge_CIA_610_660_QC", 'START', 'STOP'] #, "5'gene", "3'gene", "5'dist(kb)", "3'dist(kb)"]
blank_dgr = 0
for line in arr[1:]:
lline = line.split('\t')
dgr = lline[i_DANGER]
scr = lline[i_SCORE]
cnv_iid = cnv_uiid(line, names_from, arr[0])
if dgr != '':
add_cnv = (danger_score(lline[i_DANGER], pH1_with_apriori),
score2prob(lline[i_SCORE]))
if cnv_iid in cnv.keys():
cnv[cnv_iid].append(add_cnv)
else:
cnv[cnv_iid] = [add_cnv]
else:
blank_dgr += 1
In [40]:
print(len(cnv), (blank_dgr))
print([k for k in cnv.keys()[:5]])
In [41]:
names_from = ["CHR de Merge_CIA_610_660_QC", 'START', 'STOP']
cnv_names = cnv_uiid(arr, names_from)
print("with names from: ", names_from)
print("we have {} unique elements out of {} rows in the tsv".format(
len(np.unique(cnv_names)), len(cnv_names)))
#print(cnv_names[:5])
# To be check : the 7337 unique cnv should reduce to 3374 when we remove those for which dgr == ''
In [42]:
__kcnv = cnv.keys()
print(len(__kcnv))
for _ in __kcnv[3330:3350]:
print(_,': ',cnv[_])
In [43]:
#for k in cnv.keys():
# if len(cnv[k]) > 1:
# if len(set([cnv[k][i][0] for i in range(len(cnv[k]))])) > 1:
# pass
# # print(cnv[k])
# #print([t[0] for t in cnv[k]])
In [44]:
#print(len(arr), line0)
In [45]:
cnv = OrderedDict({})
#names_from = ['START', 'STOP', "5'gene", "3'gene", "5'dist(kb)", "3'dist(kb)"]
names_from = ['IID_projet']
for line in arr[1:]:
lline = line.split('\t')
dgr = lline[i_DANGER]
scr = lline[i_SCORE]
sub_iid = cnv_uiid(line, names_from, arr[0])
if dgr != '':
add_cnv = (danger_score(lline[i_DANGER], pH1_with_apriori),
score2prob(lline[i_SCORE]))
if sub_iid in cnv.keys():
cnv[sub_iid].append(add_cnv)
else:
cnv[sub_iid] = [add_cnv]
In [46]:
print(len(cnv))
nbcnv = [len(cnv[sb]) for sb in cnv]
hist = plt.hist(nbcnv, bins=50)
print(np.max(np.asarray(nbcnv)))
In [47]:
# definition of dangerosity from a list of cnv
def dangerosity(listofcnvs):
"""
inputs: list tuples (danger_score, proba_cnv)
returns: a dangerosity score
"""
last = -1 #slicing the last
tmp = [np.asarray(t) for t in zip(*listofcnvs)]
return tmp[0].dot(tmp[1])
# or: return np.asarray([dgr*prob for (dgr,prob) in listofcnvs]).cumsum()[last]
In [48]:
for k in range(1,30, 30):
print(cnv[cnv.keys()[k]], ' yields ', dangerosity(cnv[cnv.keys()[k]]))
test_dangerosity_input = [[(1., .5), (1., .5), (1., .5), (1., .5)],
[(2., 1.)],
[(10000., 0.)]]
test_dangerosity_output = [2., 2., 0]
#print( [dangerosity(icnv) for icnv in test_dangerosity_input]) # == test_dangerosity_output
assert( [dangerosity(icnv) for icnv in test_dangerosity_input] == test_dangerosity_output)
In [49]:
outfile = 'dangerosity_cnv.txt'
fulloutfile = osp.join(CWD, outfile)
with open(fulloutfile, 'w') as outf:
for sub in cnv:
outf.write("\t".join([sub, str(dangerosity(cnv[sub]))]) + "\n")
In [ ]:
In [ ]: