In [14]:
%pylab inline
In [15]:
import numpy as np
import scipy.stats as sst
import matplotlib.pyplot as plt
import os
import os.path as osp
from __future__ import print_function
from __future__ import division
import six
import cnv_util as util
from datetime import datetime
reload(util)
Out[15]:
In [16]:
CWD = osp.join(osp.expanduser('~'), 'documents','grants_projects','roberto_projects', \
'guillaume_huguet_CNV','File_OK')
filename = 'Imagen_QC_CIA_MMAP_V2_Annotation.tsv'
fullfname = osp.join(CWD, filename)
In [17]:
arr = np.loadtxt(fullfname, dtype='str', comments=None, delimiter='\Tab',
converters=None, skiprows=0, usecols=None, unpack=False, ndmin=0)
In [18]:
EXPECTED_LINES = 19542
expected_nb_values = EXPECTED_LINES - 1
assert arr.shape[0] == EXPECTED_LINES
line0 = arr[0].split('\t')
print(line0)
danger = 'Pvalue_MMAP_V2_sans_intron_and_Intergenic'
score = 'SCORE'
i_danger = line0.index(danger)
i_score = line0.index(score)
print(i_danger, i_score)
In [19]:
# check that all lines have the same number of tab separated elements
larr = np.asarray([len(arr[i].split('\t')) for i in range(arr.shape[0])])
assert not (larr - larr[0]).any() # all element have the same value
In [20]:
dangers = np.asarray([line.split('\t')[i_danger] for line in arr[1:]])
scores = np.asarray([line.split('\t')[i_score] for line in arr[1:]])
# print(np.unique(scores))
assert len(dangers) == expected_nb_values
assert len(scores) == expected_nb_values
In [21]:
assert util._test_danger_score_1()
assert util._test_danger_score()
QUESTION pour Guillaume: a quoi correspondent les '' dans la colonne "Pvalue_MMAP_V2_sans_intron_and_Intergenic" (danger)?
Ansewer: cnv for which we have no dangerosity information
In [22]:
"""
danger_not_empty = dangers != ''
danger_scores = dangers[danger_not_empty]
danger_scores = np.asarray([util.danger_score(pstr, util.pH1_with_apriori)
for pstr in danger_scores])
""";
In [23]:
reload(util)
#get the scores
scores = np.asarray([line.split('\t')[i_score] for line in arr[1:]])
assert len(scores) == expected_nb_values
print(len(np.unique(scores)))
#tmp_score = np.asarray([util.str2floats(s, comma2point=True, sep=' ')[0] for s in scores])
assert scores.shape[0] == EXPECTED_LINES - 1
In [24]:
# h = plt.hist(tmp[tmp > sst.scoreatpercentile(tmp, 99)], bins=100)
# h = plt.hist(tmp[tmp < 50], bins=100)
"""
print("# CNV with score == 0.: ", (tmp==0.).sum())
print("# CNV with score >=15 < 17.5 : ", np.logical_and(tmp >= 15., tmp < 17.5).sum())
tmp.max()
""";
In [29]:
scoresf = np.asarray([util.str2floats(s, comma2point=True, sep=' ')[0]
for s in scores])
print(scoresf.max(), scoresf.min(),(scoresf==0).sum())
#clean_score = util.process_scores(scores)
#h = plt.hist(clean_score[clean_score < 60], bins=100)
#h = plt.hist(scoresf[scoresf < 60], bins=100)
h = plt.hist(scoresf, bins=100, range=(0,150))
In [27]:
# Creating a function from score to proba from Guillaume's values
p_cnv = util._build_dict_prob_cnv()
#print(p_cnv.keys())
#print(p_cnv.values())
#### Definition with a piecewise linear function
#score2prob = util.create_score2prob_lin_piecewise(p_cnv)
#scores = np.arange(15,50,1)
#probs = [score2prob(sc) for sc in scores]
#plt.plot(scores, probs)
#### Definition with a corrected regression line
score2prob = util.create_score2prob_lin(p_cnv)
#x = np.arange(0,50,1)
#plt.plot(x, [score2prob(_) for _ in x], '-', p_cnv.keys(), p_cnv.values(), '+')
In [28]:
p_scores = [score2prob(sc) for sc in clean_score]
assert len(p_scores) == EXPECTED_LINES -1
In [29]:
# re-loading
reload(util)
CWD = osp.join(osp.expanduser('~'), 'documents','grants_projects','roberto_projects', \
'guillaume_huguet_CNV','File_OK')
filename = 'Imagen_QC_CIA_MMAP_V2_Annotation.tsv'
fullfname = osp.join(CWD, filename)
# in numpy array
arr = np.loadtxt(fullfname, dtype='str', comments=None, delimiter='\Tab',
converters=None, skiprows=0, usecols=None, unpack=False, ndmin=0)
line0 = arr[0].split('\t')
i_DANGER = line0.index('Pvalue_MMAP_V2_sans_intron_and_Intergenic')
i_SCORE = line0.index('SCORE')
i_START = line0.index('START')
i_STOP = line0.index('STOP')
i_5pGENE = line0.index("5'gene")
i_3pGENE = line0.index("3'gene")
i_5pDIST = line0.index("5'dist(kb)")
i_3pDIST = line0.index("3'dist(kb)")
#i_LOC = line0.index('Location')
scores = np.asarray([line.split('\t')[i_SCORE] for line in arr[1:]])
clean_score = util.process_scores(scores)
max_score = clean_score.max()
In [30]:
print(line0)
In [31]:
#names_from = ['START', 'STOP', "5'gene", "3'gene", "5'dist(kb)", "3'dist(kb)"]
#---------- ligne uniques:
names_from = ['IID_projet', 'IID_genotype', "CHR de Merge_CIA_610_660_QC", 'START', 'STOP']
cnv_names = util.make_uiid(arr, names_from)
print("with names from: ", names_from)
print("we have {} unique elements out of {} rows in the tsv".format(
len(np.unique(cnv_names)), len(cnv_names)))
#---------- CNV uniques ?
names_from = ["CHR de Merge_CIA_610_660_QC", 'START', 'STOP']
cnv_names = util.make_uiid(arr, names_from)
print("with names from: ", names_from)
print("we have {} unique elements out of {} rows in the tsv".format(
len(np.unique(cnv_names)), len(cnv_names)))
#---------- sujets uniques ?
names_from = ['IID_projet'] # , 'IID_genotype']
cnv_names = util.make_uiid(arr, names_from)
print("with names from: ", names_from)
print("we have {} unique elements out of {} rows in the tsv".format(
len(np.unique(cnv_names)), len(cnv_names)))
dangers = np.asarray([line.split('\t')[i_DANGER] for line in arr[1:]])
scores = np.asarray([line.split('\t')[i_SCORE] for line in arr[1:]])
#danger_not_empty = dangers != ''
#print(danger_not_empty.sum())
#print(len(np.unique(cnv_name)))
#print(cnv_name[:10])
In [32]:
from collections import OrderedDict
cnv = OrderedDict()
names_from = ["CHR de Merge_CIA_610_660_QC", 'START', 'STOP']
#, "5'gene", "3'gene", "5'dist(kb)", "3'dist(kb)"]
blank_dgr = 0
for line in arr[1:]:
lline = line.split('\t')
dgr = lline[i_DANGER]
scr = lline[i_SCORE]
cnv_iid = util.make_uiid(line, names_from, arr[0])
if dgr != '':
add_cnv = (util.danger_score(lline[i_DANGER], util.pH1_with_apriori),
score2prob(util.process_one_score(lline[i_SCORE], max_score)))
if cnv_iid in cnv.keys():
cnv[cnv_iid].append(add_cnv)
else:
cnv[cnv_iid] = [add_cnv]
else:
blank_dgr += 1
In [33]:
print(len(cnv), (blank_dgr))
print([k for k in cnv.keys()[:5]])
print([k for k in cnv.values()[:5]])
In [34]:
for k in cnv.keys()[3340:3350]:
print(k,': ',cnv[k])
In [35]:
cnv = OrderedDict({})
#names_from = ['START', 'STOP', "5'gene", "3'gene", "5'dist(kb)", "3'dist(kb)"]
names_from = ['IID_projet']
for line in arr[1:]:
lline = line.split('\t')
dgr = lline[i_DANGER]
scr = lline[i_SCORE]
sub_iid = util.make_uiid(line, names_from, arr[0])
if dgr != '':
add_cnv = (util.danger_score(lline[i_DANGER], util.pH1_with_apriori),
score2prob(util.process_one_score(lline[i_SCORE], max_score)))
if sub_iid in cnv.keys():
cnv[sub_iid].append(add_cnv)
else:
cnv[sub_iid] = [add_cnv]
In [36]:
print(len(cnv))
nbcnv = [len(cnv[sb]) for sb in cnv]
hist = plt.hist(nbcnv, bins=50)
print(np.max(np.asarray(nbcnv)))
In [37]:
# definition of dangerosity from a list of cnv
def dangerosity(listofcnvs):
"""
inputs: list tuples (danger_score, proba_cnv)
returns: a dangerosity score
"""
last = -1 #slicing the last
tmp = [np.asarray(t) for t in zip(*listofcnvs)]
return tmp[0].dot(tmp[1])
# or: return np.asarray([dgr*prob for (dgr,prob) in listofcnvs]).cumsum()[last]
In [38]:
for k in range(1,30, 30):
print(cnv[cnv.keys()[k]], ' yields ', dangerosity(cnv[cnv.keys()[k]]))
test_dangerosity_input = [[(1., .5), (1., .5), (1., .5), (1., .5)],
[(2., 1.)],
[(10000., 0.)]]
test_dangerosity_output = [2., 2., 0]
#print( [dangerosity(icnv) for icnv in test_dangerosity_input]) # == test_dangerosity_output
assert( [dangerosity(icnv) for icnv in test_dangerosity_input] == test_dangerosity_output)
In [39]:
dtime = datetime.now().strftime("%y-%m-%d_h%H-%M")
outfile = dtime+'dangerosity_cnv.txt'
fulloutfile = osp.join(CWD, outfile)
with open(fulloutfile, 'w') as outf:
for sub in cnv:
outf.write("\t".join([sub, str(dangerosity(cnv[sub]))]) + "\n")